# Importing modules

In [3]:
import regex as re
import sys
import os
import pandas as pd
import datetime as dt
import time
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path
from typing import Any

In [4]:
def documentImporter(file_path):

    # Imports a text file and returns its content. I am not sure if this is still used in the final verison. Version control says yes, but I don't see where.
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            TextInhalt = file.read()
        return TextInhalt
    else:
        print(f"File not found: {file_path}")
        return None

In [5]:
def dategetter(RawText):

    datelist = re.findall(r'"datum":"\d{4}-\d{2}-\d{2}"', RawText)
    date_list = []
    for n_date in datelist:
        date_str = n_date.split('"')[3]  # Extract the date string from the match
        date_list.append(dt.datetime.strptime(date_str, '%Y-%m-%d').date())

    # Only every even entry is relevant. This is the case because the ID is included in each document twice. Removing the uneven entries
    date_list = date_list[::2]
    
    return date_list


In [6]:
def name_and_party_getter(Chunk, patterns):
    for pattern in patterns:
        matches = re.findall(pattern, Chunk)
        if matches:
            return matches[0]


In [7]:
# Store patterns in dictionaries for better organization
patterns = {
    "preamble": re.compile(r'("id":"\d{4}")(.*?)Uhr.{0,150}(Alterspräsidentin|Alterspräsident|Vizepräsidentin|Vizepräsident|Vizekanzlerin|Vizekanzler|Präsidentin|Präsident|Kanzlerin|Kanzler).{0,50}?:', re.DOTALL), #I swear this makes sense!
    "appendix": re.compile(r'(\(Schluß der Sitzung: \d+(.|:)\d+ Uhr.?\)|\\nAnlagen zum Stenographischen Bericht|\\nAnlage 1)(.*?)("id":"\d{4})', re.DOTALL),
    "appendix_last": re.compile(r'(\(Schluß der Sitzung: \d+(.|:)\d+ Uhr.?\)|\\nAnlagen zum Stenographischen Bericht|\\nAnlage 1)(.*?)', re.DOTALL),    # Last appendix without "id" at the end
    "party_speaker": re.compile(r'[^\s,]+ [^\s,]+ \([^\s,]+\)\s?:', re.DOTALL),                                                                         # Generic pattern for speeches, e.g. 'Speaker (Party) :'                                           
    "party_speaker_CDU": re.compile(r'[^\s,]+ [^\s,]+ \(CDU/CSU\)\s?:', re.DOTALL),                                                                     # Specific pattern for CDU speeches
    "party_speaker_FDP_random": re.compile(r'[^\s,]+ [^\s,]+ \(F.D.P.\)\s?:', re.DOTALL),                                                               # For an ungodly reason the FDP was briefly referred to as F.D.P. 1999-2000. I suspect this is a conspiracy to sabotage my thesis and social science in general.
    "party_speaker_new": re.compile(r'\[\w+\]\s?', re.DOTALL),                                                                                          # New pattern for speeches after 2013. The previous pattern 'Speaker (Party) :' was replaced with 'Speaker [Party] :' in the Bundestag protocol.
    "party_speaker_CDU_new": re.compile(r'\[CDU+/CSU\]\s?:', re.DOTALL),                                                                                # Specific pattern for CDU speeches
    "minister_speaker": re.compile(r'(?:[^\n,]+,\s+Bundesminister(?:in)?\s+(?:der|für|des)\s+[^\n:]+:)', re.DOTALL | re.UNICODE),                      # Ministers are usually addressed with 'Bundesminister der ... i.e. Finanzen'
    "chancellor_speaker": re.compile(r', (?:(?:Bundes|Vize)?[Kk]anzlerin?):', re.DOTALL),                                                                 # Chancellor speeches are usually addressed with 'Bundeskanzlerin:' or 'Vizekanzlerin:'
    "reactions": re.compile(r'\(\w\w+ (.*?)\)', re.DOTALL),                                                                                             # Reactions are usually in the form '(Applaus)', '(Beifall)', '(Zuruf)', these simple reactions are removed here
    "remarks": re.compile(r'\((?!CDU/CSU|CDU|CSU|SPD|FDP|F.D.P.|AfD|Die Linke|Bündnis 90/Die Grünen|Bündnis 90 / Die Grüne|Die Grünen|LINKE|PDS|Piraten|NPD|REP|DVU|ÖDP|Tierschutzpartei|MLPD|DKP|BP|SSW|Fraktionslos)[^(]*?:[^()]+\)', re.DOTALL) # Excludes party markers i.e. --> Joachim Gauck (CDU) : I need to keep these to identify individual speeches which
}

keywords = [
    "Schwangerschaftsabbruch",
    "Abtreibung",
    "abgetrieben",
    "Abtreibungsgesetz",
    "Abtreibungsrecht",
    "§ 218",
    "Schwangerschaft abgebrochen",
    "Schwangerschaft",
    "Schwanger",
    "Paragraf 218",
    "Schwangerschaftskonfliktgesetz",
    "Fristenlösung",
    "Indikationsregelung",
    "Beratungspflicht",
    "Beratungsschein",
    "werdendes Leben",
    "ungeborenes Kind",
    "ungeborenes Leben",
    "werdende Mutter",
    "Strafbarkeit Schwangerschaftsabbruch",
    "Entkriminalisierung",
    "Legalisierung",
    "Abbruchsversorgung",
    "medizinische Indikation",
    "kriminologische Indikation",
    "embryopathische Indikation",
    "Schwangerschaftskonfliktberatung",
    "Ärzt*innen Schwangerschaftsabbruch",
    "Arzt Schwangerschaftsabbruch",
    "Kostenübernahme Abtreibung",
    "Gesundheitsversorgung Schwangere",
    "Komplikationen Schwangerschaftsabbruch",
    "psychologische Betreuung",
    "Selbstbestimmungsrecht",
    "reproduktive Rechte",
    "reproduktive Selbstbestimmung",
    "Frauenrechte",
    "Stigmatisierung Abbruch",
    "Tabu Abtreibung",
    "Lebensschutz",
    "ungewollt schwanger",
    "Indikationslösung",
    "Fristenlösung",
    "Schutz des ungeborenen Lebens",
    "Diskriminierung Schwangerer",
    "Versorgung ungewollt Schwangerer",
    "Bundestagsdebatte Schwangerschaftsabbruch",
    "Parlamentsdebatte Abtreibung",
    "Gesetzesentwurf Schwangerschaftsabbruch",
    "Gesetzesänderung §218",
    "Antrag Schwangerschaftsabbruch",
    "Abstimmung Schwangerschaftsabbruch",
    "Expertenkommission Schwangerschaftsabbruch",
    "Öffentlichkeit Schwangerschaftsabbruch",
    "Gruppenantrag Abtreibung",
    "Ampel-Koalition Abtreibung",
    "CDU/CSU Position Abtreibung",
    "Liberalisierung Abtreibungsrecht",
    "Werbeverbot Schwangerschaftsabbruch",
    "Paragraf 219a",
    "Beratungsregel",
    "Kompromiss Schwangerschaftsabbruch",
    "Verfassungsgericht Urteil Schwangerschaftsabbruch",
    "Spätabbruch",
    "Minderjährige Schwangere",
    "Schwangere Jugendliche",
    "religiöse Verbände Abtreibung",
    "Statistik Schwangerschaftsabbruch",
    "Wir haben abgetrieben",
    "219a Werbeverbot",
    "Schwangerschaftsabbruch EU",
    "Internationaler Vergleich Abtreibungsrecht",
    "ungewollt schwanger",
    "ungeborenes Leben",
    "Aufhebung Werbeverbot",
    "Pro Familia",
    "SPD Schwangerschaftsabbruch",
    "Grüne Schwangerschaftsabbruch",
    "FDP Schwangerschaftsabbruch",
    "Linke Schwangerschaftsabbruch",
    "AfD Schwangerschaftsabbruch",
    "Abtreibungskonflikt",
    "Kindstötung", 
    "Memmingen"
]

In [8]:
def isolate_session_content(RawText):
    '''The sequence of removing preamble (Table of content, list of appendices etc.) and appendix (Appendix, list of speakers etc.) is important.
    If the appendix is removed first, the preamble will not be removed correctly, because it relies on the presence of the appendix to identify the end of the preamble.'''
    # Remove preamble first
    textIsolated = patterns["preamble"].sub(r'\1\3', RawText)

    # Remove appendix patterns
    textIsolated = patterns["appendix"].sub(r'\4', textIsolated)
    textIsolated = patterns["appendix_last"].sub("", textIsolated)

    return textIsolated

In [9]:
def reactions_remarks_processing(text):
    remarksList = re.findall(patterns["remarks"], text)
    text = re.sub(patterns["remarks"], "", text)
    text = re.sub(patterns["reactions"], "", text)
    return text, remarksList

In [10]:
text_splitter_pre2013 = RecursiveCharacterTextSplitter(
    
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=True,

    separators=[
        patterns['party_speaker'].pattern,
        patterns['party_speaker_CDU'].pattern,
        patterns['chancellor_speaker'].pattern,
        patterns['minister_speaker'].pattern,
        patterns['party_speaker_FDP_random'].pattern, # Strange pattern that occurs in the parliamentary protocols around 1999 idk why. 
    ]
)

# The utilization of two splitter is necessary due to some structural changes in the protocols after 2013 as the patterns changed. Don't as me  why, I just work here.

text_splitter_post2013 = RecursiveCharacterTextSplitter(

    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=True,
    keep_separator=True,
    
    separators=[
    patterns['chancellor_speaker'].pattern,
    patterns['minister_speaker'].pattern,
    patterns['party_speaker_CDU'].pattern,  
    patterns['party_speaker'].pattern,
    ]
)


In [11]:
def chunk_processing(chunks):
    merged_chunks = []
    i = 0
    n = 0
    while i < len(chunks):
        current_chunk = chunks[i]

        if len(current_chunk) >= 300 or n >= 3:
            merged_chunks.append(current_chunk)
            i += 1
            n = 0

        elif len(current_chunk) < 300 and i >= 1 and n < 4:
            merged_chunks.append(chunks[i-1] + " " + current_chunk)
            # The previous structure lead to looped failure as chunks were merged in a way that they were never longer than 300 characters.
            n += 1
            i += 1

        else:
            print(
                f"Chunk length is less than 300 characters: {len(current_chunk)}")
            print("Failure in merging chunks. Tiny chunk detected.")
            n += 1

    return merged_chunks

In [12]:
def corpus_cleaner(Documents_df):
    # Reset index to ensure continuous indexing after deduplication
    Documents_df = Documents_df.reset_index(drop=True)
    
    # Clean speaker column - remove \\n patterns
    pattern = re.compile(r'(.*?)(\\n)', re.DOTALL | re.MULTILINE)
    Documents_df['speaker'] = Documents_df['speaker'].astype(str).apply(
        lambda x: pattern.sub('', x) if pd.notna(x) and x != 'nan' else x
    )
    
    # Clean chunk column - replace \\n with spaces
    Documents_df['chunk'] = Documents_df['chunk'].astype(str).apply(
        lambda x: x.replace('\\n', ' ') if pd.notna(x) and x != 'nan' else x
    )

    return Documents_df

### New superior chunkifikation approach (less chunks :D)
The previous approach had the issue that the final corpus got unreasonably bloated with a high amount of noise. This new approach focuses more on sessions that feature lots of discussions about reproductive rights. The datapreparation thus follows a first quick and dirty preselection of documents and then a more targeted LLM annotation for increased chunk validity.

In [13]:
folder = Path("../Speeches/IndividualSessions")
corpus_chunks = []
Corpus_Chunked = []

# First pass: Count keyword matches in all files
print("First pass: Counting keyword matches in all files...")
file_keyword_counts = []

for file in folder.glob('*.txt'):
    try:
        RawText = documentImporter(file)
        if RawText:
            # Count keyword matches
            total_matches = 0
            for keyword in keywords:
                matches = len(re.findall(r'\b' + re.escape(keyword.lower()) + r'\b', RawText.lower()))
                total_matches += matches
            
            file_keyword_counts.append({
                'file': file,
                'keyword_count': total_matches
            })
            print(f"{file.name}: {total_matches} keyword matches")
    except Exception as e:
        print(f"Error counting keywords in {file.name}: {e}")
        continue

# Sort files by keyword count and determine top 25%
file_keyword_counts.sort(key=lambda x: x['keyword_count'], reverse=True)
top_25_percent_count = len(file_keyword_counts) // 4
top_25_percent_files = {item['file'].name for item in file_keyword_counts[:top_25_percent_count]}

print(f"\nTop 25% files ({top_25_percent_count} files) with highest keyword matches:")
for item in file_keyword_counts[:top_25_percent_count]:
    print(f"  {item['file'].name}: {item['keyword_count']} matches")

print(f"\nRemaining 75% files ({len(file_keyword_counts) - top_25_percent_count} files) will use pattern-based context approach")

# Second pass: Process files according to their quartile
print("\nSecond pass: Processing files...")

for file in folder.glob('*.txt'):
    print(f"\nProcessing file: {file.name}")

    try:
        # Import and process the document
        RawText = documentImporter(file)
        if not RawText:
            continue

        # Extract date and content
        date_of_session = dategetter(RawText)
        IsolatedText = isolate_session_content(RawText)
        ProcessedText = reactions_remarks_processing(IsolatedText)[0]

        # Find keyword matches with positions
        matches = []
        for keyword in keywords:
            matches.extend([(m.start(), m.end(), keyword) for m in re.finditer(
                r'\b' + re.escape(keyword.lower()) + r'\b', ProcessedText.lower())])

        # Determine year for processing logic
        year = date_of_session[0].year if date_of_session else 999
        year = int(year)
        print(f"Year of session: {year}")
        print(f"Number of keyword matches found: {len(matches)}")

        # Check if file is in top 25%
        if file.name in top_25_percent_files:
            print(f"Processing as TOP 25% file (full processing)")
            
            # Full processing approach for top 25%
            if ProcessedText:
                # Split text based on year
                if year >= 2013:
                    chunks = text_splitter_post2013.split_text(ProcessedText)
                    print(f"Number of chunks (post-2013): {len(chunks)}")
                else:
                    chunks = text_splitter_pre2013.split_text(ProcessedText)
                    print(f"Number of chunks (pre-2013): {len(chunks)}")

                print(f"Processing chunks for {file.name}")
                merged_chunks = chunk_processing(chunks)
            else:
                merged_chunks = []

        else:
            print(f"Processing as BOTTOM 75% file (pattern-based context around keywords)")
            
            # Pattern-based context approach for bottom 75%
            merged_chunks = []
            if matches:
                # Sort matches by position to avoid overlaps
                matches.sort(key=lambda x: x[0])
                
                # Define speaker patterns based on year
                if year >= 2013:
                    speaker_patterns = [
                        patterns['party_speaker_new'],
                        patterns['party_speaker_CDU_new'],
                        patterns['chancellor_speaker'],
                        patterns['minister_speaker']
                    ]
                else:
                    speaker_patterns = [
                        patterns['party_speaker'],
                        patterns['party_speaker_CDU'],
                        patterns['party_speaker_FDP_random'],
                        patterns['chancellor_speaker'],
                        patterns['minister_speaker']
                    ]
                
                processed_ranges = []
                for start, end, keyword in matches:
                    # Find speaker patterns around the keyword
                    search_start = max(0, start - 2000)  # Search 2000 chars before
                    search_end = min(len(ProcessedText), end + 2000)  # Search 2000 chars after
                    search_text = ProcessedText[search_start:search_end]
                    
                    # Find all speaker pattern matches in the search area
                    pattern_matches = []
                    for pattern in speaker_patterns:
                        for match in pattern.finditer(search_text):
                            absolute_start = search_start + match.start()
                            absolute_end = search_start + match.end()
                            pattern_matches.append((absolute_start, absolute_end, match.group()))
                    
                    # Sort pattern matches by position
                    pattern_matches.sort(key=lambda x: x[0])
                    
                    # Find the pattern before and after the keyword
                    pattern_before = None
                    pattern_after = None
                    
                    for p_start, p_end, p_text in pattern_matches:
                        if p_end <= start:  # Pattern before keyword
                            pattern_before = (p_start, p_end, p_text)
                        elif p_start >= end and pattern_after is None:  # First pattern after keyword
                            pattern_after = (p_start, p_end, p_text)
                            break
                    
                    # Determine chunk boundaries
                    if pattern_before and pattern_after:
                        # Use pattern before -> keyword -> pattern after
                        chunk_start = pattern_before[0]
                        chunk_end = pattern_after[1]
                        structure_type = "pattern-keyword-pattern"
                        print(f"  Found pattern-keyword-pattern structure around '{keyword}'")
                    elif pattern_before:
                        # Use pattern before -> keyword + 500 chars
                        chunk_start = pattern_before[0]
                        chunk_end = min(len(ProcessedText), end + 500)
                        structure_type = "pattern-keyword-context"
                        print(f"  Found pattern-keyword-context structure around '{keyword}'")
                    elif pattern_after:
                        # Use 500 chars before keyword -> pattern after
                        chunk_start = max(0, start - 500)
                        chunk_end = pattern_after[1]
                        structure_type = "context-keyword-pattern"
                        print(f"  Found context-keyword-pattern structure around '{keyword}'")
                    else:
                        # Fallback to simple context
                        chunk_start = max(0, start - 500)
                        chunk_end = min(len(ProcessedText), end + 500)
                        structure_type = "context-only"
                        print(f"  Using context-only fallback around '{keyword}'")
                    
                    # Check for overlap with previously processed ranges
                    overlap = False
                    for prev_start, prev_end in processed_ranges:
                        if not (chunk_end < prev_start or chunk_start > prev_end):
                            overlap = True
                            break
                    
                    if not overlap:
                        context_chunk = ProcessedText[chunk_start:chunk_end].strip()
                        if len(context_chunk) > 50:  # Only add non-trivial chunks
                            merged_chunks.append(context_chunk)
                            processed_ranges.append((chunk_start, chunk_end))
                            print(f"  Added {structure_type} chunk around '{keyword}' ({len(context_chunk)} chars)")
            
            if not merged_chunks:
                print(f"  No keyword matches found, skipping file")
                continue

        # Add each chunk to the corpus
        for chunk in merged_chunks:
            corpus_chunks.append({
                'chunk': chunk,
                'date': date_of_session,
                'file_name': file.name,
                'processing_type': 'full' if file.name in top_25_percent_files else 'pattern_context',
                'speaker': name_and_party_getter(chunk, [
                    patterns['party_speaker'],
                    patterns['party_speaker_CDU'],
                    patterns['party_speaker_FDP_random'],
                    patterns['party_speaker_new'],
                    patterns['party_speaker_CDU_new'],
                    patterns['chancellor_speaker'],
                    patterns['minister_speaker']
                ]),
            })

        print(f"Added {len(merged_chunks)} chunks from {file.name}")

    except Exception as e:
        print(f"Error processing {file.name}: {str(e)}")
        continue

# Create DataFrame and save
print(f"\nTotal chunks collected: {len(corpus_chunks)}")

if corpus_chunks:
    Corpus_Chunked = pd.DataFrame(corpus_chunks)
    print(f"DataFrame created with shape: {Corpus_Chunked.shape}")

    # Show processing type distribution
    processing_counts = Corpus_Chunked['processing_type'].value_counts()
    print(f"Processing type distribution:")
    for proc_type, count in processing_counts.items():
        print(f"  {proc_type}: {count} chunks")

    initial_count = len(Corpus_Chunked)
    Corpus_Chunked = Corpus_Chunked.drop_duplicates(subset=['chunk'], keep='first').reset_index(drop=True)
    duplicates_removed = initial_count - len(Corpus_Chunked)
    print(f"Removed {duplicates_removed} duplicate chunks")
    print(f"DataFrame after deduplication: {Corpus_Chunked.shape}")

    # Clean the corpus
    Corpus_Chunked = corpus_cleaner(Corpus_Chunked)
    print(f"DataFrame after cleaning: {Corpus_Chunked.shape}")

    # Save to CSV
    output_file = "corpus_chunks.csv"
    Corpus_Chunked.to_csv(output_file, index=False)
    print(f"Corpus saved to {output_file}")

    # Display some info about the corpus
    print(f"\nCorpus summary:")
    print(f"- Total chunks: {len(Corpus_Chunked)}")
    print(f"- Unique files: {Corpus_Chunked['file_name'].nunique()}")
    print(f"- Date range: {Corpus_Chunked['date'].min()} to {Corpus_Chunked['date'].max()}")
    
    # Show final processing type distribution
    final_processing_counts = Corpus_Chunked['processing_type'].value_counts()
    print(f"Final processing type distribution:")
    for proc_type, count in final_processing_counts.items():
        print(f"  {proc_type}: {count} chunks")

First pass: Counting keyword matches in all files...
documents_TextDaten1985-02-01-1985-03-01.txt_7_2351.txt: 2 keyword matches
documents_TextDaten1999-11-15-1999-12-15.txt_5_1266.txt: 5 keyword matches
documents_TextDaten2022-11-15-2022-12-15.txt_9_5535.txt: 13 keyword matches
documents_TextDaten2018-05-01-2018-06-01.txt_5_4739.txt: 3 keyword matches
documents_TextDaten1973-05-01-1973-06-01.txt_7_3155.txt: 2 keyword matches
documents_TextDaten1974-11-01-1974-12-01.txt_9_3249.txt: 2 keyword matches
documents_TextDaten1990-11-15-1991-02-15.txt_3_1747.txt: 8 keyword matches
documents_TextDaten2024-06-01-2024-07-01.txt_15_5651.txt: 12 keyword matches
documents_TextDaten2020-05-01-2020-06-01.txt_1_5334.txt: 5 keyword matches
documents_TextDaten2022-09-15-2022-10-15.txt_5_5519.txt: 4 keyword matches
documents_TextDaten2021-05-01-2021-06-01.txt_3_5416.txt: 3 keyword matches
documents_TextDaten2020-05-01-2020-06-01.txt_5_5332.txt: 4 keyword matches
documents_TextDaten2019-04-01-2019-05-01.txt

# Saving the chunkified corpus

In [20]:
with open ("./CSV/Corpus_Chunkified_adjusted.csv", "w", encoding="utf-8") as f:
    Corpus_Chunked.to_csv(f, index=False)