# Keyword Search

List of keywords:

In [1]:
import re
from pathlib import Path

keywords = [
    "Schwangerschaftsabbruch",
    "Abtreibung",
    "abgetrieben",
    "Abtreibungsgesetz",
    "Abtreibungsrecht",
    "§ 218",
    "Schwangerschaft abgebrochen",
    "Schwangerschaft",
    "Schwanger",
    "Paragraf 218",
    "Schwangerschaftskonfliktgesetz",
    "Fristenlösung",
    "Indikationsregelung",
    "Beratungspflicht",
    "Beratungsschein",
    "werdendes Leben",
    "ungeborenes Kind",
    "ungeborenes Leben",
    "werdende Mutter",
    "Strafbarkeit Schwangerschaftsabbruch",
    "Entkriminalisierung",
    "Legalisierung",
    "Abbruchsversorgung",
    "medizinische Indikation",
    "kriminologische Indikation",
    "embryopathische Indikation",
    "Schwangerschaftskonfliktberatung",
    "Ärzt*innen Schwangerschaftsabbruch",
    "Arzt Schwangerschaftsabbruch",
    "Kostenübernahme Abtreibung",
    "Gesundheitsversorgung Schwangere",
    "Komplikationen Schwangerschaftsabbruch",
    "psychologische Betreuung",
    "Selbstbestimmungsrecht",
    "reproduktive Rechte",
    "reproduktive Selbstbestimmung",
    "Frauenrechte",
    "Stigmatisierung Abbruch",
    "Tabu Abtreibung",
    "Lebensschutz",
    "ungewollt schwanger",
    "Indikationslösung",
    "Fristenlösung",
    "Schutz des ungeborenen Lebens",
    "Diskriminierung Schwangerer",
    "Versorgung ungewollt Schwangerer",
    "Bundestagsdebatte Schwangerschaftsabbruch",
    "Parlamentsdebatte Abtreibung",
    "Gesetzesentwurf Schwangerschaftsabbruch",
    "Gesetzesänderung §218",
    "Antrag Schwangerschaftsabbruch",
    "Abstimmung Schwangerschaftsabbruch",
    "Expertenkommission Schwangerschaftsabbruch",
    "Öffentlichkeit Schwangerschaftsabbruch",
    "Gruppenantrag Abtreibung",
    "Ampel-Koalition Abtreibung",
    "CDU/CSU Position Abtreibung",
    "Liberalisierung Abtreibungsrecht",
    "Werbeverbot Schwangerschaftsabbruch",
    "Paragraf 219a",
    "Beratungsregel",
    "Kompromiss Schwangerschaftsabbruch",
    "Verfassungsgericht Urteil Schwangerschaftsabbruch",
    "Spätabbruch",
    "Minderjährige Schwangere",
    "Schwangere Jugendliche",
    "religiöse Verbände Abtreibung",
    "Statistik Schwangerschaftsabbruch",
    "Wir haben abgetrieben",
    "219a Werbeverbot",
    "Schwangerschaftsabbruch EU",
    "Internationaler Vergleich Abtreibungsrecht",
    "ungewollt schwanger",
    "ungeborenes Leben",
    "Aufhebung Werbeverbot",
    "Pro Familia",
    "SPD Schwangerschaftsabbruch",
    "Grüne Schwangerschaftsabbruch",
    "FDP Schwangerschaftsabbruch",
    "Linke Schwangerschaftsabbruch",
    "AfD Schwangerschaftsabbruch",
    "Abtreibungskonflikt",
    "Kindstötung", 
    "Memmingen"
]

## Keyword function:

In [None]:
import pandas as pd

#Function for identifying files that contain listed keywords:

def find_files_with_keywords(folder_path, keywords, case_sensitive=False):
    """Return list of .txt filenames containing any of the specified keywords as whole words."""
    folder = Path(folder_path)
    records = []

        # Create regex patterns for whole word matching
    flags = 0 if case_sensitive else re.IGNORECASE
    patterns = [re.compile(r'\b' + re.escape(keyword) + r'\b', flags) for keyword in keywords]

    for txt_file in folder.glob('*.txt'):
        try:
            filename = txt_file.name
            content = txt_file.read_text(encoding='utf-8')
            found_keywords = [keyword for keyword, pattern in zip(keywords, patterns) if pattern.search(content)]
            if found_keywords:
                    records.append({'filename': txt_file.name, 'matched_keywords': found_keywords})
        except Exception as e:
            print(f"Error reading {txt_file.name}: {e}")

        matching = pd.DataFrame(records)
    return matching


In [5]:
if __name__ == "__main__":
    folder = "/home/pc/Uni/MasterThesis/Speeches/IndividualSessions"
    keywords = keywords
    
    matches_in_docs = find_files_with_keywords(folder, keywords, case_sensitive=False)
    with open("keyword_search_results.txt", "w", encoding="utf-8") as file:
                            file.write(matches_in_docs.to_string(index=False))


In [None]:
import pandas as pd

# read text file into pandas DataFrame
df = pd.read_csv("keyword_search_results.txt", sep="[", header = 0, names=["filename", "matched_keywords"])
matching_files = [fname.strip() for fname in df.filename.tolist()]

df["matched_keywords"] = df["matched_keywords"].apply(lambda x: x.strip("]"))
df_filtered = df[df['matched_keywords'].str.count(',') >= 1] # Selecting documents with more than one keyword. Documents with only one keyword are considered as cursory mentions.


SelectedSessions = []
SelectedSessions = pd.DataFrame(df_filtered)
SelectedSessions.to_csv("FilteredSessions.csv", index=False)

**Output** = Matching files list object

In [4]:
def split_sessions_by_pattern(text, pattern):
    
    '''This function splits a document into multiple new documents since each original document contains 
    up to 10 different sessions. The sessions are denoted by the pattern id:1234 which marks the beginning of each individual protocoll (session)'''

    matches = list(re.finditer(pattern, text))
    sessions = []

    for i in range(len(matches)):
        start = matches[i].start()
        end = matches[i+1].start() if i+1 < len(matches) else len(text)
        sessions.append(text[start:end])
    return sessions


**Output** = Sessions list object

## Deleting redundant files (files not containing keywords)

In [1]:
def delete_redundant(folder, new_docs):
    folder = Path(folder)
    new_files = [str(filename) for filename in new_docs]
    for file in folder.glob('*.txt'):
        file_contains_match = False
        for matching_name in new_files:
            if matching_name == file.name:
                file_contains_match = True
                break
        if not file_contains_match:
            try:
                file.unlink()
                print(f"Deleted: {file.name} (does not contain any matching document name)")
            except Exception as e:
                print(f"Could not delete {file.name}: {e}")


## Parsing individual protocoll sessions

In [19]:
def save_selected(matching_files, folder, output_folder):
    
    #Split documents into individual sessions based on the 'id' pattern.

    folder = Path(folder)
    output_folder = Path(output_folder)
    
    if not output_folder.exists():
        output_folder.mkdir(parents=True, exist_ok=True)
    
    for filename in matching_files: 
        with open(f'{folder}/{filename}', 'r', encoding='utf-8') as file:
                    content = None
                    content = file.read()
                    pattern = r'"id":"\d{4}"'  # Pattern to split sessions by 'id'
                    session_texts = split_sessions_by_pattern(content, pattern)

                    for i, session in enumerate(session_texts):
                        documentID = re.search(r'"id":"(\d{4})"', session)
                        if documentID:
                            session_file_name = f"documents_{filename}_{i}_{documentID.group(1)}.txt"
                            new_docs.append(session_file_name)
                        else:
                            session_file_name = f"documents_{filename}_{i}.txt"
                            new_docs.append(session_file_name)
                        with open(f'{output_folder}/{session_file_name}', "w", encoding="utf-8") as file:
                            file.write(session)



In [None]:

if __name__ == "__main__":
    content = None
    new_docs = []
    folder = "/home/pc/Uni/MasterThesis/Speeches"
    output_folder = "/home/pc/Uni/MasterThesis/Speeches/IndividualSessions"
    save_selected(matching_files, folder, output_folder)


## Identifying and deleting redundant documents (sessions that contain no mention of keywords)

In [None]:
if __name__ == "__main__":
    folder = "/home/pc/Uni/MasterThesis/Speeches/IndividualSessions"
    keywords = keywords
    
    matches_in_docs = find_files_with_keywords(folder, keywords, case_sensitive=False)
    target_folder = "/home/pc/Uni/MasterThesis/Speeches/IndividualSessions"

    filenames = df_filtered.filename.str.strip()
    filenames = filenames.tolist()
    delete_redundant(target_folder, filenames)
