In [None]:
import os
import pandas as pd
import nltk
import re

In [None]:
# Change with yout working directories
FOLDER_PATH = "data/combined data"
OUTPUT_PATH = "applications"

In [None]:
# Combine CSV files
file_paths = [os.path.join(FOLDER_PATH, f) for f in os.listdir(FOLDER_PATH) if f.endswith('.csv')]
list_of_data_frames = [pd.read_csv(file) for file in file_paths]
df = pd.concat(list_of_data_frames, ignore_index=True)

In [None]:
# Select speeches mentioning "living wage" or "minimum wage"
search_phrase = r"living wage|minimum wage"
relevant_df = df[df['text'].str.contains(search_phrase, case=False, na=False, regex=True)].copy()

In [None]:
# Save filtered Dataframe
relevant_df.to_excel(os.path.join(OUTPUT_PATH, "minimumwage_speech_df.xlsx"), index=False) 

In [None]:
# Cleaning up text (remove brackets, fix abbreviations, strip whitespace)
relevant_df['text'] = (relevant_df['text']
                       .str.replace(r'\[.*?\]', '', regex=True)
                       .str.replace('hon.', 'hon', regex=False)
                       .str.strip())

In [None]:
# Splitting text into sentences creating a new dataframe
matching_sentences = []
for index, row in relevant_df.iterrows():
    if isinstance(row['text'], str):
        sentences = nltk.sent_tokenize(row['text'])
        for sentence in sentences:
            if re.search(search_phrase, sentence, re.IGNORECASE):
                matching_sentences.append({
                    'original_index': index,
                    'sentence': sentence.strip()
                })

sentences_df = pd.DataFrame(matching_sentences)

final_df = pd.merge(sentences_df, df, left_on='original_index', right_index=True, how='left')


In [None]:
# Create final dataset with metadata and remove redundant columuns
final_df = pd.merge(sentences_df, df, left_on='original_index', right_index=True, how='left')
final_df = final_df.drop(columns=['text', 'original_index'], errors='ignore')
print(final_df)


In [None]:
# Save sentence level dataframe
final_df.to_excel(os.path.join(OUTPUT_PATH, "minimumwage_sentence_df.xlsx"), index=False)