In [8]:
import os
import pandas as pd
import re

In [23]:
speeches_dir = '../data/wps_speeches'
speeches = [f for f in os.listdir(speeches_dir) if f.endswith('.txt')]

speeches_list = []

def sort_speeches(filename):
    session_match = re.search(r'SPV\.\d+(Resumption\d+)?', filename)  # Match session and resumption
    speech_match = re.search(r'spch(\d+)', filename)  # Match speech number
    session = session_match.group(0) if session_match else ''
    speech_num = int(speech_match.group(1)) if speech_match else float('inf')  # Default speech number if missing
    return session, speech_num
    
sorted_speeches = sorted(speeches, key=sort_speeches)

for speech in sorted_speeches:
    file_path = os.path.join(speeches_dir, speech)
    try:
        with open(file_path, encoding='utf-8-sig') as f:
            text = f.read()
    except UnicodeDecodeError:
        try:
            with open(file_path, encoding='utf-16') as f: # trying alternative encoding
                text = f.read()
        except Exception as e:
            print(f"Could not read {file_path}: {e}")
            continue
    year_match = re.search(r'(\d{4})', speech)
    year = year_match.group(1) if year_match else None
    
    # Append both text and metadata
    speeches_list.append({'filename': speech, 'text': text, 'year': year})

speeches_df = pd.DataFrame(speeches_list, columns=['text', 'year', 'filename'])

In [24]:
# remove new line characters
speeches_df['text'] = speeches_df['text'].str.replace('\n', ' ')

In [17]:
# count rows
speeches_df.shape

(4668, 1)

In [25]:
#save to csv
speeches_df.to_csv('../data/wps_speeches.csv', index=False)