In [3]:
import os
import pandas as pd
import re

In [4]:
speeches_dir = '../data/wps_speeches'
speeches = [f for f in os.listdir(speeches_dir) if f.endswith('.txt')]

speeches_list = []

def sort_speeches(filename):
    session_match = re.search(r'SPV\.\d+(Resumption\d+)?', filename)  # Match session and resumption
    speech_match = re.search(r'spch(\d+)', filename)  # Match speech number
    session = session_match.group(0) if session_match else ''
    speech_num = int(speech_match.group(1)) if speech_match else float('inf')  # Default speech number if missing
    return session, speech_num
    
sorted_speeches = sorted(speeches, key=sort_speeches)

def extract_information(text):
    # make sure there is colon to split on 
    if ":" in text: 
        before_colon, after_colon = text.split(":", 1)
        
    else: 
        before_colon = text
        after_colon = ''
        
    # extract country (if included, inside parentheses) and person (before parentheses)
    match = re.match(r"(.*?)\(([^)]+)\)(?:.*\(([^)]+)\))?", before_colon.strip())
    if match:
        speaker = match.group(1).strip()  # Text before the first parentheses
        if 'spoke in' in match.group(2).lower():
            language = re.sub(r'\b(spoke in)\b', '', match.group(2).strip(), flags=re.IGNORECASE).strip()
            country = None
        else: 
            country = match.group(2).strip()  # Text inside the first parentheses
            language = re.sub(r'\b(spoke in)\b', '', match.group(3).strip(), flags=re.IGNORECASE).strip() if match.group(3) else "English"  # Text inside the second parentheses or default
    else:
        speaker, country, language = before_colon.strip(), '', "English"

    # sometimes USA as US, sometimes USA, normalize
    if country in ['United States', 'United States of America']:
        country = 'United States of America'
        
    # get gender of the speaker (if obvious)
    if re.search(r'(Mr\.|King|Prince|King|Lord|Sir)', speaker):
        gender = "m"
    elif re.search(r'(Ms\.|Miss|Mrs\.|Queen|Princess|Lady|Baroness)', speaker):
        gender = "f"
    else:
        gender = "u"
    
    return speaker, country, language, gender, after_colon

for speech in sorted_speeches:
    # sort out duplicates
    if speech.startswith('UNSC_2010_SPV.6396Resumption1'):
        continue
    session_match = re.search(r'SPV\.(\d+)', speech)
    session_number = session_match.group(1)
    file_path = os.path.join(speeches_dir, speech)
    try:
        with open(file_path, encoding='utf-8-sig') as f:
            text = f.read()
            speaker, country, language, gender, after_colon = extract_information(text)
    except UnicodeDecodeError:
        try:
            with open(file_path, encoding='utf-16') as f: # trying alternative encoding
                text = f.read()
                speaker, country, language, gender, after_colon = extract_information(text)

        except Exception as e:
            print(f"Could not read {file_path}: {e}")
            continue
    year_match = re.search(r'(\d{4})', speech)
    year = year_match.group(1) if year_match else None
    
    # Append both text and metadata
    speeches_list.append({'filename': speech, 'session number': session_number, 'text': text, 'year': year, 'only text': after_colon, 
                          'speaker' : speaker, 'country/organization': country, 'gender': gender, 'language': language})

speeches_df = pd.DataFrame(speeches_list, columns=['filename', 'year', 'session number', 'text', 'speaker', 'country/organization', 'language', 'gender', 'only text'])

In [5]:
# remove new line characters
for col in speeches_df:
    speeches_df[col] = speeches_df[col].str.replace('\n', ' ')

In [6]:
# count rows
speeches_df.shape

(4613, 9)

In [13]:
# remove speeches from the president
speeches_df = speeches_df[~(speeches_df['speaker'] == 'The President')]

In [14]:
#save to csv
speeches_df.to_csv('../data/wps_speeches.csv', index=False)