In this file, we illustrate: 
- How we have extracted the unique speakers available from the scraped XML data of the TheyWorkForYou (TWFY) repository. 
- This is important to later on combine these data with data from the Comparative Legislators Database. 


In [None]:
## Load libraries

from bs4 import BeautifulSoup, SoupStrainer
import urllib.request 
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import xml.etree.ElementTree as ET

In [None]:
df = pd.read_pickle("uk_debates_1970_1979.pkl")

In [None]:
# extract the date from the speech_id_link

df['date'] = df['speech_id_link'].str.extract(r'(\d{4}-\d{2}-\d{2})')


In [None]:
# define the legislatures with their start and end dates

legislatures = [
    {"number": 45, "start": "1970-06-18", "end": "1974-02-27"},
    {"number": 46, "start": "1974-02-28", "end": "1974-10-09"},
    {"number": 47, "start": "1974-10-10", "end": "1979-05-02"},
    {"number": 48, "start": "1979-05-03", "end": "1983-06-08"},
    {"number": 49, "start": "1983-06-09", "end": "1987-06-10"},
    {"number": 50, "start": "1987-06-11", "end": "1992-04-08"},
    {"number": 51, "start": "1992-04-09", "end": "1997-04-30"},
    {"number": 52, "start": "1997-05-01", "end": "2001-06-06"},
    {"number": 53, "start": "2001-06-07", "end": "2005-05-04"},
    {"number": 54, "start": "2005-05-05", "end": "2010-05-05"},
    {"number": 55, "start": "2010-05-06", "end": "2015-05-06"},
    {"number": 56, "start": "2015-05-07", "end": "2017-06-07"},
    {"number": 57, "start": "2017-06-08", "end": "2019-12-11"},
    {"number": 58, "start": "2019-12-12", "end": "2024-05-22"}, 
]

In [None]:
for leg in legislatures:
    leg["start"] = pd.to_datetime(leg["start"])
    leg["end"] = pd.to_datetime(leg["end"])


In [None]:
df["date"] = pd.to_datetime(df["date"])


In [None]:
def get_legislature(date):
    for leg in legislatures:
        if leg["start"] <= date <= leg["end"]:
            return leg["number"]
    return None

# Apply function
df["legislature"] = df["date"].apply(get_legislature)


In [None]:
df = df[df["date"] > "1970-06-17"]

In [None]:
df_45 = df[df["legislature"]==45]
df_46 = df[df["legislature"]==46]
df_47 = df[df["legislature"]==47]
df_48 = df[df["legislature"]==48]

In [None]:
u_df_45 = df_45.drop_duplicates(["twfy_member_id"])
u_df_46 = df_46.drop_duplicates(["twfy_member_id"])
u_df_47 = df_47.drop_duplicates(["twfy_member_id"])
u_df_48 = df_48.drop_duplicates(["twfy_member_id"])

In [None]:
u_df_45.to_csv("unique_speaker_45.csv")
u_df_46.to_csv("unique_speaker_46.csv")
u_df_47.to_csv("unique_speaker_47.csv")
u_df_48.to_csv("unique_speaker_48.csv")

Once we have data on the unique speakers, we need to merge it with data from the Comparative Legislators Database (CLD).