After having annotated all parliamentary speeches with the metadata contained in the Comparative Legislators Database (CLD), we can now concatenate all the data from the different decades, standardize the nomenclature and divide them into different dataframes for each legislature. 

In [None]:
import pandas as pd 
import numpy as np

In [None]:
df_1970s = pd.read_pickle("./output_1970s.pkl")
df_1980s = pd.read_pickle("./output_1980s.pkl")
df_1990s = pd.read_pickle("./output_1990s.pkl")  
df_2000s = pd.read_pickle("./output_2000s.pkl")
df_2010_2015 = pd.read_pickle("./output_2010_2015.pkl")  
df_2016_2025 = pd.read_pickle("./output_2016_2025.pkl")

In [None]:
df = pd.concat([df_1970s, df_1980s, df_1990s, df_2000s, df_2010_2015], ignore_index=True)

In [None]:
df = df.rename(columns={'hansard_id_x': 'hansard_id'})

In [None]:
df.insert(3, 'speaker_id', np.nan)

In [None]:
# as the TWFY dataset does not contain a 'twfy_member_id' tag, we insert a column with NaN values

df_2016_2025.insert(2, 'twfy_member_id', np.nan)

In [None]:
df_2016_2025 = df_2016_2025.rename(columns={'hansard_id_x': 'hansard_id'})

In [None]:
df_2016_2025 = df_2016_2025[['speech_id_link', 'speaker', 'twfy_member_id', 'speaker_id',
                             'hansard_id', 'text', 'date', 'legislature', 'chair', 'name_cld',
                             'wikidataid', 'pageid', 'ethnicity', 'religion', 'sex', 'birth',
                             'death', 'birthplace', 'deathplace', 'session', 'party',
                             'constituency']]

In [None]:
# ensure standardized column names and sort by dates

df = pd.concat([df,df_2016_2025])
df = df.sort_values(by='date').reset_index(drop=True)

In [None]:
# create a unique ID for each speech

df['id'] = df.index + 1
df['id'] = df['id'].apply(lambda x: f'{x:07d}')

In [None]:
# merge with the partyfact dataset to get party names

partyfacts_df = pd.read_excel("./partyfacts_uk.xlsx")
partyfacts_df = partyfacts_df[["partyfacts_id","name"]]
partyfacts_df = partyfacts_df.rename(columns={"name": "party"})

In [None]:
df = pd.merge(df, partyfacts_df, how='left', on='party')

In [None]:
df_45 = df[df["legislature"] == 45]
df_46 = df[df["legislature"] == 46]
df_47 = df[df["legislature"] == 47] 
df_48 = df[df["legislature"] == 48]
df_49 = df[df["legislature"] == 49] 
df_50 = df[df["legislature"] == 50]
df_51 = df[df["legislature"] == 51] 
df_52 = df[df["legislature"] == 52]
df_53 = df[df["legislature"] == 53]
df_54 = df[df["legislature"] == 54]
df_55 = df[df["legislature"] == 55]    
df_56 = df[df["legislature"] == 56]
df_57 = df[df["legislature"] == 57] 
df_58 = df[df["legislature"] == 58]

In [None]:
# Now one can save the dataframes for each legislature, e.g. for legislature 45:
# This is the data that is available on the Harvard Dataverse repository

df_45.to_pickle("./output_45.pkl")
df_45.to_csv("./output_45.csv")
