In this file, we showcase how we have used OpenAI GPT-4.0 to aid in the fuzzy matching between the speaker information available from the TWFY dataset with the speaker information available from the Comparative Legislators Database (CLD). The goal is to match the speaker names from TWFY with those in the CLD. We divided this analysis by legislative period.

In [None]:
import pandas as pd
from openai import OpenAI
import os
from openai import OpenAI
import tiktoken
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import re
from dotenv import load_dotenv


In [None]:
client = OpenAI(api_key=os.getenv("api_key"))

In [None]:
# The csv file that is read in contains the list of unique speakers recorded by the CLD and TWFY datasets, divided by legislature. 

# In this example, we will use the 46th legislature.

df = pd.read_csv("./combined_df_46.csv")

In [None]:
# We first separate the data by creating two different dataframe, one containing the CLD speaker names (and associated metadata and unique ID) and the other containing the TWFY speaker names 
# (and associated metadata and unique ID).

# CLD dataset 
df_cld = df[["wikidataid","pageid","name", "ethnicity", "religion", "sex", "birth", "death", "birthplace", "deathplace", "session", "party", "constituency"]]

In [None]:
# TWFY dataset
df = df[["speaker", "name", "twfy_member_id", "hansard_id"]]

In [None]:
# Now we specificy the prompt that we will use to match the speaker names from the TWFY dataset with those in the CLD dataset.

instructions = """
Your task is to link the text recorded in the speaker column with the closest text you can find in the name column. 

You should do this by creating a new column called match and reporting the text in the name column which constitutes the closest match to the text in the speaker column. 

Do this for every row. If unsure, insert unsure
"""

In [None]:
# Test prompt

messages = [ 
    {"role": "system", "content": instructions},
    {"role": "user", "content": df['speaker'].values[0]}
]

response = client.chat.completions.create(
    model="gpt-4o",
    messages=messages,
    seed=42,
    temperature=0.0,
)

results = response.choices[0].message.content
results

In [None]:
# Define a function 

def classify_text(row):
    speaker = str(row["speaker"])
    all_names = df["name"].dropna().astype(str).tolist()
    name_list_str = "\n".join(all_names)

    
    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": f"Speaker: {speaker}\nName list:\n{name_list_str}"}
    ]

    # Call the API
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        seed=42,
        temperature=0.0,
    )

    return response.choices[0].message.content.strip()

In [None]:
df["match"] = df.apply(classify_text, axis=1)

In [None]:
## clean the resulting match column

def extract_match_value(match_str):
    try:
        lines = match_str.strip().split("\n")
        if len(lines) >= 3:
            values = [cell.strip() for cell in lines[2].strip().split("|")[1:-1]]
            return values  # returns a list: [Speaker, Match]
        else:
            return [None, None]
    except:
        return [None, None]

# Apply extraction to each row
df[['match_speaker', 'match_name']] = df['match'].apply(extract_match_value).apply(pd.Series)

In [None]:
df = df.drop("name", axis=1)

In [None]:
df = df.rename(columns={"match_name":"name"})

In [None]:
df = pd.merge(df, df_cld, on = "name", how="left")

In [None]:
df = df.rename(columns={"name":"name_cld"})

In [None]:
df.to_csv("./merge_openai_46.csv")