In [18]:
out_path = "data/protagonist_gender.csv"

from llm_annotate import call_model
import pandas as pd
import os

In [2]:
#load kaggle data
df = pd.read_csv("data/data.csv")


In [22]:
if not os.path.exists(out_path):
    genders = pd.DataFrame({"imdb_id": df.imdb_id, "character_name_search": df.character_name_search, "gender": [""] * len(df)})
else:
    genders = pd.read_csv(out_path)

def annotate_one(i):
    current_name = df.character_name_search.iloc[i]
    snippet_file = df.script_filename.iloc[i]
    title = df.title.iloc[i]
    id = df.imdb_id.iloc[i]
    if current_name in genders.character_name_search.values and id in genders.imdb_id.values:
        previous_value = genders.loc[(genders.imdb_id == id) & (genders.character_name_search == current_name), "gender"].values[0]
        if previous_value != "" and not pd.isna(previous_value):
            return None
    
    print(f"{i+1}/{df.shape[0]}")
    with open(f"data/screenplay_data/data/raw_texts/raw_texts/{snippet_file}", "r", encoding="iso-8859-1") as f:
        text = f.read()
        snippet = text[:4000]
    
    model="gpt-4.1-mini"
    
    prompt = f"""Determine the gender of the protagonist in a story based on their name.
    Protagonist Name: {current_name}
    Respond with one of the following options:
    - Male
    - Female
    - Other
    - Unknown

    <Title> {title} </Title>
    <Snippet> {snippet} </Snippet>
    <Task> You're determining the gender of the character: {current_name}
    Repond with a single word answer (Male, Female, Other, Unknown). </Task>
    """

    response = call_model(prompt=prompt, model=model)
    gender = response.strip().split("\n")[0]
    print(f"Annotated gender for {current_name}: {gender}")
    return {"imdb_id": id, "character_name_search": current_name, "gender": gender}

In [23]:
from concurrent.futures import ThreadPoolExecutor, as_completed
with ThreadPoolExecutor(max_workers=4) as executor:
    futures = [executor.submit(annotate_one, i) for i in range(df.shape[0])]
    for future in as_completed(futures):
        #save to csv
        result = future.result()
        if result is not None:
            id = result["imdb_id"]
            name = result["character_name_search"]
            gender = result["gender"]
            genders.loc[(genders.imdb_id == id) & (genders.character_name_search == name), "gender"] = gender
            genders.to_csv(out_path, index=False)
genders.to_csv(out_path, index=False)

202/2138203/2138
201/2138
204/2138

Annotated gender for Blade : Male
205/2138
Annotated gender for Billy Peltzer: Male
206/2138
Annotated gender for Billy Peltzer : Male
207/2138
Annotated gender for Bitti Mishra : Female
208/2138
Annotated gender for Blade: Male
209/2138
Annotated gender for Bloom : Male
210/2138
Annotated gender for Bo "Bandit" Darville : Male
211/2138
Annotated gender for Bob Barnes: Male
212/2138
Annotated gender for Blade: Male
213/2138
Annotated gender for Bob Harris: Male
214/2138
Annotated gender for Bob Arctor: Male
215/2138
Annotated gender for Bob Parr (A.K.A. Mr. Incredible): Male
216/2138
Annotated gender for Bob Wiley: Male
217/2138
Annotated gender for Bob Hughes: Male
218/2138
Annotated gender for Bob Wallace: Male
219/2138
Annotated gender for Bob Wilton : Male
220/2138
Annotated gender for Bobby : Male
221/2138
Annotated gender for Bobby Cooper : Male
222/2138
Annotated gender for Bobby Dupea: Male
223/2138
Annotated gender for Bobby Fischer : Male
2