In [6]:
import pandas as pd
import requests
import time

df = pd.read_csv("malayalam_movie_cast_dataset.csv")
actors = df["actor_name"].dropna().unique()

HEADERS = {"User-Agent": "CastNetBot/1.0 (academic project)"}

def get_wikidata_id_by_name(name):
    url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbsearchentities",
        "search": name,
        "language": "en",
        "format": "json",
        "limit": 1
    }
    r = requests.get(url, params=params, headers=HEADERS, timeout=10).json()
    if "search" in r and len(r["search"]) > 0:
        return r["search"][0]["id"]
    return None

def get_gender_birthyear(wikidata_id):
    try:
        url = f"https://www.wikidata.org/wiki/Special:EntityData/{wikidata_id}.json"
        data = requests.get(url, headers=HEADERS, timeout=10).json()
        entity = data["entities"][wikidata_id]
        claims = entity["claims"]

        gender = None
        if "P21" in claims:
            gender_id = claims["P21"][0]["mainsnak"]["datavalue"]["value"]["id"]
            if gender_id == "Q6581097":
                gender = "Male"
            elif gender_id == "Q6581072":
                gender = "Female"
            else:
                gender = "Other"

        birth_year = None
        if "P569" in claims:
            dob = claims["P569"][0]["mainsnak"]["datavalue"]["value"]["time"]
            birth_year = int(dob[1:5])

        return gender, birth_year
    except:
        return None, None

results = []
i=1
for actor in actors:

    print("Processing:", actor," ",i,"/",len(actors))

    wikidata_id = get_wikidata_id_by_name(actor)

    if wikidata_id:
        gender, birth_year = get_gender_birthyear(wikidata_id)
        print("  ✓ Found:", wikidata_id)
    else:
        gender, birth_year = None, None
        print("  ✗ Not found")

    results.append([actor, gender, birth_year])
    time.sleep(1)
    i = i+1

meta_df = pd.DataFrame(results, columns=["actor_name", "gender", "birth_year"])
meta_df.to_csv("actor_metadata.csv", index=False)

print("Saved actor_metadata.csv")


Processing: Sharbani Mukherjee   1 / 3466
  ✓ Found: Q7489442
Processing: Thampi Antony   2 / 3466
  ✗ Not found
Processing: Prakash Bare   3 / 3466
  ✓ Found: Q7238167
Processing: Jagathy Sreekumar   4 / 3466
  ✓ Found: Q6122250
Processing: V. K. Sreeraman   5 / 3466
  ✓ Found: Q7906091
Processing: Babu Antony   6 / 3466
  ✓ Found: Q3595252
Processing: Augustine   7 / 3466
  ✓ Found: Q1300961
Processing: Indrans   8 / 3466
  ✓ Found: Q6026949
Processing: Irshad   9 / 3466
  ✓ Found: Q37568473
Processing: Vineeth Kumar   10 / 3466
  ✓ Found: Q16239097
Processing: Mullanezhi   11 / 3466
  ✓ Found: Q6934146
Processing: K. B. Venu   12 / 3466
  ✓ Found: Q131760104
Processing: Samvrutha Sunil   13 / 3466
  ✓ Found: Q3595584
Processing: Hima Shankar   14 / 3466
  ✓ Found: Q41634671
Processing: Sona Nair   15 / 3466
  ✓ Found: Q7560721
Processing: Geetha Vijayan   16 / 3466
  ✓ Found: Q5529984
Processing: Valsala Menon   17 / 3466
  ✓ Found: Q16107046
Processing: Sunitha Nedungadi   18 / 346

In [7]:
print(len(actors))

3466
