In [1]:
import pandas as pd
import polars as pl
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from collections import defaultdict
import warnings
from tqdm import tqdm
import pickle
from datetime import datetime
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

tqdm.pandas()
warnings.filterwarnings("ignore")

In [2]:
year = 20
df = f"../data/CampaignFin{year}/indivs{year}.txt"
donors_csv = f"../data/CampaignFin{year}/donors_state{year}.csv"

In [3]:
donors = pd.read_csv(donors_csv)
donors["firstname"] = donors["name"].apply(lambda x: str(x).split(",")[-1].lower().strip())
donors["lastname"] = donors["name"].apply(lambda x: str(x).split(",")[0].lower().strip())
donors.head(10)

Unnamed: 0,contrib_id,name,name_new,orgname,ultorg,realcode,gender,occupation,employer,city,state,total_donated,donation_count,avg_donation,med_donation,firstname,lastname
0,,ACTBLUE,actblue actblue,,,Y4000,,,,WASHINGTON,CA,1261253000.0,25821,48846.03,1000.0,actblue,actblue
1,U00000037041,"BLOOMBERG, MICHAEL R",michael r bloomberg,[Candidate Contribution],,Z9000,M,FOUNDER,BLOOMBERG INC.,NEW YORK,NY,1127731000.0,958,1177172.0,682.5,michael r,bloomberg
2,U00000036521,"STEYER, TOM",tom steyer,[Candidate Contribution],,Z9000,M,PRESIDENTIAL CANDIDATE,SELF-EMPLOYED,SAN FRANCISCO,CA,379478200.0,756,501955.3,2800.0,tom,steyer
3,U00000046841,"MELLON, TIMOTHY",timothy mellon,Investments,,F7000,M,INVESTMENTS,SELF-EMPLOYED,SARATOGA,WY,45133560.0,23,1962328.0,2800.0,timothy,mellon
4,U0000000310A,"ADELSON, MIRIAM",miriam adelson,Adelson Clinic for Drug Abuse Treatment & Rese...,,H3200,F,PHYSICIAN,ADELSON CLINIC,LAS VEGAS,NV,44999550.0,124,362899.6,2800.0,miriam,adelson
5,U00000003101,"ADELSON, SHELDON G",sheldon g adelson,Las Vegas Sands,,G6500,M,CEO,LAS VEGAS SANDS CORPORATION,LAS VEGAS,NV,44847950.0,119,376873.5,2800.0,sheldon g,adelson
6,U00000036901,"UIHLEIN, RICHARD",richard uihlein,Uline Inc,,M7000,M,CEO,ULINE,LAKE FOREST,IL,35364330.0,319,110860.0,2800.0,richard,uihlein
7,U00000036551,"GRIFFIN, KENNETH",kenneth griffin,Citadel LLC,,F2700,M,FOUNDER CEO,CITADEL LLC,CHICAGO,IL,33667630.0,188,179083.2,2800.0,kenneth,griffin
8,U00000003611,"SCHWARZMAN, STEPHEN A",stephen a schwarzman,Blackstone Group,,F2600,M,CHAIRMAN,BLACKSTONE,NEW YORK,NY,33454000.0,226,148026.5,2800.0,stephen a,schwarzman
9,U00000046781,"JURVETSON, KARLA",karla jurvetson,Karla T Jurvetson MD,,H1110,F,PHYSICIAN,SELF,LOS ALTOS,CA,33088100.0,914,36201.42,2800.0,karla,jurvetson


In [4]:
firstname_ratios = pd.read_csv("../output/USIN_firstnames_ratios.csv")
lastname_ratios = pd.read_csv("../output/USIN_lastnames_ratios.csv")

In [5]:
firstname_ratio_dict = firstname_ratios.set_index(firstname_ratios['firstname'].str.strip().str.lower())['ratio'].to_dict()
lastname_ratio_dict = lastname_ratios.set_index(lastname_ratios['lastname'].str.strip().str.lower())['ratio'].to_dict()

donors['combined_ratio'] = (
    donors['firstname'].map(firstname_ratio_dict).fillna(0) + 
    donors['lastname'].map(lastname_ratio_dict).fillna(0)
)

In [6]:
# get the most common indian last names
indian_firstnames = set(firstname_ratios[firstname_ratios["ratio"] >= 8]["firstname"].str.lower())
indian_lastnames = set(lastname_ratios[lastname_ratios["ratio"] >= 5]["lastname"].str.lower())
unindian_firstnames = set(firstname_ratios[firstname_ratios["ratio"] <= 0.05]["firstname"].str.lower())
unindian_lastnames = set(lastname_ratios[lastname_ratios["ratio"] <= 0.05]["lastname"].str.lower())

In [7]:
donors["indian_first"] = np.where(donors["firstname"].str.lower().isin(indian_firstnames) & ~donors["lastname"].str.lower().isin(unindian_lastnames), True, False)
donors["indian_last"] = np.where(donors["lastname"].str.lower().isin(indian_lastnames) & ~donors["firstname"].str.lower().isin(unindian_firstnames), True, False)
donors["indian"] = np.where((donors["combined_ratio"] >= 15) | (donors["indian_first"] == True) | (donors["indian_last"] == True), True, False)
donors["indian"].value_counts()

indian
False    3545523
True       43413
Name: count, dtype: int64

In [8]:
# https://github.com/philipperemy/name-dataset
'''df_us = pd.read_csv("../data/US.csv")
df_us.columns = ['firstname', 'lastname', 'gender', 'ethnicity']
df_us['firstname'] = df_us['firstname'].apply(lambda x: x.split(" ")[0].strip() if " " in str(x) else str(x).strip())
df_us['lastname'] = df_us['lastname'].apply(lambda x: x.split(" ")[-1].strip() if " " in str(x) else str(x).strip())
df_us['name'] = df_us['firstname'].apply(lambda x: x.lower()) + ' ' + df_us['lastname'].apply(lambda x: x.lower())
df_us["indian"] = df_us["ethnicity"].apply(lambda x: False)

df_us = df_us[
    (df_us['firstname'].str.match(r'^[A-Za-z]+$', na=False)) & 
    (df_us['firstname'].str.len() > 1) &
    (df_us['firstname'].str.lower() != 'nan') &
    (df_us['lastname'].str.match(r'^[A-Za-z]+$', na=False)) &
    (df_us['lastname'].str.len() > 1) &
    (df_us['lastname'].str.lower() != 'nan')
]

df_us = df_us[['firstname', 'lastname', 'name', 'indian']]
df_us.head(10)'''

'df_us = pd.read_csv("../data/US.csv")\ndf_us.columns = [\'firstname\', \'lastname\', \'gender\', \'ethnicity\']\ndf_us[\'firstname\'] = df_us[\'firstname\'].apply(lambda x: x.split(" ")[0].strip() if " " in str(x) else str(x).strip())\ndf_us[\'lastname\'] = df_us[\'lastname\'].apply(lambda x: x.split(" ")[-1].strip() if " " in str(x) else str(x).strip())\ndf_us[\'name\'] = df_us[\'firstname\'].apply(lambda x: x.lower()) + \' \' + df_us[\'lastname\'].apply(lambda x: x.lower())\ndf_us["indian"] = df_us["ethnicity"].apply(lambda x: False)\n\ndf_us = df_us[\n    (df_us[\'firstname\'].str.match(r\'^[A-Za-z]+$\', na=False)) & \n    (df_us[\'firstname\'].str.len() > 1) &\n    (df_us[\'firstname\'].str.lower() != \'nan\') &\n    (df_us[\'lastname\'].str.match(r\'^[A-Za-z]+$\', na=False)) &\n    (df_us[\'lastname\'].str.len() > 1) &\n    (df_us[\'lastname\'].str.lower() != \'nan\')\n]\n\ndf_us = df_us[[\'firstname\', \'lastname\', \'name\', \'indian\']]\ndf_us.head(10)'

In [9]:
'''total_names = len(df_us)

firstname_counts = df_us['firstname'].value_counts().reset_index().rename(columns={'count': 'firstname_count'})
firstname_counts['firstname_rate'] = (firstname_counts['firstname_count'] / len(df_us)) * 100

lastname_counts = df_us['lastname'].value_counts().reset_index().rename(columns={'count': 'lastname_count'})
lastname_counts['lastname_rate'] = (lastname_counts['lastname_count'] / len(df_us)) * 100

df_us = df_us.merge(firstname_counts[['firstname', 'firstname_count', 'firstname_rate']], on='firstname', how='left')
df_us = df_us.merge(lastname_counts[['lastname', 'lastname_count', 'lastname_rate']], on='lastname', how='left')
df_us'''

"total_names = len(df_us)\n\nfirstname_counts = df_us['firstname'].value_counts().reset_index().rename(columns={'count': 'firstname_count'})\nfirstname_counts['firstname_rate'] = (firstname_counts['firstname_count'] / len(df_us)) * 100\n\nlastname_counts = df_us['lastname'].value_counts().reset_index().rename(columns={'count': 'lastname_count'})\nlastname_counts['lastname_rate'] = (lastname_counts['lastname_count'] / len(df_us)) * 100\n\ndf_us = df_us.merge(firstname_counts[['firstname', 'firstname_count', 'firstname_rate']], on='firstname', how='left')\ndf_us = df_us.merge(lastname_counts[['lastname', 'lastname_count', 'lastname_rate']], on='lastname', how='left')\ndf_us"

In [10]:
'''# https://github.com/philipperemy/name-dataset
df_indian = pd.read_csv("../data/IN.csv")
df_indian.columns = ['firstname', 'lastname', 'gender', 'ethnicity']
df_indian['firstname'] = df_indian['firstname'].apply(lambda x: x.split(" ")[0].strip() if " " in str(x) else str(x).strip())
df_indian['lastname'] = df_indian['lastname'].apply(lambda x: x.split(" ")[-1].strip() if " " in str(x) else str(x).strip())
df_indian['name'] = df_indian['firstname'].apply(lambda x: x.lower()) + ' ' + df_indian['lastname'].apply(lambda x: x.lower())
df_indian["indian"] = df_indian["ethnicity"].apply(lambda x: True)

df_indian = df_indian[
    (df_indian['firstname'].str.match(r'^[A-Za-z]+$', na=False)) & 
    (df_indian['firstname'].str.len() > 1) &
    (df_indian['firstname'].str.lower() != 'nan') &
    (df_indian['lastname'].str.match(r'^[A-Za-z]+$', na=False)) &
    (df_indian['lastname'].str.len() > 1) &
    (df_indian['lastname'].str.lower() != 'nan')
]

df_indian = df_indian[['firstname', 'lastname', 'name', 'indian']]
df_indian.head(10)'''

'# https://github.com/philipperemy/name-dataset\ndf_indian = pd.read_csv("../data/IN.csv")\ndf_indian.columns = [\'firstname\', \'lastname\', \'gender\', \'ethnicity\']\ndf_indian[\'firstname\'] = df_indian[\'firstname\'].apply(lambda x: x.split(" ")[0].strip() if " " in str(x) else str(x).strip())\ndf_indian[\'lastname\'] = df_indian[\'lastname\'].apply(lambda x: x.split(" ")[-1].strip() if " " in str(x) else str(x).strip())\ndf_indian[\'name\'] = df_indian[\'firstname\'].apply(lambda x: x.lower()) + \' \' + df_indian[\'lastname\'].apply(lambda x: x.lower())\ndf_indian["indian"] = df_indian["ethnicity"].apply(lambda x: True)\n\ndf_indian = df_indian[\n    (df_indian[\'firstname\'].str.match(r\'^[A-Za-z]+$\', na=False)) & \n    (df_indian[\'firstname\'].str.len() > 1) &\n    (df_indian[\'firstname\'].str.lower() != \'nan\') &\n    (df_indian[\'lastname\'].str.match(r\'^[A-Za-z]+$\', na=False)) &\n    (df_indian[\'lastname\'].str.len() > 1) &\n    (df_indian[\'lastname\'].str.lower() !=

In [11]:
'''total_names = len(df_indian)

firstname_counts = df_indian['firstname'].value_counts().reset_index().rename(columns={'count': 'firstname_count'})
firstname_counts['firstname_rate'] = (firstname_counts['firstname_count'] / len(df_indian)) * 100

lastname_counts = df_indian['lastname'].value_counts().reset_index().rename(columns={'count': 'lastname_count'})
lastname_counts['lastname_rate'] = (lastname_counts['lastname_count'] / len(df_indian)) * 100

df_indian = df_indian.merge(firstname_counts[['firstname', 'firstname_count', 'firstname_rate']], on='firstname', how='left')
df_indian = df_indian.merge(lastname_counts[['lastname', 'lastname_count', 'lastname_rate']], on='lastname', how='left')
df_indian'''

"total_names = len(df_indian)\n\nfirstname_counts = df_indian['firstname'].value_counts().reset_index().rename(columns={'count': 'firstname_count'})\nfirstname_counts['firstname_rate'] = (firstname_counts['firstname_count'] / len(df_indian)) * 100\n\nlastname_counts = df_indian['lastname'].value_counts().reset_index().rename(columns={'count': 'lastname_count'})\nlastname_counts['lastname_rate'] = (lastname_counts['lastname_count'] / len(df_indian)) * 100\n\ndf_indian = df_indian.merge(firstname_counts[['firstname', 'firstname_count', 'firstname_rate']], on='firstname', how='left')\ndf_indian = df_indian.merge(lastname_counts[['lastname', 'lastname_count', 'lastname_rate']], on='lastname', how='left')\ndf_indian"

In [12]:
'''df_combined = pd.concat([df_us, df_indian], ignore_index=True)
df_combined.to_csv("../output/USIN.csv", index=False)'''

'df_combined = pd.concat([df_us, df_indian], ignore_index=True)\ndf_combined.to_csv("../output/USIN.csv", index=False)'

In [13]:
df = pd.read_csv("../output/USIN.csv")
df

Unnamed: 0,firstname,lastname,name,indian,firstname_count,firstname_rate,lastname_count,lastname_rate
0,Brandon,Sylvester,brandon sylvester,False,58421,0.189127,1272,0.004118
1,Chris,Toussaint,chris toussaint,False,131039,0.424215,1691,0.005474
2,Willie,Gotti,willie gotti,False,10987,0.035568,693,0.002243
3,Cristobal,Corona,cristobal corona,False,2640,0.008547,9672,0.031311
4,Wilmer,Diaz,wilmer diaz,False,4269,0.013820,91634,0.296648
...,...,...,...,...,...,...,...,...
36734983,Vikas,Chakchanpur,vikas chakchanpur,True,8871,0.151765,1,0.000017
36734984,Dipu,Gupta,dipu gupta,True,1526,0.026107,43396,0.742419
36734985,Riya,Naharwal,riya naharwal,True,6367,0.108927,8,0.000137
36734986,Jashandeep,Hanjra,jashandeep hanjra,True,17,0.000291,43,0.000736


In [14]:
def create_train_features(train_data_label, output_file_path="USIN_features.csv"):
    # --------------------
    # Load Data
    # --------------------
    
    path = f"../output/yearly/donors_{train_data_label}_pred_lastname.csv"
    train_data = pd.read_csv(path)
        
    train_data['id'] = range(1, len(train_data) + 1)
    
    # Clean data
    train_data['firstname'] = train_data['firstname'].apply(lambda x: str(x).split(" ")[0])
    train_data = train_data.dropna(subset=['firstname', 'lastname', 'indian'])
    
    # --------------------
    # Cutpoints
    # --------------------
    
    c = 1
    cutl = 10
    
    # --------------------
    # Feature Creation
    # --------------------
    
    # 1. First four letters of the first/last name:
    train_data['first_name_f4'] = train_data['firstname'].str[:4]
    train_data['last_name_f4'] = train_data['lastname'].str[:4]
    
    # First Names - Indian probability
    indian_count_fn = train_data[train_data['indian'] == True].groupby('first_name_f4').size()
    indian_count_fn.name = 'pop_indian_f4'
    train_data = train_data.merge(indian_count_fn, how='left', on='first_name_f4')
    
    mean_pop = train_data.groupby('first_name_f4')['pop_indian_f4'].transform('mean')
    count = train_data.groupby('first_name_f4')['pop_indian_f4'].transform('count')
    train_data['pop_fn_indian_f4'] = mean_pop / (count + c)
    
    # Last Names - Indian probability
    indian_count_ln = train_data[train_data['indian'] == True].groupby('last_name_f4').size()
    indian_count_ln.name = 'pop_indian_f4_ln'
    train_data = train_data.merge(indian_count_ln, how='left', on='last_name_f4')
    
    mean_pop = train_data.groupby('last_name_f4')['pop_indian_f4_ln'].transform('mean')
    count = train_data.groupby('last_name_f4')['pop_indian_f4_ln'].transform('count')
    train_data['pop_ln_indian_f4'] = mean_pop / count
    
    train_data = train_data.drop(columns=['pop_indian_f4', 'pop_indian_f4_ln'])
    train_data = train_data.fillna(0)
    
    # 2. Last four letters of the first/last name:
    train_data['first_name_l4'] = train_data['firstname'].str[-4:]
    train_data['last_name_l4'] = train_data['lastname'].str[-4:]
    
    # First Names - Indian probability
    indian_count_fn = train_data[train_data['indian'] == True].groupby('first_name_l4').size()
    indian_count_fn.name = 'pop_indian_l4'
    train_data = train_data.merge(indian_count_fn, how='left', on='first_name_l4')
    
    mean_pop = train_data.groupby('first_name_l4')['pop_indian_l4'].transform('mean')
    count = train_data.groupby('first_name_l4')['pop_indian_l4'].transform('count')
    train_data['pop_fn_indian_l4'] = mean_pop / (count + c)
    
    # Last Names - Indian probability
    indian_count_ln = train_data[train_data['indian'] == True].groupby('last_name_l4').size()
    indian_count_ln.name = 'pop_indian_l4_ln'
    train_data = train_data.merge(indian_count_ln, how='left', on='last_name_l4')
    
    mean_pop = train_data.groupby('last_name_l4')['pop_indian_l4_ln'].transform('mean')
    count = train_data.groupby('last_name_l4')['pop_indian_l4_ln'].transform('count')
    train_data['pop_ln_indian_l4'] = mean_pop / (count + c)
    
    train_data = train_data.drop(columns=['pop_indian_l4', 'pop_indian_l4_ln'])
    train_data = train_data.fillna(0)
    
    # 3. Full first/last name:
    # First Names - Indian probability
    indian_count_fn = train_data[train_data['indian'] == True].groupby('firstname').size()
    indian_count_fn.name = 'pop_indian_fn'
    train_data = train_data.merge(indian_count_fn, how='left', on='firstname')
    
    mean_pop = train_data.groupby('firstname')['pop_indian_fn'].transform('mean')
    count = train_data.groupby('firstname')['pop_indian_fn'].transform('count')
    train_data['pop_fn_indian'] = mean_pop / (count + c)
    
    # Last Names - Indian probability
    indian_count_ln = train_data[train_data['indian'] == True].groupby('lastname').size()
    indian_count_ln.name = 'pop_indian_ln'
    train_data = train_data.merge(indian_count_ln, how='left', on='lastname')
    
    mean_pop = train_data.groupby('lastname')['pop_indian_ln'].transform('mean')
    count = train_data.groupby('lastname')['pop_indian_ln'].transform('count')
    train_data['pop_ln_indian'] = mean_pop / (count + c)
    
    train_data = train_data.drop(columns=['pop_indian_fn', 'pop_indian_ln'])
    train_data = train_data.fillna(0)
    
    # Indicator Low Frequency of Name:
    train_data['first_name_low'] = (train_data.groupby('firstname')['firstname'].transform('count') < cutl).astype(int)
    train_data['last_name_low'] = (train_data.groupby('lastname')['lastname'].transform('count') < cutl).astype(int)
    
    # Best Evidence
    train_data['best_evidence_indian'] = train_data[['pop_ln_indian', 'pop_fn_indian']].max(axis=1)
    
    # Select final columns
    final_columns = [
        'id', 'firstname', 'lastname', 'indian', 
        'first_name_f4', 'first_name_l4',
        'last_name_f4', 'last_name_l4',
        'pop_ln_indian', 'pop_fn_indian',
        'best_evidence_indian',
        'pop_ln_indian_f4', 'pop_fn_indian_f4',
        'pop_ln_indian_l4', 'pop_fn_indian_l4',
        'last_name_low', 'first_name_low'
    ]
    
    train_data = train_data[final_columns]
    
    # Save output
    output_file_path = f"../data/donors{train_data_label}_{output_file_path}"
    train_data.to_csv(output_file_path, index=False)
    
    return train_data

In [15]:
# create_train_features(str(year))
output_path = f"../data/donors{year}_USIN_features.csv"
features = pd.read_csv(output_path)
features

Unnamed: 0,id,firstname,lastname,indian,first_name_f4,first_name_l4,last_name_f4,last_name_l4,pop_ln_indian,pop_fn_indian,best_evidence_indian,pop_ln_indian_f4,pop_fn_indian_f4,pop_ln_indian_l4,pop_fn_indian_l4,last_name_low,first_name_low
0,1,actblue,actblue,False,actb,blue,actb,blue,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1,1
1,2,michael,bloomberg,False,mich,hael,bloo,berg,0.000000,0.001818,0.001818,0.000000,0.001930,0.000404,0.001825,0,0
2,3,tom,steyer,False,tom,tom,stey,eyer,0.000000,0.000903,0.000903,0.000000,0.000903,0.000175,0.000903,0,0
3,4,timothy,mellon,False,timo,othy,mell,llon,0.000000,0.001856,0.001856,0.005252,0.001843,0.044110,0.002094,0,0
4,5,miriam,adelson,False,miri,riam,adel,lson,0.000000,0.000805,0.000805,0.002618,0.000775,0.000659,0.003446,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3588744,3588932,elizabeth,english,False,eliz,beth,engl,lish,0.001623,0.002059,0.002059,0.001106,0.002022,0.002060,0.002251,0,0
3588745,3588933,linda,rapp,False,lind,inda,rapp,rapp,0.000000,0.001796,0.001796,0.000000,0.001779,0.000000,0.002167,0,0
3588746,3588934,julie,oldham,False,juli,ulie,oldh,dham,0.000000,0.001888,0.001888,0.000000,0.002424,0.009736,0.001882,0,0
3588747,3588935,nancy,martin,False,nanc,ancy,mart,rtin,0.000873,0.001560,0.001560,0.000792,0.001580,0.000780,0.001601,0,0


In [28]:
def process_data_set_indian(features, num_rows=100000, full_data=False, first_name_encoder=None, last_name_encoder=None):

    features = features.dropna(subset=['firstname', 'lastname', 'indian'])
    features.fillna(0, inplace=True)
    
    if not full_data:
        features = features.sample(n=num_rows, random_state=42)
    
    features.reset_index(drop=True, inplace=True)
    
    # Preprocess name embeddings upfront
    if first_name_encoder is None:
        first_name_encoder = LabelEncoder()
        X_first_name = first_name_encoder.fit_transform(features['firstname'].fillna('unknown'))
    else:
        # For test data, use existing encoder and map unseen names to 'unknown'
        try:
            X_first_name = first_name_encoder.transform(features['firstname'].fillna('unknown'))
        except ValueError:
            # Handle unseen names by mapping to 'unknown'
            features['firstname'] = features['firstname'].apply(
                lambda x: x if x in first_name_encoder.classes_ else 'unknown'
            )
            X_first_name = first_name_encoder.transform(features['firstname'])
    
    if last_name_encoder is None:
        last_name_encoder = LabelEncoder()
        X_last_name = last_name_encoder.fit_transform(features['lastname'].fillna('unknown'))
    else:
        # For test data, use existing encoder and map unseen names to 'unknown'
        try:
            X_last_name = last_name_encoder.transform(features['lastname'].fillna('unknown'))
        except ValueError:
            # Handle unseen names by mapping to 'unknown'
            features['lastname'] = features['lastname'].apply(
                lambda x: x if x in last_name_encoder.classes_ else 'unknown'
            )
            X_last_name = last_name_encoder.transform(features['lastname'])
    
    # Add encoded names as features
    features['first_name_encoded'] = X_first_name
    features['last_name_encoded'] = X_last_name
    
    # Keep relevant columns
    main_features = features[['firstname', 'lastname', 'indian', 'id', 'first_name_encoded', 'last_name_encoded']]
    y = main_features['indian'].astype(int)
    X = main_features.drop(['id', 'firstname', 'lastname', 'indian'], axis=1, errors='ignore')
    
    print(f"X shape = {X.shape}, y shape = {y.shape}")
    print(f"Indian count: {y.sum()}, Non-Indian count: {len(y) - y.sum()}")
    
    return X, y, main_features, first_name_encoder, last_name_encoder

In [29]:
X, y, main_features, first_name_encoder, last_name_encoder = process_data_set_indian(features, num_rows=1, full_data=True)
X

X shape = (3587209, 2), y shape = (3587209,)
Indian count: 43220, Non-Indian count: 3543989


Unnamed: 0,first_name_encoded,last_name_encoded
0,656,2160
1,59248,39073
2,88498,383238
3,88136,262404
4,60162,2597
...,...,...
3587204,24747,115509
3587205,51619,325882
3587206,42924,292638
3587207,62628,251703


In [52]:
def train_indian_classifier(train_data_label, X_train, y_train, first_name_encoder, last_name_encoder, epochs=5):
    print("date and time =", datetime.now().strftime("%d/%m/%Y %H:%M:%S"))

    best_loss = 999999999
    patience = 5
    patience_counter = 0
    early_stop = False
    batch_size = 1024
    
    first_name_vocab_size = len(first_name_encoder.classes_)
    last_name_vocab_size = len(last_name_encoder.classes_)
    print(f"First name vocabulary size: {first_name_vocab_size}")
    print(f"Last name vocabulary size: {last_name_vocab_size}")
    
    X_numeric = X_train.values.astype(np.float32)
    print(f"Using {X_numeric.shape[1]} numeric features + character embeddings")
    
    X_numeric_tensor = torch.tensor(X_numeric, dtype=torch.float32)
    y_tensor = torch.tensor(y_train.values.astype(np.float32), dtype=torch.float32)
    first_name_encoded_idx = X_train.columns.get_loc('first_name_encoded')
    last_name_encoded_idx = X_train.columns.get_loc('last_name_encoded')
    
    dataset = TensorDataset(X_numeric_tensor, y_tensor)
    train_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True)
    
    numeric_dim = X_numeric.shape[1] - 2
    embedding_dim = 32
    hidden_dim = 64
    
    first_name_embedding = nn.Embedding(num_embeddings=first_name_vocab_size + 10, embedding_dim=embedding_dim)
    last_name_embedding = nn.Embedding(num_embeddings=last_name_vocab_size + 10, embedding_dim=embedding_dim)
    linear1 = nn.Linear(numeric_dim + 2 * embedding_dim, hidden_dim)
    linear2 = nn.Linear(hidden_dim, 1)
    relu = nn.ReLU()
    sigmoid = nn.Sigmoid()
    
    parameters = list(first_name_embedding.parameters()) + list(last_name_embedding.parameters()) + list(linear1.parameters()) + list(linear2.parameters())
    
    print("date and time =", datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
    
    criterion = torch.nn.BCELoss()
    optimizer = torch.optim.Adam(parameters, lr=0.001)

    iteration_list = []
    loss_list = []
    accuracy_list = []
    average_loss = 0
    iteration = 0

    for epoch in range(int(epochs)):
        correct = 0
        total = 0
        
        for i, (features_batch, labels) in enumerate(train_loader):
            iteration = iteration + 1
            optimizer.zero_grad()
            
            # Split the batch into numeric features and encoded names
            numeric_features = features_batch[:, [idx for idx in range(features_batch.shape[1]) 
                                              if idx not in [first_name_encoded_idx, last_name_encoded_idx]]]
            first_name_ids = features_batch[:, first_name_encoded_idx].long()
            last_name_ids = features_batch[:, last_name_encoded_idx].long()
            
            # Get embeddings
            first_name_emb = first_name_embedding(first_name_ids)
            last_name_emb = last_name_embedding(last_name_ids)
            
            # Combine features
            combined = torch.cat([numeric_features, first_name_emb, last_name_emb], dim=1)
            hidden = relu(linear1(combined))
            outputs = sigmoid(linear2(hidden))
            loss = criterion(outputs.squeeze(), labels)
            
            loss.backward()
            optimizer.step()
            
            predicted = (outputs > 0.5).float()
            total += labels.size(0)
            correct += (predicted.squeeze() == labels).sum().item()
            average_loss += loss.item()
            
            if iteration % 1024 == 0:
                current_loss = average_loss / 1024 if average_loss > 0 else loss.item()
                
                # Check if loss improved
                if current_loss < best_loss:
                    best_loss = current_loss
                    patience_counter = 0
                    torch.save({
                        'first_name_embedding': first_name_embedding.state_dict(),
                        'last_name_embedding': last_name_embedding.state_dict(),
                        'linear1': linear1.state_dict(),
                        'linear2': linear2.state_dict(),
                        'first_name_encoder_classes': first_name_encoder.classes_,
                        'last_name_encoder_classes': last_name_encoder.classes_,
                        'first_name_encoded_idx': first_name_encoded_idx,
                        'last_name_encoded_idx': last_name_encoded_idx,
                        'feature_columns': list(X_train.columns)
                    }, f"./models/indian_classifier_{train_data_label}_best.pt")
                else:
                    patience_counter += 1
                    print(f"No improvement for {patience_counter}/{patience} iterations")
                    
                # Check for early stopping
                if patience_counter >= patience:
                    print(f"Early stopping triggered at iteration {iteration}")
                    early_stop = True
                    break

            # Add this at the end of the epoch loop
            if early_stop:
                break

            if iteration % 1024 == 0:
                iteration_list.append(iteration)
                loss_list.append(average_loss / 1024)
                accuracy = 100 * correct / total
                print("Iteration: {}. Loss: {}. Accuracy: {}.".format(iteration, average_loss / 1024, accuracy))
                accuracy_list.append(accuracy)
                total = 0
                correct = 0
                average_loss = 0

    path = f"./models/indian_classifier_{train_data_label}.pt"
    torch.save({
        'first_name_embedding': first_name_embedding.state_dict(),
        'last_name_embedding': last_name_embedding.state_dict(),
        'linear1': linear1.state_dict(),
        'linear2': linear2.state_dict(),
        'first_name_encoder_classes': first_name_encoder.classes_,
        'last_name_encoder_classes': last_name_encoder.classes_,
        'first_name_encoded_idx': first_name_encoded_idx,
        'last_name_encoded_idx': last_name_encoded_idx,
        'feature_columns': list(X_train.columns)
    }, path)

    plt.plot(iteration_list, loss_list)
    plt.xlabel('Number of Iterations')
    plt.ylabel('Loss on Training Set')
    plt.title('Indian Classifier - Logistic Regression')
    plt.savefig(f"../images/ethnicia_{train_data_label}_loss.png")
    plt.clf()

    plt.plot(iteration_list, accuracy_list)
    plt.xlabel('Number of Iterations')
    plt.ylabel('Accuracy on Training Set')
    plt.title('Indian Classifier - Logistic Regression')
    plt.savefig(f"../images/ethnicia_{train_data_label}_accuracy.png")
    plt.clf()
    
    print("date and time =", datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
    print(f"Training completed. Model saved as: indian_classifier_{train_data_label}.pt")
    
    return {
        'first_name_embedding': first_name_embedding,
        'last_name_embedding': last_name_embedding,
        'linear1': linear1,
        'linear2': linear2
    }

In [53]:
def test_indian_classifier(model_dict, X_test, y_test, first_name_encoder, last_name_encoder):
    
    X_numeric_tensor = torch.tensor(X_test.values.astype(np.float32), dtype=torch.float32)
    y_tensor = torch.tensor(y_test.values.astype(np.float32), dtype=torch.float32)
    
    # Get the indices of encoded name columns from the saved model info
    first_name_encoded_idx = X_test.columns.get_loc('first_name_encoded')
    last_name_encoded_idx = X_test.columns.get_loc('last_name_encoded')
    
    # Set model to eval mode
    print(model_dict)
    model_dict['first_name_embedding'].eval()
    model_dict['last_name_embedding'].eval()
    model_dict['linear1'].eval()
    model_dict['linear2'].eval()
    
    with torch.no_grad():
        
        numeric_features = X_numeric_tensor[:, [idx for idx in range(X_numeric_tensor.shape[1]) 
                                             if idx not in [first_name_encoded_idx, last_name_encoded_idx]]]
        first_name_ids = X_numeric_tensor[:, first_name_encoded_idx].long()
        last_name_ids = X_numeric_tensor[:, last_name_encoded_idx].long()
        
        # Forward pass
        first_name_emb = model_dict['first_name_embedding'](first_name_ids)
        last_name_emb = model_dict['last_name_embedding'](last_name_ids)
        combined = torch.cat([numeric_features, first_name_emb, last_name_emb], dim=1)
        hidden = torch.relu(model_dict['linear1'](combined))
        outputs = torch.sigmoid(model_dict['linear2'](hidden))
        
        predictions = (outputs > 0.5).float()
        correct = (predictions.squeeze() == y_tensor).sum().item()
        total = y_tensor.size(0)
        accuracy = 100 * correct / total
        
        true_positives = ((predictions.squeeze() == 1) & (y_tensor == 1)).sum().item()
        false_positives = ((predictions.squeeze() == 1) & (y_tensor == 0)).sum().item()
        true_negatives = ((predictions.squeeze() == 0) & (y_tensor == 0)).sum().item()
        false_negatives = ((predictions.squeeze() == 0) & (y_tensor == 1)).sum().item()
        
        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        test_metrics = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1_score,
            'true_positives': true_positives,
            'false_positives': false_positives,
            'true_negatives': true_negatives,
            'false_negatives': false_negatives,
            'total_samples': total
        }
    
    return test_metrics

In [54]:
def train_and_test_indian_classifier(train_data_label, X, y, test_size=0.3, epochs=50):

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )
    
    print("Starting training...")
    model_dict = train_indian_classifier(train_data_label, X_train, y_train, first_name_encoder, last_name_encoder, epochs=epochs)
    
    print("\nTesting model...")
    test_metrics = test_indian_classifier(model_dict, X_test, y_test, first_name_encoder, last_name_encoder)
    
    # Print results
    print("\n" + "="*50)
    print("TEST RESULTS:")
    print("="*50)
    print(f"Accuracy: {test_metrics['accuracy']:.2f}%")
    print(f"Precision: {test_metrics['precision']:.4f}")
    print(f"Recall: {test_metrics['recall']:.4f}")
    print(f"F1 Score: {test_metrics['f1_score']:.4f}")
    print(f"True Positives: {test_metrics['true_positives']}")
    print(f"False Positives: {test_metrics['false_positives']}")
    print(f"True Negatives: {test_metrics['true_negatives']}")
    print(f"False Negatives: {test_metrics['false_negatives']}")
    
    return model_dict, test_metrics

In [51]:
model, metrics = train_and_test_indian_classifier(str(year), X, y)

Starting training...
date and time = 25/08/2025 13:45:25
First name vocabulary size: 98623
Last name vocabulary size: 450582
Using 2 numeric features + character embeddings
date and time = 25/08/2025 13:45:26
Iteration: 1024. Loss: 0.08619794135847769. Accuracy: 98.06890487670898.
Iteration: 2048. Loss: 0.04049863769159856. Accuracy: 98.97441864013672.
Iteration: 3072. Loss: 0.030566009686936013. Accuracy: 99.20076484248789.
Iteration: 4096. Loss: 0.025415988099211972. Accuracy: 99.28693771362305.
Iteration: 5120. Loss: 0.022274870089177057. Accuracy: 99.47064836448598.
Iteration: 6144. Loss: 0.018684919646148046. Accuracy: 99.47052001953125.
Iteration: 7168. Loss: 0.017270058229769347. Accuracy: 99.50103759765625.
Iteration: 8192. Loss: 0.014756163797528643. Accuracy: 99.58627982442977.
Iteration: 9216. Loss: 0.013528279008596655. Accuracy: 99.6027946472168.
Iteration: 10240. Loss: 0.012004251794678567. Accuracy: 99.68216084988318.
Iteration: 11264. Loss: 0.010408573815197997. Accurac

<Figure size 640x480 with 0 Axes>