In [1]:
import pandas as pd
import polars as pl
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from collections import defaultdict
import warnings
from tqdm import tqdm
import pickle

tqdm.pandas()
warnings.filterwarnings("ignore")

In [204]:
year = 20
df = f"./data/CampaignFin{year}/indivs{year}.txt"
donors_csv = f"./data/CampaignFin{year}/donors_state{year}.csv"

In [205]:
donors = pd.read_csv(donors_csv)
donors["firstname"] = donors["name"].apply(lambda x: str(x).split(",")[-1].lower().strip())
donors["lastname"] = donors["name"].apply(lambda x: str(x).split(",")[0].lower().strip())
donors.head(10)

Unnamed: 0,contrib_id,name,name_new,orgname,ultorg,realcode,gender,occupation,employer,city,state,total_donated,donation_count,avg_donation,med_donation,firstname,lastname
0,,ACTBLUE,actblue actblue,,,Y4000,,,,WASHINGTON,CA,1261253000.0,25821,48846.03,1000.0,actblue,actblue
1,U00000037041,"BLOOMBERG, MICHAEL R",michael r bloomberg,[Candidate Contribution],,Z9000,M,FOUNDER,BLOOMBERG INC.,NEW YORK,NY,1127731000.0,958,1177172.0,682.5,michael r,bloomberg
2,U00000036521,"STEYER, TOM",tom steyer,[Candidate Contribution],,Z9000,M,PRESIDENTIAL CANDIDATE,SELF-EMPLOYED,SAN FRANCISCO,CA,379478200.0,756,501955.3,2800.0,tom,steyer
3,U00000046841,"MELLON, TIMOTHY",timothy mellon,Investments,,F7000,M,INVESTMENTS,SELF-EMPLOYED,SARATOGA,WY,45133560.0,23,1962328.0,2800.0,timothy,mellon
4,U0000000310A,"ADELSON, MIRIAM",miriam adelson,Adelson Clinic for Drug Abuse Treatment & Rese...,,H3200,F,PHYSICIAN,ADELSON CLINIC,LAS VEGAS,NV,44999550.0,124,362899.6,2800.0,miriam,adelson
5,U00000003101,"ADELSON, SHELDON G",sheldon g adelson,Las Vegas Sands,,G6500,M,CEO,LAS VEGAS SANDS CORPORATION,LAS VEGAS,NV,44847950.0,119,376873.5,2800.0,sheldon g,adelson
6,U00000036901,"UIHLEIN, RICHARD",richard uihlein,Uline Inc,,M7000,M,CEO,ULINE,LAKE FOREST,IL,35364330.0,319,110860.0,2800.0,richard,uihlein
7,U00000036551,"GRIFFIN, KENNETH",kenneth griffin,Citadel LLC,,F2700,M,FOUNDER CEO,CITADEL LLC,CHICAGO,IL,33667630.0,188,179083.2,2800.0,kenneth,griffin
8,U00000003611,"SCHWARZMAN, STEPHEN A",stephen a schwarzman,Blackstone Group,,F2600,M,CHAIRMAN,BLACKSTONE,NEW YORK,NY,33454000.0,226,148026.5,2800.0,stephen a,schwarzman
9,U00000046781,"JURVETSON, KARLA",karla jurvetson,Karla T Jurvetson MD,,H1110,F,PHYSICIAN,SELF,LOS ALTOS,CA,33088100.0,914,36201.42,2800.0,karla,jurvetson


In [4]:
# https://github.com/philipperemy/name-dataset
df_us = pd.read_csv("./data/US.csv")
df_us.columns = ['firstname', 'lastname', 'gender', 'ethnicity']
df_us['firstname'] = df_us['firstname'].apply(lambda x: x.split(" ")[0].strip() if " " in str(x) else str(x).strip())
df_us['lastname'] = df_us['lastname'].apply(lambda x: x.split(" ")[-1].strip() if " " in str(x) else str(x).strip())
df_us['name'] = df_us['firstname'].apply(lambda x: x.lower()) + ' ' + df_us['lastname'].apply(lambda x: x.lower())
df_us["indian"] = df_us["ethnicity"].apply(lambda x: False)

df_us = df_us[
    (df_us['firstname'].str.match(r'^[A-Za-z]+$', na=False)) & 
    (df_us['firstname'].str.len() > 1) &
    (df_us['firstname'].str.lower() != 'nan') &
    (df_us['lastname'].str.match(r'^[A-Za-z]+$', na=False)) &
    (df_us['lastname'].str.len() > 1) &
    (df_us['lastname'].str.lower() != 'nan')
]

df_us = df_us[['firstname', 'lastname', 'name', 'indian']]
df_us.head(10)

Unnamed: 0,firstname,lastname,name,indian
0,Brandon,Sylvester,brandon sylvester,False
1,Chris,Toussaint,chris toussaint,False
2,Willie,Gotti,willie gotti,False
3,Cristobal,Corona,cristobal corona,False
4,Wilmer,Diaz,wilmer diaz,False
5,Angela,Renee,angela renee,False
6,Duke,Duke,duke duke,False
7,Gonzales,Gricelda,gonzales gricelda,False
8,Jaren,Celestine,jaren celestine,False
9,Nathaniel,Smith,nathaniel smith,False


In [5]:
total_names = len(df_us)

firstname_counts = df_us['firstname'].value_counts().reset_index().rename(columns={'count': 'firstname_count'})
firstname_counts['firstname_rate'] = (firstname_counts['firstname_count'] / len(df_us)) * 100

lastname_counts = df_us['lastname'].value_counts().reset_index().rename(columns={'count': 'lastname_count'})
lastname_counts['lastname_rate'] = (lastname_counts['lastname_count'] / len(df_us)) * 100

df_us = df_us.merge(firstname_counts[['firstname', 'firstname_count', 'firstname_rate']], on='firstname', how='left')
df_us = df_us.merge(lastname_counts[['lastname', 'lastname_count', 'lastname_rate']], on='lastname', how='left')
df_us

Unnamed: 0,firstname,lastname,name,indian,firstname_count,firstname_rate,lastname_count,lastname_rate
0,Brandon,Sylvester,brandon sylvester,False,58421,0.189127,1272,0.004118
1,Chris,Toussaint,chris toussaint,False,131039,0.424215,1691,0.005474
2,Willie,Gotti,willie gotti,False,10987,0.035568,693,0.002243
3,Cristobal,Corona,cristobal corona,False,2640,0.008547,9672,0.031311
4,Wilmer,Diaz,wilmer diaz,False,4269,0.013820,91634,0.296648
...,...,...,...,...,...,...,...,...
30889765,Albert,Hall,albert hall,False,15251,0.049372,29195,0.094513
30889766,Dede,Love,dede love,False,1043,0.003377,21827,0.070661
30889767,Kylie,Smith,kylie smith,False,4148,0.013428,179391,0.580746
30889768,Koy,Smith,koy smith,False,109,0.000353,179391,0.580746


In [6]:
# https://github.com/philipperemy/name-dataset
df_indian = pd.read_csv("./data/IN.csv")
df_indian.columns = ['firstname', 'lastname', 'gender', 'ethnicity']
df_indian['firstname'] = df_indian['firstname'].apply(lambda x: x.split(" ")[0].strip() if " " in str(x) else str(x).strip())
df_indian['lastname'] = df_indian['lastname'].apply(lambda x: x.split(" ")[-1].strip() if " " in str(x) else str(x).strip())
df_indian['name'] = df_indian['firstname'].apply(lambda x: x.lower()) + ' ' + df_indian['lastname'].apply(lambda x: x.lower())
df_indian["indian"] = df_indian["ethnicity"].apply(lambda x: True)

df_indian = df_indian[
    (df_indian['firstname'].str.match(r'^[A-Za-z]+$', na=False)) & 
    (df_indian['firstname'].str.len() > 1) &
    (df_indian['firstname'].str.lower() != 'nan') &
    (df_indian['lastname'].str.match(r'^[A-Za-z]+$', na=False)) &
    (df_indian['lastname'].str.len() > 1) &
    (df_indian['lastname'].str.lower() != 'nan')
]

df_indian = df_indian[['firstname', 'lastname', 'name', 'indian']]
df_indian.head(10)

Unnamed: 0,firstname,lastname,name,indian
0,Kapil,Kumar,kapil kumar,True
1,Mitali,Aggarwal,mitali aggarwal,True
2,Vikas,Jangra,vikas jangra,True
3,Ravi,Lungay,ravi lungay,True
4,Jagat,Yadav,jagat yadav,True
5,Kajal,Kumari,kajal kumari,True
6,Vimal,Kumar,vimal kumar,True
7,Surya,Singham,surya singham,True
8,Kawaljarnail,Hanjra,kawaljarnail hanjra,True
9,Kuldeep,Rathor,kuldeep rathor,True


In [7]:
total_names = len(df_indian)

firstname_counts = df_indian['firstname'].value_counts().reset_index().rename(columns={'count': 'firstname_count'})
firstname_counts['firstname_rate'] = (firstname_counts['firstname_count'] / len(df_indian)) * 100

lastname_counts = df_indian['lastname'].value_counts().reset_index().rename(columns={'count': 'lastname_count'})
lastname_counts['lastname_rate'] = (lastname_counts['lastname_count'] / len(df_indian)) * 100

df_indian = df_indian.merge(firstname_counts[['firstname', 'firstname_count', 'firstname_rate']], on='firstname', how='left')
df_indian = df_indian.merge(lastname_counts[['lastname', 'lastname_count', 'lastname_rate']], on='lastname', how='left')
df_indian

Unnamed: 0,firstname,lastname,name,indian,firstname_count,firstname_rate,lastname_count,lastname_rate
0,Kapil,Kumar,kapil kumar,True,4041,0.069133,396702,6.786779
1,Mitali,Aggarwal,mitali aggarwal,True,639,0.010932,1372,0.023472
2,Vikas,Jangra,vikas jangra,True,8871,0.151765,1018,0.017416
3,Ravi,Lungay,ravi lungay,True,27501,0.470487,1,0.000017
4,Jagat,Yadav,jagat yadav,True,511,0.008742,92416,1.581053
...,...,...,...,...,...,...,...,...
5845213,Vikas,Chakchanpur,vikas chakchanpur,True,8871,0.151765,1,0.000017
5845214,Dipu,Gupta,dipu gupta,True,1526,0.026107,43396,0.742419
5845215,Riya,Naharwal,riya naharwal,True,6367,0.108927,8,0.000137
5845216,Jashandeep,Hanjra,jashandeep hanjra,True,17,0.000291,43,0.000736


In [8]:
all_firstnames = pd.concat([
    df_indian['firstname'].drop_duplicates(),
    df_us['firstname'].drop_duplicates()
]).drop_duplicates()

all_lastnames = pd.concat([
    df_indian['lastname'].drop_duplicates(),
    df_us['lastname'].drop_duplicates()
]).drop_duplicates()

In [None]:
chunk_size = 100000
firstname_ratio_dfs, lastname_ratio_dfs = [], []

for i in tqdm(range(0, len(all_lastnames), chunk_size)):
    
    chunk_firstnames = all_firstnames.iloc[i:i + chunk_size]
    chunk_lastnames = all_lastnames.iloc[i:i + chunk_size]
    
    indian_chunk_first = df_indian[df_indian['firstname'].isin(chunk_firstnames)]
    indian_rates_first = indian_chunk_first[['firstname', 'firstname_count', 'firstname_rate']].drop_duplicates()
    indian_chunk_last = df_indian[df_indian['lastname'].isin(chunk_lastnames)]
    indian_rates_last = indian_chunk_last[['lastname', 'lastname_count', 'lastname_rate']].drop_duplicates()
    
    us_chunk_first = df_us[df_us['firstname'].isin(chunk_firstnames)]
    us_rates_first = us_chunk_first[['firstname', 'firstname_count', 'firstname_rate']].drop_duplicates()
    us_chunk_last = df_us[df_us['lastname'].isin(chunk_lastnames)]
    us_rates_last = us_chunk_last[['lastname', 'lastname_count', 'lastname_rate']].drop_duplicates()
    
    merged_first = pd.merge(indian_rates_first, us_rates_first, on='firstname', how='outer', suffixes=('_india', '_us')).fillna(0)
    merged_last = pd.merge(indian_rates_last, us_rates_last, on='lastname', how='outer', suffixes=('_india', '_us')).fillna(0)
    
    merged_first['ratio'] = (merged_first['firstname_rate_india'] / merged_first['firstname_rate_us']).replace(float('inf'), 1000)
    merged_last['ratio'] = (merged_last['lastname_rate_india'] / merged_last['lastname_rate_us']).replace(float('inf'), 1000)
    
    firstname_ratio_dfs.append(merged_first)
    lastname_ratio_dfs.append(merged_last)

100%|██████████| 20/20 [05:35<00:00, 16.78s/it]


In [10]:
firstname_ratios = pd.concat(firstname_ratio_dfs)
firstname_ratios.sort_values(by='ratio', ascending=False)

Unnamed: 0,firstname,firstname_count_india,firstname_rate_india,firstname_count_us,firstname_rate_us,ratio
17123,Chhotu,1924.0,0.032916,1.0,0.000003,10167.613506
95032,Vasava,1648.0,0.028194,1.0,0.000003,8709.057722
85885,Solanki,1265.0,0.021642,1.0,0.000003,6685.047341
26138,Gamit,1021.0,0.017467,1.0,0.000003,5395.599475
57022,Navnath,574.0,0.009820,1.0,0.000003,3033.373260
...,...,...,...,...,...,...
27363,Eviany,0.0,0.000000,1.0,0.000003,0.000000
27364,Evichiz,0.0,0.000000,1.0,0.000003,0.000000
27365,Evidence,0.0,0.000000,4.0,0.000013,0.000000
27366,Eviee,0.0,0.000000,11.0,0.000036,0.000000


In [11]:
lastname_ratios = pd.concat(lastname_ratio_dfs)
lastname_ratios.sort_values(by='ratio', ascending=False)

Unnamed: 0,lastname,lastname_count_india,lastname_rate_india,lastname_count_us,lastname_rate_us,ratio
71725,Rajbhar,2151.0,0.036799,1.0,0.000003,11367.222791
64235,Oraon,2129.0,0.036423,1.0,0.000003,11250.961098
92747,Tudu,1522.0,0.026038,1.0,0.000003,8043.195299
95160,Vasava,4389.0,0.075087,3.0,0.000010,7731.402577
66609,Paswan,7105.0,0.121552,6.0,0.000019,6257.873697
...,...,...,...,...,...,...
23493,Dlifegaurd,0.0,0.000000,1.0,0.000003,0.000000
23492,Dlgado,0.0,0.000000,7.0,0.000023,0.000000
23491,Dleofjp,0.0,0.000000,1.0,0.000003,0.000000
23490,Dlenaime,0.0,0.000000,1.0,0.000003,0.000000


In [12]:
firstname_ratios.to_csv("./output/USIN_firstnames_ratios.csv", index=False)
lastname_ratios.to_csv("./output/USIN_lastnames_ratios.csv", index=False)

In [211]:
firstname_ratio_dict = firstname_ratios.set_index(firstname_ratios['firstname'].str.strip().str.lower())['ratio'].to_dict()
lastname_ratio_dict = lastname_ratios.set_index(lastname_ratios['lastname'].str.strip().str.lower())['ratio'].to_dict()

donors['combined_ratio'] = (
    donors['firstname'].map(firstname_ratio_dict).fillna(0) + 
    donors['lastname'].map(lastname_ratio_dict).fillna(0)
)

In [212]:
sample = pd.read_csv("./manual/top_donors20_pred.csv")
sample["firstname"] = sample["name"].apply(lambda x: x.split(",")[-1] if "," in x else str(x))
sample["lastname"] = sample["name"].apply(lambda x: x.split(",")[0] if "," in x else str(x))
sample["name_new"] = sample["firstname"].apply(lambda x: x.lower()) + ' ' + sample["lastname"].apply(lambda x: x.lower())
sample = sample[['name', 'lastname', 'name_new', 'indian']]
sample.head(10)

Unnamed: 0,name,lastname,name_new,indian
0,"for america, amy",for america,amy for america,
1,"bloomberg, michael",bloomberg,michael bloomberg,False
2,"steyer, thomas f",steyer,thomas f steyer,False
3,"mellon, timothy",mellon,timothy mellon,False
4,"adelson, miriam o dr",adelson,miriam o dr adelson,False
5,"adelson, sheldon g mr",adelson,sheldon g mr adelson,False
6,"uihlein, richard e",uihlein,richard e uihlein,False
7,"griffin, kenneth",griffin,kenneth griffin,False
8,"schwarzman, stephen a",schwarzman,stephen a schwarzman,False
9,"jurvetson, karla",jurvetson,karla jurvetson,False


In [215]:
# get the most common indian last names
indian_firstnames = set(firstname_ratios[firstname_ratios["ratio"] >= 8]["firstname"].str.lower())
indian_lastnames = set(lastname_ratios[lastname_ratios["ratio"] >= 5]["lastname"].str.lower())
unindian_firstnames = set(firstname_ratios[firstname_ratios["ratio"] <= 0.05]["firstname"].str.lower())
unindian_lastnames = set(lastname_ratios[lastname_ratios["ratio"] <= 0.05]["lastname"].str.lower())

# add the known indian-american last names
known_indian_lastnames = sample[sample["indian"] == True]["lastname"].str.lower().tolist()
indian_lastnames.update(known_indian_lastnames)

In [216]:
donors["ethnic_first"] = np.where(donors["firstname"].str.lower().isin(indian_firstnames) & ~donors["lastname"].str.lower().isin(unindian_lastnames), "ind", "not")
donors["ethnic_last"] = np.where(donors["lastname"].str.lower().isin(indian_lastnames) & ~donors["firstname"].str.lower().isin(unindian_firstnames), "ind", "not")
donors["ethnic_new"] = np.where((donors["combined_ratio"] >= 15) | (donors["ethnic_first"] == "ind") | (donors["ethnic_last"] == "ind"), "ind", "not")
donors["ethnic_new"].value_counts()

ethnic_new
not    3543091
ind      45845
Name: count, dtype: int64

In [270]:
# add missed definitive indian names
donors.loc[donors["lastname"].isin(["rao", "amin", "sandhu", "dhillon", "chaudhry", "kapoor", 
                         "rahman", "mahal", "trivedi", "madan", "rashid", "khanna",
                         "patel", "grewal", "dube", "sidhu", "walia", "kaur", "brar",
                         "sabharwal", "chaudhry", "choudhry", "choudry", "kapadia", "reddy",
                         "jain", "narula", "chandra", "shah", "bedi", "gulati", "chandi",
                         "juneja", "khosla", "mehta", "dhaliwal", "mohan", "satter", "chowdhury", "chatterjee",
                         "chahal", "luthra", "arora", "malhotra", "randhawa", "sethi", "gandhi",
                         "parekh", "bhasin", "anand", "kothari", "agrawal"]), "ethnic_new"] = "ind"

# remove muslim names and names that are commonly american
donors.loc[donors["firstname"].isin(["peer", "goldene", "jaan",
                                     "mohammad", "mohamad", "mohamed", "mohammed"]), "ethnic_new"] = "not"
donors.loc[donors["lastname"].isin(["khan", "hussein", "hussain", "husain", "ali", "ansari", 
                                    "ansary", "alam", "ahmed", "ahmad", "islam", "sheikh",
                                    "salman", "akhtar", "null", "mohammad", "mohamad", "mohamed", "mohammed",
                                    "aslam", "shahid", "junaid", "quazi", "qaazi", "siddiqui",
                                    "qureshi", "raza",
                                    "paul", "swain", "ingle", "routh", "crozer", "kale",
                                    "roy", "harman", "gund", "sifaris", "varis",
                                    "more", "rander", "rather", "sable", "grover", "boy", 
                                    "mochary", "vind", "rock", "perwin", "kant", "karsh",
                                    "mule", "riaz", "boyd", "liberman", "monis"]), "ethnic_new"] = "not"
donors["ethnic_new"].value_counts()

ethnic_new
not    3545711
ind      43225
Name: count, dtype: int64

In [None]:
# donors.sort_values(by="total_donated", ascending=False).loc[:9999, "ethnic"] = sample["ethnic"]
# donors.to_csv(f"./output/donors_state{year}_pred_lastname.csv", index=False)

In [279]:
donors[donors["ethnic"] == "ind"]["lastname"].value_counts()[:10]

lastname
patel     2725
singh     1148
shah       928
khan       868
gupta      541
reddy      499
kumar      467
sharma     428
rao        414
mehta      315
Name: count, dtype: int64

In [85]:
lastname_ratios[lastname_ratios["lastname"].isin(["Gund", "Perwin", "Bennett", "Roy", "Ingle", "Routh", "Null", "Pai"])]

Unnamed: 0,lastname,lastname_count_india,lastname_rate_india,lastname_count_us,lastname_rate_us,ratio
10477,Bennett,7.0,0.00012,15409.0,0.049884,0.002401
30673,Gund,118.0,0.002019,19.0,6.2e-05,32.820287
34901,Ingle,821.0,0.014046,618.0,0.002001,7.02051
63738,Null,721.0,0.012335,659.0,0.002133,5.78181
64830,Pai,159.0,0.00272,565.0,0.001829,1.487177
75976,Routh,376.0,0.006433,183.0,0.000592,10.858022
76000,Roy,34300.0,0.586804,5601.0,0.018132,32.362533
65121,Perwin,10.0,0.000171,0.0,0.0,1000.0


In [227]:
donors[donors["lastname"] == "roy"]

Unnamed: 0,contrib_id,name,name_new,orgname,ultorg,realcode,gender,occupation,employer,city,...,total_donated,donation_count,avg_donation,med_donation,firstname,lastname,ethnic,ethnic_new,ethnic_first,ethnic_last
13810,f9000267988,"ROY, JEAN FRANCOIS",jean francois roy,Ocean Land Investment,,Y4000,F,,OCEAN LAND INVESTMENT,POMPANO,...,11000.0,2,5500.000000,5500.0,jean francois,roy,not,not,not,not
18199,f9000226020,"ROY, JEAN F",jean f roy,Ocean Land Investment,,Y4000,F,,OCEAN LAND INVESTMENT,POMPANO,...,9000.0,1,9000.000000,9000.0,jean f,roy,not,not,not,not
32712,b0060186155,"ROY, ELLEN",ellen roy,Inter Continental Energy Group,,Y4000,F,,IEC,COHASSET,...,5400.0,9,600.000000,500.0,ellen,roy,not,not,not,not
42914,a00000699851,"ROY, G",g roy,Rahway Animal Hospital,,A4500,N,,VETERINARIAN,SCOTCH PLAINS,...,4500.0,7,642.857143,500.0,g,roy,not,not,not,not
50022,f1000214249,"ROY, RISHIN",rishin roy,Clinton Group,,F2100,N,,CLINTON GROUP,NEW YORK,...,4000.0,4,1000.000000,1000.0,rishin,roy,not,not,not,not
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
753029,d1004714810,"ROY, LOIS G",lois g roy,Retired,,X1200,F,,RETIRED,MANCHESTER,...,200.0,1,200.000000,200.0,lois g,roy,not,not,not,not
758232,f1100342436,"ROY, JOSEPH",joseph roy,Optometrist,,H1120,M,,OPTOMETRIST,DERIDDER,...,200.0,1,200.000000,200.0,joseph,roy,not,not,not,not
759747,f1100253717,"ROY, SATHYABHAMA",sathyabhama roy,Retired,,X1200,N,,RETIRED,GAITHERSBURG,...,200.0,1,200.000000,200.0,sathyabhama,roy,not,ind,ind,not
768891,f9000241397,"ROY, MICHAEL J",michael j roy,Powder Test Vaccines,,Y4000,M,,POWDER TEST VACCINES,MADISON,...,200.0,1,200.000000,200.0,michael j,roy,not,not,not,not
