In [34]:
import pandas as pd
import polars as pl
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from collections import defaultdict
import warnings
from tqdm import tqdm
import pickle

tqdm.pandas()
warnings.filterwarnings("ignore")

In [35]:
year = 14
df = f"./data/CampaignFin{year}/indivs{year}.txt"
donors_csv = f"./data/CampaignFin{year}/donors_state{year}.csv"
names = f"./data/USIN.csv"

In [36]:
donors = pd.read_csv(donors_csv)
donors["lastname"] = donors["name"].apply(lambda x: str(x).split(",")[0].lower())
donors.head(10)

Unnamed: 0,contrib_id,name,name_new,orgname,ultorg,realcode,gender,occupation,employer,city,state,total_donated,donation_count,avg_donation,med_donation,lastname
0,,ACTBLUE,actblue actblue,[24I Contribution],,Z9500,,,,WASHINGTON,DC,269989381.0,16880,15994.63,1500.0,actblue
1,U00000036521,"STEYER, THOMAS F",thomas f steyer,Fahr LLC/Tom Steyer,Fahr LLC,JE300,M,FOUNDER,"FAHR, LLC",SAN FRANCISCO,CA,78918267.0,51,1547417.0,2600.0,steyer
2,U00000037041,"BLOOMBERG, MICHAEL R",michael r bloomberg,Bloomberg Lp,,F5500,M,EXECUTIVE,BLOOMBERG LP,NEW YORK,NY,28532529.0,81,352253.4,105000.0,bloomberg
3,U00000000661,"SINGER, PAUL",paul singer,Elliott Management,,F2700,M,PRINCIPAL,ELLIOTT MANAGEMENT CORP.,NEW YORK,NY,11371474.0,195,58315.25,2600.0,singer
4,U00000036821,"MERCER, ROBERT",robert mercer,Renaissance Technologies,,F2700,M,FINANCIAL CONSULTANT,RENAISSANCE TECHNOLOGIES,EAST SETAUKET,NY,9655999.0,118,81830.5,2600.0,mercer
5,U0000003235,"EYCHANER, FRED",fred eychaner,Newsweb Corp,,C1100,M,PRESIDENT,NEWSWEB CORPORATION,CHICAGO,IL,8755400.0,99,88438.38,2600.0,eychaner
6,U00000003151,"SIMONS, JAMES H",james h simons,Renaissance Technologies,,F2700,M,PHILANTHROPIST,EUCLIDEAN CAPITAL,NEW YORK,NY,7488700.0,39,192017.9,2600.0,simons
7,f00106275301,"CARDON, WILFORD R",wilford r cardon,[Candidate Contribution],,Z9000,M,EXECUTIVE,CARDON GROUP,MESA,AZ,6251785.0,9,694642.8,198000.0,cardon
8,U00000003201,"RICKETTS, J JOE",j joe ricketts,Entrepreneur/Joe Ricketts,,F7000,M,ENTREPRENEUR,SELF-EMPLOYED,BONDURANT,WY,6200025.0,31,200000.8,2600.0,ricketts
9,U00000003101,"ADELSON, SHELDON",sheldon adelson,Las Vegas Sands,,G6500,M,CHAIRMAN,LAS VEGAS SANDS CORPORATION,LAS VEGAS,NV,5878900.0,47,125083.0,2600.0,adelson


In [37]:
# https://github.com/philipperemy/name-dataset
df_indian = pd.read_csv("./data/IN.csv")
df_indian.columns = ['firstname', 'lastname', 'gender', 'ethnicity']
df_indian['firstname'] = df_indian['firstname'].apply(lambda x: x.split(" ")[0] if " " in str(x) else str(x))
df_indian['lastname'] = df_indian['lastname'].apply(lambda x: x.split(" ")[-1] if " " in str(x) else str(x))
df_indian['name'] = df_indian['firstname'].apply(lambda x: x.lower()) + ' ' + df_indian['lastname'].apply(lambda x: x.lower())
df_indian["ethnic"] = df_indian["ethnicity"].apply(lambda x: "ind")
df_indian = df_indian[['lastname', 'name', 'ethnic']]
df_indian.head(10)

Unnamed: 0,lastname,name,ethnic
0,Kumar,kapil kumar,ind
1,Aggarwal,mitali aggarwal,ind
2,Jangra,vikas jangra,ind
3,Lungay,ravi lungay,ind
4,Yadav,jagat yadav,ind
5,Kumari,kajal kumari,ind
6,Kumar,vimal kumar,ind
7,Singham,surya singham,ind
8,Hanjra,kawaljarnail hanjra,ind
9,Rathor,kuldeep rathor,ind


In [38]:
sample = pd.read_csv("./manual/top_donors20_pred.csv")
sample["firstname"] = sample["name"].apply(lambda x: x.split(",")[-1] if "," in x else str(x))
sample["lastname"] = sample["name"].apply(lambda x: x.split(",")[0] if "," in x else str(x))
sample["name_new"] = sample["firstname"].apply(lambda x: x.lower()) + ' ' + sample["lastname"].apply(lambda x: x.lower())
sample["ethnic"] = sample["indian"].apply(lambda x: "ind" if x == True else "not")
sample = sample[['name', 'lastname', 'name_new', 'indian', 'ethnic']]
sample.head(10)

Unnamed: 0,name,lastname,name_new,indian,ethnic
0,"for america, amy",for america,amy for america,,not
1,"bloomberg, michael",bloomberg,michael bloomberg,False,not
2,"steyer, thomas f",steyer,thomas f steyer,False,not
3,"mellon, timothy",mellon,timothy mellon,False,not
4,"adelson, miriam o dr",adelson,miriam o dr adelson,False,not
5,"adelson, sheldon g mr",adelson,sheldon g mr adelson,False,not
6,"uihlein, richard e",uihlein,richard e uihlein,False,not
7,"griffin, kenneth",griffin,kenneth griffin,False,not
8,"schwarzman, stephen a",schwarzman,stephen a schwarzman,False,not
9,"jurvetson, karla",jurvetson,karla jurvetson,False,not


In [39]:
# get the most common indian last names
indian_lastnames = set(df_indian["lastname"].str.lower().head(5000))

# add the known indian-american last names
known_indian_lastnames = sample[sample["ethnic"] == "ind"]["lastname"].str.lower().tolist()
indian_lastnames.update(known_indian_lastnames)

# remove the names that are more commonly american
indian_lastnames = [x for x in indian_lastnames if x not in ["ali", "ansari", "roy", "sk", "alam", "ahmed", "hussain", "islam", "ray",
                                                             "king", "mann", "ray", "gill", "m", "paul", "farmer", "john", "kay", 
                                                             "r", "abraham", "camp", "g", "p", "power", "song", "k", "kang", "binder", 
                                                             "joy", "null", "sell", "n", "ko", "shi", "ricker", "robin", "rambo", "ku", 
                                                             "back", "banker", "hans", "sandy", "grand", "ji", "bains", "bale", "duane", 
                                                             "don", "mi", "mall", "baden", "panter", "sing", "barra", "davi", 
                                                             "lasseter", "munger", "kaler", "maddy", "lucky", "pop", "shalom", "ro",
                                                             "rod", "sha", "pon", "bander", "barman", "biber", "goldy", "bobby", "bou", 
                                                             "sky", "nan", "done", "boy", "summy", "boro", "all", "rings", "vali",
                                                             "shing", "dema", "farm", "lather", "suny", "kalson", "payment", "si",
                                                             "ch", "sah", "come", "roy", "adam", "harry", "virk", "mian", "san",
                                                             "vik", "butte"]]

In [40]:
donors["ethnic"] = np.where(donors["lastname"].str.lower().isin(indian_lastnames), "ind", "not")
donors.sort_values(by="total_donated", ascending=False).loc[:9999, "ethnic"] = sample["ethnic"]
donors.to_csv(f"./output/donors_state{year}_pred_lastname.csv", index=False)
donors["ethnic"].value_counts()

ethnic
not    735563
ind      5560
Name: count, dtype: int64

In [41]:
donors[donors["ethnic"] == "ind"]["lastname"].value_counts()[:10]

lastname
patel     883
singh     309
shah      277
khan      191
gupta     169
reddy     130
mehta     104
sharma     98
kumar      96
ahmad      70
Name: count, dtype: int64

In [42]:
donors[donors["lastname"] == "roy"]

Unnamed: 0,contrib_id,name,name_new,orgname,ultorg,realcode,gender,occupation,employer,city,state,total_donated,donation_count,avg_donation,med_donation,lastname,ethnic
31409,m00013275941,"ROY, RAHUL",rahul roy,ARC Document Solutions,,C5000,N,CTO,ARC DOCUMENT SOLUTION,FREMONT,CA,10300.0,4,2575.000000,2600.0,roy,not
37387,a0000939990,"ROY, JAMES P",james p roy,"Domengeaux, Wright et al",,K1000,M,ATTORNEY,"DOMENGEAUX, WRIGHT, ROY & EDWARDS",LAFAYETTE,LA,9500.0,4,2375.000000,1750.0,roy,not
44596,m0001478431,"ROY, ROB",rob roy,Switch Supernap,,C6500,M,CEO,SWITCH,LAS VEGAS,NV,7800.0,2,3900.000000,3900.0,roy,not
67764,k0001312427,"ROY, SULAIMAN",sulaiman roy,Imagineering Inc,,Y4000,N,VP SALES,IMAGINEERING,HAWTHORN WOODS,IL,5200.0,4,1300.000000,1300.0,roy,not
83221,b0060859920,"ROY, RANDALL",randall roy,Retired,,X1200,M,RETIRED,RETIRED,TOPEKA,KS,4900.0,11,445.454545,250.0,roy,not
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685535,n0001172200,"ROY, KAUSHIK",kaushik roy,Shanti,,Y4000,N,EXECUTIVE DIRECTOR,SHANTI,SAN FRANCISCO,CA,250.0,1,250.000000,250.0,roy,not
702530,n0001131070,"ROY, ALLEN",allen roy,Wenzel Strategies,,Y4000,M,RESEARCH CONSULTANT,WENZEL STRATEGIES,MYRTLE BEACH,SC,215.0,1,215.000000,215.0,roy,not
704479,n0001194712,"ROY, CRAIG",craig roy,,,F1300,M,CFO,HEART OF LOUISIANA FEDERAL CREDIT UNIO,DEVILLE,LA,210.0,1,210.000000,210.0,roy,not
715128,m0001351147,"ROY, CHARLES N",charles n roy,Real Estate,,F4000,M,REAL ESTATE,SELF,ORANGE,CA,200.0,1,200.000000,200.0,roy,not


In [43]:
donors[donors["ethnic"] == "ind"].sort_values(by="total_donated", ascending=False)

Unnamed: 0,contrib_id,name,name_new,orgname,ultorg,realcode,gender,occupation,employer,city,state,total_donated,donation_count,avg_donation,med_donation,lastname,ethnic
196,U00000035581,"SATTER, MUNEER",muneer satter,Satter Investment Management,,F7000,M,PRIVATE INVESTOR,SATTER INVESTMENT MANAGEMENT,CHICAGO,IL,484000.0,62,7806.451613,2600.0,satter,ind
455,j1001328262,"KUMAR, VIKRAM ADITYA MR",vikram aditya mr kumar,AVG Advanced Technologies,,C5400,M,COO,AVG,KILDEER,IL,279350.0,7,39907.142857,20000.0,kumar,ind
456,m0001734575@,"RAO, WILLA",willa rao,Cai Industries,,H4200,F,PARTNER,CAI INDUSTRIES,EL MONTE,CA,278900.0,60,4648.333333,2600.0,rao,ind
679,U00000033011,"KHOSLA, VINOD",vinod khosla,Khosla Ventures,,F2500,M,VENTURE CAPITALIST,KHOSLA VENTURES,PORTOLA VALLEY,CA,217900.0,23,9473.913043,2600.0,khosla,ind
921,b0060775432,"NARASIMHAN, SHEKAR",shekar narasimhan,Beekman Advisors,,J7500,M,ADVISOR,BEEKMAN ADVISORS,DUNN LORING,VA,178475.0,35,5099.285714,1000.0,narasimhan,ind
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
740143,n0001504648,"PAI, FANG YAO",fang yao pai,Cathay Bank,,F1000,N,BANKER,CATHAY BANK,GREAT NECK,NY,125.0,1,125.000000,125.0,pai,ind
740667,h3001848945,"PATEL, ASHWIN",ashwin patel,Asha Patel Llc,,G5100,M,HAIRDRESSER,ASHA PATEL LLC,SPARTANBURG,SC,40.0,2,20.000000,20.0,patel,ind
740697,d0000999508,"RAJAN, GOVIN T",govin t rajan,,,Y2000,U,INFO REQUESTED,INFO REQUESTED,BRADENTON,FL,40.0,1,40.000000,40.0,rajan,ind
740712,i3003959058,"KAPADIA, ASHA SETH",asha seth kapadia,Retired,,X1200,F,PROF EMERITUS,RETD,HOUSTON,TX,40.0,2,20.000000,20.0,kapadia,ind
