In [41]:
import pandas as pd
import polars as pl
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from collections import defaultdict
import warnings
from tqdm import tqdm
import pickle

tqdm.pandas()
warnings.filterwarnings("ignore")

In [42]:
year = 22
df = f"./data/CampaignFin20/indivs{year}.txt"
donors_csv = f"./data/CampaignFin{year}/donors_state{year}.csv"
names = f"./data/USIN.csv"

In [43]:
donors = pd.read_csv(donors_csv)
donors["lastname"] = donors["name"].apply(lambda x: str(x).split(",")[0].lower())
donors.head(10)

Unnamed: 0,contrib_id,name,name_new,orgname,ultorg,realcode,gender,occupation,employer,city,state,total_donated,donation_count,avg_donation,med_donation,lastname
0,,"A, RAYMOND",raymond a,National Republican Senatorial Cmte,,Z9600,,RETIRED,RETIRED,WASHINGTON,CA,1649385000.0,115116,14328.03,25.0,a
1,U00000003641,"SOROS, GEORGE",george soros,Soros Fund Management,,F2700,M,EXECUTIVE,SOROS FUND MANAGEMENT,NEW YORK,NY,180017300.0,204,882437.6,2900.0,soros
2,U00000036901,"UIHLEIN, RICHARD",richard uihlein,Uline Inc,,M7000,M,CEO,ULINE,LAKE FOREST,IL,84221530.0,369,228242.6,2900.0,uihlein
3,U00000036551,"GRIFFIN, KENNETH",kenneth griffin,Citadel LLC,,F2700,M,FOUNDER CEO,CITADEL LLC,CHICAGO,IL,74375500.0,317,234623.0,2900.0,griffin
4,U00000042451,"YASS, JEFF",jeff yass,Susquehanna International Group,,F2100,M,MANAGING DIRECTOR,SIG,BALA CYNWYD,PA,56324500.0,49,1149480.0,5800.0,yass
5,U0000004705,"BANKMAN-FRIED, SAMUEL",samuel bankman-fried,FTX.US,,F2800,M,CEO,NOT EMPLOYED,NEW PROVIDENC,CA,42044880.0,338,124393.1,5000.0,bankman-fried
6,U00000046841,"MELLON, TIMOTHY",timothy mellon,Investments,,F7000,M,INVESTMENTS,SELF-EMPLOYED,SARATOGA,WY,41746400.0,31,1346658.0,2900.0,mellon
7,U00000037041,"BLOOMBERG, MICHAEL",michael bloomberg,Bloomberg Lp,,F5500,M,FOUNDER,BLOOMBERG INC.,NEW YORK,NY,41330510.0,39,1059757.0,200000.0,bloomberg
8,U0000003235,"EYCHANER, FRED",fred eychaner,Newsweb Corp,,C1100,M,PRESIDENT,NEWS WEB CORPORATION,CHICAGO,IL,37827500.0,234,161656.0,10000.0,eychaner
9,U00000003611,"SCHWARZMAN, STEPHEN",stephen schwarzman,Blackstone Group,,F2600,M,,BLACKSTONE,NEW YORK,NY,37544900.0,281,133611.7,2900.0,schwarzman


In [44]:
# https://github.com/philipperemy/name-dataset
df_indian = pd.read_csv("./data/IN.csv")
df_indian.columns = ['firstname', 'lastname', 'gender', 'ethnicity']
df_indian['firstname'] = df_indian['firstname'].apply(lambda x: x.split(" ")[0] if " " in str(x) else str(x))
df_indian['lastname'] = df_indian['lastname'].apply(lambda x: x.split(" ")[-1] if " " in str(x) else str(x))
df_indian['name'] = df_indian['firstname'].apply(lambda x: x.lower()) + ' ' + df_indian['lastname'].apply(lambda x: x.lower())
df_indian["ethnic"] = df_indian["ethnicity"].apply(lambda x: "ind")
df_indian = df_indian[['lastname', 'name', 'ethnic']]
df_indian.head(10)

Unnamed: 0,lastname,name,ethnic
0,Kumar,kapil kumar,ind
1,Aggarwal,mitali aggarwal,ind
2,Jangra,vikas jangra,ind
3,Lungay,ravi lungay,ind
4,Yadav,jagat yadav,ind
5,Kumari,kajal kumari,ind
6,Kumar,vimal kumar,ind
7,Singham,surya singham,ind
8,Hanjra,kawaljarnail hanjra,ind
9,Rathor,kuldeep rathor,ind


In [45]:
sample = pd.read_csv("./manual/top_donors20_pred.csv")
sample["firstname"] = sample["name"].apply(lambda x: x.split(",")[-1] if "," in x else str(x))
sample["lastname"] = sample["name"].apply(lambda x: x.split(",")[0] if "," in x else str(x))
sample["name_new"] = sample["firstname"].apply(lambda x: x.lower()) + ' ' + sample["lastname"].apply(lambda x: x.lower())
sample["ethnic"] = sample["indian"].apply(lambda x: "ind" if x == True else "not")
sample = sample[['name', 'lastname', 'name_new', 'indian', 'ethnic']]
sample.head(10)

Unnamed: 0,name,lastname,name_new,indian,ethnic
0,"for america, amy",for america,amy for america,,not
1,"bloomberg, michael",bloomberg,michael bloomberg,False,not
2,"steyer, thomas f",steyer,thomas f steyer,False,not
3,"mellon, timothy",mellon,timothy mellon,False,not
4,"adelson, miriam o dr",adelson,miriam o dr adelson,False,not
5,"adelson, sheldon g mr",adelson,sheldon g mr adelson,False,not
6,"uihlein, richard e",uihlein,richard e uihlein,False,not
7,"griffin, kenneth",griffin,kenneth griffin,False,not
8,"schwarzman, stephen a",schwarzman,stephen a schwarzman,False,not
9,"jurvetson, karla",jurvetson,karla jurvetson,False,not


In [46]:
# get the most common indian last names
indian_lastnames = set(df_indian["lastname"].str.lower().head(5000))

# add the known indian-american last names
known_indian_lastnames = sample[sample["ethnic"] == "ind"]["lastname"].str.lower().tolist()
indian_lastnames.update(known_indian_lastnames)

# remove the names that are more commonly american
indian_lastnames = [x for x in indian_lastnames if x not in ["ali", "ansari", "roy", "sk", "alam", "ahmed", "hussain", "islam", "ray",
                                                             "king", "mann", "ray", "gill", "m", "paul", "farmer", "john", "kay", 
                                                             "r", "abraham", "camp", "g", "p", "power", "song", "k", "kang", "binder", 
                                                             "joy", "null", "sell", "n", "ko", "shi", "ricker", "robin", "rambo", "ku", 
                                                             "back", "banker", "hans", "sandy", "grand", "ji", "bains", "bale", "duane", 
                                                             "don", "mi", "mall", "baden", "panter", "sing", "barra", "davi", 
                                                             "lasseter", "munger", "kaler", "maddy", "lucky", "pop", "shalom", "ro",
                                                             "rod", "sha", "pon", "bander", "barman", "biber", "goldy", "bobby", "bou", 
                                                             "sky", "nan", "done", "boy", "summy", "boro", "all", "rings", "vali",
                                                             "shing", "dema", "farm", "lather", "suny", "kalson", "payment", "si",
                                                             "ch", "sah", "come", "roy", "adam", "harry", "virk", "mian", "san",
                                                             "vik", "butte"]]

In [47]:
donors["ethnic"] = np.where(donors["lastname"].str.lower().isin(indian_lastnames), "ind", "not")
donors.sort_values(by="total_donated", ascending=False).loc[:9999, "ethnic"] = sample["ethnic"]
donors.to_csv(f"./output/donors_state{year}_pred_lastname.csv", index=False)
donors["ethnic"].value_counts()

ethnic
not    2745057
ind      15961
Name: count, dtype: int64

In [48]:
donors[donors["ethnic"] == "ind"]["lastname"].value_counts()[:10]

lastname
patel     2537
singh      984
shah       722
khan       705
gupta      386
kumar      321
reddy      304
sharma     285
rao        256
grover     212
Name: count, dtype: int64

In [49]:
donors[donors["lastname"] == "roy"]

Unnamed: 0,contrib_id,name,name_new,orgname,ultorg,realcode,gender,occupation,employer,city,state,total_donated,donation_count,avg_donation,med_donation,lastname,ethnic
46614,s00051139011,"ROY, ALEXANDER",alexander roy,Retired,,X1200,M,RETIRED,RETIRED,MUNCY,PA,20300.0,5,4060.000000,2900.0,roy,not
50344,a0000939990,"ROY, JAMES",james roy,"Domengeaux, Wright et al",,K1100,M,LAWYER,DOMENGEAUX WRIGHT ROY & EDWARDS LLC,LAFAYETTE,LA,19085.0,153,124.738562,83.0,roy,not
59857,r0016819951,"ROY, ASIT",asit roy,[24T Contribution],,Z9500,M,NOT EMPLOYED,NOT EMPLOYED,E. BRUNSWICK,NJ,16464.0,534,30.831461,15.0,roy,not
62072,i3004199421,"ROY, JOSEPHINE",josephine roy,West New York Board of Education,,X3500,F,TEACHER,WEST NEW YORK BOARD OF ED,BELMAR,NJ,15853.0,267,59.374532,50.0,roy,not
69147,s0005113901@,"ROY, JENNIFER",jennifer l roy,Retired,,X1200,F,RETIRED,RETIRED,MUNCY,PA,14500.0,4,3625.000000,2900.0,roy,not
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2725277,r0002263211,"ROY, MICHAEL",michael roy,Vaughn D Thibodeau,,Y4000,M,TRUCK DRIVER,VAUGHN D THIBODEAU,BRADFORD,ME,11.0,1,11.000000,11.0,roy,not
2742379,r0014335626,"ROY, PARAMITA",paramita roy,Dentist,,H1400,F,DENTIST,SELF-EMPLOYED,FOSTER CITY,CA,7.0,1,7.000000,7.0,roy,not
2752515,s0013690941,"ROY, RARITA",rarita roy,Unemployed,,Y1000,M,NOT EMPLOYED,NONE,MINDEN,NV,5.0,1,5.000000,5.0,roy,not
2753660,s0007031065,"ROY, ROBERT",robert roy,Retired,,X1200,M,RETIRED,RETIRED,WILMINGTON,NC,5.0,1,5.000000,5.0,roy,not


In [50]:
donors[donors["ethnic"] == "ind"].sort_values(by="total_donated", ascending=False)

Unnamed: 0,contrib_id,name,name_new,orgname,ultorg,realcode,gender,occupation,employer,city,state,total_donated,donation_count,avg_donation,med_donation,lastname,ethnic
37,U0000004761,"SINGH, NISHAD",nishad singh,FTX.US,,F2800,M,DIRECTOR OF ENGINEERING,FTX,LOS ALTOS HILLS,CA,8476000.0,148,57270.270270,2900.0,singh,ind
197,U00000033011,"KHOSLA, VINOD",vinod khosla,Khosla Ventures,,F2500,M,VENTURE CAPITALIST,KHOSLA VENTURES,PORTOLA VALLEY,CA,2084900.0,30,69496.666667,5000.0,khosla,ind
385,h10013502001,"REDDY, PREM",prem reddy,Prime Healthcare Services,,H2100,M,FOUNDEE AND CEO,PRIME HEALTHCARE,ONTARIO,CA,1088749.0,78,13958.320513,3205.0,reddy,ind
542,p00042647281,"SRIVASTAVA, GAURAV",gaurav srivastava,Unity Group,,Y4000,M,BUSINESS,UNITY GROUP,LOS ANGELES,CA,854000.0,15,56933.333333,12900.0,srivastava,ind
690,U00000035581,"SATTER, MUNEER A",muneer a satter,Satter Investment Management,,F7000,M,INVESTOR,SATTER INVESTMENT MANAGEMENT,CHICAGO,IL,706900.0,93,7601.075269,2900.0,satter,ind
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2759680,q0002006606,"BISHT, NARENDRA",narendra bisht,Green Apple,,Y4000,M,SALES,GREEN APPLE,WOODSIDE,NY,1.0,1,1.000000,1.0,bisht,ind
2760143,r0024072584,"RAHMAN, BNP MOYNUR",bnp moynur rahman,[24T Contribution],,Z9500,U,,,BIRMINGHAM,ZZ,1.0,1,1.000000,1.0,rahman,ind
2760345,h3001740863,"RAO, C K",c k rao,Medical Director,,H0000,U,MEDICAL DIRECTOR,SELF-EMPLOYED,NEWPORT COAST,CA,1.0,1,1.000000,1.0,rao,ind
2760535,r0017386326@,"BHATT, NACHIKETA MR",nachiketa mr bhatt,Nurse,...,H1710,M,SELF-EMPLOYED,SELF-EMPLOYED,PROSPECT,KY,1.0,1,1.000000,1.0,bhatt,ind
