In [1]:
import pandas as pd
import polars as pl
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from collections import defaultdict
import warnings
from tqdm import tqdm
import pickle

tqdm.pandas()
warnings.filterwarnings("ignore")

In [2]:
year = "20"
df = f"./data/CampaignFin{year}/indivs{year}.txt"
donors_csv = f"./data/CampaignFin{year}/donors_state{year}.csv"
names = f"./data/USIN.csv"

In [3]:
donors = pd.read_csv(donors_csv)
donors["lastname"] = donors["name"].apply(lambda x: str(x).split(",")[0].lower())
donors.head(10)

Unnamed: 0,contrib_id,name,name_new,orgname,ultorg,realcode,gender,occupation,employer,city,state,total_donated,donation_count,avg_donation,med_donation,lastname
0,,ACTBLUE,actblue actblue,,,Y4000,,,,WASHINGTON,CA,1261253000.0,25821,48846.03,1000.0,actblue
1,U00000037041,"BLOOMBERG, MICHAEL R",michael r bloomberg,[Candidate Contribution],,Z9000,M,FOUNDER,BLOOMBERG INC.,NEW YORK,NY,1127731000.0,958,1177172.0,682.5,bloomberg
2,U00000036521,"STEYER, TOM",tom steyer,[Candidate Contribution],,Z9000,M,PRESIDENTIAL CANDIDATE,SELF-EMPLOYED,SAN FRANCISCO,CA,379478200.0,756,501955.3,2800.0,steyer
3,U00000046841,"MELLON, TIMOTHY",timothy mellon,Investments,,F7000,M,INVESTMENTS,SELF-EMPLOYED,SARATOGA,WY,45133560.0,23,1962328.0,2800.0,mellon
4,U0000000310A,"ADELSON, MIRIAM",miriam adelson,Adelson Clinic for Drug Abuse Treatment & Rese...,,H3200,F,PHYSICIAN,ADELSON CLINIC,LAS VEGAS,NV,44999550.0,124,362899.6,2800.0,adelson
5,U00000003101,"ADELSON, SHELDON G",sheldon g adelson,Las Vegas Sands,,G6500,M,CEO,LAS VEGAS SANDS CORPORATION,LAS VEGAS,NV,44847950.0,119,376873.5,2800.0,adelson
6,U00000036901,"UIHLEIN, RICHARD",richard uihlein,Uline Inc,,M7000,M,CEO,ULINE,LAKE FOREST,IL,35364330.0,319,110860.0,2800.0,uihlein
7,U00000036551,"GRIFFIN, KENNETH",kenneth griffin,Citadel LLC,,F2700,M,FOUNDER CEO,CITADEL LLC,CHICAGO,IL,33667630.0,188,179083.2,2800.0,griffin
8,U00000003611,"SCHWARZMAN, STEPHEN A",stephen a schwarzman,Blackstone Group,,F2600,M,CHAIRMAN,BLACKSTONE,NEW YORK,NY,33454000.0,226,148026.5,2800.0,schwarzman
9,U00000046781,"JURVETSON, KARLA",karla jurvetson,Karla T Jurvetson MD,,H1110,F,PHYSICIAN,SELF,LOS ALTOS,CA,33088100.0,914,36201.42,2800.0,jurvetson


In [4]:
# https://github.com/philipperemy/name-dataset
df_indian = pd.read_csv("./data/IN.csv")
df_indian.columns = ['firstname', 'lastname', 'gender', 'ethnicity']
df_indian['firstname'] = df_indian['firstname'].apply(lambda x: x.split(" ")[0] if " " in str(x) else str(x))
df_indian['lastname'] = df_indian['lastname'].apply(lambda x: x.split(" ")[-1] if " " in str(x) else str(x))
df_indian['name'] = df_indian['firstname'].apply(lambda x: x.lower()) + ' ' + df_indian['lastname'].apply(lambda x: x.lower())
df_indian["ethnic"] = df_indian["ethnicity"].apply(lambda x: "ind")
df_indian = df_indian[['lastname', 'name', 'ethnic']]
df_indian.head(10)

Unnamed: 0,lastname,name,ethnic
0,Kumar,kapil kumar,ind
1,Aggarwal,mitali aggarwal,ind
2,Jangra,vikas jangra,ind
3,Lungay,ravi lungay,ind
4,Yadav,jagat yadav,ind
5,Kumari,kajal kumari,ind
6,Kumar,vimal kumar,ind
7,Singham,surya singham,ind
8,Hanjra,kawaljarnail hanjra,ind
9,Rathor,kuldeep rathor,ind


In [5]:
sample = pd.read_csv("./manual/top_donors20_pred.csv")
sample["firstname"] = sample["name"].apply(lambda x: x.split(",")[-1] if "," in x else str(x))
sample["lastname"] = sample["name"].apply(lambda x: x.split(",")[0] if "," in x else str(x))
sample["name_new"] = sample["firstname"].apply(lambda x: x.lower()) + ' ' + sample["lastname"].apply(lambda x: x.lower())
sample["ethnic"] = sample["indian"].apply(lambda x: "ind" if x == True else "not")
sample = sample[['name', 'lastname', 'name_new', 'indian', 'ethnic']]
sample.head(10)

Unnamed: 0,name,lastname,name_new,indian,ethnic
0,"for america, amy",for america,amy for america,,not
1,"bloomberg, michael",bloomberg,michael bloomberg,False,not
2,"steyer, thomas f",steyer,thomas f steyer,False,not
3,"mellon, timothy",mellon,timothy mellon,False,not
4,"adelson, miriam o dr",adelson,miriam o dr adelson,False,not
5,"adelson, sheldon g mr",adelson,sheldon g mr adelson,False,not
6,"uihlein, richard e",uihlein,richard e uihlein,False,not
7,"griffin, kenneth",griffin,kenneth griffin,False,not
8,"schwarzman, stephen a",schwarzman,stephen a schwarzman,False,not
9,"jurvetson, karla",jurvetson,karla jurvetson,False,not


In [6]:
# get the most common indian last names
indian_lastnames = set(df_indian["lastname"].str.lower().head(5000))

# add the known indian-american last names
known_indian_lastnames = sample[sample["ethnic"] == "ind"]["lastname"].str.lower().tolist()
indian_lastnames.update(known_indian_lastnames)

# remove the names that are more commonly american
indian_lastnames = [x for x in indian_lastnames if x not in ["ali", "ansari", "roy", "sk", "alam", "ahmed", "hussain", "islam", "ray",
                                                             "king", "mann", "ray", "gill", "m", "paul", "farmer", "john", "kay", 
                                                             "r", "abraham", "camp", "g", "p", "power", "song", "k", "kang", "binder", 
                                                             "joy", "null", "sell", "n", "ko", "shi", "ricker", "robin", "rambo", "ku", 
                                                             "back", "banker", "hans", "sandy", "grand", "ji", "bains", "bale", "duane", 
                                                             "don", "mi", "mall", "baden", "panter", "sing", "barra", "davi", 
                                                             "lasseter", "munger", "kaler", "maddy", "lucky", "pop", "shalom", "ro",
                                                             "rod", "sha", "pon", "bander", "barman", "biber", "goldy", "bobby", "bou", 
                                                             "sky", "nan", "done", "boy", "summy", "boro", "all", "rings", "vali",
                                                             "shing", "dema", "farm", "lather", "suny", "kalson", "payment", "si",
                                                             "ch", "sah", "come", "roy", "adam", "harry", "virk", "mian", "san",
                                                             "vik", "butte"]]

In [None]:
donors["ethnic"] = np.where(donors["lastname"].str.lower().isin(indian_lastnames), "ind", "not")
donors.sort_values(by="total_donated", ascending=False).loc[:9999, "ethnic"] = sample["ethnic"]
# donors.to_csv(f"./output/donors_state{year}_pred_lastname.csv", index=False)
donors["ethnic"].value_counts()

ethnic
not    774839
ind      4843
Name: count, dtype: int64

In [38]:
donors[donors["ethnic"] == "ind"]["lastname"].value_counts()[:10]

lastname
patel     633
singh     319
shah      264
khan      175
gupta     128
reddy     123
mehta     108
kumar      92
bhakta     86
sharma     77
Name: count, dtype: int64

In [39]:
donors[donors["lastname"] == "roy"]

Unnamed: 0,contrib_id,name,name_new,orgname,ultorg,realcode,gender,occupation,employer,city,state,total_donated,donation_count,avg_donation,med_donation,lastname,ethnic
16340,a0000939990,"ROY, JAMES P",james p roy,"Domengeaux, Wright et al",,K1000,M,ATTORNEY,DOMENGEAUX WRIGHT ROY & EDWARDS,LAFAYETTE,LA,13200.0,13,1015.384615,700.0,roy,not
23065,h3001821090,"ROY, MARK A",mark a roy,[24I Contribution],,Z9500,M,PRESIDENT,E-CAVERN,CRESTWOOD,KY,10000.0,9,1111.111111,1000.0,roy,not
24923,i3003928255,"ROY, HELEN",helen roy,,,Y2000,F,NONE,,LEXINGTON,MA,10000.0,1,10000.000000,10000.0,roy,not
28096,h3001307171,"ROY, DENIS MR",denis mr roy,,,F3200,M,SR. VP-CHIEF INFORMATION OFFICER,WELLMARK INC.,CLIVE,IA,9000.0,2,4500.000000,4500.0,roy,not
32769,c0180495563,"ROY, JOHNNY",johnny b roy,,,H1100,M,PHYSICIAN,SELF,EDMOND,OK,7750.0,3,2583.333333,500.0,roy,not
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
760199,i3003600610,"ROY, JITENDRA",jitendra roy,Mt Sinai Hospital,,H2100,N,DIRECTOR OF ONCOLOGY,MT. SINAI HOSPITAL,FLUSHING,NY,200.0,1,200.000000,200.0,roy,not
761211,d0000897819,"ROY, DONALD MR",donald mr roy,Self-Employed,,G0000,M,MEDICAL DOCTORE,SELF-EMPLOYED,FRESNO,CA,200.0,1,200.000000,200.0,roy,not
764998,i3003664870,"ROY, PAT A MS",pat a ms roy,AXA Equitable Life Insurance,AXA,F3300,F,ATTORNEY,AXA EQUITABLE,NEW HYDE PARK,NY,200.0,1,200.000000,200.0,roy,not
767900,i3003853602,"ROY, PHILIP H",philip h roy,,,Y2000,M,BEST EFFORT,BEST EFFORT/BEST EFFORT,PHOENIX,AZ,200.0,1,200.000000,200.0,roy,not


In [40]:
donors[donors["ethnic"] == "ind"].sort_values(by="total_donated", ascending=False)

Unnamed: 0,contrib_id,name,name_new,orgname,ultorg,realcode,gender,occupation,employer,city,state,total_donated,donation_count,avg_donation,med_donation,lastname,ethnic
70,h30012305521,"LANGAN, JOHN",john langan,,,J7400,M,PUBLISHER,SELF-EMPLOYED,VOORHEES,NJ,159800.0,96,1664.583333,2100.0,langan,ind
141,U00000035581,"SATTER, MUNEER A MR",muneer satter,Goldman Sachs,,F2300,M,INVESTMENT BANKER,GOLDMAN SACHS,WINNETKA,IL,136900.0,23,5952.173913,2100.0,satter,ind
306,U0000003827,"SINGH, DINAKAR",dinakar singh,TPG-Axon Capital,,F2700,M,INVESTMENT BANKER,TPG - AXON CAPITAL,NEW YORK,NY,121350.0,30,4045.000000,2100.0,singh,ind
771,g1100977629,"MALIK, MICHAEL",michael malik,MJM Enterprises & Development,,G6500,M,DEVELOPER,SELF-EMPLOYED,DETROIT,MI,97700.0,18,5427.777778,2100.0,malik,ind
1043,g1100906075,"SINGH, VISHWA",vishwa singh,TPG-Axon Capital,,F2700,U,SCIENTIST,CONSULTANT,NEW YORK,NY,87000.0,21,4142.857143,1500.0,singh,ind
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
753417,h3001526127,"SINGH, BHAGWAHT",bhagwaht singh,Bhagwaht Singh Md In,,Y4000,U,RETIRED,BHAGWAHT SINGH MD IN,RIVERSIDE,CA,200.0,1,200.000000,200.0,singh,ind
753314,g1100949229,"SHAH, NITA J",nita j shah,Shah & Co,,Y4000,F,C.P.A.,SHAH AND COMPANY,JAMESTOWN,RI,200.0,1,200.000000,200.0,shah,ind
752498,i3004209529,"AMIN, RITVIG N",ritvig n amin,,,Y2000,U,INFORMATION REQUESTED,,CHARLOTTE,NC,200.0,1,200.000000,200.0,amin,ind
752482,b0060868534,"RAI, KANTI R",kanti r rai,Long Island Jewish Medical Center,,H1100,N,PHYSICIAN,LONG ISLAND JEWISH MEDICAL CENTER,GREAT NECK,NY,200.0,1,200.000000,200.0,rai,ind
