In [1]:
import pandas as pd
import polars as pl
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from collections import defaultdict
import warnings
from tqdm import tqdm
import pickle

tqdm.pandas()
warnings.filterwarnings("ignore")

In [2]:
year = 18
df = f"./data/CampaignFin20/indivs{year}.txt"
donors_csv = f"./data/CampaignFin{year}/donors{year}.csv"
names = f"./data/USIN.csv"

In [3]:
donors = pd.read_csv(donors_csv)
donors["lastname"] = donors["name"].apply(lambda x: str(x).split(",")[0].lower())
donors.head(10)

Unnamed: 0,contrib_id,name,name_new,orgname,ultorg,gender,occupation,employer,total_donated,donation_count,avg_donation,med_donation,lastname
0,,ANIBAL,anibal anibal,Attorney,,,LAWYER,SELF EMPLOYED,585387138.0,25228,23203.87,1000.0,anibal
1,U00000037041,"BLOOMBERG, MICHAEL R",michael r bloomberg,Bloomberg LP,,M,FOUNDER/OWNER,BLOOMBERG L.P.,95131257.0,58,1640194.0,37578.5,bloomberg
2,U00000036521,"STEYER, THOMAS",thomas steyer,League of Conservation Voters,,M,ADVOCACY,FAHR LLC,74102881.0,402,184335.5,2700.0,steyer
3,q0001673261,"SCOTT, RICK",rick scott,[Candidate Contribution],,M,GOVERNOR,STATE OF FLORIDA,69942510.0,165,423894.0,5526.0,scott
4,U00000003101,"ADELSON, SHELDON G",sheldon g adelson,Las Vegas Sands,,M,OWNER / CHAIRMAN OF THE BOARD,LAS VEGAS SANDS CORPORATION,62265700.0,127,490281.1,2700.0,adelson
5,U0000000310A,"ADELSON, MIRIAM DR",miriam dr adelson,Adelson Clinic for Drug Abuse Treatment & Rese...,,F,DIRECTOR OF COMMUNITY INVOLVEMENT,LAS VEGAS SANDS CORPORATION,61861800.0,113,547449.6,2700.0,adelson
6,U00000036901,"UIHLEIN, RICHARD E MR",richard e mr uihlein,Uline Inc,,M,CEO/OWNER,ULINE,39187258.0,320,122460.2,2700.0,uihlein
7,U0000004604,"SUSSMAN, S DONALD",s donald sussman,Paloma Partners,,M,CHAIRMAN,PALOMA PARTNERS ADVISORS LP,27987200.0,647,43256.88,2700.0,sussman
8,U00000036551,"GRIFFIN, KENNETH C",kenneth c griffin,Citadel LLC,,M,FOUNDER & CEO,CITADEL LLC,19567300.0,111,176282.0,2700.0,griffin
9,U00000003151,"SIMONS, JAMES",james simons,Euclidean Capital,,M,PRESIDENT,EUCLIDEAN CAPITAL,19279110.0,258,74725.23,2700.0,simons


In [4]:
# https://github.com/philipperemy/name-dataset
df_indian = pd.read_csv("./data/IN.csv")
df_indian.columns = ['firstname', 'lastname', 'gender', 'ethnicity']
df_indian['firstname'] = df_indian['firstname'].apply(lambda x: x.split(" ")[0] if " " in str(x) else str(x))
df_indian['lastname'] = df_indian['lastname'].apply(lambda x: x.split(" ")[-1] if " " in str(x) else str(x))
df_indian['name'] = df_indian['firstname'].apply(lambda x: x.lower()) + ' ' + df_indian['lastname'].apply(lambda x: x.lower())
df_indian["ethnic"] = df_indian["ethnicity"].apply(lambda x: "ind")
df_indian = df_indian[['lastname', 'name', 'ethnic']]
df_indian.head(10)

Unnamed: 0,lastname,name,ethnic
0,Kumar,kapil kumar,ind
1,Aggarwal,mitali aggarwal,ind
2,Jangra,vikas jangra,ind
3,Lungay,ravi lungay,ind
4,Yadav,jagat yadav,ind
5,Kumari,kajal kumari,ind
6,Kumar,vimal kumar,ind
7,Singham,surya singham,ind
8,Hanjra,kawaljarnail hanjra,ind
9,Rathor,kuldeep rathor,ind


In [5]:
sample = pd.read_csv("./manual/top_donors_all.csv")
sample["firstname"] = sample["name"].apply(lambda x: x.split(",")[-1] if "," in x else str(x))
sample["lastname"] = sample["name"].apply(lambda x: x.split(",")[0] if "," in x else str(x))
sample["name_new"] = sample["firstname"].apply(lambda x: x.lower()) + ' ' + sample["lastname"].apply(lambda x: x.lower())
sample["ethnic"] = sample["actual"].apply(lambda x: "ind" if x == True else "not")
sample = sample[['name', 'lastname', 'name_new', 'actual', 'ethnic']]
sample.head(10)

Unnamed: 0,name,lastname,name_new,actual,ethnic
0,"for america, amy",for america,amy for america,,not
1,"bloomberg, michael",bloomberg,michael bloomberg,False,not
2,"steyer, thomas f",steyer,thomas f steyer,False,not
3,"mellon, timothy",mellon,timothy mellon,False,not
4,"adelson, miriam o dr",adelson,miriam o dr adelson,False,not
5,"adelson, sheldon g mr",adelson,sheldon g mr adelson,False,not
6,"uihlein, richard e",uihlein,richard e uihlein,False,not
7,"griffin, kenneth",griffin,kenneth griffin,False,not
8,"schwarzman, stephen a",schwarzman,stephen a schwarzman,False,not
9,"jurvetson, karla",jurvetson,karla jurvetson,False,not


In [6]:
# get the most common indian last names
indian_lastnames = set(df_indian["lastname"].str.lower().head(5000))

# add the known indian-american last names
known_indian_lastnames = sample[sample["ethnic"] == "ind"]["lastname"].str.lower().tolist()
indian_lastnames.update(known_indian_lastnames)

# remove the names that are more commonly american
indian_lastnames = [x for x in indian_lastnames if x not in ["king", "mann", "ray", "gill", "m", "paul", "farmer", "john", "kay", 
                                                             "r", "abraham", "camp", "g", "p", "power", "song", "k", "kang", "binder", 
                                                             "joy", "null", "sell", "n", "ko", "shi", "ricker", "robin", "rambo", "ku", 
                                                             "back", "banker", "hans", "sandy", "grand", "ji", "bains", "bale", "duane", 
                                                             "don", "mi", "mall", "baden", "panter", "sing", "barra", "davi", 
                                                             "lasseter", "munger", "kaler", "maddy", "lucky", "pop", "shalom", "ro",
                                                             "rod", "sha", "pon", "bander", "barman", "biber", "goldy", "bobby", "bou", 
                                                             "sky", "nan", "done", "boy", "summy", "boro", "all", "rings", "vali",
                                                             "shing", "dema", "farm", "lather", "suny", "kalson", "payment", "si",
                                                             "ch", "sah", "come", "roy", "adam", "harry", "virk", "mian", "san",
                                                             "vik", "butte"]]

In [7]:
donors["ethnic"] = np.where(donors["lastname"].str.lower().isin(indian_lastnames), "ind", "not")
donors.sort_values(by="total_donated", ascending=False).loc[:9999, "ethnic"] = sample["ethnic"]
donors.to_csv(f"./output/donors{year}_pred_lastname.csv", index=False)
donors["ethnic"].value_counts()

ethnic
not    1820173
ind      14383
Name: count, dtype: int64

In [8]:
donors[donors["ethnic"] == "ind"]["lastname"].value_counts()[:10]

lastname
patel     2415
singh      722
shah       634
khan       536
gupta      344
reddy      318
kumar      270
ahmed      250
rao        249
sharma     228
Name: count, dtype: int64

In [9]:
donors[donors["lastname"] == "roy"]

Unnamed: 0,contrib_id,name,name_new,orgname,ultorg,gender,occupation,employer,total_donated,donation_count,avg_donation,med_donation,lastname,ethnic
15169,m0001478431@,"ROY, STELLA",stella roy,Philanthropist,,F,INFORMATION REQUESTED,INFORMATION REQUESTED,31600.0,7,4514.285714,2700.0,roy,not
32644,b0060859920,"ROY, RANDALL J",randall j roy,Retired,,M,RETIRED,RETIRED,16050.0,28,573.214286,100.0,roy,not
33207,i3004169447,"ROY, ROB",rob roy,Switch,,M,CEO,SWITCH,15800.0,4,3950.000000,3850.0,roy,not
33214,m00014784311,"ROY, ROB",rob roy,Switch,,M,CEO,SWITCH,15800.0,3,5266.666667,2700.0,roy,not
36380,a0000939990,"ROY, JAMES",james roy,"Domengeaux, Wright et al",,M,LAWYER,DOMENGEAUX WRIGHT ROY AND EDWARDS,14750.0,66,223.484848,112.5,roy,not
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1807053,q0002193228,"ROY, JOHN",john roy,Yelp Inc,,M,SOFTWARE ENGINEER,YELP,25.0,1,25.000000,25.0,roy,not
1807710,q0000717249,"ROY, ROY",roy roy,Bookseller,,M,BOOK SELLER,SELF EMPLOYED,25.0,1,25.000000,25.0,roy,not
1814888,p0002974109,"ROY, CLAIRE",claire roy,Retired,,F,RETIRED,RETIRED,20.0,1,20.000000,20.0,roy,not
1815265,q0001023021,"ROY, ROBERT W MR SR",robert w mr sr roy,Retired,,M,RETIRED,RETIRED,20.0,1,20.000000,20.0,roy,not


In [10]:
donors[donors["ethnic"] == "ind"].sort_values(by="total_donated", ascending=False)

Unnamed: 0,contrib_id,name,name_new,orgname,ultorg,gender,occupation,employer,total_donated,donation_count,avg_donation,med_donation,lastname,ethnic
45,h3003465387,"AYYADURAI, SHIVA DR",shiva dr ayyadurai,[Candidate Contribution],,M,CHAIRMAN & CEO,"CYTOSOLVE, INC.",4817653.0,39,123529.564103,3133.0,ayyadurai,ind
50,U00000045761,"ANSARY, HUSHANG HON",hushang hon ansary,Parman Capital Group,,M,EXECUTIVE CHAIRMAN,PARMAN CAPITAL GROUP,4374086.0,140,31243.471429,2700.0,ansary,ind
110,U00000045762,"ANSARY, SHAHLA MRS",shahla mrs ansary,Parman Capital Group,,F,HOMEMAKER,HOMEMAKER,1971900.0,86,22929.069767,2700.0,ansary,ind
148,U00000045811,"MANOCHERIAN, JED",jed manocherian,Woodbranch Investments,,M,REAL ESTATE INVESTOR,WOODBRANCH INVESTMENTS,1497400.0,266,5629.323308,2700.0,manocherian,ind
191,U00000040581,"ANWAR, JAVAID",javaid anwar,Midland Energy,,M,MIDLAND ENERGY,SELF-EMPLOYED,1165715.0,105,11102.047619,2700.0,anwar,ind
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1833128,p0003911829,"KRISHNA, KARNA",karna krishna,Massachusetts Bay Community College,,M,ADJUNCT PROFESSOR,MASSACHUSETTS BAY COMMUNITY COLLEGE,3.0,1,3.000000,3.0,krishna,ind
1833184,q0001170395,"SHARMA, SANJAY",sanjay sharma,Questcare,,M,PHYSICIAN,QUESTCARE,2.0,1,2.000000,2.0,sharma,ind
1833753,q0001157033,"NAYYAR, MIKE",mike nayyar,Insight Global,,M,BUSINESS ANALYST,INSIGHT GLOBAL,1.0,1,1.000000,1.0,nayyar,ind
1833939,q0000512958,"THUKRAL, DHRUV",dhruv thukral,Amazon.com,,M,SOLUTIONS ARCHITECT,AMAZON,1.0,1,1.000000,1.0,thukral,ind
