In [1]:
import pandas as pd
import polars as pl
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from collections import defaultdict
import warnings
from tqdm import tqdm
import pickle

tqdm.pandas()
warnings.filterwarnings("ignore")

In [17]:
df = "./data/CampaignFin20/indivs20.txt"
donors = "./data/CampaignFin20/donors20.csv"
names = "./data/USIN.csv"

In [18]:
donors = pd.read_csv(donors)
donors["lastname"] = donors["name"].apply(lambda x: str(x).split(",")[0].lower())
donors.head(10)

Unnamed: 0,contrib_id,name,name_new,orgname,ultorg,gender,occupation,employer,total_donated,donation_count,avg_donation,med_donation,lastname
0,,"FOR AMERICA, AMY",amy for america,[24T Contribution],,,NOT EMPLOYED,NOT EMPLOYED,1261253000.0,25821,48846.03,1000.0,for america
1,U00000037041,"BLOOMBERG, MICHAEL",michael bloomberg,Bloomberg LP,,M,MAYOR,CITY OF NEW YORK,1127731000.0,958,1177172.0,682.5,bloomberg
2,U00000036521,"STEYER, THOMAS F",thomas f steyer,Fahr LLC/Tom Steyer,Fahr LLC,M,FOUNDER,NEXTGEN AMERICA,379478200.0,756,501955.3,2800.0,steyer
3,U00000046841,"MELLON, TIMOTHY",timothy mellon,Investments,,M,INVESTMENTS,SELF,45133560.0,23,1962328.0,2800.0,mellon
4,U0000000310A,"ADELSON, MIRIAM O DR",miriam o dr adelson,Adelson Clinic for Drug Abuse Treatment & Rese...,,F,PHYSICIAN,ADELSON CLINIC,44999550.0,124,362899.6,2800.0,adelson
5,U00000003101,"ADELSON, SHELDON G MR",sheldon g mr adelson,Las Vegas Sands,,M,CHAIRMAN OF THE BOARD,THE VENETIAN,44847950.0,119,376873.5,2800.0,adelson
6,U00000036901,"UIHLEIN, RICHARD",richard uihlein,Uline Inc,,M,,,35364330.0,319,110860.0,2800.0,uihlein
7,U00000036551,"GRIFFIN, KENNETH",kenneth griffin,Rodimer for Congress,,M,FOUNDER CEO,CITADEL LLC,33667630.0,188,179083.2,2800.0,griffin
8,U00000003611,"SCHWARZMAN, STEPHEN A",stephen a schwarzman,Blackstone Group,,M,"CHAIRMAN, CEO & CO-FOUNDER",BLACKSTONE,33454000.0,226,148026.5,2800.0,schwarzman
9,U00000046781,"JURVETSON, KARLA",karla jurvetson,[24T Contribution],,F,PHYSICIAN,SELF,33088100.0,914,36201.42,2800.0,jurvetson


In [9]:
# https://github.com/philipperemy/name-dataset
df_indian = pd.read_csv("./data/IN.csv")
df_indian.columns = ['firstname', 'lastname', 'gender', 'ethnicity']
df_indian['firstname'] = df_indian['firstname'].apply(lambda x: x.split(" ")[0] if " " in str(x) else str(x))
df_indian['lastname'] = df_indian['lastname'].apply(lambda x: x.split(" ")[-1] if " " in str(x) else str(x))
df_indian['name'] = df_indian['firstname'].apply(lambda x: x.lower()) + ' ' + df_indian['lastname'].apply(lambda x: x.lower())
df_indian["ethnic"] = df_indian["ethnicity"].apply(lambda x: "ind")
df_indian = df_indian[['lastname', 'name', 'ethnic']]
df_indian.head(10)

Unnamed: 0,lastname,name,ethnic
0,Kumar,kapil kumar,ind
1,Aggarwal,mitali aggarwal,ind
2,Jangra,vikas jangra,ind
3,Lungay,ravi lungay,ind
4,Yadav,jagat yadav,ind
5,Kumari,kajal kumari,ind
6,Kumar,vimal kumar,ind
7,Singham,surya singham,ind
8,Hanjra,kawaljarnail hanjra,ind
9,Rathor,kuldeep rathor,ind


In [10]:
sample = pd.read_csv("./manual/top_donors_all.csv")
sample["firstname"] = sample["name"].apply(lambda x: x.split(",")[-1] if "," in x else str(x))
sample["lastname"] = sample["name"].apply(lambda x: x.split(",")[0] if "," in x else str(x))
sample["name_new"] = sample["firstname"].apply(lambda x: x.lower()) + ' ' + sample["lastname"].apply(lambda x: x.lower())
sample["ethnic"] = sample["actual"].apply(lambda x: "ind" if x == True else "not")
sample = sample[['name', 'lastname', 'name_new', 'actual', 'ethnic']]
sample.head(10)

Unnamed: 0,name,lastname,name_new,actual,ethnic
0,"for america, amy",for america,amy for america,,not
1,"bloomberg, michael",bloomberg,michael bloomberg,False,not
2,"steyer, thomas f",steyer,thomas f steyer,False,not
3,"mellon, timothy",mellon,timothy mellon,False,not
4,"adelson, miriam o dr",adelson,miriam o dr adelson,False,not
5,"adelson, sheldon g mr",adelson,sheldon g mr adelson,False,not
6,"uihlein, richard e",uihlein,richard e uihlein,False,not
7,"griffin, kenneth",griffin,kenneth griffin,False,not
8,"schwarzman, stephen a",schwarzman,stephen a schwarzman,False,not
9,"jurvetson, karla",jurvetson,karla jurvetson,False,not


In [11]:
# get the most common indian last names
indian_lastnames = set(df_indian["lastname"].str.lower().head(5000))

# add the known indian-american last names
known_indian_lastnames = sample[sample["ethnic"] == "ind"]["lastname"].str.lower().tolist()
indian_lastnames.update(known_indian_lastnames)

# remove the names that are more commonly american
indian_lastnames = [x for x in indian_lastnames if x not in ["king", "mann", "ray", "gill", "m", "paul", "farmer", "john", "kay", 
                                                             "r", "abraham", "camp", "g", "p", "power", "song", "k", "kang", "binder", 
                                                             "joy", "null", "sell", "n", "ko", "shi", "ricker", "robin", "rambo", "ku", 
                                                             "back", "banker", "hans", "sandy", "grand", "ji", "bains", "bale", "duane", 
                                                             "don", "mi", "mall", "baden", "panter", "sing", "barra", "davi", 
                                                             "lasseter", "munger", "kaler", "maddy", "lucky", "pop", "shalom", "ro",
                                                             "rod", "sha", "pon", "bander", "barman", "biber", "goldy", "bobby", "bou", 
                                                             "sky", "nan", "done", "boy", "summy", "boro", "all", "rings", "vali",
                                                             "shing", "dema", "farm", "lather", "suny", "kalson", "payment", "si",
                                                             "ch", "sah", "come", "roy"]]

In [26]:
donors["ethnic"] = np.where(donors["lastname"].str.lower().isin(indian_lastnames), "ind", "not")
donors.loc[:9999, "ethnic"] = sample["ethnic"]
donors.to_csv("./output/donors20_pred_lastname.csv", index=False)
donors["ethnic"].value_counts()

ethnic
not    3565815
ind      23121
Name: count, dtype: int64

In [22]:
donors.head(10)

Unnamed: 0,contrib_id,name,name_new,orgname,ultorg,gender,occupation,employer,total_donated,donation_count,avg_donation,med_donation,lastname,ethnic
0,,"FOR AMERICA, AMY",amy for america,[24T Contribution],,,NOT EMPLOYED,NOT EMPLOYED,1261253000.0,25821,48846.03,1000.0,for america,not
1,U00000037041,"BLOOMBERG, MICHAEL",michael bloomberg,Bloomberg LP,,M,MAYOR,CITY OF NEW YORK,1127731000.0,958,1177172.0,682.5,bloomberg,not
2,U00000036521,"STEYER, THOMAS F",thomas f steyer,Fahr LLC/Tom Steyer,Fahr LLC,M,FOUNDER,NEXTGEN AMERICA,379478200.0,756,501955.3,2800.0,steyer,not
3,U00000046841,"MELLON, TIMOTHY",timothy mellon,Investments,,M,INVESTMENTS,SELF,45133560.0,23,1962328.0,2800.0,mellon,not
4,U0000000310A,"ADELSON, MIRIAM O DR",miriam o dr adelson,Adelson Clinic for Drug Abuse Treatment & Rese...,,F,PHYSICIAN,ADELSON CLINIC,44999550.0,124,362899.6,2800.0,adelson,not
5,U00000003101,"ADELSON, SHELDON G MR",sheldon g mr adelson,Las Vegas Sands,,M,CHAIRMAN OF THE BOARD,THE VENETIAN,44847950.0,119,376873.5,2800.0,adelson,not
6,U00000036901,"UIHLEIN, RICHARD",richard uihlein,Uline Inc,,M,,,35364330.0,319,110860.0,2800.0,uihlein,not
7,U00000036551,"GRIFFIN, KENNETH",kenneth griffin,Rodimer for Congress,,M,FOUNDER CEO,CITADEL LLC,33667630.0,188,179083.2,2800.0,griffin,not
8,U00000003611,"SCHWARZMAN, STEPHEN A",stephen a schwarzman,Blackstone Group,,M,"CHAIRMAN, CEO & CO-FOUNDER",BLACKSTONE,33454000.0,226,148026.5,2800.0,schwarzman,not
9,U00000046781,"JURVETSON, KARLA",karla jurvetson,[24T Contribution],,F,PHYSICIAN,SELF,33088100.0,914,36201.42,2800.0,jurvetson,not


In [23]:
donors[donors["ethnic"] == "ind"]["lastname"].value_counts()[:10]

lastname
patel     2724
singh     1142
shah       920
khan       865
gupta      539
reddy      497
kumar      467
ahmed      437
sharma     427
ali        421
Name: count, dtype: int64

In [24]:
donors[donors["lastname"] == "roy"]

Unnamed: 0,contrib_id,name,name_new,orgname,ultorg,gender,occupation,employer,total_donated,donation_count,avg_donation,med_donation,lastname,ethnic
15345,a0000939990,"ROY, JAMES",james roy,,,M,LAWYER,DOMENGEAUX WRIGHT ROY & EDWARDS LLC,53844.0,188,286.404255,100.0,roy,not
54114,b0060859920,"ROY, RANDALL",randall roy,[24T Contribution],,M,NOT EMPLOYED,NOT EMPLOYED,18550.0,32,579.687500,500.0,roy,not
55394,h30013501341,"ROY, PETER",peter roy,[24T Contribution],,M,NOT EMPLOYED,NOT EMPLOYED,18150.0,22,825.000000,500.0,roy,not
67016,r0000133800,"ROY, JIM",jim roy,Los Angeles County Fire Dept,"Los Angeles County, CA",M,NOT EMPLOYED,NOT EMPLOYED,15517.0,122,127.188525,50.0,roy,not
78505,j10027645071,"ROY, ROGER",roger roy,"Pillsbury, Winthrop et al",,M,CONSULTANT,PILLSBURY WINTHROP SHAW PITTMAN,13611.0,56,243.053571,100.0,roy,not
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3569571,r0016388523,"ROY, ROBERT A MR",robert a mr roy,US Postal Service,,M,LETTER CARRIER,US POSTAL SERVICE,6.0,1,6.000000,6.0,roy,not
3574301,r0002847935,"ROY, LORI",lori roy,EMPLOYEE SUPPORT SPECIALIST,,F,EMPLOYEE SUPPORT SPECIALIST,LORI ROY,5.0,1,5.000000,5.0,roy,not
3584267,r0019725756,"ROY, ANNE MS",anne ms roy,,,F,,,2.0,1,2.000000,2.0,roy,not
3585501,m0001819896,"ROY, JAMES MR",james mr roy,Retired,,M,RETIRED,RETIRED,2.0,1,2.000000,2.0,roy,not


In [25]:
len(donors)

3588936