In [1]:
import pandas as pd
import polars as pl
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from collections import defaultdict
from tqdm import tqdm
import pickle
from ethnicseer import EthnicClassifier

tqdm.pandas(desc="Processing")

In [2]:
indivs20 = "./data/CampaignFin20/indivs20.txt"
indivs22 = "./data/CampaignFin22/indivs22.txt"
names = "./data/USIN.csv"
model_path = "./models/logit_classifier.pkl"

In [3]:
df = pd.read_csv(names)
df["ethnic"].value_counts()

ethnic
not    6994611
ind    3005389
Name: count, dtype: int64

In [4]:
lf20 = (
        pl.scan_csv(
            indivs20,
            separator=',', 
            quote_char='|', 
            encoding='utf8-lossy', 
            has_header=False,
            new_columns=['dummy1', 'dummy2', 'contrib_id', 'name', 'recip_id', 
                        'orgname', 'ultorg', 'realcode', 'dummy3', 'amount', 
                        'street', 'city', 'state', 'zip', 'recipcode', 'type', 'dummy4', 'dummy5', 'gender', 'dummy6', 'occupation', 'employer', 'dummy7'],
            schema_overrides={'amount': pl.Float64, 'name': pl.Utf8, 'state': pl.Utf8, 'city': pl.Utf8},
            ignore_errors=True
        )
        .select(['contrib_id', 'name', 'recip_id', 'orgname', 'ultorg', 'realcode', 
                 'amount', 'street', 'city', 'state', 'zip', 'recipcode', 'type', 
                 'gender', 'occupation', 'employer'])
        .filter(~pl.col('amount').is_null())
    )
df20 = lf20.collect()
print(df20.head(10))

shape: (10, 16)
┌─────────────┬─────────────┬───────────┬────────────┬───┬──────┬────────┬────────────┬────────────┐
│ contrib_id  ┆ name        ┆ recip_id  ┆ orgname    ┆ … ┆ type ┆ gender ┆ occupation ┆ employer   │
│ ---         ┆ ---         ┆ ---       ┆ ---        ┆   ┆ ---  ┆ ---    ┆ ---        ┆ ---        │
│ str         ┆ str         ┆ str       ┆ str        ┆   ┆ str  ┆ str    ┆ str        ┆ str        │
╞═════════════╪═════════════╪═══════════╪════════════╪═══╪══════╪════════╪════════════╪════════════╡
│ p0004869853 ┆ LONNBERG,   ┆ C00721712 ┆ [24T Contr ┆ … ┆ 24T  ┆ M      ┆ PARTNER    ┆ BOSTON     │
│             ┆ CARL        ┆           ┆ ibution]   ┆   ┆      ┆        ┆            ┆ CONSULTING │
│             ┆             ┆           ┆            ┆   ┆      ┆        ┆            ┆ GROUP      │
│ k0001516259 ┆ LOVO, MARIO ┆ N00044240 ┆ [24T Contr ┆ … ┆ 24T  ┆ M      ┆ LAWYER     ┆ SELF       │
│             ┆             ┆           ┆ ibution]   ┆   ┆      ┆        ┆ 

In [5]:
df20 = df20.with_columns([
    pl.col("name").str.split(",").list.get(-1).str.to_lowercase().str.strip_chars().alias("firstname"),
    pl.col("name").str.split(",").list.first().str.to_lowercase().str.strip_chars().alias("lastname"),
])

df20 = df20.with_columns([
    (
        pl.col("firstname").str.to_lowercase().str.strip_chars() + " " + 
        pl.col("lastname").str.to_lowercase().str.strip_chars()
    ).alias("name_new")
])

print(df20.head(10))

shape: (10, 19)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬──────────┬───────────┐
│ contrib_i ┆ name      ┆ recip_id  ┆ orgname   ┆ … ┆ employer  ┆ firstname ┆ lastname ┆ name_new  │
│ d         ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---      ┆ ---       │
│ ---       ┆ str       ┆ str       ┆ str       ┆   ┆ str       ┆ str       ┆ str      ┆ str       │
│ str       ┆           ┆           ┆           ┆   ┆           ┆           ┆          ┆           │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪══════════╪═══════════╡
│ p00048698 ┆ LONNBERG, ┆ C00721712 ┆ [24T Cont ┆ … ┆ BOSTON    ┆ carl      ┆ lonnberg ┆ carl      │
│ 53        ┆ CARL      ┆           ┆ ribution] ┆   ┆ CONSULTIN ┆           ┆          ┆ lonnberg  │
│           ┆           ┆           ┆           ┆   ┆ G GROUP   ┆           ┆          ┆           │
│ k00015162 ┆ LOVO,     ┆ N00044240 ┆ [24T Cont ┆ … ┆ SELF      ┆ mario    

In [6]:
donors20 = (
    df20.group_by("contrib_id")
    .agg(
        pl.col("name").first().alias("name"),
        pl.col("name_new").first().alias("name_new"),
        pl.col("lastname").first().alias("lastname"),
        pl.col("amount").sum().alias("total_donated"),
        pl.col("amount").count().alias("donation_count"),
        pl.col("amount").mean().alias("avg_donation")
    )
    .sort("total_donated", descending=True)
)
print(donors20.head(10))

shape: (10, 7)
┌──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐
│ contrib_id   ┆ name        ┆ name_new    ┆ lastname    ┆ total_donat ┆ donation_co ┆ avg_donatio │
│ ---          ┆ ---         ┆ ---         ┆ ---         ┆ ed          ┆ unt         ┆ n           │
│ str          ┆ str         ┆ str         ┆ str         ┆ ---         ┆ ---         ┆ ---         │
│              ┆             ┆             ┆             ┆ f64         ┆ u32         ┆ f64         │
╞══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡
│              ┆ FOR         ┆ amy for     ┆ for america ┆ 1.2556e9    ┆ 27780       ┆ 45197.81353 │
│              ┆ AMERICA,    ┆ america     ┆             ┆             ┆             ┆ 5           │
│              ┆ AMY         ┆             ┆             ┆             ┆             ┆             │
│ U00000037041 ┆ BLOOMBERG,  ┆ michael     ┆ bloomberg   ┆ 1.1277e9    ┆ 960

In [10]:
donors20 = donors20.to_pandas()

In [7]:
sample = pd.read_csv("./top_donors_all.csv")
sample["firstname"] = sample["name"].apply(lambda x: x.split(",")[-1] if "," in x else str(x))
sample["lastname"] = sample["name"].apply(lambda x: x.split(",")[0] if "," in x else str(x))
sample["name_new"] = sample["firstname"].apply(lambda x: x.lower()) + ' ' + sample["lastname"].apply(lambda x: x.lower())
sample["ethnic"] = sample["actual"].apply(lambda x: "ind" if x == True else "not")
sample

Unnamed: 0,contrib_id,name,total_donated,donation_count,avg_donation,is_indian,actual,indian,firstname,lastname,name_new,ethnic
0,,"for america, amy",1255595260,27780,4.519781e+04,,,,amy,for america,amy for america,not
1,U00000037041,"bloomberg, michael",1127712782,960,1.174701e+06,False,False,False,michael,bloomberg,michael bloomberg,not
2,U00000036521,"steyer, thomas f",379061294,779,4.865999e+05,False,False,False,thomas f,steyer,thomas f steyer,not
3,U00000046841,"mellon, timothy",45133555,23,1.962328e+06,False,False,False,timothy,mellon,timothy mellon,not
4,U0000000310A,"adelson, miriam o dr",44971550,134,3.356086e+05,False,False,False,miriam o dr,adelson,miriam o dr adelson,not
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,i3003933154,"title, lawrence",75828,146,5.193699e+02,False,False,False,lawrence,title,lawrence title,not
9996,q0001094848,"chaves, manny",75816,256,2.961562e+02,False,False,False,manny,chaves,manny chaves,not
9997,h10014870831,"adelman, david",75800,33,2.296970e+03,False,False,False,david,adelman,david adelman,not
9998,Y0000040866S,"brownstein, helen",75800,29,2.613793e+03,False,False,False,helen,brownstein,helen brownstein,not


In [13]:
# https://github.com/philipperemy/name-dataset
df_indian = pd.read_csv("./data/IN.csv")
df_indian.columns = ['firstname', 'lastname', 'gender', 'ethnicity']
df_indian['firstname'] = df_indian['firstname'].apply(lambda x: x.split(" ")[0] if " " in str(x) else str(x))
df_indian['lastname'] = df_indian['lastname'].apply(lambda x: x.split(" ")[-1] if " " in str(x) else str(x))
df_indian['name'] = df_indian['firstname'].apply(lambda x: x.lower()) + ' ' + df_indian['lastname'].apply(lambda x: x.lower())
df_indian["ethnic"] = df_indian["ethnicity"].apply(lambda x: "ind")
df_indian

Unnamed: 0,firstname,lastname,gender,ethnicity,name,ethnic
0,Kapil,Kumar,M,IN,kapil kumar,ind
1,Mitali,Aggarwal,F,IN,mitali aggarwal,ind
2,Vikas,Jangra,,IN,vikas jangra,ind
3,Ravi,Lungay,M,IN,ravi lungay,ind
4,Jagat,Yadav,M,IN,jagat yadav,ind
...,...,...,...,...,...,...
6161585,Vikas,Chakchanpur,M,IN,vikas chakchanpur,ind
6161586,Dipu,Gupta,M,IN,dipu gupta,ind
6161587,Riya,Naharwal,F,IN,riya naharwal,ind
6161588,Jashandeep,Hanjra,M,IN,jashandeep hanjra,ind


In [None]:
indian_lastnames = set(df_indian["lastname"].str.lower().head(5000))
known_indian_lastnames = sample[sample["ethnic"] == "ind"]["lastname"].str.lower().tolist()
indian_lastnames.update(known_indian_lastnames)
indian_lastnames = [x for x in indian_lastnames if x not in ["king", "mann", "ray", "gill", "m", "paul", "farmer", "john", "kay", 
                                                             "r", "abraham", "camp", "g", "p", "power", "song", "k", "kang", "binder", 
                                                             "joy", "null", "sell", "n", "ko", "shi", "ricker", "robin", "rambo", "ku", 
                                                             "back", "banker", "hans", "sandy", "grand", "ji", "bains", "bale", "duane", 
                                                             "don", "mi", "mall", "baden", "panter", "sing", "barra", "davi", 
                                                             "lasseter", "munger", "kaler", "maddy", "lucky", "pop", "shalom", "ro",
                                                             "rod", "sha", "pon", "bander", "barman", "biber", "goldy", "bobby", "bou", 
                                                             "sky", "nan", "done", "boy", "summy", "boro", "all", "rings", "vali",
                                                             "shing", "dema", "farm", "lather", "suny", "kalson", "payment", "si",
                                                             "ch", "sah", "come", "roy"]]

donors20["ethnic"] = np.where(donors20["lastname"].str.lower().isin(indian_lastnames), "ind", "not")
donors20.loc[:9999, "ethnic"] = np.where(sample["actual"], "ind", donors20.loc[:9999, "ethnic"])
donors20.to_csv("./data/donors20_with_pred_lastname.csv", index=False)
donors20["ethnic"].value_counts()

ethnic
not    3823483
ind      24272
Name: count, dtype: int64

In [223]:
donors20.head(10)

Unnamed: 0,contrib_id,name,name_new,lastname,total_donated,donation_count,avg_donation,ethnic
0,,"FOR AMERICA, AMY",amy for america,for america,1255595000.0,27780,45197.81,ind
1,U00000037041,"BLOOMBERG, MICHAEL",michael bloomberg,bloomberg,1127713000.0,960,1174701.0,not
2,U00000036521,"STEYER, THOMAS F",thomas f steyer,steyer,379061300.0,779,486599.9,not
3,U00000046841,"MELLON, TIMOTHY",timothy mellon,mellon,45133560.0,23,1962328.0,not
4,U0000000310A,"ADELSON, MIRIAM O DR",miriam o dr adelson,adelson,44971550.0,134,335608.6,not
5,U00000003101,"ADELSON, SHELDON G MR",sheldon g mr adelson,adelson,44819950.0,129,347441.5,not
6,U00000036901,"UIHLEIN, RICHARD E",richard e uihlein,uihlein,35302380.0,342,103223.3,not
7,U00000036551,"GRIFFIN, KENNETH",kenneth griffin,griffin,33639930.0,197,170761.1,not
8,U00000003611,"SCHWARZMAN, STEPHEN A",stephen a schwarzman,schwarzman,33406500.0,242,138043.4,not
9,U00000046781,"JURVETSON, KARLA",karla jurvetson,jurvetson,32493510.0,973,33395.18,not


In [207]:
donors20[donors20["ethnic"] == "ind"]["lastname"].value_counts()[:10]

lastname
patel     2844
singh     1221
shah       959
khan       908
gupta      551
reddy      512
kumar      492
ahmed      460
ali        451
sharma     449
Name: count, dtype: int64

In [225]:
donors20[donors20["lastname"] == "roy"]

Unnamed: 0,contrib_id,name,name_new,lastname,total_donated,donation_count,avg_donation,ethnic
14560,a0000939990,"ROY, JAMES",james roy,roy,53634.0,191,280.806283,not
50181,b0060859920,"ROY, RANDALL",randall roy,roy,18550.0,32,579.687500,not
51379,h30013501341,"ROY, PETER",peter roy,roy,18150.0,22,825.000000,not
72938,j10027645071,"ROY, ROGER",roger roy,roy,13510.0,57,237.017544,not
78618,r0000133800,"ROY, JIM",jim roy,roy,12662.0,124,102.112903,not
...,...,...,...,...,...,...,...,...
3835152,r0020072734,"ROY, TIMORHY",timorhy roy,roy,-625.0,5,-125.000000,not
3840774,j1002527397,"ROY, CARLYN",carlyn roy,roy,-1000.0,3,-333.333333,not
3842744,r0018719218,"ROY, ZUNIGA",zuniga roy,roy,-1356.0,1,-1356.000000,not
3844041,j1002688735,"ROY, MELISSA",melissa roy,roy,-1788.0,20,-89.400000,not
