In [1]:
import pandas as pd
import polars as pl
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from collections import defaultdict
import warnings
from tqdm import tqdm
import pickle
import re

tqdm.pandas()
warnings.filterwarnings("ignore")

In [2]:
year = 20
df = f"./data/CampaignFin20/indivs{year}.txt"
donors_csv = f"./data/CampaignFin{year}/donors_state{year}.csv"
names = f"./data/USIN.csv"

In [3]:
donors = pd.read_csv(donors_csv)
donors["lastname"] = donors["name"].apply(lambda x: str(x).split(",")[0].lower())
donors.head(10)

Unnamed: 0,contrib_id,name,name_new,orgname,ultorg,realcode,gender,occupation,employer,city,state,total_donated,donation_count,avg_donation,med_donation,lastname
0,,ACTBLUE,actblue actblue,,,Y4000,,,,WASHINGTON,CA,1261253000.0,25821,48846.03,1000.0,actblue
1,U00000037041,"BLOOMBERG, MICHAEL R",michael r bloomberg,[Candidate Contribution],,Z9000,M,FOUNDER,BLOOMBERG INC.,NEW YORK,NY,1127731000.0,958,1177172.0,682.5,bloomberg
2,U00000036521,"STEYER, TOM",tom steyer,[Candidate Contribution],,Z9000,M,PRESIDENTIAL CANDIDATE,SELF-EMPLOYED,SAN FRANCISCO,CA,379478200.0,756,501955.3,2800.0,steyer
3,U00000046841,"MELLON, TIMOTHY",timothy mellon,Investments,,F7000,M,INVESTMENTS,SELF-EMPLOYED,SARATOGA,WY,45133560.0,23,1962328.0,2800.0,mellon
4,U0000000310A,"ADELSON, MIRIAM",miriam adelson,Adelson Clinic for Drug Abuse Treatment & Rese...,,H3200,F,PHYSICIAN,ADELSON CLINIC,LAS VEGAS,NV,44999550.0,124,362899.6,2800.0,adelson
5,U00000003101,"ADELSON, SHELDON G",sheldon g adelson,Las Vegas Sands,,G6500,M,CEO,LAS VEGAS SANDS CORPORATION,LAS VEGAS,NV,44847950.0,119,376873.5,2800.0,adelson
6,U00000036901,"UIHLEIN, RICHARD",richard uihlein,Uline Inc,,M7000,M,CEO,ULINE,LAKE FOREST,IL,35364330.0,319,110860.0,2800.0,uihlein
7,U00000036551,"GRIFFIN, KENNETH",kenneth griffin,Citadel LLC,,F2700,M,FOUNDER CEO,CITADEL LLC,CHICAGO,IL,33667630.0,188,179083.2,2800.0,griffin
8,U00000003611,"SCHWARZMAN, STEPHEN A",stephen a schwarzman,Blackstone Group,,F2600,M,CHAIRMAN,BLACKSTONE,NEW YORK,NY,33454000.0,226,148026.5,2800.0,schwarzman
9,U00000046781,"JURVETSON, KARLA",karla jurvetson,Karla T Jurvetson MD,,H1110,F,PHYSICIAN,SELF,LOS ALTOS,CA,33088100.0,914,36201.42,2800.0,jurvetson


In [4]:
def is_english(s):
    return bool(re.fullmatch(r'^[A-Za-z\s\'-]+$', str(s)))

In [5]:
# https://github.com/philipperemy/name-dataset
df_jewish = pd.read_csv("./data/IL.csv")
df_jewish.columns = ['firstname', 'lastname', 'gender', 'ethnicity']
df_jewish['firstname'] = df_jewish['firstname'].apply(lambda x: x.split(" ")[0] if " " in str(x) else str(x))
df_jewish['lastname'] = df_jewish['lastname'].apply(lambda x: x.split(" ")[-1] if " " in str(x) else str(x))
df_jewish['name'] = df_jewish['firstname'].apply(lambda x: x.lower()) + ' ' + df_jewish['lastname'].apply(lambda x: x.lower())
df_jewish["ethnic"] = df_jewish["ethnicity"].apply(lambda x: "jew")
df_jewish = df_jewish[['lastname', 'name', 'ethnic']]
df_jewish = df_jewish[df_jewish['lastname'].apply(is_english)]
df_jewish.head(10)

Unnamed: 0,lastname,name,ethnic
0,Mozo,zozo mozo,jew
1,Salama,uzi salama,jew
2,Agayev,ido agayev,jew
3,Gohar,isaac gohar,jew
5,Agayev,lior agayev,jew
6,Friedman,gadi friedman,jew
7,Bahumi,dikla bahumi,jew
8,Livshits,vitaly livshits,jew
11,Yaniv,amir yaniv,jew
13,Tetro,dedy tetro,jew


In [6]:
df_us = pd.read_csv("./data/US.csv")
df_us.columns = ['firstname', 'lastname', 'gender', 'ethnicity']
df_us['firstname'] = df_us['firstname'].apply(lambda x: x.split(" ")[0] if " " in str(x) else str(x))
df_us['lastname'] = df_us['lastname'].apply(lambda x: x.split(" ")[-1] if " " in str(x) else str(x))
df_us['name'] = df_us['firstname'].apply(lambda x: x.lower()) + ' ' + df_us['lastname'].apply(lambda x: x.lower())
df_us["ethnic"] = df_us["ethnicity"].apply(lambda x: "not")
df_us = df_us[['lastname', 'name', 'ethnic']]
df_us.head(10)

Unnamed: 0,lastname,name,ethnic
0,Sylvester,brandon sylvester,not
1,Toussaint,chris toussaint,not
2,Gotti,willie gotti,not
3,Corona,cristobal corona,not
4,Diaz,wilmer diaz,not
5,Renee,angela renee,not
6,Duke,duke duke,not
7,Gricelda,gonzales gricelda,not
8,Celestine,jaren celestine,not
9,Smith,nathaniel smith,not


In [7]:
# Get the most common Indian last names
jewish_lastnames = set(df_jewish["lastname"].str.lower().value_counts()[df_jewish["lastname"].str.lower().value_counts() > 4].index)

# Get the most common American last names
common_us_lastnames = set(df_us["lastname"].str.lower().value_counts().head(2000).index)

# Remove last names that are also common in the US
jewish_lastnames = list(jewish_lastnames - common_us_lastnames)

# remove edge cases
jewish_lastnames = list(set([x for x in jewish_lastnames if len(str(x)) > 2 
                             and x not in ["ahmad", "ali", "dahan", "amar", "omar", "awad", "saleh", 
                                           "hadad", "abed", "odeh", "mohammad", "mohamed", "mohammed", 
                                           "sh", "nan", "hassan", "ahmed", "mansour", "nassar", "hamdan", 
                                           "ohana", "kh", "hazan", "dayan", "chen", "khalil", "yousef", 
                                           "zoabi", "shaheen", "naser", "hasan", "salman", "mahmoud", 
                                           "mahmud", "amir", "hamad", "khaled", "nasser", "perez", 
                                           "khatib", "haddad", "masri", "abu", "mohamad", "jamal", "awwad", 
                                           "hamed", "najjar", "mohamed", "naim", "nahum", "amsalem", "halabi", 
                                           "salameh", "hammad", "or", "shaked", "nagar", "natsheh", "maimon", 
                                           "morad", "sultan", "abbas", "mosa", "tamir", "shimon",
                                           "lord", "roe", "sweet", "swan", "read", "hilton", "rooney", 
                                           "reed", "reid", "clifford", "masters", "mcallister", "dick", 
                                           "whitman", "sherwood", "grove", "rudolph", "clement", "brand", 
                                           "dubois", "blank", "root" "land", "urban", "light", "quick", 
                                           "justice", "pool", "fair", "street", "stock", "seaman", "poe", "leone", "jameson", 
                                           "luke", "atwood", "castle", "berlin", "vitale", "ham", "waterman", "nathan"]] 
                            + ["cohen", "david", "miller", "schwartz", "friedman", "levine",
                               "levy", "kaplan", "katz", "shapiro", "stein", "bernstein", "kaufman", 
                               "weiner", "goldberg", "goldstein", "klein", "greenberg", 
                               "rosenberg", "stern", "gordon", "weiss", "rubin", "rosen",
                               "soros", "bankman-fried", "zuckerberg", "moskovitz", "moskowitz", 
                               "koum", "schusterman", "peretz", "drescher", "steyer", "dreyfus",
                               "sussman", "simon", "laufer", "bekenstein", "berkenstein", "ballmer"]))

In [8]:
donors["ethnic"] = np.where(donors["lastname"].str.lower().isin(jewish_lastnames), "jew", "not")
donors.to_csv(f"./output/donors_state{year}_pred_lastname_jew.csv", index=False)
donors["ethnic"].value_counts()

ethnic
not    3171666
jew     417270
Name: count, dtype: int64

In [9]:
donors[donors["lastname"] == "roy"]

Unnamed: 0,contrib_id,name,name_new,orgname,ultorg,realcode,gender,occupation,employer,city,state,total_donated,donation_count,avg_donation,med_donation,lastname,ethnic
15345,a0000939990,"ROY, JAMES",james roy,,,J1200,M,LAWYER,DOMENGEAUX WRIGHT ROY & EDWARDS LLC,LAFAYETTE,LA,53844.0,188,286.404255,100.0,roy,not
54083,b0060859920,"ROY, RANDALL",randall roy,Retired,,X1200,M,RETIRED,RETIRED,TOPEKA,KS,18550.0,32,579.687500,500.0,roy,not
55407,h30013501341,"ROY, PETER",peter roy,[24T Contribution],,Z9500,M,NOT EMPLOYED,NOT EMPLOYED,PAWLEYS ISLAND,SC,18150.0,22,825.000000,500.0,roy,not
67017,r0000133800,"ROY, JIM",jim roy,Los Angeles County Fire Dept,"Los Angeles County, CA",X3000,M,NOT EMPLOYED,NOT EMPLOYED,SANTA ANA,CA,15517.0,122,127.188525,50.0,roy,not
78507,j10027645071,"ROY, ROGER",roger roy,"Pillsbury, Winthrop et al",,K1200,M,CONSULTANT,PILLSBURY WINTHROP SHAW PITTMAN,SAN RAMON,CA,13611.0,56,243.053571,100.0,roy,not
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3569842,r0016388523,"ROY, ROBERT A MR",robert a mr roy,US Postal Service,,X3700,M,LETTER CARRIER,US POSTAL SERVICE,FORT SMITH,AR,6.0,1,6.000000,6.0,roy,not
3570517,r0002847935,"ROY, LORI",lori roy,EMPLOYEE SUPPORT SPECIALIST,,Y4000,F,EMPLOYEE SUPPORT SPECIALIST,LORI ROY,SPRINGBORO,OH,5.0,1,5.000000,5.0,roy,not
3583897,m0001819896,"ROY, JAMES MR",james mr roy,Retired,,X1200,M,RETIRED,RETIRED,SPICKARD,MO,2.0,1,2.000000,2.0,roy,not
3585986,r0019725756,"ROY, ANNE MS",anne ms roy,,,Y2000,F,,,BOYCE,LA,2.0,1,2.000000,2.0,roy,not


In [11]:
donors[donors["ethnic"] == "jew"].sort_values(by="total_donated", ascending=False).iloc[:10]

Unnamed: 0,contrib_id,name,name_new,orgname,ultorg,realcode,gender,occupation,employer,city,state,total_donated,donation_count,avg_donation,med_donation,lastname,ethnic
1,U00000037041,"BLOOMBERG, MICHAEL R",michael r bloomberg,[Candidate Contribution],,Z9000,M,FOUNDER,BLOOMBERG INC.,NEW YORK,NY,1127731000.0,958,1177172.0,682.5,bloomberg,jew
2,U00000036521,"STEYER, TOM",tom steyer,[Candidate Contribution],,Z9000,M,PRESIDENTIAL CANDIDATE,SELF-EMPLOYED,SAN FRANCISCO,CA,379478200.0,756,501955.3,2800.0,steyer,jew
4,U0000000310A,"ADELSON, MIRIAM",miriam adelson,Adelson Clinic for Drug Abuse Treatment & Rese...,,H3200,F,PHYSICIAN,ADELSON CLINIC,LAS VEGAS,NV,44999550.0,124,362899.6,2800.0,adelson,jew
5,U00000003101,"ADELSON, SHELDON G",sheldon g adelson,Las Vegas Sands,,G6500,M,CEO,LAS VEGAS SANDS CORPORATION,LAS VEGAS,NV,44847950.0,119,376873.5,2800.0,adelson,jew
8,U00000003611,"SCHWARZMAN, STEPHEN A",stephen a schwarzman,Blackstone Group,,F2600,M,CHAIRMAN,BLACKSTONE,NEW YORK,NY,33454000.0,226,148026.5,2800.0,schwarzman,jew
10,U00000045921,"MOSKOVITZ, DUSTIN AARON",dustin aaron moskovitz,Asana,,C5120,M,CO-FOUNDER,ASANA,SAN FRANCISCO,CA,29304190.0,112,261644.6,10000.0,moskovitz,jew
11,U00000003151,"SIMONS, JAMES",james simons,Euclidean Capital,,Z9500,M,PHILANTHROPIST,EUCLIDEAN CAPITAL,NEW YORK,NY,21979600.0,241,91201.66,2800.0,simons,jew
12,U0000004604,"SUSSMAN, S DONALD",s donald sussman,Paloma Partners,,F2700,M,INVESTMENT ADVISOR,PALOMA PARTNERS ADVISORS LP,FT LAUDERDALE,FL,21817600.0,465,46919.57,2800.0,sussman,jew
13,U00000042451,"YASS, JEFF",jeff yass,Susquehanna International Group,,F2100,M,MANAGING DIRECTOR,SIG,BALA CYNWYD,PA,16061400.0,94,170866.0,2800.0,yass,jew
15,U0000004682,"SIMON, DEBORAH",deborah simon,Simon Youth Foundation,,X4100,F,NOT EMPLOYED,NOT EMPLOYED,CARMEL,IN,15421360.0,1015,15193.46,675.0,simon,jew
