In [164]:
import pandas as pd
import polars as pl
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from collections import defaultdict
import warnings
from tqdm import tqdm
import pickle
import re

tqdm.pandas()
warnings.filterwarnings("ignore")

In [165]:
year = 16
df = f"./data/CampaignFin20/indivs{year}.txt"
donors_csv = f"./data/CampaignFin{year}/donors_state{year}.csv"
names = f"./data/USIN.csv"

In [166]:
donors = pd.read_csv(donors_csv)
donors["lastname"] = donors["name"].apply(lambda x: str(x).split(",")[0].lower())
donors.head(10)

Unnamed: 0,contrib_id,name,name_new,orgname,ultorg,realcode,gender,occupation,employer,city,state,total_donated,donation_count,avg_donation,med_donation,lastname
0,,,,,,Y4000,,,,WASHINGTON,CA,856084910.0,27811,30782.2412,1003.0,
1,U00000036521,"STEYER, THOMAS",thomas steyer,Fahr LLC/Tom Steyer,Fahr LLC,JE300,M,MANAGING PARTNER,FAHR LLC,REDWOOD CITY,CA,91197510.0,196,465293.418367,2700.0,steyer
2,U00000003101,"ADELSON, SHELDON G MR",sheldon g mr adelson,Las Vegas Sands,,G6500,M,CEO,LAS VEGAS SANDS CORPORATION,LAS VEGAS,NV,44554500.0,147,303091.836735,5000.0,adelson
3,U0000004604,"SUSSMAN, S DONALD",s donald sussman,Paloma Partners,,F2700,M,INVESTMENT ADVISOR,PALOMA PARTNERS ADVISORS LP,FT LAUDERDALE,FL,43007600.0,408,105410.784314,2700.0,sussman
4,U0000000310A,"ADELSON, MIRIAM",miriam adelson,Adelson Clinic for Drug Abuse Treatment & Rese...,,H3200,F,PHYSICIAN,ADELSON DRUG CLINIC,LAS VEGAS,NV,39938500.0,145,275437.931034,5000.0,adelson
5,U0000003235,"EYCHANER, FRED",fred eychaner,Newsweb Corp,,C1100,M,CEO,NEWSWEB CORPORATION,CHICAGO,IL,38815778.0,222,174845.846847,9909.0,eychaner
6,U00000000661,"SINGER, PAUL",paul singer,Elliott Management,,F2700,M,PRINCIPAL,ELLIOTT MANAGEMENT,NEW YORK,NY,26530353.0,266,99738.169173,2700.0,singer
7,U00000036821,"MERCER, ROBERT",robert mercer,Renaissance Technologies,,F2700,M,FINANCIAL CONSULTANT,RENAISSANCE TECHNOLOGIES,EAST SETAUKET,NY,24173000.0,145,166710.344828,2700.0,mercer
8,U00000037041,"BLOOMBERG, MICHAEL R",michael r bloomberg,Bloomberg Lp,,X3100,M,EXECUTIVE,BLOOMBERG LP,NEW YORK,NY,23769124.0,49,485084.163265,5400.0,bloomberg
9,U00000003151,"SIMONS, JAMES H",james h simons,Renaissance Technologies,,F2700,M,PHILANTHROPIST,EUCLIDEAN CAPITAL,NEW YORK,NY,22961150.0,87,263921.264368,2700.0,simons


In [167]:
def is_english(s):
    return bool(re.fullmatch(r'^[A-Za-z\s\'-]+$', str(s)))

In [168]:
# https://github.com/philipperemy/name-dataset
df_jewish = pd.read_csv("./data/IL.csv")
df_jewish.columns = ['firstname', 'lastname', 'gender', 'ethnicity']
df_jewish['firstname'] = df_jewish['firstname'].apply(lambda x: x.split(" ")[0] if " " in str(x) else str(x))
df_jewish['lastname'] = df_jewish['lastname'].apply(lambda x: x.split(" ")[-1] if " " in str(x) else str(x))
df_jewish['name'] = df_jewish['firstname'].apply(lambda x: x.lower()) + ' ' + df_jewish['lastname'].apply(lambda x: x.lower())
df_jewish["ethnic"] = df_jewish["ethnicity"].apply(lambda x: "jew")
df_jewish = df_jewish[['lastname', 'name', 'ethnic']]
df_jewish = df_jewish[df_jewish['lastname'].apply(is_english)]
df_jewish.head(10)

Unnamed: 0,lastname,name,ethnic
0,Mozo,zozo mozo,jew
1,Salama,uzi salama,jew
2,Agayev,ido agayev,jew
3,Gohar,isaac gohar,jew
5,Agayev,lior agayev,jew
6,Friedman,gadi friedman,jew
7,Bahumi,dikla bahumi,jew
8,Livshits,vitaly livshits,jew
11,Yaniv,amir yaniv,jew
13,Tetro,dedy tetro,jew


In [169]:
df_us = pd.read_csv("./data/US.csv")
df_us.columns = ['firstname', 'lastname', 'gender', 'ethnicity']
df_us['firstname'] = df_us['firstname'].apply(lambda x: x.split(" ")[0] if " " in str(x) else str(x))
df_us['lastname'] = df_us['lastname'].apply(lambda x: x.split(" ")[-1] if " " in str(x) else str(x))
df_us['name'] = df_us['firstname'].apply(lambda x: x.lower()) + ' ' + df_us['lastname'].apply(lambda x: x.lower())
df_us["ethnic"] = df_us["ethnicity"].apply(lambda x: "not")
df_us = df_us[['lastname', 'name', 'ethnic']]
df_us.head(10)

Unnamed: 0,lastname,name,ethnic
0,Sylvester,brandon sylvester,not
1,Toussaint,chris toussaint,not
2,Gotti,willie gotti,not
3,Corona,cristobal corona,not
4,Diaz,wilmer diaz,not
5,Renee,angela renee,not
6,Duke,duke duke,not
7,Gricelda,gonzales gricelda,not
8,Celestine,jaren celestine,not
9,Smith,nathaniel smith,not


In [170]:
# Get the most common Indian last names
jewish_lastnames = set(df_jewish["lastname"].str.lower().value_counts()[df_jewish["lastname"].str.lower().value_counts() > 4].index)

# Get the most common American last names
common_us_lastnames = set(df_us["lastname"].str.lower().value_counts().head(2000).index)

# Remove last names that are also common in the US
jewish_lastnames = list(jewish_lastnames - common_us_lastnames)

# remove edge cases
jewish_lastnames = list(set([x for x in jewish_lastnames if len(str(x)) > 2 
                             and x not in ["ahmad", "ali", "dahan", "amar", "omar", "awad", "saleh", 
                                           "hadad", "abed", "odeh", "mohammad", "mohamed", "mohammed", 
                                           "sh", "nan", "hassan", "ahmed", "mansour", "nassar", "hamdan", 
                                           "ohana", "kh", "hazan", "dayan", "chen", "khalil", "yousef", 
                                           "zoabi", "shaheen", "naser", "hasan", "salman", "mahmoud", 
                                           "mahmud", "amir", "hamad", "khaled", "nasser", "perez", 
                                           "khatib", "haddad", "masri", "abu", "mohamad", "jamal", "awwad", 
                                           "hamed", "najjar", "mohamed", "naim", "nahum", "amsalem", "halabi", 
                                           "salameh", "hammad", "or", "shaked", "nagar", "natsheh", "maimon", 
                                           "morad", "sultan", "abbas", "mosa", "tamir", "shimon",
                                           "lord", "roe", "sweet", "swan", "read", "hilton", "rooney", 
                                           "reed", "reid", "clifford", "masters", "mcallister", "dick", 
                                           "whitman", "sherwood", "grove", "rudolph", "clement", "brand", 
                                           "dubois", "blank", "root" "land", "urban", "light", "quick", 
                                           "justice", "pool", "fair", "street", "stock", "seaman", "poe", "leone", "jameson", 
                                           "luke", "atwood", "castle", "berlin", "vitale", "ham", "waterman", "nathan"]] 
                            + ["cohen", "david", "miller", "schwartz", "friedman", "levine",
                               "levy", "kaplan", "katz", "shapiro", "stein", "bernstein", "kaufman", 
                               "weiner", "goldberg", "goldstein", "klein", "greenberg", 
                               "rosenberg", "stern", "gordon", "weiss", "rubin", "rosen",
                               "soros", "bankman-fried", "zuckerberg", "moskovitz", "moskowitz", 
                               "koum", "schusterman", "peretz", "drescher", "steyer", "dreyfus",
                               "sussman", "simon", "laufer", "bekenstein", "berkenstein", "ballmer"]))

In [171]:
donors["ethnic"] = np.where(donors["lastname"].str.lower().isin(jewish_lastnames), "jew", "not")
donors.to_csv(f"./output/donors_state{year}_pred_lastname_jew.csv", index=False)
donors["ethnic"].value_counts()

ethnic
not    2150742
jew     285398
Name: count, dtype: int64

In [172]:
donors[donors["lastname"] == "roy"]

Unnamed: 0,contrib_id,name,name_new,orgname,ultorg,realcode,gender,occupation,employer,city,state,total_donated,donation_count,avg_donation,med_donation,lastname,ethnic
3456,m00014784311,"ROY, ROB",rob roy,Switch Supernap,,C6500,M,CEO,SWITCH,LAS VEGAS,NV,135100.0,10,13510.000000,2700.0,roy,not
9936,m0001478431@,"ROY, STELLA",stella roy,Switch Supernap,,C6500,F,PHILANTHROPIST,,LAS VEGAS,NV,51000.0,13,3923.076923,2700.0,roy,not
10003,i3004169447,"ROY, ROB",rob roy,Switch Supernap,,C6500,M,CEO/FOUNDER/CHAIRMAN,SWITCH,LAS VEGAS,NV,50700.0,4,12675.000000,8650.0,roy,not
15021,p0005017709,"ROY, JONES R MR",jones r mr roy,Rw Jones & Sons,,B1000,M,CONSTRUCTION,"RW JONES & SONS, INC.",SAN ANTONIO,TX,33400.0,1,33400.000000,33400.0,roy,not
34823,m0002233644,"ROY, HARRY",harry roy,Rensselaer Polytechnic Institute,,H5100,M,PROFESSOR,RENSSELAER POLYTECHNIC INSTITUTE,TROY,NY,15390.0,94,163.723404,200.0,roy,not
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2412336,p0004683600,"ROY, AYUSHI",ayushi roy,Oakland City Hall,,Y4000,F,DIGITAL SERVICES COORDINATOR,OAKLAND CITY HALL,SAN JOSE,CA,10.0,2,5.000000,5.0,roy,not
2416936,p0005466657,"ROY, ROBERT C MR",robert c mr roy,,,Y2000,M,INFORMATION REQUESTED,INFORMATION REQUESTED,ST AUGUSTINE,FL,8.0,1,8.000000,8.0,roy,not
2424479,p0005427204,"ROY, LAURA",laura roy,Unemployed,,Y1000,F,NURSE PRACTITIONER,NOT EMPLOYED,KILLEEN,TX,8.0,2,4.000000,4.0,roy,not
2427690,p0004521376,"ROY, THERESA",theresa roy,Coldwell Banker,Realogy Corp,F4200,F,REALTOR,COLDWELL BANKER,DELRAY BEACH,FL,5.0,1,5.000000,5.0,roy,not


In [173]:
donors[donors["ethnic"] == "jew"].sort_values(by="total_donated", ascending=False).iloc[10:20]

Unnamed: 0,contrib_id,name,name_new,orgname,ultorg,realcode,gender,occupation,employer,city,state,total_donated,donation_count,avg_donation,med_donation,lastname,ethnic
26,U0000000377A,"PRITZKER, MARY KATHRYN",mary kathryn pritzker,Pritzker Group,,F2100,F,MANAGER,SELF-EMPLOYED,CHICAGO,IL,10319599.0,112,92139.28,10000.0,pritzker,jew
29,U00000003801,"SABAN, HAIM",haim saban,Saban Capital Group,,C2300,M,EXECUTIVE,SABAN CAPITAL GROUP INC.,LOS ANGELES,CA,9855245.0,152,64837.14,10000.0,saban,jew
30,U0000000380A,"SABAN, CHERYL",cheryl saban,Saban Capital Group,,C2300,F,AUTHOR/PRODUCER,SELF-EMPLOYED,LOS ANGELES,CA,8499042.0,154,55188.58,9696.0,saban,jew
41,U00000033221,"LAUFER, HENRY",henry laufer,Renaissance Technologies,,F2700,M,RETIRED,,LANTANA,FL,6357108.0,100,63571.08,2700.0,laufer,jew
42,U0000004181,"SOROS, ALEXANDER",alexander soros,Soros Fund Management,,F2700,M,STUDENT,,NEW YORK,NY,6327702.0,145,43639.32,2700.0,soros,jew
43,U00000027001,"SANDLER, HERBERT M",herbert m sandler,Herb & Marion Sandler/Sandler Foundation,Sandler Foundation,X4100,M,PRESIDENT,SANDLER FOUNDATION,SAN FRANCISCO,CA,6249600.0,80,78120.0,10000.0,sandler,jew
49,U00000002531,"SCHWARTZ, BERNARD L",bernard l schwartz,BLS Investments,,F7000,M,CEO,BLS INVESTMENTS,NEW YORK,NY,5590000.0,133,42030.08,2700.0,schwartz,jew
50,U00000042451,"YASS, JEFF",jeff yass,Susquehanna International Group,,F2100,M,TRADER,SIG,BALA CYNWYD,PA,5567432.0,73,76266.19,5000.0,yass,jew
53,U00000003611,"SCHWARZMAN, STEPHEN A",stephen a schwarzman,Blackstone Group,,F2600,M,CHAIRMAN & CEO,THE BLACKSTONE GROUP,NEW YORK,NY,5333200.0,121,44076.03,2700.0,schwarzman,jew
57,U00000045711,"PERLMUTTER, ISAAC",isaac perlmutter,Marvel Entertainment,Walt Disney Co,C2400,M,CEO,MARVEL ENTERTAINMENT,PALM BEACH,FL,5000000.0,2,2500000.0,2500000.0,perlmutter,jew
