In [1]:
import pandas as pd
import polars as pl
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from collections import defaultdict
import warnings
from tqdm import tqdm
import pickle

tqdm.pandas()
warnings.filterwarnings("ignore")

In [2]:
df_csv = "./data/CampaignFin20/indivs20.txt"
donors_csv = "./output/donors20_pred_lastname.csv"
names_csv = "./data/USIN.csv"

In [3]:
donors = pd.read_csv(donors_csv)
donors.head(10)

Unnamed: 0,contrib_id,name,name_new,orgname,ultorg,gender,occupation,employer,total_donated,donation_count,avg_donation,med_donation,lastname,ethnic
0,,"FOR AMERICA, AMY",amy for america,[24T Contribution],,,NOT EMPLOYED,NOT EMPLOYED,1261253000.0,25821,48846.03,1000.0,for america,not
1,U00000037041,"BLOOMBERG, MICHAEL",michael bloomberg,Bloomberg LP,,M,MAYOR,CITY OF NEW YORK,1127731000.0,958,1177172.0,682.5,bloomberg,not
2,U00000036521,"STEYER, THOMAS F",thomas f steyer,Fahr LLC/Tom Steyer,Fahr LLC,M,FOUNDER,NEXTGEN AMERICA,379478200.0,756,501955.3,2800.0,steyer,not
3,U00000046841,"MELLON, TIMOTHY",timothy mellon,Investments,,M,INVESTMENTS,SELF,45133560.0,23,1962328.0,2800.0,mellon,not
4,U0000000310A,"ADELSON, MIRIAM O DR",miriam o dr adelson,Adelson Clinic for Drug Abuse Treatment & Rese...,,F,PHYSICIAN,ADELSON CLINIC,44999550.0,124,362899.6,2800.0,adelson,not
5,U00000003101,"ADELSON, SHELDON G MR",sheldon g mr adelson,Las Vegas Sands,,M,CHAIRMAN OF THE BOARD,THE VENETIAN,44847950.0,119,376873.5,2800.0,adelson,not
6,U00000036901,"UIHLEIN, RICHARD",richard uihlein,Uline Inc,,M,,,35364330.0,319,110860.0,2800.0,uihlein,not
7,U00000036551,"GRIFFIN, KENNETH",kenneth griffin,Rodimer for Congress,,M,FOUNDER CEO,CITADEL LLC,33667630.0,188,179083.2,2800.0,griffin,not
8,U00000003611,"SCHWARZMAN, STEPHEN A",stephen a schwarzman,Blackstone Group,,M,"CHAIRMAN, CEO & CO-FOUNDER",BLACKSTONE,33454000.0,226,148026.5,2800.0,schwarzman,not
9,U00000046781,"JURVETSON, KARLA",karla jurvetson,[24T Contribution],,F,PHYSICIAN,SELF,33088100.0,914,36201.42,2800.0,jurvetson,not


In [4]:
lf = (
        pl.scan_csv(
            df_csv,
            separator=',', 
            quote_char='|', 
            encoding='utf8-lossy', 
            has_header=False,
            new_columns=['dummy1', 'dummy2', 'contrib_id', 'name', 'recip_id', 
                        'orgname', 'ultorg', 'realcode', 'dummy3', 'amount', 
                        'street', 'city', 'state', 'zip', 'recipcode', 'type', 'dummy4', 'dummy5', 'gender', 'dummy6', 'occupation', 'employer', 'dummy7'],
            schema_overrides={'amount': pl.Float64, 'name': pl.Utf8, 'state': pl.Utf8, 'city': pl.Utf8},
            ignore_errors=True
        )
        .select(['dummy1', 'dummy2', 'contrib_id', 'name', 'recip_id', 'orgname', 'ultorg', 'realcode', 
                 'amount', 'street', 'city', 'state', 'zip', 'recipcode', 'type', 
                 'gender', 'occupation', 'employer'])
        # remove blank donations
        .filter(~pl.col('amount').is_null())
        # remove refunds
        .filter(pl.col('amount') > 0)
        # create a lowercase name column in the usual format
        .with_columns([
            pl.col("name").str.split(",").list.get(-1)
                .str.to_lowercase().str.strip_chars().alias("firstname"),
            pl.col("name").str.split(",").list.first()
                .str.to_lowercase().str.strip_chars().alias("lastname"),
        ])
        .with_columns([
            (pl.col("firstname") + " " + pl.col("lastname")).alias("name_new")
        ])
    )

In [5]:
df = lf.collect(streaming=True)
print(df.head(10))
print(len(df))

shape: (10, 21)
┌────────┬────────────┬────────────┬────────────┬───┬───────────┬───────────┬──────────┬───────────┐
│ dummy1 ┆ dummy2     ┆ contrib_id ┆ name       ┆ … ┆ employer  ┆ firstname ┆ lastname ┆ name_new  │
│ ---    ┆ ---        ┆ ---        ┆ ---        ┆   ┆ ---       ┆ ---       ┆ ---      ┆ ---       │
│ i64    ┆ i64        ┆ str        ┆ str        ┆   ┆ str       ┆ str       ┆ str      ┆ str       │
╞════════╪════════════╪════════════╪════════════╪═══╪═══════════╪═══════════╪══════════╪═══════════╡
│ 2020   ┆ 4030220201 ┆ p000486985 ┆ LONNBERG,  ┆ … ┆ BOSTON    ┆ carl      ┆ lonnberg ┆ carl      │
│        ┆ 214334986  ┆ 3          ┆ CARL       ┆   ┆ CONSULTIN ┆           ┆          ┆ lonnberg  │
│        ┆            ┆            ┆            ┆   ┆ G GROUP   ┆           ┆          ┆           │
│ 2020   ┆ 4030220201 ┆ k000151625 ┆ LOVO,      ┆ … ┆ SELF      ┆ mario     ┆ lovo     ┆ mario     │
│        ┆ 214334988  ┆ 9          ┆ MARIO      ┆   ┆           ┆          

In [6]:
print(len(donors))
donors = donors.drop_duplicates(subset=["contrib_id"])
print(len(donors))

3588936
3588936


In [7]:
df = df.to_pandas()
df_names = df[["contrib_id", "name_new"]].merge(donors[["contrib_id", "ethnic"]], on="contrib_id", how="left")
print(len(df_names))

34828105


In [8]:
df_names.to_csv("./output/df20_pred_lastname_keys.csv", index=False)
df_names["ethnic"].value_counts()

ethnic
not    34651814
ind      176291
Name: count, dtype: int64

In [9]:
df_names[df_names["ethnic"].isnull()]["name_new"].value_counts()

Series([], Name: count, dtype: int64)

In [10]:
df["ethnic"] = df_names["ethnic"]
df.to_csv("./output/df20_pred_lastname.csv", index=False)
df.head(10)

Unnamed: 0,dummy1,dummy2,contrib_id,name,recip_id,orgname,ultorg,realcode,amount,street,...,zip,recipcode,type,gender,occupation,employer,firstname,lastname,name_new,ethnic
0,2020,4030220201214334986,p0004869853,"LONNBERG, CARL",C00721712,[24T Contribution],,Z9500,10000.0,,...,94117.0,DP,24T,M,PARTNER,BOSTON CONSULTING GROUP,carl,lonnberg,carl lonnberg,not
1,2020,4030220201214334988,k0001516259,"LOVO, MARIO",N00044240,[24T Contribution],,Z9500,250.0,,...,33134.0,DL,24T,M,LAWYER,SELF,mario,lovo,mario lovo,not
2,2020,4030220201214335206,h3003526289,"LOGUE, KATHERINE",C00401224,,,J1200,5.0,,...,60010.0,PI,15,F,NOT EMPLOYED,NONE,katherine,logue,katherine logue,not
3,2020,4051220201742609379,m00016536071,"YINGLING, JOHN",N00044240,Steamboat Wharf of Provincetown,,G2900,25.0,,...,2657.0,DL,15E,M,RESTAURANT MANAGER,STEAMBOAT WHARF OF PROVINCETOWN,john,yingling,john yingling,not
4,2020,4051220201742609381,i3003912456,"YODAIKEN, VICTOR",N00044240,Finite State Machine Labs,,Z9600,250.0,,...,78733.0,DL,15E,M,BUSINESSMAN,FINITE STATE MACHINE LABS INC.,victor,yodaiken,victor yodaiken,not
5,2020,4051220201742609383,q0001264432,"YODER, MARY E",N00044240,Retired,,X1200,20.0,,...,46507.0,DL,15E,F,NOT EMPLOYED,,mary e,yoder,mary e yoder,not
6,2020,4051220201742609385,q0001264432,"YODER, MARY E",N00044240,Retired,,X1200,20.0,,...,46507.0,DL,15E,F,NOT EMPLOYED,,mary e,yoder,mary e yoder,not
7,2020,4051220201742609387,q0001264432,"YODER, MARY E",N00044240,Retired,,X1200,20.0,,...,46507.0,DL,15E,F,NOT EMPLOYED,,mary e,yoder,mary e yoder,not
8,2020,4051220201742609389,q0001264432,"YODER, MARY E",N00044240,Retired,,X1200,20.0,,...,46507.0,DL,15E,F,NOT EMPLOYED,,mary e,yoder,mary e yoder,not
9,2020,4051220201742609391,q0001264432,"YODER, MARY E",N00044240,Retired,,X1200,20.0,,...,46507.0,DL,15E,F,NOT EMPLOYED,,mary e,yoder,mary e yoder,not
