In [1]:
import pandas as pd
import polars as pl
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from collections import defaultdict
import warnings
from tqdm import tqdm
import pickle

tqdm.pandas()
warnings.filterwarnings("ignore")

In [2]:
year = 22
df_csv = f"./data/CampaignFin{year}/indivs{year}.txt"
donors_csv = f"./output/donors{year}_pred_lastname.csv"
names_csv = "./data/USIN.csv"

In [3]:
donors = pd.read_csv(donors_csv)
donors.head(10)

Unnamed: 0,contrib_id,name,name_new,orgname,ultorg,gender,occupation,employer,total_donated,donation_count,avg_donation,med_donation,lastname,ethnic
0,,"HOME, 30119 SETTLE",30119 settle home,[24T Contribution],,,NOT EMPLOYED,NOT EMPLOYED,1649385000.0,115116,14328.03,25.0,home,not
1,U00000003641,"SOROS, GEORGE",george soros,Soros Fund Management,,M,EXECUTIVE,SOROS FUND MANAGEMENT,180017300.0,204,882437.6,2900.0,soros,not
2,U00000036901,"UIHLEIN, RICHARD",richard uihlein,Uline Inc,,M,CEO/OWNER,ULINE,84221530.0,369,228242.6,2900.0,uihlein,not
3,U00000036551,"GRIFFIN, KENNETH C",kenneth c griffin,Citadel LLC,,M,CEO,CITADEL LLC,74375500.0,317,234623.0,2900.0,griffin,not
4,U00000042451,"YASS, JEFF",jeff yass,Susquehanna International Group,,M,MANAGING DIRECTOR,SIG,56324500.0,49,1149480.0,5800.0,yass,not
5,U0000004705,"BANKMAN-FRIED, SAMUEL",samuel bankman-fried,FTX.US,,M,CEO,FTX US,42044880.0,338,124393.1,5000.0,bankman-fried,not
6,U00000046841,"MELLON, TIMOTHY",timothy mellon,Investor,,M,INVESTOR,SELF-EMPLOYED,41746400.0,31,1346658.0,2900.0,mellon,not
7,U00000037041,"BLOOMBERG, MICHAEL R",michael r bloomberg,Bloomberg Lp,,M,FOUNDER,BLOOMBERG INC.,41330510.0,39,1059757.0,200000.0,bloomberg,not
8,U0000003235,"EYCHANER, FRED",fred eychaner,Newsweb Corp,,M,OWNER,NEWSWEB CORP,37827500.0,234,161656.0,10000.0,eychaner,not
9,U00000003611,"SCHWARZMAN, STEPHEN",stephen schwarzman,Blackstone Group,,M,CHAIRMAN CEO,BLACKSTONE,37544900.0,281,133611.7,2900.0,schwarzman,not


In [4]:
lf = (
        pl.scan_csv(
            df_csv,
            separator=',', 
            quote_char='|', 
            encoding='utf8-lossy', 
            has_header=False,
            new_columns=['dummy1', 'dummy2', 'contrib_id', 'name', 'recip_id', 
                        'orgname', 'ultorg', 'realcode', 'dummy3', 'amount', 
                        'street', 'city', 'state', 'zip', 'recipcode', 'type', 'dummy4', 'dummy5', 'gender', 'dummy6', 'occupation', 'employer', 'dummy7'],
            schema_overrides={'amount': pl.Float64, 'name': pl.Utf8, 'state': pl.Utf8, 'city': pl.Utf8},
            ignore_errors=True
        )
        .select(['dummy1', 'dummy2', 'contrib_id', 'name', 'recip_id', 'orgname', 'ultorg', 'realcode', 
                 'amount', 'street', 'city', 'state', 'zip', 'recipcode', 'type', 
                 'gender', 'occupation', 'employer'])
        # remove blank donations
        .filter(~pl.col('amount').is_null())
        # remove refunds
        .filter(pl.col('amount') > 0)
        # create a lowercase name column in the usual format
        .with_columns([
            pl.col("name").str.split(",").list.get(-1)
                .str.to_lowercase().str.strip_chars().alias("firstname"),
            pl.col("name").str.split(",").list.first()
                .str.to_lowercase().str.strip_chars().alias("lastname"),
        ])
        .with_columns([
            (pl.col("firstname") + " " + pl.col("lastname")).alias("name_new")
        ])
    )

In [5]:
df = lf.collect(streaming=True)
print(df.head(10))
print(len(df))

shape: (10, 21)
┌────────┬────────────┬────────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ dummy1 ┆ dummy2     ┆ contrib_id ┆ name      ┆ … ┆ employer  ┆ firstname ┆ lastname  ┆ name_new  │
│ ---    ┆ ---        ┆ ---        ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│ i64    ┆ i64        ┆ str        ┆ str       ┆   ┆ str       ┆ str       ┆ str       ┆ str       │
╞════════╪════════════╪════════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 2022   ┆ 4061520221 ┆ r001425651 ┆ DILLARD,  ┆ … ┆ NOT       ┆ daniel    ┆ dillard   ┆ daniel    │
│        ┆ 505841534  ┆ 0          ┆ DANIEL    ┆   ┆ EMPLOYED  ┆           ┆           ┆ dillard   │
│ 2022   ┆ 4061520221 ┆ r001550361 ┆ WHITE,    ┆ … ┆ NOT       ┆ scotto    ┆ white     ┆ scotto    │
│        ┆ 505841535  ┆ 4          ┆ SCOTTO    ┆   ┆ EMPLOYED  ┆           ┆           ┆ white     │
│ 2022   ┆ 4061520221 ┆ p000386130 ┆ DOMINGUEZ ┆ … ┆ MCDONALD  ┆ sallie    

In [6]:
print(len(donors))
donors = donors.drop_duplicates(subset=["contrib_id"])
print(len(donors))

2761018
2761018


In [7]:
df_names = df.select(["contrib_id", "name_new"]).join(
    pl.from_pandas(donors).select(["contrib_id", "ethnic"]),
    on="contrib_id",
    how="left"
)

In [8]:
# df = df.to_pandas()
# df_names = df[["contrib_id", "name_new"]].merge(donors[["contrib_id", "ethnic"]], on="contrib_id", how="left")
print(len(df_names))

63581442


In [10]:
df_names.write_csv(f"./output/df{year}_pred_lastname_keys.csv")
df_names["ethnic"].value_counts()

ethnic,count
str,u32
"""ind""",216268
"""not""",63365174


In [12]:
df = df.with_columns(df_names["ethnic"])
df.write_csv(f"./output/df{year}_pred_lastname.csv")
df.head(10)

dummy1,dummy2,contrib_id,name,recip_id,orgname,ultorg,realcode,amount,street,city,state,zip,recipcode,type,gender,occupation,employer,firstname,lastname,name_new,ethnic
i64,i64,str,str,str,str,str,str,f64,str,str,str,i64,str,str,str,str,str,str,str,str,str
2022,4061520221505841534,"""r0014256510 ""","""DILLARD, DANIEL""","""C00000935""","""[24T Contribution]""","""""","""Z9500""",35.0,"""""","""HIDALGO""","""TX""",78557,"""DP""","""24T""","""M""","""NOT EMPLOYED""","""NOT EMPLOYED""","""daniel""","""dillard""","""daniel dillard""","""not"""
2022,4061520221505841535,"""r0015503614 ""","""WHITE, SCOTTO""","""C00633404""","""[24T Contribution]""","""""","""Z9500""",15.0,"""""","""YONKERS""","""NY""",10701,"""PI""","""24T""","""M""","""NOT EMPLOYED""","""NOT EMPLOYED""","""scotto""","""white""","""scotto white""","""not"""
2022,4061520221505841536,"""p0003861308 ""","""DOMINGUEZ, SALLIE""","""C00632398""","""[24T Contribution]""","""""","""Z9500""",10.0,"""""","""CAMPBELL""","""CA""",95011,"""PI""","""24T""","""F""","""HEALTHCARE""","""MCDONALD FAMILY DENTAL""","""sallie""","""dominguez""","""sallie dominguez""","""not"""
2022,4061520221505841543,"""p0003689535 ""","""BRADLEY, JANNETTE""","""C00678839""","""[24T Contribution]""","""""","""Z9500""",26.0,"""""","""OAK LAWN""","""IL""",60453,"""PI""","""24T""","""F""","""LIBRARIAN""","""MEDICAL CENTER""","""jannette""","""bradley""","""jannette bradley""","""not"""
2022,4061520221505841545,"""q0002181378 ""","""BHATT, TANYA""","""C00580068""","""[24T Contribution]""","""""","""Z9500""",50.0,"""""","""MIAMI BEACH""","""FL""",33141,"""PI""","""24T""","""F""","""MKTG""","""FRYE FINANCIAL""","""tanya""","""bhatt""","""tanya bhatt""","""ind"""
2022,4061520221505841548,"""r0015503614 ""","""WHITE, SCOTTO""","""C00633404""","""[24T Contribution]""","""""","""Z9500""",15.0,"""""","""YONKERS""","""NY""",10701,"""PI""","""24T""","""M""","""NOT EMPLOYED""","""NOT EMPLOYED""","""scotto""","""white""","""scotto white""","""not"""
2022,4061520221505841550,"""h3003472552 ""","""ALDERMAN, SUSAN""","""C00031054""","""[24T Contribution]""","""""","""Z9500""",25.0,"""""","""PORT HURON""","""MI""",48060,"""DP""","""24T""","""F""","""NOT EMPLOYED""","""NOT EMPLOYED""","""susan""","""alderman""","""susan alderman""","""not"""
2022,4061520221505841553,"""p0004825653 ""","""HOTVET, MARTIN""","""C00580068""","""[24T Contribution]""","""""","""Z9500""",10.0,"""""","""ALBANY""","""NY""",12205,"""PI""","""24T""","""M""","""LAWYERSTATE OF NY""","""MY STATE""","""martin""","""hotvet""","""martin hotvet""","""not"""
2022,4061520221505841558,"""b00603311131""","""EISENBERG, BRUCE""","""C00000935""","""[24T Contribution]""","""""","""Z9500""",1.0,"""""","""CHERRY HILL""","""NJ""",8003,"""DP""","""24T""","""M""","""ATTORNEY""","""BLANK ROME LLP""","""bruce""","""eisenberg""","""bruce eisenberg""","""not"""
2022,4061520221505841560,"""j1002282246 ""","""FLEISCHLI, MARY""","""C00748301""","""[24T Contribution]""","""""","""Z9500""",100.0,"""""","""WEST HARTFORD""","""CT""",6107,"""OI""","""24T""","""F""","""STATISTICIAN""","""UNIVERSITY OF CONNECTICUT""","""mary""","""fleischli""","""mary fleischli""","""not"""
