In [1]:
import pandas as pd
import polars as pl
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from collections import defaultdict
import warnings
from tqdm import tqdm
import pickle

tqdm.pandas()
warnings.filterwarnings("ignore")

In [2]:
year = 18
df_csv = f"./data/CampaignFin{year}/indivs{year}.txt"
donors_csv = f"./output/donors{year}_pred_lastname.csv"
names_csv = "./data/USIN.csv"

In [3]:
donors = pd.read_csv(donors_csv)
donors.head(10)

Unnamed: 0,contrib_id,name,name_new,orgname,ultorg,gender,occupation,employer,total_donated,donation_count,avg_donation,med_donation,lastname,ethnic
0,,ANIBAL,anibal anibal,Attorney,,,LAWYER,SELF EMPLOYED,585387138.0,25228,23203.87,1000.0,anibal,not
1,U00000037041,"BLOOMBERG, MICHAEL R",michael r bloomberg,Bloomberg LP,,M,FOUNDER/OWNER,BLOOMBERG L.P.,95131257.0,58,1640194.0,37578.5,bloomberg,not
2,U00000036521,"STEYER, THOMAS",thomas steyer,League of Conservation Voters,,M,ADVOCACY,FAHR LLC,74102881.0,402,184335.5,2700.0,steyer,not
3,q0001673261,"SCOTT, RICK",rick scott,[Candidate Contribution],,M,GOVERNOR,STATE OF FLORIDA,69942510.0,165,423894.0,5526.0,scott,not
4,U00000003101,"ADELSON, SHELDON G",sheldon g adelson,Las Vegas Sands,,M,OWNER / CHAIRMAN OF THE BOARD,LAS VEGAS SANDS CORPORATION,62265700.0,127,490281.1,2700.0,adelson,not
5,U0000000310A,"ADELSON, MIRIAM DR",miriam dr adelson,Adelson Clinic for Drug Abuse Treatment & Rese...,,F,DIRECTOR OF COMMUNITY INVOLVEMENT,LAS VEGAS SANDS CORPORATION,61861800.0,113,547449.6,2700.0,adelson,not
6,U00000036901,"UIHLEIN, RICHARD E MR",richard e mr uihlein,Uline Inc,,M,CEO/OWNER,ULINE,39187258.0,320,122460.2,2700.0,uihlein,not
7,U0000004604,"SUSSMAN, S DONALD",s donald sussman,Paloma Partners,,M,CHAIRMAN,PALOMA PARTNERS ADVISORS LP,27987200.0,647,43256.88,2700.0,sussman,not
8,U00000036551,"GRIFFIN, KENNETH C",kenneth c griffin,Citadel LLC,,M,FOUNDER & CEO,CITADEL LLC,19567300.0,111,176282.0,2700.0,griffin,not
9,U00000003151,"SIMONS, JAMES",james simons,Euclidean Capital,,M,PRESIDENT,EUCLIDEAN CAPITAL,19279110.0,258,74725.23,2700.0,simons,not


In [4]:
lf = (
        pl.scan_csv(
            df_csv,
            separator=',', 
            quote_char='|', 
            encoding='utf8-lossy', 
            has_header=False,
            new_columns=['dummy1', 'dummy2', 'contrib_id', 'name', 'recip_id', 
                        'orgname', 'ultorg', 'realcode', 'dummy3', 'amount', 
                        'street', 'city', 'state', 'zip', 'recipcode', 'type', 'dummy4', 'dummy5', 'gender', 'dummy6', 'occupation', 'employer', 'dummy7'],
            schema_overrides={'amount': pl.Float64, 'name': pl.Utf8, 'state': pl.Utf8, 'city': pl.Utf8},
            ignore_errors=True
        )
        .select(['dummy1', 'dummy2', 'contrib_id', 'name', 'recip_id', 'orgname', 'ultorg', 'realcode', 
                 'amount', 'street', 'city', 'state', 'zip', 'recipcode', 'type', 
                 'gender', 'occupation', 'employer'])
        # remove blank donations
        .filter(~pl.col('amount').is_null())
        # remove refunds
        .filter(pl.col('amount') > 0)
        # create a lowercase name column in the usual format
        .with_columns([
            pl.col("name").str.split(",").list.get(-1)
                .str.to_lowercase().str.strip_chars().alias("firstname"),
            pl.col("name").str.split(",").list.first()
                .str.to_lowercase().str.strip_chars().alias("lastname"),
        ])
        .with_columns([
            (pl.col("firstname") + " " + pl.col("lastname")).alias("name_new")
        ])
    )

In [5]:
df = lf.collect(streaming=True)
print(df.head(10))
print(len(df))

shape: (10, 21)
┌────────┬────────────┬────────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ dummy1 ┆ dummy2     ┆ contrib_id ┆ name      ┆ … ┆ employer  ┆ firstname ┆ lastname  ┆ name_new  │
│ ---    ┆ ---        ┆ ---        ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│ i64    ┆ i64        ┆ str        ┆ str       ┆   ┆ str       ┆ str       ┆ str       ┆ str       │
╞════════╪════════════╪════════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 2018   ┆ 1010320180 ┆ q000074479 ┆ CAZZANI,  ┆ … ┆           ┆ serafino  ┆ cazzani   ┆ serafino  │
│        ┆ 036112450  ┆ 2          ┆ SERAFINO  ┆   ┆           ┆ v         ┆           ┆ v cazzani │
│        ┆            ┆            ┆ V         ┆   ┆           ┆           ┆           ┆           │
│ 2018   ┆ 1010320180 ┆ p000465054 ┆ JONES,    ┆ … ┆           ┆ kenneth   ┆ jones     ┆ kenneth   │
│        ┆ 036112466  ┆ 0          ┆ KENNETH   ┆   ┆           ┆           

In [6]:
print(len(donors))
donors = donors.drop_duplicates(subset=["contrib_id"])
print(len(donors))

1834556
1834556


In [7]:
df_names = df.select(["contrib_id", "name_new"]).join(
    pl.from_pandas(donors).select(["contrib_id", "ethnic"]),
    on="contrib_id",
    how="left"
)

In [8]:
# df = df.to_pandas()
# df_names = df[["contrib_id", "name_new"]].merge(donors[["contrib_id", "ethnic"]], on="contrib_id", how="left")
print(len(df_names))

16180471


In [9]:
df_names.write_csv(f"./output/df{year}_pred_lastname_keys.csv")
df_names["ethnic"].value_counts()

ethnic,count
str,u32
"""ind""",85719
"""not""",16094752


In [10]:
df = df.with_columns(df_names["ethnic"])
df.write_csv(f"./output/df{year}_pred_lastname.csv")
df.head(10)

dummy1,dummy2,contrib_id,name,recip_id,orgname,ultorg,realcode,amount,street,city,state,zip,recipcode,type,gender,occupation,employer,firstname,lastname,name_new,ethnic
i64,i64,str,str,str,str,str,str,f64,str,str,str,i64,str,str,str,str,str,str,str,str,str
2018,1010320180036112450,"""q0000744792 ""","""CAZZANI, SERAFINO V""","""N00040819""","""Cazzani Power Boats""","""""","""Y4000""",1000.0,"""""","""CRANSTON""","""RI""",2920,"""RN""","""15 ""","""M""","""""","""""","""serafino v""","""cazzani""","""serafino v cazzani""","""not"""
2018,1010320180036112466,"""p0004650540 ""","""JONES, KENNETH""","""N00040819""","""Kenneth Jones Construction""","""""","""B1500""",250.0,"""""","""WEST GREENWICH""","""RI""",2817,"""RN""","""15 ""","""M""","""""","""""","""kenneth""","""jones""","""kenneth jones""","""not"""
2018,1010320180036112472,"""i3003283827@""","""NARDOLILLO, KIM""","""N00040819""","""Nardolillo Funeral Home""","""""","""G5400""",1360.0,"""""","""NARRAGANSETT""","""RI""",2882,"""RN""","""15 ""","""F""","""""","""NARDOLILLO FUNERAL HOME""","""kim""","""nardolillo""","""kim nardolillo""","""not"""
2018,1010320180036112513,"""m0001273985A""","""NARDOLILLO, JOSEPHINE""","""N00040819""","""Retired""","""""","""X1200""",250.0,"""""","""CRANSTON""","""RI""",2921,"""RN""","""15 ""","""F""","""RETIRED""","""RETIRED""","""josephine""","""nardolillo""","""josephine nardolillo""","""not"""
2018,1010320180036112527,"""q0000444464 ""","""EDWARD POCOCK III""","""N00040819""","""Retired""","""""","""X1200""",500.0,"""""","""SOUTHINGTON""","""CT""",6489,"""RN""","""15 ""","""U""","""RETIRED""","""RETIRED""","""edward pocock iii""","""edward pocock iii""","""edward pocock iii edward pococ…","""not"""
2018,1010320180036112532,"""q0002988326 ""","""MORGAN, MICHAEL""","""N00040819""","""Extreme Airsoft""","""""","""Y4000""",300.0,"""""","""SAUNDERSTOWN""","""RI""",2874,"""RN""","""15 ""","""M""","""OWNER""","""SELF EMPLOYED""","""michael""","""morgan""","""michael morgan""","""not"""
2018,1010320180036112545,"""q0000709160 ""","""GIARRUSSO, GINA""","""N00040819""","""""","""""","""Y2000""",850.0,"""""","""NARRAGANSETT""","""RI""",2883,"""RN""","""15 ""","""F""","""NONE""","""NONE""","""gina""","""giarrusso""","""gina giarrusso""","""not"""
2018,1010320180036112549,"""m0001273985A""","""NARDOLILLO, JOSEPHINE""","""N00040819""","""Retired""","""""","""X1200""",1450.0,"""""","""CRANSTON""","""RI""",2921,"""RN""","""15 ""","""F""","""RETIRED""","""RETIRED""","""josephine""","""nardolillo""","""josephine nardolillo""","""not"""
2018,1010320180036112553,"""q0000708423 ""","""STOCKLEY, KENNETH J""","""N00040819""","""Pool & Patio Center""","""""","""Y4000""",500.0,"""""","""GREENVILLE""","""RI""",2828,"""RN""","""15 ""","""M""","""OWNER""","""POOL AND PATIO CENTER INC.""","""kenneth j""","""stockley""","""kenneth j stockley""","""not"""
2018,1010320180036112594,"""j1002343419 ""","""CALLICUTT, ANDY MR""","""N00003280""","""Self-Employed""","""""","""G0000""",1000.0,"""""","""OXFORD""","""MS""",38655,"""RW""","""15 ""","""M""","""""","""SELF EMPLOYED""","""andy mr""","""callicutt""","""andy mr callicutt""","""not"""
