In [1]:
import argparse, os, time
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.ticker import ScalarFormatter, FuncFormatter
import seaborn as sns
from collections import defaultdict
from tqdm import tqdm
import pickle
from ethnicseer import EthnicClassifier

In [2]:
indivs20 = "./data/CampaignFin20/indivs20.txt"
indivs22 = "./data/CampaignFin22/indivs22.txt"
donors20withpred80 = "./data/donors20_with_pred80.csv"
donors20withlastname = "./data/donors20_with_pred_lastname.csv"

In [3]:
donors20 = pd.read_csv(donors20withlastname)
donors20["ethnic"] = donors20.apply(lambda x: x["ethnic"] if x["contrib_id"].strip() != "" else "not", axis=1)
donors20.head(10)

Unnamed: 0,contrib_id,name,name_new,lastname,total_donated,donation_count,avg_donation,ethnic
0,,"FOR AMERICA, AMY",amy for america,for america,1255595000.0,27780,45197.81,not
1,U00000037041,"BLOOMBERG, MICHAEL",michael bloomberg,bloomberg,1127713000.0,960,1174701.0,not
2,U00000036521,"STEYER, THOMAS F",thomas f steyer,steyer,379061300.0,779,486599.9,not
3,U00000046841,"MELLON, TIMOTHY",timothy mellon,mellon,45133560.0,23,1962328.0,not
4,U0000000310A,"ADELSON, MIRIAM O DR",miriam o dr adelson,adelson,44971550.0,134,335608.6,not
5,U00000003101,"ADELSON, SHELDON G MR",sheldon g mr adelson,adelson,44819950.0,129,347441.5,not
6,U00000036901,"UIHLEIN, RICHARD E",richard e uihlein,uihlein,35302380.0,342,103223.3,not
7,U00000036551,"GRIFFIN, KENNETH",kenneth griffin,griffin,33639930.0,197,170761.1,not
8,U00000003611,"SCHWARZMAN, STEPHEN A",stephen a schwarzman,schwarzman,33406500.0,242,138043.4,not
9,U00000046781,"JURVETSON, KARLA",karla jurvetson,jurvetson,32493510.0,973,33395.18,not


In [4]:
df20 = (
        pl.scan_csv(
            indivs20,
            separator=',', 
            quote_char='|', 
            encoding='utf8-lossy', 
            has_header=False,
            new_columns=['dummy1', 'dummy2', 'contrib_id', 'name', 'recip_id', 
                        'orgname', 'ultorg', 'realcode', 'dummy3', 'amount', 
                        'street', 'city', 'state', 'zip', 'recipcode', 'type', 'dummy4', 'dummy5', 'gender', 'dummy6', 'occupation', 'employer', 'dummy7'],
            schema_overrides={'amount': pl.Float64, 'name': pl.Utf8, 'state': pl.Utf8, 'city': pl.Utf8},
            ignore_errors=True
        )
        # .select(['contrib_id', 'name', 'recip_id', 'orgname', 'ultorg', 'realcode', 
        #          'amount', 'street', 'city', 'state', 'zip', 'recipcode', 'type', 
        #          'gender', 'occupation', 'employer'])
        .select(['contrib_id', 'name', 'recip_id', 'ultorg', 'amount', 'city', 'state', 
                 'recipcode', 'gender', 'occupation', 'employer'])
        .filter(~pl.col('amount').is_null())
    ).collect()
# df20 = lf20.collect()
print(df20.head(10))

shape: (10, 11)
┌─────────────┬────────────┬───────────┬────────┬───┬───────────┬────────┬────────────┬────────────┐
│ contrib_id  ┆ name       ┆ recip_id  ┆ ultorg ┆ … ┆ recipcode ┆ gender ┆ occupation ┆ employer   │
│ ---         ┆ ---        ┆ ---       ┆ ---    ┆   ┆ ---       ┆ ---    ┆ ---        ┆ ---        │
│ str         ┆ str        ┆ str       ┆ str    ┆   ┆ str       ┆ str    ┆ str        ┆ str        │
╞═════════════╪════════════╪═══════════╪════════╪═══╪═══════════╪════════╪════════════╪════════════╡
│ p0004869853 ┆ LONNBERG,  ┆ C00721712 ┆        ┆ … ┆ DP        ┆ M      ┆ PARTNER    ┆ BOSTON     │
│             ┆ CARL       ┆           ┆        ┆   ┆           ┆        ┆            ┆ CONSULTING │
│             ┆            ┆           ┆        ┆   ┆           ┆        ┆            ┆ GROUP      │
│ k0001516259 ┆ LOVO,      ┆ N00044240 ┆        ┆ … ┆ DL        ┆ M      ┆ LAWYER     ┆ SELF       │
│             ┆ MARIO      ┆           ┆        ┆   ┆           ┆        ┆ 

In [5]:
df20 = df20.with_columns([
    pl.col("name").str.split(",").list.get(-1).str.to_lowercase().str.strip_chars().alias("firstname"),
    pl.col("name").str.split(",").list.first().str.to_lowercase().str.strip_chars().alias("lastname"),
])

df20 = df20.with_columns([
    (
        pl.col("firstname").str.to_lowercase().str.strip_chars() + " " + 
        pl.col("lastname").str.to_lowercase().str.strip_chars()
    ).alias("name_new")
])

print(df20.head(10))

shape: (10, 14)
┌────────────┬────────────┬───────────┬────────┬───┬────────────┬───────────┬──────────┬───────────┐
│ contrib_id ┆ name       ┆ recip_id  ┆ ultorg ┆ … ┆ employer   ┆ firstname ┆ lastname ┆ name_new  │
│ ---        ┆ ---        ┆ ---       ┆ ---    ┆   ┆ ---        ┆ ---       ┆ ---      ┆ ---       │
│ str        ┆ str        ┆ str       ┆ str    ┆   ┆ str        ┆ str       ┆ str      ┆ str       │
╞════════════╪════════════╪═══════════╪════════╪═══╪════════════╪═══════════╪══════════╪═══════════╡
│ p000486985 ┆ LONNBERG,  ┆ C00721712 ┆        ┆ … ┆ BOSTON     ┆ carl      ┆ lonnberg ┆ carl      │
│ 3          ┆ CARL       ┆           ┆        ┆   ┆ CONSULTING ┆           ┆          ┆ lonnberg  │
│            ┆            ┆           ┆        ┆   ┆ GROUP      ┆           ┆          ┆           │
│ k000151625 ┆ LOVO,      ┆ N00044240 ┆        ┆ … ┆ SELF       ┆ mario     ┆ lovo     ┆ mario     │
│ 9          ┆ MARIO      ┆           ┆        ┆   ┆            ┆          

In [6]:
df20 = df20.to_pandas()
df20_names = df20[["name_new"]].merge(donors20[["name_new", "ethnic"]], on="name_new", how="left")
df20_names["ethnic"].value_counts()

ethnic
not    150412970
ind       288339
Name: count, dtype: int64

In [7]:
df20_names.to_csv("./data/df20_names.csv", index=False)

In [None]:
df20.head(10)