In [1]:
import argparse, os, time
import pandas as pd
import geopandas as gpd
import polars as pl
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.ticker import ScalarFormatter, FuncFormatter
import matplotlib.patheffects as path_effects
from matplotlib.cm import get_cmap
from matplotlib.patches import Patch
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from collections import defaultdict
import warnings
from tqdm import tqdm
import pickle
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd


tqdm.pandas()
warnings.filterwarnings("ignore")

In [2]:
year = 20
pred_csv = f"./output/donors_state{year}_pred_lastname.csv"
donors_csv = f"./data/CampaignFin{year}/donors_recip{year}.csv"
cands_csv = f"./data/CampaignFin{year}/cands{year}.txt"
cmtes_csv = f"./data/CampaignFin{year}/cmtes{year}.txt"
pacs_csv = f"./data/CampaignFin{year}/pacs{year}.txt"
pac2pacs_csv = f"./data/CampaignFin{year}/pac_other{year}.txt"

In [3]:
donors = pd.read_csv(donors_csv)
donors

Unnamed: 0,contrib_id,recip_id,name,name_new,realcode,gender,occupation,employer,city,state,total_donated,donation_count,avg_donation,med_donation
0,U00000037041,N00029349,"BLOOMBERG, MICHAEL R",michael r bloomberg,Z9000,M,FOUNDER,BLOOMBERG INC.,NEW YORK,NY,1.086565e+09,868,1.251802e+06,612.0
1,U00000036521,N00044966,"STEYER, TOM",tom steyer,Z9000,M,PRESIDENTIAL CANDIDATE,SELF-EMPLOYED,SAN FRANCISCO,CA,3.179460e+08,582,5.462989e+05,2036.5
2,,N00001669,"INFORMATION REQUESTED, INFORMATION",information information requested,Y4000,,,,NEW YORK,CA,9.947018e+07,589,1.688798e+05,250.0
3,,C00484642,MAJORITY FORWARD,majority forward majority forward,Z9600,,,,WASHINGTON,DC,9.866908e+07,122,8.087629e+05,117035.5
4,,C00618371,WINRED,winred winred,Z9600,,,,ARLINGTON,VA,9.195982e+07,197,4.668011e+05,500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7136231,h3001209792,N00040675,"GOULD, COLIN",colin gould,C5120,M,SOFTWARE INFRASTRUCTURE ARCHITECT,ORACLE,FOSTER CITY,CA,1.000000e+00,1,1.000000e+00,1.0
7136232,p0001372401,C00003418,"BLEDSOE, PEARLIOUS MS",pearlious ms bledsoe,X1200,F,RETIRED,RETIRED,GRANITEVILLE,SC,1.000000e+00,1,1.000000e+00,1.0
7136233,r0006423692,C00003418,"CARMICHAEL, RUTH",ruth carmichael,X1200,F,RETIRED,RETIRED,KANSAS CITY,KS,1.000000e+00,1,1.000000e+00,1.0
7136234,r0015995400,C00003418,"BOOKATZ, DEBRA MRS",debra mrs bookatz,Y4000,F,RETIRED,MARINE CORP. COMMUNITY SERVICES,KANEOHE,HI,1.000000e+00,1,1.000000e+00,1.0


In [4]:
donors[donors["contrib_id"] == "q0002067764 "]

Unnamed: 0,contrib_id,recip_id,name,name_new,realcode,gender,occupation,employer,city,state,total_donated,donation_count,avg_donation,med_donation
8796,q0002067764,N00023864,"CHANDRA, SATISH",satish chandra,H1100,N,PHYSIAN,DOCTOR,NEW YORK,NY,52920.0,19,2785.263158,2772.0
18787,q0002067764,C00694323,"CHANDRA, SATISH",satish chandra,Z9600,N,PHYSIAN,SELF-EMPLOYED,NEW YORK,NY,30632.0,11,2784.727273,2772.0
2407465,q0002067764,N00001669,"CHANDRA, SATISH",satish chandra,H5100,N,PHYSICIAN,MADISON,NEW YORK,NY,525.0,3,175.0,250.0


In [5]:
cands_lf = (
        pl.scan_csv(
            cands_csv,
            separator=',', 
            quote_char='|', 
            encoding='utf8-lossy', 
            has_header=False,
            new_columns=['dummy1', 'id', 'recip_id', 'name', 'party', 
                        'seat', 'seat_current', 'ran_general', 'ran_ever', 'type', 
                        'recipcode', 'nopacs'],
            ignore_errors=True
        )
        .select(['id', 'recip_id', 'name', 'party', 'seat', 'seat_current', 
                 'ran_general', 'ran_ever', 'type', 'recipcode'])
    )

cands = cands_lf.collect()
cands = cands.to_pandas()
cands = cands.drop_duplicates(subset=['recip_id', 'name', 'party', 'seat'], keep='last')
cands[(cands["seat"] == "PRES") & (cands["ran_general"] == "Y")]

Unnamed: 0,id,recip_id,name,party,seat,seat_current,ran_general,ran_ever,type,recipcode
6211,P00013524,N00011042,Jo Jorgensen (L),L,PRES,,Y,Y,C,3L
6278,P00014209,N00042370,Don Blankenship (3),3,PRES,,Y,Y,C,3L
6934,P80000722,N00001669,Joe Biden (D),D,PRES,,Y,Y,C,DW
6936,P80001571,N00023864,Donald Trump (R),R,PRES,PRES,Y,Y,I,RL


In [6]:
cands[cands["recip_id"].str.contains("N")]

Unnamed: 0,id,recip_id,name,party,seat,seat_current,ran_general,ran_ever,type,recipcode
0,H0AK00105,N00039029,Thomas Lamb (I),I,AK01,,,Y,C,3N
1,H0AK00113,N00047445,Ray Sean Tugatuk (D),D,AK01,,,Y,C,DL
3,H0AL01055,N00044245,Jerry Carl (R),R,AL01,,Y,Y,O,RW
4,H0AL01063,N00044288,Wes Lambert (R),R,AL01,,,Y,O,RL
5,H0AL01071,N00025369,Chris Pringle (R),R,AL01,,,Y,O,RL
...,...,...,...,...,...,...,...,...,...,...
8055,S8WV00135,N00041474,Bo Copley (R),R,WVS1,,,,,RN
8056,S8WV00143,N00012642,Patrick Morrisey (R),R,WVS1,,,,,RN
8057,S8WY00189,N00027793,Gary Trauner (D),D,WYS1,,,,,DN
8058,S8WY00197,N00042998,David Dodson (R),R,WYS1,,,,,RN


In [7]:
mapping = {
    'A': 'Agribusiness',
    'B': 'Construction',
    'C': 'Tech',
    'D': 'Defense',
    'E': 'Energy',
    'F': 'Finance',
    'M': 'Misc Business',
    'H': 'Health',
    'J': 'Ideology/Single Issue',
    'K': 'Law',
    'L': 'Labor',
    'M': 'Manufacturing',
    'T': 'Transportation',
    'W': 'Other',
    'Y': 'Unknown',
    'Z': 'Party'
}

donors["sector"] = donors["realcode"].apply(lambda x: str(x).upper()[0]).map(mapping)
donors["sector"].value_counts()

sector
Party                    1299587
Unknown                  1036328
Ideology/Single Issue     861610
Health                    541522
Finance                   443132
Tech                      224719
Law                       199207
Construction              122016
Transportation            118605
Labor                     112626
Energy                     89268
Agribusiness               73594
Manufacturing              55259
Defense                    37952
Name: count, dtype: int64

In [9]:
lf = (
        pl.scan_csv(
            pacs_csv,
            separator=',', 
            quote_char='|', 
            encoding='utf8-lossy', 
            has_header=False,
            new_columns=['dummy1', 'rec_no', 'pac_id', 'cand_id', 'amount', 
                        'dummy2', 'realcode', 'type', 'direct', 'fec_cand_id'],
            ignore_errors=True
        )
        .select(['rec_no', 'pac_id', 'cand_id', 'amount', 
                 'realcode', 'type', 'direct', 'fec_cand_id'])
    )
pacs = lf.collect()
pacs = pacs.to_pandas()
pacs

Unnamed: 0,rec_no,pac_id,cand_id,amount,realcode,type,direct,fec_cand_id
0,1011020200194684263,C00429241,N00033395,250,H5100,24K,D,H2MI05119
1,1011020200194684266,C00429241,N00036275,500,H5100,24K,D,H4MI04126
2,1012620210264140422,C00682724,N00042308,39,J1200,24Z,D,P00006486
3,1012620210264140423,C00682724,N00044183,49,J1200,24Z,D,
4,1012620210264140425,C00682724,N00042308,26,J1200,24Z,D,P00006486
...,...,...,...,...,...,...,...,...
890573,4123120201987403573,C00035675,N00038734,1000,H4300,24K,D,S8NV00156
890574,4123120201987403574,C00035675,N00030622,1000,H4300,24K,D,H0AL07086
890575,4123120201987403575,C00035675,N00004133,1000,H4300,24K,D,H6MI04113
890576,4123120201987403886,C00574103,N00030245,2500,H1130,24K,D,S4LA00107


In [10]:
'''lf = (
        pl.scan_csv(
            pac2pacs_csv,
            separator=',', 
            quote_char='|', 
            encoding='utf8-lossy', 
            has_header=False,
            new_columns=['dummy1', 'rec_no', 'filer_id', 'donor_name', 'other_name', 
                        'city', 'state', 'zip', 'donor_occupation', 'primcode', 
                        'date', 'amount', 'recip_id', 'party', 'other_id', 'recipcode',
                        'recipprimcode', 'amend', 'report', 'pg', 'microfilm', 'type',
                        'realcode', 'source'],
            ignore_errors=True
        )
        .select(['rec_no', 'filer_id', 'donor_name', 'other_name', 
                        'city', 'state', 'donor_occupation', 'primcode', 
                        'amount', 'recip_id', 'party', 'other_id', 'recipcode',
                        'recipprimcode', 'pg', 'type', 'realcode'])
    )
pac2pacs = lf.collect(streaming=True)
pac2pacs = pac2pacs.to_pandas()
pac2pacs'''

"lf = (\n        pl.scan_csv(\n            pac2pacs_csv,\n            separator=',', \n            quote_char='|', \n            encoding='utf8-lossy', \n            has_header=False,\n            new_columns=['dummy1', 'rec_no', 'filer_id', 'donor_name', 'other_name', \n                        'city', 'state', 'zip', 'donor_occupation', 'primcode', \n                        'date', 'amount', 'recip_id', 'party', 'other_id', 'recipcode',\n                        'recipprimcode', 'amend', 'report', 'pg', 'microfilm', 'type',\n                        'realcode', 'source'],\n            ignore_errors=True\n        )\n        .select(['rec_no', 'filer_id', 'donor_name', 'other_name', \n                        'city', 'state', 'donor_occupation', 'primcode', \n                        'amount', 'recip_id', 'party', 'other_id', 'recipcode',\n                        'recipprimcode', 'pg', 'type', 'realcode'])\n    )\npac2pacs = lf.collect(streaming=True)\npac2pacs = pac2pacs.to_pandas()\npac2p

In [9]:
preds = pd.read_csv(pred_csv)
donors = donors.merge(preds[['contrib_id', 'ethnic']], on='contrib_id', how='left')
donors = donors.merge(cands, on='recip_id', how='left')

In [20]:
pacs_named = pacs.merge(cands, left_on='cand_id', right_on="recip_id", how='left')
# pacs_named = pacs_named.merge(cmtes, left_on='cand_id', right_on="recip_id", how='left')
pacs_named

Unnamed: 0,rec_no,pac_id,cand_id,amount,realcode,type_x,direct,fec_cand_id,id,recip_id,name,party,seat,seat_current,ran_general,ran_ever,type_y,recipcode
0,1011020200194684263,C00429241,N00033395,250,H5100,24K,D,H2MI05119,H2MI05119,N00033395,Dan Kildee (D),D,MI05,MI05,Y,Y,I,DW
1,1011020200194684266,C00429241,N00036275,500,H5100,24K,D,H4MI04126,H4MI04126,N00036275,John Moolenaar (R),R,MI04,MI04,Y,Y,I,RW
2,1012620210264140422,C00682724,N00042308,39,J1200,24Z,D,P00006486,P00006486,N00042308,Andrew Yang (D),D,PRES,,,Y,C,DL
3,1012620210264140423,C00682724,N00044183,49,J1200,24Z,D,,P00010298,N00044183,Pete Buttigieg (D),D,PRES,,,Y,C,DL
4,1012620210264140425,C00682724,N00042308,26,J1200,24Z,D,P00006486,P00006486,N00042308,Andrew Yang (D),D,PRES,,,Y,C,DL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
890593,4123120201987403573,C00035675,N00038734,1000,H4300,24K,D,S8NV00156,S8NV00156,N00038734,Jacky Rosen (D),D,NVS1,NVS1,,,I,DI
890594,4123120201987403574,C00035675,N00030622,1000,H4300,24K,D,H0AL07086,H0AL07086,N00030622,Terri A Sewell (D),D,AL07,AL07,Y,Y,I,DW
890595,4123120201987403575,C00035675,N00004133,1000,H4300,24K,D,H6MI04113,H6MI04113,N00004133,Fred Upton (R),R,MI06,MI06,Y,Y,I,RW
890596,4123120201987403886,C00574103,N00030245,2500,H1130,24K,D,S4LA00107,S4LA00107,N00030245,Bill Cassidy (R),R,LAS1,LAS1,Y,Y,I,RW


In [22]:
pacs_named["name"].value_counts()

name
Donald Trump (R)        292184
David Perdue (R)         32093
Mitch McConnell (R)      30966
Kelly Loeffler (R)       30940
Martha McSally (R)       21436
                         ...  
Robert A. Brady (D)          1
Guy Furay (3)                1
William Martinek (R)         1
Ron DiNicola (D)             1
Donald Eason (I)             1
Name: count, Length: 1806, dtype: int64

In [32]:
pacs_named = pacs_named[(pacs_named["amount"] >= 0) & (pacs_named["direct"] == "D")]
total_per_pac = pacs_named.groupby('pac_id')['amount'].sum().rename('total_amount')
d_dollars_per_pac = pacs_named[pacs_named['party'] == 'D'].groupby('pac_id')['amount'].sum().rename('d_amount')

pac_ratios = pd.concat([total_per_pac, d_dollars_per_pac], axis=1)
pac_ratios['pac_d_ratio'] = pac_ratios['d_amount'] / pac_ratios['total_amount']
pac_ratios['pac_d_ratio'] = pac_ratios['pac_d_ratio'].fillna(0)  # Handle PACs with no D donations

pacs_named['pac_d_ratio'] = pacs_named['pac_id'].map(pac_ratios['pac_d_ratio'])
pacs_named

Unnamed: 0,rec_no,pac_id,cand_id,amount,realcode,type_x,direct,fec_cand_id,id,recip_id,name,party,seat,seat_current,ran_general,ran_ever,type_y,recipcode,pac_d_ratio
0,1011020200194684263,C00429241,N00033395,250,H5100,24K,D,H2MI05119,H2MI05119,N00033395,Dan Kildee (D),D,MI05,MI05,Y,Y,I,DW,0.382979
1,1011020200194684266,C00429241,N00036275,500,H5100,24K,D,H4MI04126,H4MI04126,N00036275,John Moolenaar (R),R,MI04,MI04,Y,Y,I,RW,0.382979
2,1012620210264140422,C00682724,N00042308,39,J1200,24Z,D,P00006486,P00006486,N00042308,Andrew Yang (D),D,PRES,,,Y,C,DL,0.488927
3,1012620210264140423,C00682724,N00044183,49,J1200,24Z,D,,P00010298,N00044183,Pete Buttigieg (D),D,PRES,,,Y,C,DL,0.488927
4,1012620210264140425,C00682724,N00042308,26,J1200,24Z,D,P00006486,P00006486,N00042308,Andrew Yang (D),D,PRES,,,Y,C,DL,0.488927
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
890593,4123120201987403573,C00035675,N00038734,1000,H4300,24K,D,S8NV00156,S8NV00156,N00038734,Jacky Rosen (D),D,NVS1,NVS1,,,I,DI,0.466392
890594,4123120201987403574,C00035675,N00030622,1000,H4300,24K,D,H0AL07086,H0AL07086,N00030622,Terri A Sewell (D),D,AL07,AL07,Y,Y,I,DW,0.466392
890595,4123120201987403575,C00035675,N00004133,1000,H4300,24K,D,H6MI04113,H6MI04113,N00004133,Fred Upton (R),R,MI06,MI06,Y,Y,I,RW,0.466392
890596,4123120201987403886,C00574103,N00030245,2500,H1130,24K,D,S4LA00107,S4LA00107,N00030245,Bill Cassidy (R),R,LAS1,LAS1,Y,Y,I,RW,0.602386


In [38]:
donors_pacs = donors.merge(pacs_named.drop_duplicates(['pac_id', 'pac_d_ratio'])[['pac_id', 'pac_d_ratio']], left_on='recip_id', right_on='pac_id', how='left')
donors_pacs

Unnamed: 0,contrib_id,recip_id,name_x,name_new,realcode,gender,occupation,employer,city,state,...,name_y,party,seat,seat_current,ran_general,ran_ever,type,recipcode,pac_id,pac_d_ratio
0,U00000037041,N00029349,"BLOOMBERG, MICHAEL R",michael r bloomberg,Z9000,M,FOUNDER,BLOOMBERG INC.,NEW YORK,NY,...,Michael Bloomberg (D),D,PRES,,,Y,C,DL,,
1,U00000036521,N00044966,"STEYER, TOM",tom steyer,Z9000,M,PRESIDENTIAL CANDIDATE,SELF-EMPLOYED,SAN FRANCISCO,CA,...,Tom Steyer (D),D,PRES,,,Y,C,DL,,
2,,N00001669,"INFORMATION REQUESTED, INFORMATION",information information requested,Y4000,,,,NEW YORK,CA,...,Joe Biden (D),D,PRES,,Y,Y,C,DW,,
3,,C00484642,MAJORITY FORWARD,majority forward majority forward,Z9600,,,,WASHINGTON,DC,...,,,,,,,,,,
4,,C00618371,WINRED,winred winred,Z9600,,,,ARLINGTON,VA,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7136705,h3001209792,N00040675,"GOULD, COLIN",colin gould,C5120,M,SOFTWARE INFRASTRUCTURE ARCHITECT,ORACLE,FOSTER CITY,CA,...,Jon Ossoff (D),D,GAS1,,Y,Y,C,DW,,
7136706,p0001372401,C00003418,"BLEDSOE, PEARLIOUS MS",pearlious ms bledsoe,X1200,F,RETIRED,RETIRED,GRANITEVILLE,SC,...,,,,,,,,,C00003418,0.0
7136707,r0006423692,C00003418,"CARMICHAEL, RUTH",ruth carmichael,X1200,F,RETIRED,RETIRED,KANSAS CITY,KS,...,,,,,,,,,C00003418,0.0
7136708,r0015995400,C00003418,"BOOKATZ, DEBRA MRS",debra mrs bookatz,Y4000,F,RETIRED,MARINE CORP. COMMUNITY SERVICES,KANEOHE,HI,...,,,,,,,,,C00003418,0.0


In [41]:
pacs_named[pacs_named["pac_id"] == "C00484642"]

Unnamed: 0,rec_no,pac_id,cand_id,amount,realcode,type_x,direct,fec_cand_id,id,recip_id,name,party,seat,seat_current,ran_general,ran_ever,type_y,recipcode,pac_d_ratio
