In [2]:
import pandas as pd
import os

In [9]:
# Read in the bulk individual contribution file from FEC.gov

# Directory containing the text files
directory = './indiv24/by_date'

chunk_size = 200000  # Number of rows per chunk
column_names = [
    'ID', 'Field1', 'Field2', 'Field3', 'Field4', 'Field5', 'Field6', 'Name', 'City', 'State', 'Zip',
    'Employer', 'Occupation', 'Date', 'Amount', 'OtherID', 'Field16', 'Field17', 'Field18', 'Field19', 'Field20'
]

# Initialize an empty list to store DataFrames
dfs = []

# Loop over each file in the directory
for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        file_path = os.path.join(directory, filename)
        df = pd.read_csv(file_path, delimiter='|', names=column_names, header=None, low_memory=False)
        dfs.append(df)

# Concatenate all DataFrames in the list
combined_df = pd.concat(dfs, ignore_index=True)
combined_df.head()

Unnamed: 0,ID,Field1,Field2,Field3,Field4,Field5,Field6,Name,City,State,...,Employer,Occupation,Date,Amount,OtherID,Field16,Field17,Field18,Field19,Field20
0,C00079681,N,M6,P,202306129581782820,15,IND,"GODLEWSKI, KIM",FENTON,MI,...,IPS EQUIPMENT,VICE PRESIDENT,5282023.0,100,,2425123,1707004,,,4061220231749193699
1,C00409730,N,M6,P,202306129582118675,15E,IND,"ROBERTS, PHILIP",SEATTLE,WA,...,"RYAN, SWANSON & CLEVELAND, PLLC",ATTORNEY,5282023.0,500,C00699470,15444361,1707040,,* EARMARKED CONTRIBUTION: SEE BELOW,4061320231749244130
2,C00409730,N,M6,P,202306129582118670,15E,IND,"GRUNDSTEIN, LEON",MERCER ISLAND,WA,...,"GENCARE, INC","SENIOR LIVING OWNER, DEVELOPER, OPERAT",5282023.0,500,C00699470,15444363,1707040,,* EARMARKED CONTRIBUTION: SEE BELOW,4061320231749244117
3,C00409730,N,M6,P,202306129582118676,15E,IND,"SCHOCKEN, JOSEPH L.",SEATTLE,WA,...,BROADMARK CAPITAL,PRESIDENT,5282023.0,5000,C00699470,15444362,1707040,,* EARMARKED CONTRIBUTION: SEE BELOW,4061320231749244134
4,C00063164,N,M6,P,202306139582145211,15,IND,"MCMILLAN, S TODD",COLORADO SPRINGS,CO,...,MCDONALD'S,LICENSEE,5282023.0,200,,17411206,1707152,,,4061320231749248513


In [11]:
pd.set_option('display.max_columns', None)
combined_df.head()

Unnamed: 0,ID,Field1,Field2,Field3,Field4,Field5,Field6,Name,City,State,Zip,Employer,Occupation,Date,Amount,OtherID,Field16,Field17,Field18,Field19,Field20
0,C00079681,N,M6,P,202306129581782820,15,IND,"GODLEWSKI, KIM",FENTON,MI,484309630,IPS EQUIPMENT,VICE PRESIDENT,5282023.0,100,,2425123,1707004,,,4061220231749193699
1,C00409730,N,M6,P,202306129582118675,15E,IND,"ROBERTS, PHILIP",SEATTLE,WA,981092846,"RYAN, SWANSON & CLEVELAND, PLLC",ATTORNEY,5282023.0,500,C00699470,15444361,1707040,,* EARMARKED CONTRIBUTION: SEE BELOW,4061320231749244130
2,C00409730,N,M6,P,202306129582118670,15E,IND,"GRUNDSTEIN, LEON",MERCER ISLAND,WA,980402445,"GENCARE, INC","SENIOR LIVING OWNER, DEVELOPER, OPERAT",5282023.0,500,C00699470,15444363,1707040,,* EARMARKED CONTRIBUTION: SEE BELOW,4061320231749244117
3,C00409730,N,M6,P,202306129582118676,15E,IND,"SCHOCKEN, JOSEPH L.",SEATTLE,WA,981014129,BROADMARK CAPITAL,PRESIDENT,5282023.0,5000,C00699470,15444362,1707040,,* EARMARKED CONTRIBUTION: SEE BELOW,4061320231749244134
4,C00063164,N,M6,P,202306139582145211,15,IND,"MCMILLAN, S TODD",COLORADO SPRINGS,CO,809623583,MCDONALD'S,LICENSEE,5282023.0,200,,17411206,1707152,,,4061320231749248513


In [12]:
# Create a primary key column by combining Name and Zip
combined_df['PrimaryKey'] = combined_df['Name'] + '_' + combined_df['Zip'].astype(str)

# Group by the primary key and aggregate
final_df = combined_df.groupby('PrimaryKey').agg(
    RecordCount=('PrimaryKey', 'size'),
    TotalAmount=('Amount', 'sum')
).reset_index()

In [24]:
# Issue: Some entries include middle name or title, or change use a zip+4 code in place of a zip code
filtered_df = final_df.loc[final_df['PrimaryKey'].str.startswith('MAKOWSKI, BRUCE')]
print(filtered_df)

                                   PrimaryKey  RecordCount  TotalAmount
1682585             MAKOWSKI, BRUCE_480171279        20092        60654
1682583                 MAKOWSKI, BRUCE_48017        11038        31870
1682579      MAKOWSKI, BRUCE EDWARD_480171279         2146         6945
1682578  MAKOWSKI, BRUCE EDWARD MR._480171279          342         3004
1682584             MAKOWSKI, BRUCE_480171227          230         1028
1682580         MAKOWSKI, BRUCE MR._480171279           94          583
1682582                 MAKOWSKI, BRUCE_47985            5            4
1682581              MAKOWSKI, BRUCE MR_48017            4            6


In [18]:
final_df = final_df.sort_values(by='RecordCount', ascending=False)
pd.set_option('display.max_rows', None)
final_df.head(100)

Unnamed: 0,PrimaryKey,RecordCount,TotalAmount
1682585,"MAKOWSKI, BRUCE_480171279",20092,60654
1311445,"JAHNS, RICHARD_760405951",18705,360303
867301,"FORD, BEVERLY_300673614",16791,155112
810454,"FARR, GERALD_786665129",15304,203311
1427098,"KESKITALO, CANDACE_55384",12724,71559
2781986,"URBANOWICZ, WENDY_98668",11912,64469
1682583,"MAKOWSKI, BRUCE_48017",11038,31870
1049787,"GRISWOLD, EDSON_80231",10642,141357
1630983,"LOONEY, GEORGE_300402707",10587,26656
2567262,"SOLORZANO, DIANA_92056",9963,8914
