# An Analysis of Political Contributions During the 2020 House of Representatives Election

Goal of this notebook is to clean the webscraped data from the previous notebook, HK01-Web Scraping.

In [1]:
import pandas as pd
import re
import numpy as np

In [2]:
# Read in web scraped csv
US = pd.read_csv('../data/US_scraped.csv')

In [3]:
## Clean dataframe
for index in US.index:
    # Split up string in candidate column
    candidate_split = US['Candidate'][index].split(' • ')

    # Assign candidate name to candidate column
    US.at[index, 'Candidate'] = re.sub(r'\s\W\w\W', "", candidate_split[0])
    
    # Create new column named party
    party = re.findall(r'\W\w\W', candidate_split[0])
    US.at[index, 'party'] = re.sub(r'\W', '', party[0])
    
    # Create column named incumbent and determine incumbent status
    if 'Incumbent' in candidate_split:
        US.at[index, 'incumbent'] =  True
    else:
        US.at[index, 'incumbent'] =  False

    # Create column named race and determine if candidate won the race
    if 'Winner' in candidate_split:
        US.at[index, 'winner'] =  True
    else:
        US.at[index, 'winner'] =  False
        
    # Remove non-digit characters from raised and spent columns
    # Also transform raised and spent columns to int type
    US.at[index, 'Raised'] = int(re.sub(r'\D', '', US.at[index, 'Raised']))
    US.at[index, 'Spent'] = int(re.sub(r'\D', '', US.at[index, 'Spent']))

In [4]:
# Drop unneeded columns
US = US.drop(columns=['Cash on Hand', 'Last Report'])

# Rename columns
US = (US.rename({'Candidate':'name',
                 'Raised': 'raised',
                 'Spent':'spent'},
                axis='columns'))

In [5]:
# Export cleaned dataframe
US.to_csv('../data/us_rep_elections.csv', index=False)