In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score

from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('../Downloads/voter_turnout_merged_all_states.csv')
df.head()

Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type,total_voters_asian,total_voters_black,total_voters_hispanic,total_voters_others,total_voters_white,voted_voters_asian,...,total_voters_college,total_voters_no_college,voted_voters_college,voted_voters_no_college,perc_turnout_college,perc_turnout_no_college,mean_age,voter_population,voted_voters,state
0,Oakland,2020_11_03,2020,General,30600.0,61476.0,37174.0,8628.0,83122,23041.0,...,80668,27989.0,71356,22543.0,88.456389,80.542356,47.829424,331652,172221,CA
1,Oakland,2018_11_06,2018,General,30600.0,61476.0,37174.0,8628.0,83122,14972.0,...,80668,27989.0,59248,17372.0,73.44672,62.067241,48.224196,331652,130249,CA
2,Oakland,2016_11_08,2016,General,30600.0,61476.0,37174.0,8628.0,83122,16057.0,...,80668,27989.0,60766,18951.0,75.328507,67.708743,46.819512,331652,135633,CA
3,Oakland,2014_11_04,2014,General,30600.0,61476.0,37174.0,8628.0,83122,8145.0,...,80668,27989.0,38216,10811.0,47.374424,38.625889,51.034918,331652,75153,CA
4,San Leandro,2020_11_03,2020,General,12705.0,5596.0,16028.0,2429.0,17780,9229.0,...,19051,10653.0,16014,8674.0,84.05858,81.423073,49.75255,70359,41879,CA


In [3]:
df.columns

Index(['Residence_Addresses_City', 'elec_date', 'elec_year', 'elec_type',
       'total_voters_asian', 'total_voters_black', 'total_voters_hispanic',
       'total_voters_others', 'total_voters_white', 'voted_voters_asian',
       'voted_voters_black', 'voted_voters_hispanic', 'voted_voters_others',
       'voted_voters_white', 'perc_turnout_asian', 'perc_turnout_black',
       'perc_turnout_hispanic', 'perc_turnout_others', 'perc_turnout_white',
       'FECDonors_TotalDonationsAmount', 'FECDonors_NumberOfDonations',
       'mean_donation', 'total_voters_$1000-14999',
       'total_voters_$100000-124999', 'total_voters_$125000-149999',
       'total_voters_$15000-24999', 'total_voters_$150000-174999',
       'total_voters_$175000-199999', 'total_voters_$200000-249999',
       'total_voters_$25000-34999', 'total_voters_$250000+',
       'total_voters_$35000-49999', 'total_voters_$50000-74999',
       'total_voters_$75000-99999', 'voted_voters_$1000-14999',
       'voted_voters_$100000-1

In [4]:
use_cols = ['elec_type',
       'total_voters_asian', 'total_voters_black', 'total_voters_hispanic',
       'total_voters_others', 'total_voters_white',
       'FECDonors_TotalDonationsAmount', 'FECDonors_NumberOfDonations',
       'mean_donation', 'total_voters_$1000-14999',
       'total_voters_$100000-124999', 'total_voters_$125000-149999',
       'total_voters_$15000-24999', 'total_voters_$150000-174999',
       'total_voters_$175000-199999', 'total_voters_$200000-249999',
       'total_voters_$25000-34999', 'total_voters_$250000+',
       'total_voters_$35000-49999', 'total_voters_$50000-74999',
       'total_voters_$75000-99999', 
       'CommercialData_EstimatedHHIncomeAmount', 'total_voters_college',
       'total_voters_no_college', 'mean_age', 'voter_population', 'state', 'rcv']

In [5]:
rcv_cities = ['Albany', 'Berkeley', 'Eureka', 'Oakland', 'Palm Desert', 'San Francisco', 'San Leandro',
             'Las Cruces', 'Santa Fe', 'Boulder', 'Burlington', 'Portland', 'Takoma Park', 'Bluffdale',
             'Payson', 'Cottonwood Heights', 'Salt Lake City', 'Sandy', 'Midvale', 'Draper', 'Lehi',
             'Springville', 'South Salt Lake', 'Magna', 'Heber', 'Millcreek', 'Riverton', 'St. Louis Park',
             'Bloomington', 'Minneapolis', 'Minnetonka']
non_rcv_cities = list(set(df['Residence_Addresses_City']) - set(rcv_cities))

In [6]:
def label_rcv(row):
    if row['Residence_Addresses_City'] in rcv_cities:
        return 1
    else:
        return 0

In [7]:
df['rcv'] = df.apply(lambda row: label_rcv(row), axis=1)
df = df[use_cols]
df.head()

Unnamed: 0,elec_type,total_voters_asian,total_voters_black,total_voters_hispanic,total_voters_others,total_voters_white,FECDonors_TotalDonationsAmount,FECDonors_NumberOfDonations,mean_donation,total_voters_$1000-14999,...,total_voters_$35000-49999,total_voters_$50000-74999,total_voters_$75000-99999,CommercialData_EstimatedHHIncomeAmount,total_voters_college,total_voters_no_college,mean_age,voter_population,state,rcv
0,General,30600.0,61476.0,37174.0,8628.0,83122,44186445,403388,109.538323,5769.0,...,22561.0,42090.0,23612.0,115534.7318,80668,27989.0,47.829424,331652,CA,1
1,General,30600.0,61476.0,37174.0,8628.0,83122,42827869,394704,108.506296,5769.0,...,22561.0,42090.0,23612.0,115534.7318,80668,27989.0,48.224196,331652,CA,1
2,General,30600.0,61476.0,37174.0,8628.0,83122,42296298,390007,108.4501,5769.0,...,22561.0,42090.0,23612.0,115534.7318,80668,27989.0,46.819512,331652,CA,1
3,General,30600.0,61476.0,37174.0,8628.0,83122,37016092,341253,108.471111,5769.0,...,22561.0,42090.0,23612.0,115534.7318,80668,27989.0,51.034918,331652,CA,1
4,General,12705.0,5596.0,16028.0,2429.0,17780,1875137,30683,61.113222,875.0,...,2752.0,8386.0,11796.0,113020.5511,19051,10653.0,49.75255,70359,CA,1
