# 1. Data Loading

1. DEMOGRAPHIC_selected_cols parquet file contains all rows with selected columns (voter id, city, county, ethnicity, age, gender, education, income, donation and parties description). See Reduce_to_parquet.ipynb 
2. VOTEHISTORY_selected_cols parquet file contains all rows with selected columns (General and Local_or_Municipal). See Reduce_to_parquet.ipynb 
3. GE_LM_dates_per_city parquet file contains four most recent General election and Local_or_Municipal for each of the selected city. See Find_recent_election_dates.ipynb and ca_similarity_search.ipynb

Some observations
- We choose the 5 non-RCV cities with highest cosine similary score compared to the 7 RCV cities in CA
- There were 33 distinct cities among those 35 cities
- There are 66 non-registered voters among 21.7 million voters
- There are total of 3.9 million voters in the sampled cities
- City 'El Paso de Robles' didn't match in demographic data


In [150]:
import pandas as pd
import numpy as np
import janitor
import gc
import time
start_time = time.time()

#### Define state here

In [151]:
state = 'utah'

In [152]:
def combine_cities_list(RCV_list, NonRCV_list):

    print("total number of cities:", len(RCV_list))

    print("number of distinct cities:", len(set(NonRCV_list)))

    print("name of cities that were duplicated:", set([x for x in NonRCV_list if NonRCV_list.count(x) > 1]))

    combined_cityName = RCV_list+list(set(NonRCV_list))
    print("number of distinct RCV and sampled nonRCV cities:", len(combined_cityName))
    return combined_cityName


### California

In [153]:
if state=='california':
    # ------ California -------

    ## change the filepath as required, we have selected the folder with the latest date

    #filepath = '../Downloads/VM2--CA--2022-04-25/'
    #filepath = '../Downloads/'
    filepath = '../data/VM2--CA--2022-04-25/'
    DEMO_filename = 'VM2--CA--2022-04-25-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--CA--2022-04-25-VOTEHISTORY_selected_cols.parquet'
    elec_dates_filename = 'GE_LM_dates_per_city_CA.parquet'
    cities_filename = 'ca-cities.csv'

    # 1. List of RCV and non-RCV cities 

    RCV_cities_CA = ['San Francisco',
     'Oakland',
     'Berkeley',
     'San Leandro',
     'Palm Desert',
     'Eureka',
     'Albany']

    sampled_nonRCV_cities_CA = ['Fresno',
     'San Diego',
     'Sacramento',
     'Riverside',
     'San Jose',
     'Santa Ana',
     'Anaheim',
     'Santa Rosa',
     'Merced',
     'Santa Clarita',
     'Alhambra',
     'Davis',
     'Montebello',
     'Burbank',
     'Huntington Park',
     'Bellflower',
     'Watsonville',
     'Gilroy',
     'Whittier',
     'Lynwood',
     'Lakewood',
     'Pico Rivera',
     'Lake Forest',
     'Livermore',
     'Chino Hills',
     'Paramount',
     'El Paso de Robles',
     'Pico Rivera',
     'Buena Park',
     'Whittier',
     'Calabasas',
     'Carpinteria',
     'Morro Bay',
     'San Carlos',
     'Solvang']

    combined_sampled_cityName = combine_cities_list(RCV_list= RCV_cities_CA, NonRCV_list = sampled_nonRCV_cities_CA)
    # ---------------------

### Utah

In [154]:
if state=='utah':
    # # ------ Utah -------
    filepath = '../data/VM2--UT--2022-03-30/'
    DEMO_filename = 'VM2--UT--2022-03-30-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--UT--2022-03-30-VOTEHISTORY_selected_cols.parquet'
    elec_dates_filename = 'GE_LM_dates_per_city_UT.parquet'
    cities_filename = 'ut-cities.csv'


    ##1. List of RCV and non-RCV cities 

    RCV_cities_UT = ['Salt Lake City', 'Sandy', 'Lehi', 'Millcreek', 
                     'Draper', 'Riverton',  'Cottonwood Heights', 
                     'Springville', 'Midvale', 'Magna', 'South Salt Lake', 
                     'Payson', 'Bluffdale']

    sampled_nonRCV_cities_UT = ['Ogden', 'Provo', 'West Valley City', 
                                'Logan', 'St. George', 'Taylorsville', 
                                'Layton', 'Orem', 'South Jordan', 'Murray', 
                                'South Jordan', 'Clearfield', 'Spanish Fork', 
                                'Tooele', 'Kearns', 'Cedar City', 'Murray', 
                                'Bountiful',  'South Jordan', 'Pleasant Grove', 
                                'Vernal', 'Hurricane', 'Herriman', 'American Fork', 
                                'Washington', 'Eagle Mountain', 'Brigham City', 
                                'American Fork', 'Herriman', 'Spanish Fork', 
                                'Washington', 'Heber', 'Hurricane', 'Vernal', 
                                'Holladay', 'Pleasant Grove', 'American Fork', 
                                'Herriman', 'Eagle Mountain', 'Vernal', 
                                'Bountiful', 'Pleasant Grove', 'Washington', 
                                'South Jordan', 'Vernal', 'Tooele', 
                                'Spanish Fork', 'Clearfield', 'Kearns', 
                                'Eagle Mountain', 'Washington', 'Bountiful', 
                                'Pleasant Grove', 'Hurricane', 'Cedar City', 
                                'Saratoga Springs', 'Kaysville', 'Brigham City', 
                                'North Salt Lake', 'American Fork', 'Highland', 
                                'Lindon', 'Alpine', 'West Haven', 'North Logan']

    combined_sampled_cityName = combine_cities_list(RCV_list= RCV_cities_UT, NonRCV_list = sampled_nonRCV_cities_UT)
    # # ---------------------


total number of cities: 13
number of distinct cities: 34
name of cities that were duplicated: {'Bountiful', 'Cedar City', 'American Fork', 'Herriman', 'Clearfield', 'South Jordan', 'Brigham City', 'Tooele', 'Hurricane', 'Washington', 'Murray', 'Spanish Fork', 'Kearns', 'Pleasant Grove', 'Eagle Mountain', 'Vernal'}
number of distinct RCV and sampled nonRCV cities: 47


### Colorado

In [155]:
if state=='colorado':
    # ------ Colorado -------

    ## change the filepath as required, we have selected the folder with the latest date

    filepath = '../Downloads/VM2--CO--2022-04-26/'
    DEMO_filename = 'VM2--CO--2022-04-26-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--CO--2022-04-26-VOTEHISTORY_selected_cols.parquet'
    elec_dates_filename = 'GE_LM_dates_per_city_CO.parquet'

    # 1. List of RCV and non-RCV cities 

    RCV_cities_CO = ['Boulder']

    sampled_nonRCV_cities_CO = ['Alamosa', 'Arvada', 'Brighton', 'Broomfield', 'Castle Rock', 
                                'Ca√±on City', 'Centennial', 'Commerce City', 
                                'Durango', 'Englewood', 'Fountain', 'Glenwood Springs', 'Golden', 
                                'Greenwood Village', 'Highlands Ranch', 'Lafayette', 
                                'Littleton', 'Longmont', 'Louisville', 'Loveland', 'Montrose', 'Northglenn', 
                                'Parker', 'Silverthorne', 'Steamboat Springs', 'Wheat Ridge', 'Windsor']

    combined_sampled_cityName = combine_cities_list(RCV_list= RCV_cities_CO, NonRCV_list = sampled_nonRCV_cities_CO)

    # ---------------------

### Maryland

In [156]:
if state=='maryland':
    # ------ Maryland -------

    ## change the filepath as required, we have selected the folder with the latest date

    filepath = '../Downloads/VM2--MD--2022-04-08/'
    DEMO_filename = 'VM2--MD--2022-04-08-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--MD--2022-04-08-VOTEHISTORY_selected_cols.parquet'
    elec_dates_filename = 'GE_LM_dates_per_city_CO.parquet'

    # 1. List of RCV and non-RCV cities 

    RCV_cities_MD = ['Takoma Park']

    sampled_nonRCV_cities_MD = ['Adelphi', 'Annapolis', 'Aspen Hill', 'Bethesda', 
                                'Cockeysville', 'College Park', 'Easton', 
                                'Hyattsville', 'New Carrollton', 
                                'North Bethesda', 'North Potomac', 'Ocean Pines', 'Potomac', 
                                'Princess Anne', 'Severna Park', 'Timonium', 
                                'Westminster']

    combined_sampled_cityName = combine_cities_list(RCV_list= RCV_cities_MD, NonRCV_list = sampled_nonRCV_cities_MD)
    # ---------------------


### Maine

In [157]:
if state=='maine':
    # ------ Maine -------

    ## change the filepath as required, we have selected the folder with the latest date

    filepath = '../Downloads/VM2--ME--2022-03-02/'
    DEMO_filename = 'VM2--ME--2022-03-02-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--ME--2022-03-02-VOTEHISTORY_selected_cols.parquet'
    elec_dates_filename = 'GE_LM_dates_per_city_CO.parquet'

    # 1. List of RCV and non-RCV cities 
    # low number of cities
    combined_sampled_cityName = ['Sanford', 'Westbrook', 'Lewiston', 'Wells', 'Standish', 'Waterville',
     'Falmouth', 'Windham', 'Kennebunk', 'Scarborough', 'South Portland', 'Bangor',
     'Augusta', 'Brunswick', 'Auburn', 'Portland', 'Biddeford', 'York', 'Saco',
     'Orono', 'Gorham']
    # ---------------------


### Minnesota

In [158]:
if state=='minnesota':
    # ------ Minnesota -------
    ## change the filepath as required, we have selected the folder with the latest date

    filepath = '../Downloads/VM2--MN--2022-03-25/'
    DEMO_filename = 'VM2--MN--2022-03-25-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--MN--2022-03-25-VOTEHISTORY_selected_cols.parquet'
    elec_dates_filename = 'GE_LM_dates_per_city_CO.parquet'

    # 1. List of RCV and non-RCV cities 

    RCV_cities_MN = ['Bloomington', 'Minneapolis', 'Minnetonka', 'St. Louis Park']

    sampled_nonRCV_cities_MN = ['Alexandria', 'Bemidji', 'Blaine', 'Blaine', 'Brainerd', 'Brainerd', 
                                'Brooklyn Center', 'Brooklyn Park', 'Brooklyn Park', 'Burnsville', 
                                'Coon Rapids', 'Coon Rapids', 'Duluth', 'Eagan', 'Eden Prairie', 'Eden Prairie', 
                                'Fridley', 'Inver Grove Heights', 'Mankato', 'Mankato', 'Maple Grove', 'Maplewood', 
                                'Moorhead', 'Plymouth', 'Richfield', 'Richfield', 'Rochester', 'Roseville', 
                                'Roseville', 'St. Cloud', 'St. Paul', 'Winona']

    combined_sampled_cityName = combine_cities_list(RCV_list= RCV_cities_MN, NonRCV_list = sampled_nonRCV_cities_MN)


### New Mexico

In [159]:
if state=='new mexico':
    # ------ New Mexico -------

    ## change the filepath as required, we have selected the folder with the latest date

    filepath = '../Downloads/VM2--NM--2022-03-30/'
    DEMO_filename = 'VM2--NM--2022-03-30-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--NM--2022-03-30-VOTEHISTORY_selected_cols.parquet'
    elec_dates_filename = 'GE_LM_dates_per_city_CO.parquet'

    # 1. List of RCV and non-RCV cities 
    # low number of cities
    combined_sampled_cityName = ['Los Alamos', 'Albuquerque', 'Rio Rancho', 'Farmington',
     'Taos', 'Las Cruces', 'Silver City', 'Roswell', 'Lovington', 'Deming',
     'Alamogordo', 'Chaparral', 'Las Vegas', 'Los Lunas', 'Hobbs',
     'Clovis', 'Sunland Park', 'Artesia', 'Grants', 'Carlsbad', 'Portales', 'Gallup',
     'Espa√±ola', 'Santa Fe']
    # ---------------------



### Vermont

In [160]:
if state=='vermont':
    # ------ Vermont -------

    ## change the filepath as required, we have selected the folder with the latest date

    filepath = '../Downloads/VM2--VT--2022-04-20/'
    DEMO_filename = 'VM2--VT--2022-04-20-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--VT--2022-04-20-VOTEHISTORY_selected_cols.parquet'
    elec_dates_filename = 'GE_LM_dates_per_city_CO.parquet'

    # 1. List of RCV and non-RCV cities 
    # low number of cities
    combined_sampled_cityName = ['Burlington', 'South Burlington', 'Essex', 'Rutland', 'Bennington', 'Milton',
     'Essex Junction', 'Barre', 'Colchester', 'Brattleboro']
    # ---------------------


# 1.1 Demographic Data

1. Select only the columns required: city name ('Residence_Addresses_City'), unique voter id ('LALVOTERID'), voter's ethnicity ('EthnicGroups_EthnicGroup1Desc'), date when voter was registered ('Voters_OfficialRegDate'), voter's gender, date of birth, plus additionsl columns
2. Keep only the cities that were identified as being similar to RCV cities in CA (See ca_similarity_search.ipynb for reference) 
3. Keep only rows EthnicGroups_EthnicGroup1Desc == “European”,  “Likely African-American”,“Hispanic and Portuguese” and “East and South Asian” 
4. Keep only registered voters identified in 'Voters_OfficialRegDate'


In [161]:
def read_DEMOGRAPHIC():
    df_demographic = pd.read_parquet(f'{filepath}{DEMO_filename}')
    print("Total number of unique cities:", df_demographic.Residence_Addresses_City.nunique())
    print("Total number of unique voters:", df_demographic.LALVOTERID.nunique())
    print("Count of non-registered voters:", len(df_demographic[df_demographic['Voters_OfficialRegDate'].isnull()]))
    
    print("Number of expected cities:", len(combined_sampled_cityName))
    missing_cities = [city for city in combined_sampled_cityName if city not in df_demographic['Residence_Addresses_City'].unique()]
    if len(missing_cities) > 0:
        print("number of cities not found in demographic data:", len(missing_cities))
        print(missing_cities)
        
    return df_demographic
        
state_demographic = read_DEMOGRAPHIC()

Total number of unique cities: 389
Total number of unique voters: 1442583
Count of non-registered voters: 467
Number of expected cities: 47
number of cities not found in demographic data: 1
['St. George']


In [162]:
state_demographic.head(5)

Unnamed: 0,LALVOTERID,Residence_Addresses_City,Voters_Gender,Voters_Age,Voters_BirthDate,Parties_Description,EthnicGroups_EthnicGroup1Desc,Voters_OfficialRegDate,County,CommercialData_Education,CommercialData_EstimatedHHIncome,CommercialData_EstimatedHHIncomeAmount,FECDonors_NumberOfDonations,FECDonors_TotalDonationsAmount
0,LALUT168588809,Greenville,F,81,07/30/1941,Republican,European,09/24/2019,BEAVER,HS Diploma - Likely,$35000-49999,$37000,,
1,LALUT169183714,Beaver,F,77,03/02/1945,Republican,European,06/22/2010,BEAVER,,$50000-74999,$55155,,
2,LALUT169183783,Beaver,M,79,12/16/1943,Republican,European,06/25/2002,BEAVER,Bach Degree - Extremely Likely,$35000-49999,$48000,,
3,LALUT169165557,Beaver,F,79,03/31/1943,Republican,European,06/25/2002,BEAVER,Some College -Extremely Likely,$35000-49999,$48000,,
4,LALUT169173676,Beaver,M,75,09/22/1947,Republican,European,06/25/2002,BEAVER,HS Diploma - Extremely Likely,$35000-49999,$41000,,


### California

In [163]:
if state == 'california':
    # ----- California ----- 
    combined_sampled_cityName = list(map(lambda x: x.replace('El Paso de Robles', 'Paso Robles'), combined_sampled_cityName))
    print("number of expected cities:", len(combined_sampled_cityName))
    # ----------------------

    # Modifying the names of cities that were duplicate (Calabasas Hills > Calabasas and Huntington Pk > Huntington Park)
    state_demographic.loc[state_demographic['Residence_Addresses_City'] == 'Huntington Pk', 'Residence_Addresses_City'] = 'Huntington Park'
    state_demographic.loc[state_demographic['Residence_Addresses_City'] == 'Calabasas Hills', 'Residence_Addresses_City'] = 'Calabasas'
    # ----------------------

### Utah

In [164]:
if state=='utah':
    # # ----- Utah ----- 
    combined_sampled_cityName = list(map(lambda x: x.replace('St. George', 'Saint George'), combined_sampled_cityName))
    print("number of expected cities:", len(combined_sampled_cityName))
    # # ----------------------


number of expected cities: 47


### Colorado

In [165]:
if state=='colorado':
    # NOTE: ['Sherrelwood', 'Cherry Creek', 'Ken Caryl'] not found in demographic data so were removed from 
    # sampled_non_RCV_cities_CO list

    # # ----- Colorado ----- 
    combined_sampled_cityName = list(map(lambda x: x.replace('Ca√±on City', 'Canon City'), combined_sampled_cityName))
    print("number of expected cities:", len(combined_sampled_cityName))
    # # ----------------------

### Maryland

In [166]:
if state=='maryland':
    # NOTE: ['Colesville', 'Fairland', 'Cloverly', 'Annapolis Neck', 'Redland', 'Glenmont', 'Travilah', 
    # 'South Laurel', 'White Oak', 'Glassmanor', 'Kemp Mill', 'Parole', 'Calverton'] not found in demographic data 
    # so were removed from sampled_non_RCV_cities_MD list

    # # ----- Maryland ----- 
    combined_sampled_cityName = list(map(lambda x: x.replace(), combined_sampled_cityName))
    print("number of expected cities:", len(combined_sampled_cityName))
    # # ----------------------

### Minnesota

In [167]:
if state=='minnesota':
    # # ----- Minnesota ----- 
    combined_sampled_cityName = list(map(lambda x: x.replace('St. Louis Park', 'St Louis Park'), combined_sampled_cityName))
    combined_sampled_cityName = list(map(lambda x: x.replace('St. Paul', 'Saint Paul'), combined_sampled_cityName))
    combined_sampled_cityName = list(map(lambda x: x.replace('St. Cloud', 'Saint Cloud'), combined_sampled_cityName))

    print("number of expected cities:", len(combined_sampled_cityName))
    # # ----------------------

### New Mexico

In [168]:
if state=='new mexico':
    # NOTE: ['North Valley', 'South Valley'] not found in demographic data so were removed 
    # from sampled_non_RCV_cities_NM list

    # # ----- New Mexico ----- 
    combined_sampled_cityName = list(map(lambda x: x.replace('Espa√±ola', 'Espanola'), combined_sampled_cityName))

    print("number of expected cities:", len(combined_sampled_cityName))

### Filter DEMOGRAPHIC data based on the list of cities, ethnicities, and registered voters

In [169]:
# 2. filter DEMOGRAPHIC data based on the list of cities, ethnicities and registered voters

selected_ethnicities = ['European', 'Likely African-American','Hispanic and Portuguese', 'East and South Asian']

def filter_demo(df, list_cityNames):
    filtered_df = df[df['Residence_Addresses_City'].isin(list_cityNames) &
            df['EthnicGroups_EthnicGroup1Desc'].isin(selected_ethnicities) &
            df['Voters_OfficialRegDate'].notnull()]
    #[['LALVOTERID', 'Residence_Addresses_City']]
    
    print(filtered_df.shape)
    print("number of unique cities:", filtered_df.Residence_Addresses_City.nunique())
    
    return filtered_df

state_demographic_subset = filter_demo(df = state_demographic, list_cityNames = combined_sampled_cityName)
state_demographic_subset.head()

(887909, 14)
number of unique cities: 47


Unnamed: 0,LALVOTERID,Residence_Addresses_City,Voters_Gender,Voters_Age,Voters_BirthDate,Parties_Description,EthnicGroups_EthnicGroup1Desc,Voters_OfficialRegDate,County,CommercialData_Education,CommercialData_EstimatedHHIncome,CommercialData_EstimatedHHIncomeAmount,FECDonors_NumberOfDonations,FECDonors_TotalDonationsAmount
3133,LALUT621510668,Brigham City,F,22,01/01/2000,Democratic,European,10/30/2020,BOX ELDER,,$50000-74999,$70115,,
3137,LALUT168437912,Brigham City,F,53,12/30/1969,Republican,European,09/26/1999,BOX ELDER,Some College - Likely,$50000-74999,$58000,,
3138,LALUT169786507,Brigham City,M,49,11/15/1973,Non-Partisan,Hispanic and Portuguese,12/05/2016,BOX ELDER,,,,,
3142,LALUT596605091,Brigham City,F,35,01/01/1987,Non-Partisan,Hispanic and Portuguese,10/01/2020,BOX ELDER,,$50000-74999,$70794,,
3143,LALUT168453641,Brigham City,M,32,06/25/1990,Republican,European,09/28/2020,BOX ELDER,,$50000-74999,$70794,,


In [170]:
del state_demographic
gc.collect()

0

# 1.2 Vote History

1. Select only the columns that are 4 most recent General elections and 4 most recent Local_or_Municipal elections and EthnicGroups_EthnicGroup1Desc
2. Load Vote History 
3. Merge Vote History with the sampled Demographic Data 


## 1. Get four most recent election dates

In [171]:
# load the list of election dates for each city
GE_LM_dates_dict = pd.read_parquet(f'{filepath}{elec_dates_filename}')
GE_LM_dates_dict

Unnamed: 0,city,GE_dates,LM_dates
0,Brigham City,"[General_2020_11_03, General_2018_11_06, Gener...","[Consolidated_General_2019_11_05, Consolidated..."
1,Logan,"[General_2020_11_03, General_2018_11_06, Gener...","[Consolidated_General_2019_11_05, Consolidated..."
2,North Logan,"[General_2020_11_03, General_2018_11_06, Gener...","[Consolidated_General_2019_11_05, Consolidated..."
3,Kaysville,"[General_2020_11_03, General_2018_11_06, Gener...","[Consolidated_General_2019_11_05, Consolidated..."
4,Layton,"[General_2020_11_03, General_2018_11_06, Gener...","[Consolidated_General_2019_11_05, Consolidated..."
5,Clearfield,"[General_2020_11_03, General_2018_11_06, Gener...","[Consolidated_General_2019_11_05, Consolidated..."
6,Bountiful,"[General_2020_11_03, General_2018_11_06, Gener...","[Consolidated_General_2019_11_05, Consolidated..."
7,Ogden,"[General_2020_11_03, General_2018_11_06, Gener...","[Consolidated_General_2019_11_05, Consolidated..."
8,North Salt Lake,"[General_2020_11_03, General_2018_11_06, Gener...","[Consolidated_General_2019_11_05, Consolidated..."
9,Draper,"[General_2020_11_03, General_2018_11_06, Gener...","[Consolidated_General_2019_11_05, Consolidated..."


In [172]:
e_dates = set()
for v in GE_LM_dates_dict['GE_dates']:
    for vv in v:
        e_dates.add(vv)
for v in GE_LM_dates_dict['LM_dates'] :
    for vv in v:
        e_dates.add(vv)
        
print(list(e_dates))
## when all four dates are not found e_dates will contain None, we need to remove it
if None in list(e_dates):
    e_dates.remove(None)
    print(list(e_dates))

['General_2018_11_06', 'Consolidated_General_2015_11_03', 'Consolidated_General_2017_11_07', 'Local_or_Municipal_2017_06_27', 'Consolidated_General_2019_11_05', 'General_2020_11_03', 'General_2014_11_04', 'General_2016_11_08', 'Consolidated_General_2013_11_05', None, 'Consolidated_General_2007_11_06', 'Local_or_Municipal_2020_05_12', 'General_2012_11_06', 'Consolidated_General_2003_11_04']
['General_2018_11_06', 'Consolidated_General_2015_11_03', 'Consolidated_General_2017_11_07', 'Local_or_Municipal_2017_06_27', 'Consolidated_General_2019_11_05', 'General_2020_11_03', 'General_2014_11_04', 'General_2016_11_08', 'Consolidated_General_2013_11_05', 'Consolidated_General_2007_11_06', 'Local_or_Municipal_2020_05_12', 'General_2012_11_06', 'Consolidated_General_2003_11_04']


In [173]:
# need in order to filter out rows after aggregation
def get_correct_dates(list_like_df):
    print("Shape before reshaping:",list_like_df.shape)
    list_like_df = list_like_df.explode(['GE_dates', 'LM_dates']).melt(id_vars=["city"], 
                                                                       var_name="Date", 
                                                                       value_name="Value")
    list_like_df = list_like_df.drop(columns = 'Date')
    list_like_df.columns = ['Residence_Addresses_City', 'elec_type_date']                          
    list_like_df['elec_date'] = list_like_df['elec_type_date'].str[-10:]
    list_like_df['elec_year'] = list_like_df['elec_type_date'].str[-10:-6]
    list_like_df['elec_type'] = list_like_df['elec_type_date'].str[:-11]                    
    list_like_df = list_like_df.drop(columns = 'elec_type_date')
    print("Shape after reshaping:",list_like_df.shape)
    return list_like_df

GE_LM_dates_df = get_correct_dates(GE_LM_dates_dict)
GE_LM_dates_df.head()

Shape before reshaping: (47, 3)
Shape after reshaping: (376, 4)


Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type
0,Brigham City,2020_11_03,2020,General
1,Brigham City,2018_11_06,2018,General
2,Brigham City,2016_11_08,2016,General
3,Brigham City,2014_11_04,2014,General
4,Logan,2020_11_03,2020,General


## 2. load the VOTE HISTORY data for selected election dates only

In [174]:
needed_variables = ['LALVOTERID'] + list(e_dates)

state_voterhistory_4_dates = pd.read_parquet(f'{filepath}{VOTE_filename}',
                                             columns=needed_variables)
                                
state_voterhistory_4_dates.head(5)

Unnamed: 0,LALVOTERID,General_2018_11_06,Consolidated_General_2015_11_03,Consolidated_General_2017_11_07,Local_or_Municipal_2017_06_27,Consolidated_General_2019_11_05,General_2020_11_03,General_2014_11_04,General_2016_11_08,Consolidated_General_2013_11_05,Consolidated_General_2007_11_06,Local_or_Municipal_2020_05_12,General_2012_11_06,Consolidated_General_2003_11_04
0,LALUT168588809,,,,,,Y,,,,,,,
1,LALUT169183714,Y,Y,Y,,Y,Y,,Y,Y,Y,,Y,Y
2,LALUT169183783,Y,Y,Y,,,Y,Y,Y,Y,Y,,Y,Y
3,LALUT169165557,Y,Y,Y,,,Y,Y,Y,Y,Y,,Y,Y
4,LALUT169173676,Y,Y,Y,,Y,Y,Y,Y,Y,Y,,Y,Y


## 3. Merge Vote History and Demographic Data

In [175]:
merged_file = pd.merge(state_voterhistory_4_dates, state_demographic_subset,
                       how='inner', left_on='LALVOTERID', right_on='LALVOTERID')

print(merged_file.shape)

print("number of unique cities:", merged_file.Residence_Addresses_City.nunique())

merged_file.head(5)

(887909, 27)
number of unique cities: 47


Unnamed: 0,LALVOTERID,General_2018_11_06,Consolidated_General_2015_11_03,Consolidated_General_2017_11_07,Local_or_Municipal_2017_06_27,Consolidated_General_2019_11_05,General_2020_11_03,General_2014_11_04,General_2016_11_08,Consolidated_General_2013_11_05,...,Voters_BirthDate,Parties_Description,EthnicGroups_EthnicGroup1Desc,Voters_OfficialRegDate,County,CommercialData_Education,CommercialData_EstimatedHHIncome,CommercialData_EstimatedHHIncomeAmount,FECDonors_NumberOfDonations,FECDonors_TotalDonationsAmount
0,LALUT621510668,,,,,,Y,,,,...,01/01/2000,Democratic,European,10/30/2020,BOX ELDER,,$50000-74999,$70115,,
1,LALUT168437912,,,,,,,,Y,,...,12/30/1969,Republican,European,09/26/1999,BOX ELDER,Some College - Likely,$50000-74999,$58000,,
2,LALUT169786507,Y,,,,Y,Y,Y,Y,,...,11/15/1973,Non-Partisan,Hispanic and Portuguese,12/05/2016,BOX ELDER,,,,,
3,LALUT596605091,,,,,,Y,,,,...,01/01/1987,Non-Partisan,Hispanic and Portuguese,10/01/2020,BOX ELDER,,$50000-74999,$70794,,
4,LALUT168453641,,,,,,Y,,,,...,06/25/1990,Republican,European,09/28/2020,BOX ELDER,,$50000-74999,$70794,,


In [176]:
merge_filename = DEMO_filename.replace('DEMOGRAPHIC_selected_cols.parquet', 'merged.parquet')
print(merge_filename)
merged_file.to_parquet(f'{filepath}{merge_filename}')

VM2--UT--2022-03-30-merged.parquet


# 3.1. Calculate voter turnout per ethnicity

In [177]:
import pandas as pd
merge_filename = DEMO_filename.replace('DEMOGRAPHIC_selected_cols.parquet', 'merged.parquet')
merged_file = pd.read_parquet(f'{filepath}{merge_filename}')

In [178]:
def replace_ethnicities(df):
    df = df.replace('East and South Asian', 'asian')
    df = df.replace('European', 'white')
    df = df.replace('Hispanic and Portuguese', 'hispanic')
    df = df.replace('Likely African-American', 'black')
    return df

In [179]:
merged_file = replace_ethnicities(merged_file)
merged_file.head()

Unnamed: 0,LALVOTERID,General_2018_11_06,Consolidated_General_2015_11_03,Consolidated_General_2017_11_07,Local_or_Municipal_2017_06_27,Consolidated_General_2019_11_05,General_2020_11_03,General_2014_11_04,General_2016_11_08,Consolidated_General_2013_11_05,...,Voters_BirthDate,Parties_Description,EthnicGroups_EthnicGroup1Desc,Voters_OfficialRegDate,County,CommercialData_Education,CommercialData_EstimatedHHIncome,CommercialData_EstimatedHHIncomeAmount,FECDonors_NumberOfDonations,FECDonors_TotalDonationsAmount
0,LALUT621510668,,,,,,Y,,,,...,01/01/2000,Democratic,white,10/30/2020,BOX ELDER,,$50000-74999,$70115,,
1,LALUT168437912,,,,,,,,Y,,...,12/30/1969,Republican,white,09/26/1999,BOX ELDER,Some College - Likely,$50000-74999,$58000,,
2,LALUT169786507,Y,,,,Y,Y,Y,Y,,...,11/15/1973,Non-Partisan,hispanic,12/05/2016,BOX ELDER,,,,,
3,LALUT596605091,,,,,,Y,,,,...,01/01/1987,Non-Partisan,hispanic,10/01/2020,BOX ELDER,,$50000-74999,$70794,,
4,LALUT168453641,,,,,,Y,,,,...,06/25/1990,Republican,white,09/28/2020,BOX ELDER,,$50000-74999,$70794,,


In [180]:
GE_cols = [col for col in merged_file.columns if col.startswith('General')]
print(GE_cols)
LM_cols = [col for col in merged_file.columns if col.startswith('Local_or_Municipal') \
           or col.startswith('Consolidated_General')]
print(LM_cols)

['General_2018_11_06', 'General_2020_11_03', 'General_2014_11_04', 'General_2016_11_08', 'General_2012_11_06']
['Consolidated_General_2015_11_03', 'Consolidated_General_2017_11_07', 'Local_or_Municipal_2017_06_27', 'Consolidated_General_2019_11_05', 'Consolidated_General_2013_11_05', 'Consolidated_General_2007_11_06', 'Local_or_Municipal_2020_05_12', 'Consolidated_General_2003_11_04']


In [181]:
# fill NA values with "N" to make it easier to compare  with "Y"
merged_file[GE_cols+LM_cols] = merged_file[GE_cols+LM_cols].fillna('N')
merged_file.head()

Unnamed: 0,LALVOTERID,General_2018_11_06,Consolidated_General_2015_11_03,Consolidated_General_2017_11_07,Local_or_Municipal_2017_06_27,Consolidated_General_2019_11_05,General_2020_11_03,General_2014_11_04,General_2016_11_08,Consolidated_General_2013_11_05,...,Voters_BirthDate,Parties_Description,EthnicGroups_EthnicGroup1Desc,Voters_OfficialRegDate,County,CommercialData_Education,CommercialData_EstimatedHHIncome,CommercialData_EstimatedHHIncomeAmount,FECDonors_NumberOfDonations,FECDonors_TotalDonationsAmount
0,LALUT621510668,N,N,N,N,N,Y,N,N,N,...,01/01/2000,Democratic,white,10/30/2020,BOX ELDER,,$50000-74999,$70115,,
1,LALUT168437912,N,N,N,N,N,N,N,Y,N,...,12/30/1969,Republican,white,09/26/1999,BOX ELDER,Some College - Likely,$50000-74999,$58000,,
2,LALUT169786507,Y,N,N,N,Y,Y,Y,Y,N,...,11/15/1973,Non-Partisan,hispanic,12/05/2016,BOX ELDER,,,,,
3,LALUT596605091,N,N,N,N,N,Y,N,N,N,...,01/01/1987,Non-Partisan,hispanic,10/01/2020,BOX ELDER,,$50000-74999,$70794,,
4,LALUT168453641,N,N,N,N,N,Y,N,N,N,...,06/25/1990,Republican,white,09/28/2020,BOX ELDER,,$50000-74999,$70794,,


In [182]:
# We created the dataframe below in order to easily calculate perc_turnout when no one voted
list_ethnic_city = merged_file[['Residence_Addresses_City', 'EthnicGroups_EthnicGroup1Desc']].drop_duplicates()
list_ethnic_city_No = list_ethnic_city.copy()
list_ethnic_city_No['voted'] = 'N'
list_ethnic_city_Yes = list_ethnic_city.copy()
list_ethnic_city_Yes['voted'] = 'Y'
list_ethnic_city = pd.concat([list_ethnic_city_No, list_ethnic_city_Yes])

In [183]:
list_ethnic_city

Unnamed: 0,Residence_Addresses_City,EthnicGroups_EthnicGroup1Desc,voted
0,Brigham City,white,N
2,Brigham City,hispanic,N
12,Brigham City,asian,N
9181,Logan,white,N
9193,Logan,asian,N
...,...,...,...
849193,West Haven,hispanic,Y
849205,West Haven,white,Y
849387,Ogden,black,Y
880819,West Haven,asian,Y


In [184]:
# we also need the total voters information per city and ethnicity
total_city_ethnic = merged_file.groupby(['Residence_Addresses_City', 'EthnicGroups_EthnicGroup1Desc']).size().reset_index()
total_city_ethnic.columns = ['Residence_Addresses_City', 'EthnicGroups_EthnicGroup1Desc', 'total_voters']
total_city_ethnic  = total_city_ethnic.merge(list_ethnic_city, on = ['Residence_Addresses_City', 'EthnicGroups_EthnicGroup1Desc'])

total_city_ethnic = replace_ethnicities(total_city_ethnic)
total_city_ethnic

Unnamed: 0,Residence_Addresses_City,EthnicGroups_EthnicGroup1Desc,total_voters,voted
0,Alpine,asian,81,N
1,Alpine,asian,81,Y
2,Alpine,black,1,N
3,Alpine,black,1,Y
4,Alpine,hispanic,97,N
...,...,...,...,...
359,West Valley City,black,203,Y
360,West Valley City,hispanic,8734,N
361,West Valley City,hispanic,8734,Y
362,West Valley City,white,26869,N


In [185]:
def calc_votes(df, col):
    voter_turnout_stats = df.groupby(['Residence_Addresses_City', 'EthnicGroups_EthnicGroup1Desc', col]).size().reset_index(name='voted_voters')

    # 'voted' is either 'Y' or 'N'
    voter_turnout_stats = voter_turnout_stats.rename(columns = {col: 'voted'})    

    voter_turnout_stats = total_city_ethnic.merge(voter_turnout_stats, 
                                                     how = 'left',
                                                     on = ['Residence_Addresses_City', 'EthnicGroups_EthnicGroup1Desc', 'voted']) 
    voter_turnout_stats['perc_turnout'] = voter_turnout_stats['voted_voters']/voter_turnout_stats['total_voters']

    voter_turnout_stats['elec_date'] = col[len(col)-10:]
    voter_turnout_stats['elec_year'] = col[len(col)-10:len(col)-6]
    voter_turnout_stats['elec_type'] = col[:len(col)-11]

    voter_turnout_stats[['voted_voters', 'perc_turnout']] = voter_turnout_stats[['voted_voters', 'perc_turnout']].fillna(0)
    voter_turnout_stats = voter_turnout_stats[voter_turnout_stats['voted'] == 'Y']    
    pivot_df = voter_turnout_stats.pivot(index = ['elec_type','elec_year', 'elec_date', 'Residence_Addresses_City'],
                                    columns='EthnicGroups_EthnicGroup1Desc', 
                                    values=['total_voters', 'voted_voters', 'perc_turnout']).reset_index()
    pivot_df.columns = pivot_df.columns.map('_'.join)
    pivot_df = pivot_df.rename(columns = {'elec_type_':'elec_type', 'elec_year_':'elec_year', 'elec_date_':'elec_date', 'Residence_Addresses_City_':'Residence_Addresses_City'})

    del voter_turnout_stats
    gc.collect()
    return pivot_df

elec_date_cols = GE_cols+LM_cols

for i in range(len(elec_date_cols)):
    col = elec_date_cols[i]
    pivot_df = calc_votes(merged_file, col)    
    # stack all types of election into one dataframe 
    if i == 0:
        voter_turnout_merge_ethnicity = pivot_df.copy() 
    else:
        voter_turnout_merge_ethnicity = pd.concat([voter_turnout_merge_ethnicity, pivot_df])


In [186]:
print(voter_turnout_merge_ethnicity.shape)
voter_turnout_merge_ethnicity.head()

(611, 16)


Unnamed: 0,elec_type,elec_year,elec_date,Residence_Addresses_City,total_voters_asian,total_voters_black,total_voters_hispanic,total_voters_white,voted_voters_asian,voted_voters_black,voted_voters_hispanic,voted_voters_white,perc_turnout_asian,perc_turnout_black,perc_turnout_hispanic,perc_turnout_white
0,General,2018,2018_11_06,Alpine,81.0,1.0,97.0,5404.0,38.0,0.0,43.0,3109.0,0.469136,0.0,0.443299,0.575315
1,General,2018,2018_11_06,American Fork,276.0,6.0,750.0,13659.0,109.0,0.0,264.0,7244.0,0.394928,0.0,0.352,0.530346
2,General,2018,2018_11_06,Bluffdale,149.0,,352.0,6629.0,81.0,,149.0,3867.0,0.543624,,0.423295,0.583346
3,General,2018,2018_11_06,Bountiful,438.0,14.0,874.0,19523.0,252.0,3.0,407.0,13217.0,0.575342,0.214286,0.465675,0.676996
4,General,2018,2018_11_06,Brigham City,104.0,,524.0,8553.0,57.0,,239.0,5430.0,0.548077,,0.456107,0.634865


In [187]:
print(voter_turnout_merge_ethnicity.shape)
# remove rows where election dates are not associated with city
# need to do this only once as we will be using inner join to ensure only necessary combinations of city and election dates are present
voter_turnout_merge_ethnicity = GE_LM_dates_df.merge(voter_turnout_merge_ethnicity, 
                                how = 'left',
                                on = ['elec_type', 'elec_year', 'elec_date', 'Residence_Addresses_City'])
print(voter_turnout_merge_ethnicity.shape)

(611, 16)
(376, 16)


In [188]:
#should be empty dataframe because of the way we have filitered the dataframe

no_voter_turnout = voter_turnout_merge_ethnicity[(voter_turnout_merge_ethnicity['perc_turnout_asian'] == 0) &
                                       (voter_turnout_merge_ethnicity['perc_turnout_black'] == 0) &
                                       (voter_turnout_merge_ethnicity['perc_turnout_hispanic'] == 0) &
                                       (voter_turnout_merge_ethnicity['perc_turnout_white'] == 0)]

no_voter_turnout.head()

Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type,total_voters_asian,total_voters_black,total_voters_hispanic,total_voters_white,voted_voters_asian,voted_voters_black,voted_voters_hispanic,voted_voters_white,perc_turnout_asian,perc_turnout_black,perc_turnout_hispanic,perc_turnout_white


#  3.2.  Calculate average donation 

In [189]:
def calc_donation(df):
    donations_df = df[['Residence_Addresses_City', 'FECDonors_TotalDonationsAmount', 'FECDonors_NumberOfDonations']
                  + elec_date_cols]
    melt_donations_df = donations_df.melt(id_vars=['Residence_Addresses_City', 'FECDonors_TotalDonationsAmount', 'FECDonors_NumberOfDonations'], 
              value_vars=elec_date_cols,
              var_name='elec_type_date',
              value_name='voted')
    melt_donations_df = melt_donations_df[melt_donations_df['voted'] == 'Y']

    melt_donations_df = melt_donations_df.astype({'FECDonors_TotalDonationsAmount': float, 'FECDonors_NumberOfDonations': float})                        
    melt_donations_df = melt_donations_df.groupby(['Residence_Addresses_City', 'elec_type_date']).agg({'FECDonors_TotalDonationsAmount':'sum','FECDonors_NumberOfDonations':'sum'}).reset_index()    
    melt_donations_df['mean_donation'] = melt_donations_df['FECDonors_TotalDonationsAmount']/melt_donations_df['FECDonors_NumberOfDonations']
    melt_donations_df['elec_date'] = melt_donations_df['elec_type_date'].str[-10:]
    melt_donations_df['elec_year'] = melt_donations_df['elec_type_date'].str[-10:-6]
    melt_donations_df['elec_type'] = melt_donations_df['elec_type_date'].str[:-11]
    melt_donations_df = melt_donations_df.drop(columns = 'elec_type_date').reset_index(drop=True)
    
    return melt_donations_df

avg_donations = calc_donation(merged_file)
print(avg_donations.shape)
avg_donations.head()

(526, 7)


Unnamed: 0,Residence_Addresses_City,FECDonors_TotalDonationsAmount,FECDonors_NumberOfDonations,mean_donation,elec_date,elec_year,elec_type
0,Alpine,214625.0,1185.0,181.118143,2003_11_04,2003,Consolidated_General
1,Alpine,501671.0,3005.0,166.945424,2007_11_06,2007,Consolidated_General
2,Alpine,390120.0,2603.0,149.873223,2013_11_05,2013,Consolidated_General
3,Alpine,525128.0,3121.0,168.256328,2015_11_03,2015,Consolidated_General
4,Alpine,629087.0,3533.0,178.060289,2017_11_07,2017,Consolidated_General


In [190]:
# Merge 3.1 and 3.2
voter_turnout_merge = voter_turnout_merge_ethnicity.merge(avg_donations, 
                                                          how = 'inner',
                                                          on = ['elec_type', 'elec_year', 'elec_date', 'Residence_Addresses_City'])

voter_turnout_merge.head()

Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type,total_voters_asian,total_voters_black,total_voters_hispanic,total_voters_white,voted_voters_asian,voted_voters_black,voted_voters_hispanic,voted_voters_white,perc_turnout_asian,perc_turnout_black,perc_turnout_hispanic,perc_turnout_white,FECDonors_TotalDonationsAmount,FECDonors_NumberOfDonations,mean_donation
0,Brigham City,2020_11_03,2020,General,104.0,,524.0,8553.0,89.0,,372.0,7267.0,0.855769,,0.709924,0.849643,147115.0,2118.0,69.459396
1,Brigham City,2018_11_06,2018,General,104.0,,524.0,8553.0,57.0,,239.0,5430.0,0.548077,,0.456107,0.634865,135431.0,1910.0,70.906283
2,Brigham City,2016_11_08,2016,General,104.0,,524.0,8553.0,62.0,,223.0,5544.0,0.596154,,0.425573,0.648194,132378.0,1801.0,73.502499
3,Brigham City,2014_11_04,2014,General,104.0,,524.0,8553.0,21.0,,54.0,2718.0,0.201923,,0.103053,0.317783,108945.0,1597.0,68.218535
4,Logan,2020_11_03,2020,General,341.0,39.0,1431.0,16833.0,232.0,27.0,853.0,12452.0,0.680352,0.692308,0.596087,0.739737,518781.0,5302.0,97.846284


#  3.3.  Calculate voter turnout per income

In [191]:
# percent missing values for income
print('Percent of rows with missing value for income:',
      100 * merged_file['CommercialData_EstimatedHHIncome'].isnull().sum() / merged_file.shape[0], '%')

Percent of rows with missing value for income: 1.6224635632705604 %


As long as this percentage is low, we can continue with our turnout calculations for income.

In [192]:
# Similar to before, but with income
list_income_city = merged_file[['Residence_Addresses_City', 'CommercialData_EstimatedHHIncome']].drop_duplicates()
list_income_city_No = list_income_city.copy()
list_income_city_No['voted'] = 'N'
list_income_city_Yes = list_income_city.copy()
list_income_city_Yes['voted'] = 'Y'
list_income_city = pd.concat([list_income_city_No, list_income_city_Yes])

In [193]:
# we also need the total voters information per city and income
total_city_income = merged_file.groupby(['Residence_Addresses_City', 'CommercialData_EstimatedHHIncome']).size().reset_index()
total_city_income.columns = ['Residence_Addresses_City', 'CommercialData_EstimatedHHIncome', 'total_voters']
total_city_income  = total_city_income.merge(list_income_city, on = ['Residence_Addresses_City', 'CommercialData_EstimatedHHIncome'])

total_city_income

Unnamed: 0,Residence_Addresses_City,CommercialData_EstimatedHHIncome,total_voters,voted
0,Alpine,$1000-14999,7,N
1,Alpine,$1000-14999,7,Y
2,Alpine,$100000-124999,989,N
3,Alpine,$100000-124999,989,Y
4,Alpine,$125000-149999,1419,N
...,...,...,...,...
1087,West Valley City,$35000-49999,3651,Y
1088,West Valley City,$50000-74999,13181,N
1089,West Valley City,$50000-74999,13181,Y
1090,West Valley City,$75000-99999,10447,N


In [194]:
# function to calculate percent turnout by income bracket
def calc_votes_income(df, col):
    voter_turnout_stats = df.groupby(['Residence_Addresses_City', 'CommercialData_EstimatedHHIncome', col]).size().reset_index(name='voted_voters')

    # 'voted' is either 'Y' or 'N'
    voter_turnout_stats = voter_turnout_stats.rename(columns = {col: 'voted'})    

    voter_turnout_stats = total_city_income.merge(voter_turnout_stats, 
                                                     how = 'left',
                                                     on = ['Residence_Addresses_City', 'CommercialData_EstimatedHHIncome', 'voted']) 
    voter_turnout_stats['perc_turnout_income'] = voter_turnout_stats['voted_voters']/voter_turnout_stats['total_voters']

    voter_turnout_stats['elec_date'] = col[len(col)-10:]
    voter_turnout_stats['elec_year'] = col[len(col)-10:len(col)-6]
    voter_turnout_stats['elec_type'] = col[:len(col)-11]

    voter_turnout_stats[['voted_voters', 'perc_turnout_income']] = voter_turnout_stats[['voted_voters', 'perc_turnout_income']].fillna(0)
    voter_turnout_stats = voter_turnout_stats[voter_turnout_stats['voted'] == 'Y']    
    pivot_df = voter_turnout_stats.pivot(index = ['elec_type','elec_year', 'elec_date', 'Residence_Addresses_City'],
                                    columns='CommercialData_EstimatedHHIncome', 
                                    values=['total_voters', 'voted_voters', 'perc_turnout_income']).reset_index()
    pivot_df.columns = pivot_df.columns.map('_'.join)
    pivot_df = pivot_df.rename(columns = {'elec_type_':'elec_type', 'elec_year_':'elec_year', 'elec_date_':'elec_date', 'Residence_Addresses_City_':'Residence_Addresses_City'})

    del voter_turnout_stats
    gc.collect()
    return pivot_df

elec_date_cols = GE_cols+LM_cols

for i in range(len(elec_date_cols)):
    col = elec_date_cols[i]
    pivot_df = calc_votes_income(merged_file, col)    
    # stack all types of election into one dataframe 
    if i == 0:
        voter_turnout_income = pivot_df.copy() 
    else:
        voter_turnout_income = pd.concat([voter_turnout_income, pivot_df])


In [195]:
voter_turnout_income.head()

Unnamed: 0,elec_type,elec_year,elec_date,Residence_Addresses_City,total_voters_$1000-14999,total_voters_$100000-124999,total_voters_$125000-149999,total_voters_$15000-24999,total_voters_$150000-174999,total_voters_$175000-199999,...,perc_turnout_income_$125000-149999,perc_turnout_income_$15000-24999,perc_turnout_income_$150000-174999,perc_turnout_income_$175000-199999,perc_turnout_income_$200000-249999,perc_turnout_income_$25000-34999,perc_turnout_income_$250000+,perc_turnout_income_$35000-49999,perc_turnout_income_$50000-74999,perc_turnout_income_$75000-99999
0,General,2018,2018_11_06,Alpine,7.0,989.0,1419.0,22.0,671.0,384.0,...,0.548273,0.590909,0.47839,0.546875,0.580645,0.776119,0.541667,0.781818,0.813043,0.613095
1,General,2018,2018_11_06,American Fork,105.0,2067.0,1674.0,196.0,536.0,564.0,...,0.544803,0.653061,0.554104,0.542553,0.474255,0.627566,0.530541,0.616257,0.483999,0.507715
2,General,2018,2018_11_06,Bluffdale,50.0,1179.0,687.0,41.0,600.0,857.0,...,0.590975,0.585366,0.588333,0.6021,0.643478,0.714286,0.589686,0.519231,0.648739,0.501211
3,General,2018,2018_11_06,Bountiful,186.0,3710.0,2796.0,286.0,1282.0,1088.0,...,0.645923,0.730769,0.679407,0.663603,0.656834,0.779221,0.666667,0.797927,0.607459,0.68158
4,General,2018,2018_11_06,Brigham City,128.0,806.0,736.0,151.0,279.0,275.0,...,0.611413,0.655629,0.655914,0.661818,0.683544,0.698073,0.674157,0.768802,0.572283,0.611892


In [196]:
# Merge 3.1, 3.2 and 3.3
# merge with previous calculations for race and donation
voter_turnout_merge = voter_turnout_merge.merge(voter_turnout_income, 
                                                          how = 'inner',
                                                          on = ['elec_type', 'elec_year', 'elec_date', 'Residence_Addresses_City'])

voter_turnout_merge.head()

Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type,total_voters_asian,total_voters_black,total_voters_hispanic,total_voters_white,voted_voters_asian,voted_voters_black,...,perc_turnout_income_$125000-149999,perc_turnout_income_$15000-24999,perc_turnout_income_$150000-174999,perc_turnout_income_$175000-199999,perc_turnout_income_$200000-249999,perc_turnout_income_$25000-34999,perc_turnout_income_$250000+,perc_turnout_income_$35000-49999,perc_turnout_income_$50000-74999,perc_turnout_income_$75000-99999
0,Brigham City,2020_11_03,2020,General,104.0,,524.0,8553.0,89.0,,...,0.870924,0.821192,0.842294,0.901818,0.860759,0.88651,0.88764,0.901114,0.808653,0.835677
1,Brigham City,2018_11_06,2018,General,104.0,,524.0,8553.0,57.0,,...,0.611413,0.655629,0.655914,0.661818,0.683544,0.698073,0.674157,0.768802,0.572283,0.611892
2,Brigham City,2016_11_08,2016,General,104.0,,524.0,8553.0,62.0,,...,0.650815,0.708609,0.648746,0.68,0.658228,0.700214,0.707865,0.750696,0.578614,0.629972
3,Brigham City,2014_11_04,2014,General,104.0,,524.0,8553.0,21.0,,...,0.300272,0.417219,0.247312,0.349091,0.303797,0.400428,0.404494,0.466574,0.257826,0.293692
4,Logan,2020_11_03,2020,General,341.0,39.0,1431.0,16833.0,232.0,27.0,...,0.815759,0.689433,0.817391,0.832168,0.863014,0.683529,0.854167,0.642676,0.761143,0.789592


In [197]:
# add one column that is just overall average income 
merged_file['CommercialData_EstimatedHHIncomeAmount']= merged_file['CommercialData_EstimatedHHIncomeAmount'].str.replace('$','', regex=False)

merged_file = merged_file.astype({'CommercialData_EstimatedHHIncomeAmount': float})
        
avg_income = merged_file[['Residence_Addresses_City', 'CommercialData_EstimatedHHIncomeAmount']].\
            groupby(['Residence_Addresses_City']).\
            mean().reset_index()

avg_income.head(10)

Unnamed: 0,Residence_Addresses_City,CommercialData_EstimatedHHIncomeAmount
0,Alpine,157972.386335
1,American Fork,106989.984032
2,Bluffdale,133521.71303
3,Bountiful,115162.892332
4,Brigham City,85141.696468
5,Cedar City,83553.870092
6,Clearfield,84913.013905
7,Cottonwood Heights,129325.701576
8,Draper,148252.762113
9,Eagle Mountain,111949.620047


In [198]:
# Merge 3.1, 3.2 and 3.3
# merge with previous calculations for race, donation, income bracket
voter_turnout_merge = voter_turnout_merge.merge(avg_income, 
                                                          how = 'inner',
                                                          on = ['Residence_Addresses_City'])


voter_turnout_merge.head(10)

Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type,total_voters_asian,total_voters_black,total_voters_hispanic,total_voters_white,voted_voters_asian,voted_voters_black,...,perc_turnout_income_$15000-24999,perc_turnout_income_$150000-174999,perc_turnout_income_$175000-199999,perc_turnout_income_$200000-249999,perc_turnout_income_$25000-34999,perc_turnout_income_$250000+,perc_turnout_income_$35000-49999,perc_turnout_income_$50000-74999,perc_turnout_income_$75000-99999,CommercialData_EstimatedHHIncomeAmount
0,Brigham City,2020_11_03,2020,General,104.0,,524.0,8553.0,89.0,,...,0.821192,0.842294,0.901818,0.860759,0.88651,0.88764,0.901114,0.808653,0.835677,85141.696468
1,Brigham City,2018_11_06,2018,General,104.0,,524.0,8553.0,57.0,,...,0.655629,0.655914,0.661818,0.683544,0.698073,0.674157,0.768802,0.572283,0.611892,85141.696468
2,Brigham City,2016_11_08,2016,General,104.0,,524.0,8553.0,62.0,,...,0.708609,0.648746,0.68,0.658228,0.700214,0.707865,0.750696,0.578614,0.629972,85141.696468
3,Brigham City,2014_11_04,2014,General,104.0,,524.0,8553.0,21.0,,...,0.417219,0.247312,0.349091,0.303797,0.400428,0.404494,0.466574,0.257826,0.293692,85141.696468
4,Brigham City,2019_11_05,2019,Consolidated_General,104.0,,524.0,8553.0,23.0,,...,0.337748,0.333333,0.287273,0.367089,0.408994,0.359551,0.458217,0.273303,0.292085,85141.696468
5,Brigham City,2017_11_07,2017,Consolidated_General,104.0,,524.0,8553.0,18.0,,...,0.337748,0.261649,0.189091,0.202532,0.327623,0.314607,0.387187,0.205769,0.234231,85141.696468
6,Brigham City,2017_06_27,2017,Local_or_Municipal,104.0,,524.0,8553.0,0.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000402,85141.696468
7,Brigham City,2015_11_03,2015,Consolidated_General,104.0,,524.0,8553.0,17.0,,...,0.251656,0.204301,0.181818,0.240506,0.216274,0.202247,0.317549,0.15758,0.196063,85141.696468
8,Logan,2020_11_03,2020,General,341.0,39.0,1431.0,16833.0,232.0,27.0,...,0.689433,0.817391,0.832168,0.863014,0.683529,0.854167,0.642676,0.761143,0.789592,72519.452406
9,Logan,2018_11_06,2018,General,341.0,39.0,1431.0,16833.0,128.0,11.0,...,0.313144,0.556522,0.56993,0.659817,0.371914,0.55787,0.344623,0.518286,0.555597,72519.452406


#  3.4.  Calculate Voter turnout for college ed vs no college

In [199]:
# values for education column
merged_file['CommercialData_Education'].value_counts()

Some College - Likely                             120788
Bach Degree - Extremely Likely                     88457
Bach Degree - Likely                               77858
HS Diploma - Extremely Likely                      66214
HS Diploma - Likely                                52777
Grad Degree - Extremely Likely                     42907
Grad Degree - Likely                               40664
Some College -Extremely Likely                     28226
Less than HS Diploma - Likely                      14104
Vocational Technical Degree - Extremely Likely       802
Less than HS Diploma - Ex Like                        71
Name: CommercialData_Education, dtype: int64

In [200]:
# add column to merged file for college or no college
college_ed = ['Some College - Likely', 'Bach Degree - Extremely Likely', 'Bach Degree - Likely', 
              'Grad Degree - Likely', 'Grad Degree - Extremely Likely', 'Some College -Extremely Likely']
no_college_ed = ['HS Diploma - Extremely Likely', 'HS Diploma - Likely', 'Less than HS Diploma - Likely', 
                'Vocational Technical Degree - Extremely Likely', 'Less than HS Diploma - Ex Like']

conditions = [merged_file['CommercialData_Education'].isin(college_ed), 
             merged_file['CommercialData_Education'].isin(no_college_ed)]
outputs = ['college', 'no_college']
education_col = np.select(conditions, outputs, None)
education_col = pd.Series(education_col)

# add to merged file
merged_file['College_Ed'] = education_col

In [201]:
merged_file.head()

Unnamed: 0,LALVOTERID,General_2018_11_06,Consolidated_General_2015_11_03,Consolidated_General_2017_11_07,Local_or_Municipal_2017_06_27,Consolidated_General_2019_11_05,General_2020_11_03,General_2014_11_04,General_2016_11_08,Consolidated_General_2013_11_05,...,Parties_Description,EthnicGroups_EthnicGroup1Desc,Voters_OfficialRegDate,County,CommercialData_Education,CommercialData_EstimatedHHIncome,CommercialData_EstimatedHHIncomeAmount,FECDonors_NumberOfDonations,FECDonors_TotalDonationsAmount,College_Ed
0,LALUT621510668,N,N,N,N,N,Y,N,N,N,...,Democratic,white,10/30/2020,BOX ELDER,,$50000-74999,70115.0,,,
1,LALUT168437912,N,N,N,N,N,N,N,Y,N,...,Republican,white,09/26/1999,BOX ELDER,Some College - Likely,$50000-74999,58000.0,,,college
2,LALUT169786507,Y,N,N,N,Y,Y,Y,Y,N,...,Non-Partisan,hispanic,12/05/2016,BOX ELDER,,,,,,
3,LALUT596605091,N,N,N,N,N,Y,N,N,N,...,Non-Partisan,hispanic,10/01/2020,BOX ELDER,,$50000-74999,70794.0,,,
4,LALUT168453641,N,N,N,N,N,Y,N,N,N,...,Republican,white,09/28/2020,BOX ELDER,,$50000-74999,70794.0,,,


In [202]:
# get counts for voters and college ed
list_edu_city = merged_file[['Residence_Addresses_City', 'College_Ed']].drop_duplicates()
list_edu_city_No = list_edu_city.copy()
list_edu_city_No['voted'] = 'N'
list_edu_city_Yes = list_edu_city.copy()
list_edu_city_Yes['voted'] = 'Y'
list_edu_city = pd.concat([list_edu_city_No, list_edu_city_Yes])

# we also need the total voters information per city and education
total_city_edu = merged_file.groupby(['Residence_Addresses_City', 'College_Ed']).size().reset_index()
total_city_edu.columns = ['Residence_Addresses_City', 'College_Ed', 'total_voters']
total_city_edu  = total_city_edu.merge(list_edu_city, on = ['Residence_Addresses_City', 'College_Ed'])

total_city_edu

Unnamed: 0,Residence_Addresses_City,College_Ed,total_voters,voted
0,Alpine,college,2796,N
1,Alpine,college,2796,Y
2,Alpine,no_college,458,N
3,Alpine,no_college,458,Y
4,American Fork,college,6808,N
...,...,...,...,...
181,West Haven,no_college,1384,Y
182,West Valley City,college,13237,N
183,West Valley City,college,13237,Y
184,West Valley City,no_college,10269,N


In [203]:
# function to calculate percent turnout by income bracket
def calc_votes_edu(df, col):
    voter_turnout_stats = df.groupby(['Residence_Addresses_City', 'College_Ed', col]).size().reset_index(name='voted_voters')

    # 'voted' is either 'Y' or 'N'
    voter_turnout_stats = voter_turnout_stats.rename(columns = {col: 'voted'})    

    voter_turnout_stats = total_city_edu.merge(voter_turnout_stats, 
                                                     how = 'left',
                                                     on = ['Residence_Addresses_City', 'College_Ed', 'voted']) 
    voter_turnout_stats['perc_turnout'] = voter_turnout_stats['voted_voters']/voter_turnout_stats['total_voters']

    voter_turnout_stats['elec_date'] = col[len(col)-10:]
    voter_turnout_stats['elec_year'] = col[len(col)-10:len(col)-6]
    voter_turnout_stats['elec_type'] = col[:len(col)-11]

    voter_turnout_stats[['voted_voters', 'perc_turnout']] = voter_turnout_stats[['voted_voters', 'perc_turnout']].fillna(0)
    voter_turnout_stats = voter_turnout_stats[voter_turnout_stats['voted'] == 'Y']    
    pivot_df = voter_turnout_stats.pivot(index = ['elec_type','elec_year', 'elec_date', 'Residence_Addresses_City'],
                                    columns='College_Ed', 
                                    values=['total_voters', 'voted_voters', 'perc_turnout']).reset_index()
    pivot_df.columns = pivot_df.columns.map('_'.join)
    pivot_df = pivot_df.rename(columns = {'elec_type_':'elec_type', 'elec_year_':'elec_year', 'elec_date_':'elec_date', 'Residence_Addresses_City_':'Residence_Addresses_City'})

    del voter_turnout_stats
    gc.collect()
    return pivot_df

elec_date_cols = GE_cols+LM_cols

for i in range(len(elec_date_cols)):
    col = elec_date_cols[i]
    pivot_df = calc_votes_edu(merged_file, col)    
    # stack all types of election into one dataframe 
    if i == 0:
        voter_turnout_edu = pivot_df.copy() 
    else:
        voter_turnout_edu = pd.concat([voter_turnout_edu, pivot_df])

In [204]:
voter_turnout_edu.head()

Unnamed: 0,elec_type,elec_year,elec_date,Residence_Addresses_City,total_voters_college,total_voters_no_college,voted_voters_college,voted_voters_no_college,perc_turnout_college,perc_turnout_no_college
0,General,2018,2018_11_06,Alpine,2796.0,458.0,1933.0,337.0,0.691345,0.735808
1,General,2018,2018_11_06,American Fork,6808.0,2027.0,4172.0,1291.0,0.612808,0.636902
2,General,2018,2018_11_06,Bluffdale,3103.0,1205.0,2019.0,803.0,0.650661,0.66639
3,General,2018,2018_11_06,Bountiful,10872.0,2785.0,8119.0,2106.0,0.746781,0.756194
4,General,2018,2018_11_06,Brigham City,4077.0,2106.0,2846.0,1518.0,0.698062,0.720798


In [205]:
# Merge 3.1, 3.2, 3.3, and 3.4
voter_turnout_merge = voter_turnout_merge.merge(voter_turnout_edu, 
                                                          how = 'inner',
                                                          on = ['elec_type', 'elec_year', 'elec_date', 'Residence_Addresses_City'])

voter_turnout_merge.head()

Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type,total_voters_asian,total_voters_black,total_voters_hispanic,total_voters_white,voted_voters_asian,voted_voters_black,...,perc_turnout_income_$35000-49999,perc_turnout_income_$50000-74999,perc_turnout_income_$75000-99999,CommercialData_EstimatedHHIncomeAmount,total_voters_college,total_voters_no_college,voted_voters_college,voted_voters_no_college,perc_turnout_college,perc_turnout_no_college
0,Brigham City,2020_11_03,2020,General,104.0,,524.0,8553.0,89.0,,...,0.901114,0.808653,0.835677,85141.696468,4077.0,2106.0,3657.0,1866.0,0.896983,0.88604
1,Brigham City,2018_11_06,2018,General,104.0,,524.0,8553.0,57.0,,...,0.768802,0.572283,0.611892,85141.696468,4077.0,2106.0,2846.0,1518.0,0.698062,0.720798
2,Brigham City,2016_11_08,2016,General,104.0,,524.0,8553.0,62.0,,...,0.750696,0.578614,0.629972,85141.696468,4077.0,2106.0,2985.0,1534.0,0.732156,0.728395
3,Brigham City,2014_11_04,2014,General,104.0,,524.0,8553.0,21.0,,...,0.466574,0.257826,0.293692,85141.696468,4077.0,2106.0,1466.0,804.0,0.359578,0.381766
4,Brigham City,2019_11_05,2019,Consolidated_General,104.0,,524.0,8553.0,23.0,,...,0.458217,0.273303,0.292085,85141.696468,4077.0,2106.0,1494.0,814.0,0.366446,0.386515


#  3.4.  Calculate voter turnout per age group

In [206]:
#just average age of voter for each election
#need to calculate age on election date using DOB
#no need to bucket!

In [207]:
def calc_age(df):
    age_df = df[['Residence_Addresses_City', 'Voters_Age'] + elec_date_cols]
    melt_age_df = age_df.melt(id_vars=['Residence_Addresses_City', 'Voters_Age'], 
              value_vars=elec_date_cols,
              var_name='elec_type_date',
              value_name='voted')
    melt_age_df = melt_age_df[melt_age_df['voted'] == 'Y']

    melt_age_df = melt_age_df.astype({'Voters_Age': float})                        
    melt_age_df = melt_age_df.groupby(['Residence_Addresses_City', 'elec_type_date']).agg({'Voters_Age':'mean'}).reset_index()    
    melt_age_df.rename(columns = {'Voters_Age': 'mean_age'}, inplace = True)
    melt_age_df['elec_date'] = melt_age_df['elec_type_date'].str[-10:]
    melt_age_df['elec_year'] = melt_age_df['elec_type_date'].str[-10:-6]
    melt_age_df['elec_type'] = melt_age_df['elec_type_date'].str[:-11]
    melt_age_df = melt_age_df.drop(columns = 'elec_type_date').reset_index(drop=True)
    
    return melt_age_df

avg_age = calc_age(merged_file)
print(avg_age.shape)
avg_age.head()

(526, 5)


Unnamed: 0,Residence_Addresses_City,mean_age,elec_date,elec_year,elec_type
0,Alpine,65.760128,2003_11_04,2003,Consolidated_General
1,Alpine,62.69945,2007_11_06,2007,Consolidated_General
2,Alpine,63.147686,2013_11_05,2013,Consolidated_General
3,Alpine,61.100508,2015_11_03,2015,Consolidated_General
4,Alpine,58.505723,2017_11_07,2017,Consolidated_General


In [208]:

# Merge 3.1, 3.2, 3.3, 3.4 and 3.5

voter_turnout_merge = voter_turnout_merge.merge(avg_age,
                                                how = 'inner',
                                                on = ['elec_type', 'elec_year', 'elec_date', 'Residence_Addresses_City'])

voter_turnout_merge.head()

Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type,total_voters_asian,total_voters_black,total_voters_hispanic,total_voters_white,voted_voters_asian,voted_voters_black,...,perc_turnout_income_$50000-74999,perc_turnout_income_$75000-99999,CommercialData_EstimatedHHIncomeAmount,total_voters_college,total_voters_no_college,voted_voters_college,voted_voters_no_college,perc_turnout_college,perc_turnout_no_college,mean_age
0,Brigham City,2020_11_03,2020,General,104.0,,524.0,8553.0,89.0,,...,0.808653,0.835677,85141.696468,4077.0,2106.0,3657.0,1866.0,0.896983,0.88604,53.145914
1,Brigham City,2018_11_06,2018,General,104.0,,524.0,8553.0,57.0,,...,0.572283,0.611892,85141.696468,4077.0,2106.0,2846.0,1518.0,0.698062,0.720798,57.152151
2,Brigham City,2016_11_08,2016,General,104.0,,524.0,8553.0,62.0,,...,0.578614,0.629972,85141.696468,4077.0,2106.0,2985.0,1534.0,0.732156,0.728395,57.16618
3,Brigham City,2014_11_04,2014,General,104.0,,524.0,8553.0,21.0,,...,0.257826,0.293692,85141.696468,4077.0,2106.0,1466.0,804.0,0.359578,0.381766,64.762912
4,Brigham City,2019_11_05,2019,Consolidated_General,104.0,,524.0,8553.0,23.0,,...,0.273303,0.292085,85141.696468,4077.0,2106.0,1494.0,814.0,0.366446,0.386515,62.947498


# 3.5 Calculate total population of age 20+

In [210]:
# read in cities.csv 
df_cities = pd.read_csv(filepath + cities_filename)

# fix naming errors
if state == 'california':
    # replace El Paso de Robles with Paso Robles:
    df_cities.loc[df_cities['city'] == 'El Paso de Robles', 'city'] = 'Paso Robles'
elif state == 'utah':
    # replace St. George with Saint George
    df_cities.loc[df_cities['city'] == 'St. George', 'city'] = 'Saint George'
    df_cities = df_cities.dropna(subset=['population_proper'])
    
# make new column w/ voter_population, that is total population * percent20+
def calculate_voter_pop(row):
    percent_voting_age = row[['age_20s','age_30s', 'age_40s', 'age_50s', 'age_60s', 
                              'age_70s', 'age_over_80']].sum() / 100
    return int(np.floor(row['population_proper'] * percent_voting_age))

df_cities['voter_population'] = df_cities.apply(lambda row: calculate_voter_pop(row), axis=1)
df_cities.head()

Unnamed: 0,RCV,city,city_ascii,city_alt,state_id,state_name,county_fips,county_name,county_fips_all,county_name_all,...,race_other,race_multiple,hispanic,disabled,poverty,limited_english,commute_time,health_uninsured,veteran,voter_population
0,1.0,Salt Lake City,Salt Lake City,,UT,Utah,49035.0,Salt Lake,49035.0,Salt Lake,...,12.8,3.1,21.3,10.6,17.8,4.9,19.4,14.6,4.7,152613
1,,Ogden,Ogden,,UT,Utah,49057.0,Weber,49057.0,Weber,...,7.1,3.9,32.3,13.0,21.4,4.1,20.3,17.1,7.8,60921
2,,Provo,Provo,,UT,Utah,49049.0,Utah,49049.0,Utah,...,2.6,3.6,16.3,8.3,25.4,3.4,17.8,11.8,2.5,83542
3,,West Valley City,West Valley City,,UT,Utah,49035.0,Salt Lake,49035.0,Salt Lake,...,25.0,4.3,37.7,9.4,13.8,7.6,22.0,20.8,5.4,89055
4,,Saint George,St. George,Saint George,UT,Utah,49053.0,Washington,49053.0,Washington,...,4.6,2.8,13.0,13.8,15.5,2.1,15.0,13.9,8.6,60096


In [211]:
# merge with all previous calculations
voter_turnout_merge = voter_turnout_merge.merge(df_cities[['city', 'voter_population']], how = 'inner', left_on = ['Residence_Addresses_City'], right_on = ['city'])
voter_turnout_merge.drop(columns=['city'], inplace=True)
voter_turnout_merge.head()

Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type,total_voters_asian,total_voters_black,total_voters_hispanic,total_voters_white,voted_voters_asian,voted_voters_black,...,perc_turnout_income_$75000-99999,CommercialData_EstimatedHHIncomeAmount,total_voters_college,total_voters_no_college,voted_voters_college,voted_voters_no_college,perc_turnout_college,perc_turnout_no_college,mean_age,voter_population
0,Brigham City,2020_11_03,2020,General,104.0,,524.0,8553.0,89.0,,...,0.835677,85141.696468,4077.0,2106.0,3657.0,1866.0,0.896983,0.88604,53.145914,13197
1,Brigham City,2018_11_06,2018,General,104.0,,524.0,8553.0,57.0,,...,0.611892,85141.696468,4077.0,2106.0,2846.0,1518.0,0.698062,0.720798,57.152151,13197
2,Brigham City,2016_11_08,2016,General,104.0,,524.0,8553.0,62.0,,...,0.629972,85141.696468,4077.0,2106.0,2985.0,1534.0,0.732156,0.728395,57.16618,13197
3,Brigham City,2014_11_04,2014,General,104.0,,524.0,8553.0,21.0,,...,0.293692,85141.696468,4077.0,2106.0,1466.0,804.0,0.359578,0.381766,64.762912,13197
4,Brigham City,2019_11_05,2019,Consolidated_General,104.0,,524.0,8553.0,23.0,,...,0.292085,85141.696468,4077.0,2106.0,1494.0,814.0,0.366446,0.386515,62.947498,13197


# Save the merged aggregations 

In [147]:
voter_turnout_merge.to_csv(f'{filepath}voter_turnout_merged_{state}.csv', index=False)

In [148]:
del voter_turnout_merge
gc.collect()

20

In [149]:
end_time = time.time()
print("Time take to run this notebook in seconds: ", end_time - start_time)

Time take to run this notebook in seconds:  132.2906289100647
