# 1. Data Loading

1. DEMOGRAPHIC_selected_cols parquet file contains all rows with selected columns (voter id, city, county, ethnicity, age, gender, education, income, donation and parties description). See Reduce_to_parquet.ipynb 
2. VOTEHISTORY_selected_cols parquet file contains all rows with selected columns (General and Local_or_Municipal). See Reduce_to_parquet.ipynb 
3. GE_LM_dates_per_city parquet file contains four most recent General election and Local_or_Municipal for each of the selected city. See Find_recent_election_dates.ipynb and ca_similarity_search.ipynb

Some observations
- We choose the 5 non-RCV cities with highest cosine similary score compared to the 7 RCV cities in CA
- There were 33 distinct cities among those 35 cities
- There are 66 non-registered voters among 21.7 million voters
- There are total of 3.9 million voters in the sampled cities
- City 'El Paso de Robles' didn't match in demographic data


In [1]:
import pandas as pd
import numpy as np
import janitor
import gc
import time
from datetime import datetime
start_time = time.time()

#### Define state here

In [4]:
state = 'california'

In [5]:
def combine_cities_list(RCV_list, NonRCV_list):

    print("total number of cities:", len(RCV_list))

    print("number of distinct cities:", len(set(NonRCV_list)))

    print("name of cities that were duplicated:", set([x for x in NonRCV_list if NonRCV_list.count(x) > 1]))

    combined_cityName = RCV_list+list(set(NonRCV_list))
    print("number of distinct RCV and sampled nonRCV cities:", len(combined_cityName))
    return combined_cityName


### California

In [7]:
if state=='california':
    # ------ California -------

    ## change the filepath as required, we have selected the folder with the latest date

    #filepath = '../Downloads/VM2--CA--2022-04-25/'
    #filepath = '../Downloads/'
    #filepath = '../data/VM2--CA--2022-04-25/'
    filepath = '../CA/VM2--CA--2022-04-25/'
    DEMO_filename = 'VM2--CA--2022-04-25-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--CA--2022-04-25-VOTEHISTORY_selected_cols.parquet'
    elec_dates_filename = 'GE_LM_dates_per_city_CA.parquet'
    cities_filename = 'ca-cities.csv'

    # 1. List of RCV and non-RCV cities 

    RCV_cities_CA = ['San Francisco',
     'Oakland',
     'Berkeley',
     'San Leandro',
     'Palm Desert',
     'Eureka',
     'Albany']

    sampled_nonRCV_cities_CA = ['Fresno',
     'San Diego',
     'Sacramento',
     'Riverside',
     'San Jose',
     'Santa Ana',
     'Anaheim',
     'Santa Rosa',
     'Merced',
     'Santa Clarita',
     'Alhambra',
     'Davis',
     'Montebello',
     'Burbank',
     'Huntington Park',
     'Bellflower',
     'Watsonville',
     'Gilroy',
     'Whittier',
     'Lynwood',
     'Lakewood',
     'Pico Rivera',
     'Lake Forest',
     'Livermore',
     'Chino Hills',
     'Paramount',
     'El Paso de Robles',
     'Pico Rivera',
     'Buena Park',
     'Whittier',
     'Calabasas',
     'Carpinteria',
     'Morro Bay',
     'San Carlos',
     'Solvang']

    combined_sampled_cityName = combine_cities_list(RCV_list= RCV_cities_CA, NonRCV_list = sampled_nonRCV_cities_CA)
    # ---------------------

total number of cities: 7
number of distinct cities: 33
name of cities that were duplicated: {'Whittier', 'Pico Rivera'}
number of distinct RCV and sampled nonRCV cities: 40


### Utah

In [154]:
if state=='utah':
    # # ------ Utah -------
    filepath = '../data/VM2--UT--2022-03-30/'
    DEMO_filename = 'VM2--UT--2022-03-30-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--UT--2022-03-30-VOTEHISTORY_selected_cols.parquet'
    elec_dates_filename = 'GE_LM_dates_per_city_UT.parquet'
    cities_filename = 'ut-cities.csv'


    ##1. List of RCV and non-RCV cities 

    RCV_cities_UT = ['Salt Lake City', 'Sandy', 'Lehi', 'Millcreek', 
                     'Draper', 'Riverton',  'Cottonwood Heights', 
                     'Springville', 'Midvale', 'Magna', 'South Salt Lake', 
                     'Payson', 'Bluffdale']

    sampled_nonRCV_cities_UT = ['Ogden', 'Provo', 'West Valley City', 
                                'Logan', 'St. George', 'Taylorsville', 
                                'Layton', 'Orem', 'South Jordan', 'Murray', 
                                'South Jordan', 'Clearfield', 'Spanish Fork', 
                                'Tooele', 'Kearns', 'Cedar City', 'Murray', 
                                'Bountiful',  'South Jordan', 'Pleasant Grove', 
                                'Vernal', 'Hurricane', 'Herriman', 'American Fork', 
                                'Washington', 'Eagle Mountain', 'Brigham City', 
                                'American Fork', 'Herriman', 'Spanish Fork', 
                                'Washington', 'Heber', 'Hurricane', 'Vernal', 
                                'Holladay', 'Pleasant Grove', 'American Fork', 
                                'Herriman', 'Eagle Mountain', 'Vernal', 
                                'Bountiful', 'Pleasant Grove', 'Washington', 
                                'South Jordan', 'Vernal', 'Tooele', 
                                'Spanish Fork', 'Clearfield', 'Kearns', 
                                'Eagle Mountain', 'Washington', 'Bountiful', 
                                'Pleasant Grove', 'Hurricane', 'Cedar City', 
                                'Saratoga Springs', 'Kaysville', 'Brigham City', 
                                'North Salt Lake', 'American Fork', 'Highland', 
                                'Lindon', 'Alpine', 'West Haven', 'North Logan']

    combined_sampled_cityName = combine_cities_list(RCV_list= RCV_cities_UT, NonRCV_list = sampled_nonRCV_cities_UT)
    # # ---------------------


total number of cities: 13
number of distinct cities: 34
name of cities that were duplicated: {'Bountiful', 'Cedar City', 'American Fork', 'Herriman', 'Clearfield', 'South Jordan', 'Brigham City', 'Tooele', 'Hurricane', 'Washington', 'Murray', 'Spanish Fork', 'Kearns', 'Pleasant Grove', 'Eagle Mountain', 'Vernal'}
number of distinct RCV and sampled nonRCV cities: 47


### Colorado

In [155]:
if state=='colorado':
    # ------ Colorado -------

    ## change the filepath as required, we have selected the folder with the latest date

    filepath = '../Downloads/VM2--CO--2022-04-26/'
    DEMO_filename = 'VM2--CO--2022-04-26-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--CO--2022-04-26-VOTEHISTORY_selected_cols.parquet'
    elec_dates_filename = 'GE_LM_dates_per_city_CO.parquet'

    # 1. List of RCV and non-RCV cities 

    RCV_cities_CO = ['Boulder']

    sampled_nonRCV_cities_CO = ['Alamosa', 'Arvada', 'Brighton', 'Broomfield', 'Castle Rock', 
                                'Ca√±on City', 'Centennial', 'Commerce City', 
                                'Durango', 'Englewood', 'Fountain', 'Glenwood Springs', 'Golden', 
                                'Greenwood Village', 'Highlands Ranch', 'Lafayette', 
                                'Littleton', 'Longmont', 'Louisville', 'Loveland', 'Montrose', 'Northglenn', 
                                'Parker', 'Silverthorne', 'Steamboat Springs', 'Wheat Ridge', 'Windsor']

    combined_sampled_cityName = combine_cities_list(RCV_list= RCV_cities_CO, NonRCV_list = sampled_nonRCV_cities_CO)

    # ---------------------

### Maryland

In [156]:
if state=='maryland':
    # ------ Maryland -------

    ## change the filepath as required, we have selected the folder with the latest date

    filepath = '../Downloads/VM2--MD--2022-04-08/'
    DEMO_filename = 'VM2--MD--2022-04-08-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--MD--2022-04-08-VOTEHISTORY_selected_cols.parquet'
    elec_dates_filename = 'GE_LM_dates_per_city_CO.parquet'

    # 1. List of RCV and non-RCV cities 

    RCV_cities_MD = ['Takoma Park']

    sampled_nonRCV_cities_MD = ['Adelphi', 'Annapolis', 'Aspen Hill', 'Bethesda', 
                                'Cockeysville', 'College Park', 'Easton', 
                                'Hyattsville', 'New Carrollton', 
                                'North Bethesda', 'North Potomac', 'Ocean Pines', 'Potomac', 
                                'Princess Anne', 'Severna Park', 'Timonium', 
                                'Westminster']

    combined_sampled_cityName = combine_cities_list(RCV_list= RCV_cities_MD, NonRCV_list = sampled_nonRCV_cities_MD)
    # ---------------------


### Maine

In [157]:
if state=='maine':
    # ------ Maine -------

    ## change the filepath as required, we have selected the folder with the latest date

    filepath = '../Downloads/VM2--ME--2022-03-02/'
    DEMO_filename = 'VM2--ME--2022-03-02-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--ME--2022-03-02-VOTEHISTORY_selected_cols.parquet'
    elec_dates_filename = 'GE_LM_dates_per_city_CO.parquet'

    # 1. List of RCV and non-RCV cities 
    # low number of cities
    combined_sampled_cityName = ['Sanford', 'Westbrook', 'Lewiston', 'Wells', 'Standish', 'Waterville',
     'Falmouth', 'Windham', 'Kennebunk', 'Scarborough', 'South Portland', 'Bangor',
     'Augusta', 'Brunswick', 'Auburn', 'Portland', 'Biddeford', 'York', 'Saco',
     'Orono', 'Gorham']
    # ---------------------


### Minnesota

In [158]:
if state=='minnesota':
    # ------ Minnesota -------
    ## change the filepath as required, we have selected the folder with the latest date

    filepath = '../Downloads/VM2--MN--2022-03-25/'
    DEMO_filename = 'VM2--MN--2022-03-25-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--MN--2022-03-25-VOTEHISTORY_selected_cols.parquet'
    elec_dates_filename = 'GE_LM_dates_per_city_CO.parquet'

    # 1. List of RCV and non-RCV cities 

    RCV_cities_MN = ['Bloomington', 'Minneapolis', 'Minnetonka', 'St. Louis Park']

    sampled_nonRCV_cities_MN = ['Alexandria', 'Bemidji', 'Blaine', 'Blaine', 'Brainerd', 'Brainerd', 
                                'Brooklyn Center', 'Brooklyn Park', 'Brooklyn Park', 'Burnsville', 
                                'Coon Rapids', 'Coon Rapids', 'Duluth', 'Eagan', 'Eden Prairie', 'Eden Prairie', 
                                'Fridley', 'Inver Grove Heights', 'Mankato', 'Mankato', 'Maple Grove', 'Maplewood', 
                                'Moorhead', 'Plymouth', 'Richfield', 'Richfield', 'Rochester', 'Roseville', 
                                'Roseville', 'St. Cloud', 'St. Paul', 'Winona']

    combined_sampled_cityName = combine_cities_list(RCV_list= RCV_cities_MN, NonRCV_list = sampled_nonRCV_cities_MN)


### New Mexico

In [159]:
if state=='new mexico':
    # ------ New Mexico -------

    ## change the filepath as required, we have selected the folder with the latest date

    filepath = '../Downloads/VM2--NM--2022-03-30/'
    DEMO_filename = 'VM2--NM--2022-03-30-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--NM--2022-03-30-VOTEHISTORY_selected_cols.parquet'
    elec_dates_filename = 'GE_LM_dates_per_city_CO.parquet'

    # 1. List of RCV and non-RCV cities 
    # low number of cities
    combined_sampled_cityName = ['Los Alamos', 'Albuquerque', 'Rio Rancho', 'Farmington',
     'Taos', 'Las Cruces', 'Silver City', 'Roswell', 'Lovington', 'Deming',
     'Alamogordo', 'Chaparral', 'Las Vegas', 'Los Lunas', 'Hobbs',
     'Clovis', 'Sunland Park', 'Artesia', 'Grants', 'Carlsbad', 'Portales', 'Gallup',
     'Espa√±ola', 'Santa Fe']
    # ---------------------



### Vermont

In [160]:
if state=='vermont':
    # ------ Vermont -------

    ## change the filepath as required, we have selected the folder with the latest date

    filepath = '../Downloads/VM2--VT--2022-04-20/'
    DEMO_filename = 'VM2--VT--2022-04-20-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--VT--2022-04-20-VOTEHISTORY_selected_cols.parquet'
    elec_dates_filename = 'GE_LM_dates_per_city_CO.parquet'

    # 1. List of RCV and non-RCV cities 
    # low number of cities
    combined_sampled_cityName = ['Burlington', 'South Burlington', 'Essex', 'Rutland', 'Bennington', 'Milton',
     'Essex Junction', 'Barre', 'Colchester', 'Brattleboro']
    # ---------------------


# 1.1 Demographic Data

1. Select only the columns required: city name ('Residence_Addresses_City'), unique voter id ('LALVOTERID'), voter's ethnicity ('EthnicGroups_EthnicGroup1Desc'), date when voter was registered ('Voters_OfficialRegDate'), voter's gender, date of birth, plus additionsl columns
2. Keep only the cities that were identified as being similar to RCV cities in CA (See ca_similarity_search.ipynb for reference) 
3. Keep only rows EthnicGroups_EthnicGroup1Desc == “European”,  “Likely African-American”,“Hispanic and Portuguese” and “East and South Asian” 
4. Keep only registered voters identified in 'Voters_OfficialRegDate'


In [8]:
def read_DEMOGRAPHIC():
    df_demographic = pd.read_parquet(f'{filepath}{DEMO_filename}')
    print("Total number of unique cities:", df_demographic.Residence_Addresses_City.nunique())
    print("Total number of unique voters:", df_demographic.LALVOTERID.nunique())
    print("Count of non-registered voters:", len(df_demographic[df_demographic['Voters_OfficialRegDate'].isnull()]))
    
    print("Number of expected cities:", len(combined_sampled_cityName))
    missing_cities = [city for city in combined_sampled_cityName if city not in df_demographic['Residence_Addresses_City'].unique()]
    if len(missing_cities) > 0:
        print("number of cities not found in demographic data:", len(missing_cities))
        print(missing_cities)
        
    return df_demographic
        
state_demographic = read_DEMOGRAPHIC()

Total number of unique cities: 1533
Total number of unique voters: 21711617
Count of non-registered voters: 66
Number of expected cities: 40
number of cities not found in demographic data: 1
['El Paso de Robles']


In [9]:
state_demographic.head(5)

Unnamed: 0,LALVOTERID,Residence_Addresses_City,Voters_Gender,Voters_Age,Voters_BirthDate,Parties_Description,EthnicGroups_EthnicGroup1Desc,Voters_OfficialRegDate,County,CommercialData_Education,CommercialData_EstimatedHHIncome,CommercialData_EstimatedHHIncomeAmount,FECDonors_NumberOfDonations,FECDonors_TotalDonationsAmount
0,LALCA453164106,Oakland,F,29,04/29/1993,Democratic,Other,06/18/2021,ALAMEDA,,,,,
1,LALCA453008306,Oakland,F,26,02/02/1996,Non-Partisan,Likely African-American,04/01/2021,ALAMEDA,,,,,
2,LALCA22129469,Oakland,F,47,02/02/1975,Democratic,European,11/16/2021,ALAMEDA,HS Diploma - Extremely Likely,,,,
3,LALCA549803906,Oakland,M,60,02/09/1962,Democratic,Other,02/07/2022,ALAMEDA,,,,,
4,LALCA24729024,San Leandro,F,56,01/01/1966,Democratic,European,02/28/2016,ALAMEDA,HS Diploma - Extremely Likely,,,,


### California

In [10]:
if state == 'california':
    # ----- California ----- 
    combined_sampled_cityName = list(map(lambda x: x.replace('El Paso de Robles', 'Paso Robles'), combined_sampled_cityName))
    print("number of expected cities:", len(combined_sampled_cityName))
    # ----------------------

    # Modifying the names of cities that were duplicate (Calabasas Hills > Calabasas and Huntington Pk > Huntington Park)
    state_demographic.loc[state_demographic['Residence_Addresses_City'] == 'Huntington Pk', 'Residence_Addresses_City'] = 'Huntington Park'
    state_demographic.loc[state_demographic['Residence_Addresses_City'] == 'Calabasas Hills', 'Residence_Addresses_City'] = 'Calabasas'
    # ----------------------

number of expected cities: 40


### Utah

In [164]:
if state=='utah':
    # # ----- Utah ----- 
    combined_sampled_cityName = list(map(lambda x: x.replace('St. George', 'Saint George'), combined_sampled_cityName))
    print("number of expected cities:", len(combined_sampled_cityName))
    # # ----------------------


number of expected cities: 47


### Colorado

In [165]:
if state=='colorado':
    # NOTE: ['Sherrelwood', 'Cherry Creek', 'Ken Caryl'] not found in demographic data so were removed from 
    # sampled_non_RCV_cities_CO list

    # # ----- Colorado ----- 
    combined_sampled_cityName = list(map(lambda x: x.replace('Ca√±on City', 'Canon City'), combined_sampled_cityName))
    print("number of expected cities:", len(combined_sampled_cityName))
    # # ----------------------

### Maryland

In [166]:
if state=='maryland':
    # NOTE: ['Colesville', 'Fairland', 'Cloverly', 'Annapolis Neck', 'Redland', 'Glenmont', 'Travilah', 
    # 'South Laurel', 'White Oak', 'Glassmanor', 'Kemp Mill', 'Parole', 'Calverton'] not found in demographic data 
    # so were removed from sampled_non_RCV_cities_MD list

    # # ----- Maryland ----- 
    combined_sampled_cityName = list(map(lambda x: x.replace(), combined_sampled_cityName))
    print("number of expected cities:", len(combined_sampled_cityName))
    # # ----------------------

### Minnesota

In [167]:
if state=='minnesota':
    # # ----- Minnesota ----- 
    combined_sampled_cityName = list(map(lambda x: x.replace('St. Louis Park', 'St Louis Park'), combined_sampled_cityName))
    combined_sampled_cityName = list(map(lambda x: x.replace('St. Paul', 'Saint Paul'), combined_sampled_cityName))
    combined_sampled_cityName = list(map(lambda x: x.replace('St. Cloud', 'Saint Cloud'), combined_sampled_cityName))

    print("number of expected cities:", len(combined_sampled_cityName))
    # # ----------------------

### New Mexico

In [168]:
if state=='new mexico':
    # NOTE: ['North Valley', 'South Valley'] not found in demographic data so were removed 
    # from sampled_non_RCV_cities_NM list

    # # ----- New Mexico ----- 
    combined_sampled_cityName = list(map(lambda x: x.replace('Espa√±ola', 'Espanola'), combined_sampled_cityName))

    print("number of expected cities:", len(combined_sampled_cityName))

### Filter DEMOGRAPHIC data based on the list of cities, ethnicities, and registered voters

In [11]:
# 2. filter DEMOGRAPHIC data based on the list of cities, ethnicities and registered voters

selected_ethnicities = ['European', 'Likely African-American','Hispanic and Portuguese', 'East and South Asian', 'Other']

def filter_demo(df, list_cityNames):
    filtered_df = df[df['Residence_Addresses_City'].isin(list_cityNames) &
            df['EthnicGroups_EthnicGroup1Desc'].isin(selected_ethnicities) &
            df['Voters_OfficialRegDate'].notnull()]
    #[['LALVOTERID', 'Residence_Addresses_City']]
    
    print(filtered_df.shape)
    print("number of unique cities:", filtered_df.Residence_Addresses_City.nunique())
    
    return filtered_df

state_demographic_subset = filter_demo(df = state_demographic, list_cityNames = combined_sampled_cityName)
state_demographic_subset.head()

(4140404, 14)
number of unique cities: 40


Unnamed: 0,LALVOTERID,Residence_Addresses_City,Voters_Gender,Voters_Age,Voters_BirthDate,Parties_Description,EthnicGroups_EthnicGroup1Desc,Voters_OfficialRegDate,County,CommercialData_Education,CommercialData_EstimatedHHIncome,CommercialData_EstimatedHHIncomeAmount,FECDonors_NumberOfDonations,FECDonors_TotalDonationsAmount
0,LALCA453164106,Oakland,F,29,04/29/1993,Democratic,Other,06/18/2021,ALAMEDA,,,,,
1,LALCA453008306,Oakland,F,26,02/02/1996,Non-Partisan,Likely African-American,04/01/2021,ALAMEDA,,,,,
2,LALCA22129469,Oakland,F,47,02/02/1975,Democratic,European,11/16/2021,ALAMEDA,HS Diploma - Extremely Likely,,,,
3,LALCA549803906,Oakland,M,60,02/09/1962,Democratic,Other,02/07/2022,ALAMEDA,,,,,
4,LALCA24729024,San Leandro,F,56,01/01/1966,Democratic,European,02/28/2016,ALAMEDA,HS Diploma - Extremely Likely,,,,


In [12]:
del state_demographic
gc.collect()

20

# 1.2 Vote History

1. Select only the columns that are 4 most recent General elections and 4 most recent Local_or_Municipal elections and EthnicGroups_EthnicGroup1Desc
2. Load Vote History 
3. Merge Vote History with the sampled Demographic Data 


## 1. Get four most recent election dates

In [13]:
# load the list of election dates for each city
GE_LM_dates_dict = pd.read_parquet(f'{filepath}{elec_dates_filename}')
GE_LM_dates_dict

Unnamed: 0,city,GE_dates,LM_dates
0,Oakland,"[General_2020_11_03, General_2018_11_06, Gener...","[Consolidated_General_2021_11_02, Local_or_Mun..."
1,San Leandro,"[General_2020_11_03, General_2018_11_06, Gener...","[Local_or_Municipal_2020_08_03, Consolidated_G..."
2,Livermore,"[General_2020_11_03, General_2018_11_06, Gener...","[Consolidated_General_2021_11_02, Local_or_Mun..."
3,Berkeley,"[General_2020_11_03, General_2018_11_06, Gener...","[Consolidated_General_2021_11_02, Local_or_Mun..."
4,Albany,"[General_2020_11_03, General_2018_11_06, Gener...","[Local_or_Municipal_2021_03_02, Local_or_Munic..."
5,San Francisco,"[General_2020_11_03, General_2018_11_06, Gener...","[Consolidated_General_2021_11_02, Local_or_Mun..."
6,San Diego,"[General_2020_11_03, General_2018_11_06, Gener...","[Consolidated_General_2021_11_02, Local_or_Mun..."
7,San Jose,"[General_2020_11_03, General_2018_11_06, Gener...","[Consolidated_General_2021_11_02, Local_or_Mun..."
8,Fresno,"[General_2020_11_03, General_2018_11_06, Gener...","[Consolidated_General_2021_11_02, Local_or_Mun..."
9,Eureka,"[General_2020_11_03, General_2018_11_06, Gener...","[Local_or_Municipal_2021_06_08, Local_or_Munic..."


In [14]:
e_dates = set()
for v in GE_LM_dates_dict['GE_dates']:
    for vv in v:
        e_dates.add(vv)
for v in GE_LM_dates_dict['LM_dates'] :
    for vv in v:
        e_dates.add(vv)
        
print(list(e_dates))
## when all four dates are not found e_dates will contain None, we need to remove it
if None in list(e_dates):
    e_dates.remove(None)
    print(list(e_dates))

['General_2014_11_04', 'Local_or_Municipal_2019_06_04', 'Consolidated_General_2019_11_05', 'Local_or_Municipal_2021_06_08', 'Local_or_Municipal_2019_08_27', 'Local_or_Municipal_2021_07_20', 'Consolidated_General_2021_11_02', 'Local_or_Municipal_2021_04_20', 'Local_or_Municipal_2021_03_02', 'General_2016_11_08', 'Local_or_Municipal_2019_08_13', 'Consolidated_General_2017_11_07', 'Local_or_Municipal_2020_04_14', 'Local_or_Municipal_2020_08_03', 'General_2018_11_06', 'Local_or_Municipal_2019_04_16', 'General_2020_11_03', 'Local_or_Municipal_2021_05_11', 'Local_or_Municipal_2021_06_01', 'Local_or_Municipal_2019_03_05']


In [28]:
# need in order to filter out rows after aggregation
def get_correct_dates(list_like_df):
    print("Shape before reshaping:",list_like_df.shape)
    list_like_df = list_like_df.explode(['GE_dates', 'LM_dates']).melt(id_vars=["city"], 
                                                                       var_name="Date", 
                                                                       value_name="Value")
    list_like_df = list_like_df.drop(columns = 'Date')
    list_like_df.columns = ['Residence_Addresses_City', 'elec_type_date']                          
    list_like_df['elec_date'] = list_like_df['elec_type_date'].str[-10:]
    list_like_df['elec_year'] = list_like_df['elec_type_date'].str[-10:-6]
    list_like_df['elec_type'] = list_like_df['elec_type_date'].str[:-11]                    
    list_like_df = list_like_df.drop(columns = 'elec_type_date')
    print("Shape after reshaping:",list_like_df.shape)
    return list_like_df

GE_LM_dates_df = get_correct_dates(GE_LM_dates_dict)
GE_LM_dates_df.head()

Shape before reshaping: (40, 3)
Shape after reshaping: (320, 4)


Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type
0,Oakland,2020_11_03,2020,General
1,Oakland,2018_11_06,2018,General
2,Oakland,2016_11_08,2016,General
3,Oakland,2014_11_04,2014,General
4,San Leandro,2020_11_03,2020,General


## 2. load the VOTE HISTORY data for selected election dates only

In [15]:
needed_variables = ['LALVOTERID'] + list(e_dates)

state_voterhistory_4_dates = pd.read_parquet(f'{filepath}{VOTE_filename}',
                                             columns=needed_variables)
                                
state_voterhistory_4_dates.head(5)

Unnamed: 0,LALVOTERID,General_2014_11_04,Local_or_Municipal_2019_06_04,Consolidated_General_2019_11_05,Local_or_Municipal_2021_06_08,Local_or_Municipal_2019_08_27,Local_or_Municipal_2021_07_20,Consolidated_General_2021_11_02,Local_or_Municipal_2021_04_20,Local_or_Municipal_2021_03_02,...,Local_or_Municipal_2019_08_13,Consolidated_General_2017_11_07,Local_or_Municipal_2020_04_14,Local_or_Municipal_2020_08_03,General_2018_11_06,Local_or_Municipal_2019_04_16,General_2020_11_03,Local_or_Municipal_2021_05_11,Local_or_Municipal_2021_06_01,Local_or_Municipal_2019_03_05
0,LALCA453164106,,,,,,,,,,...,,,,,Y,,Y,,,
1,LALCA453008306,,,,,,,,,,...,,,,,Y,,,,,
2,LALCA22129469,Y,,,,,,,,,...,,,,,Y,,Y,,,
3,LALCA549803906,,,,,,,,,,...,,,,,,,Y,,,
4,LALCA24729024,,,,,,,,,,...,,,,,,,,,,


## 3. Merge Vote History and Demographic Data

In [16]:
merged_file = pd.merge(state_voterhistory_4_dates, state_demographic_subset,
                       how='inner', left_on='LALVOTERID', right_on='LALVOTERID')

print(merged_file.shape)

print("number of unique cities:", merged_file.Residence_Addresses_City.nunique())

merged_file.head(5)

(4140404, 34)
number of unique cities: 40


Unnamed: 0,LALVOTERID,General_2014_11_04,Local_or_Municipal_2019_06_04,Consolidated_General_2019_11_05,Local_or_Municipal_2021_06_08,Local_or_Municipal_2019_08_27,Local_or_Municipal_2021_07_20,Consolidated_General_2021_11_02,Local_or_Municipal_2021_04_20,Local_or_Municipal_2021_03_02,...,Voters_BirthDate,Parties_Description,EthnicGroups_EthnicGroup1Desc,Voters_OfficialRegDate,County,CommercialData_Education,CommercialData_EstimatedHHIncome,CommercialData_EstimatedHHIncomeAmount,FECDonors_NumberOfDonations,FECDonors_TotalDonationsAmount
0,LALCA453164106,,,,,,,,,,...,04/29/1993,Democratic,Other,06/18/2021,ALAMEDA,,,,,
1,LALCA453008306,,,,,,,,,,...,02/02/1996,Non-Partisan,Likely African-American,04/01/2021,ALAMEDA,,,,,
2,LALCA22129469,Y,,,,,,,,,...,02/02/1975,Democratic,European,11/16/2021,ALAMEDA,HS Diploma - Extremely Likely,,,,
3,LALCA549803906,,,,,,,,,,...,02/09/1962,Democratic,Other,02/07/2022,ALAMEDA,,,,,
4,LALCA24729024,,,,,,,,,,...,01/01/1966,Democratic,European,02/28/2016,ALAMEDA,HS Diploma - Extremely Likely,,,,


In [176]:
merge_filename = DEMO_filename.replace('DEMOGRAPHIC_selected_cols.parquet', 'merged.parquet')
print(merge_filename)
merged_file.to_parquet(f'{filepath}{merge_filename}')

VM2--UT--2022-03-30-merged.parquet


# 3.1. Calculate voter turnout per ethnicity

In [17]:
merge_filename = DEMO_filename.replace('DEMOGRAPHIC_selected_cols.parquet', 'merged.parquet')
merged_file = pd.read_parquet(f'{filepath}{merge_filename}')

In [18]:
def replace_ethnicities(df):
    df = df.replace('East and South Asian', 'asian')
    df = df.replace('European', 'white')
    df = df.replace('Hispanic and Portuguese', 'hispanic')
    df = df.replace('Likely African-American', 'black')
    df = df.replace('Other', 'others') # including 'others' ethnical group
    return df

In [19]:
merged_file = replace_ethnicities(merged_file)
merged_file.head()

Unnamed: 0,LALVOTERID,Consolidated_General_2017_11_07,Local_or_Municipal_2019_06_04,Consolidated_General_2021_11_02,Local_or_Municipal_2021_06_08,Local_or_Municipal_2019_04_16,Local_or_Municipal_2020_08_03,General_2016_11_08,Local_or_Municipal_2019_08_13,Local_or_Municipal_2019_08_27,...,Voters_BirthDate,Parties_Description,EthnicGroups_EthnicGroup1Desc,Voters_OfficialRegDate,County,CommercialData_Education,CommercialData_EstimatedHHIncome,CommercialData_EstimatedHHIncomeAmount,FECDonors_NumberOfDonations,FECDonors_TotalDonationsAmount
0,LALCA453164106,,,,,,,Y,,,...,04/29/1993,Democratic,others,06/18/2021,ALAMEDA,,,,,
1,LALCA453008306,,,,,,,,,,...,02/02/1996,Non-Partisan,black,04/01/2021,ALAMEDA,,,,,
2,LALCA22129469,,,,,,,Y,,,...,02/02/1975,Democratic,white,11/16/2021,ALAMEDA,HS Diploma - Extremely Likely,,,,
3,LALCA549803906,,,,,,,,,,...,02/09/1962,Democratic,others,02/07/2022,ALAMEDA,,,,,
4,LALCA24729024,,,,,,,,,,...,01/01/1966,Democratic,white,02/28/2016,ALAMEDA,HS Diploma - Extremely Likely,,,,


In [20]:
GE_cols = [col for col in merged_file.columns if col.startswith('General')]
print(GE_cols)
LM_cols = [col for col in merged_file.columns if col.startswith('Local_or_Municipal') \
           or col.startswith('Consolidated_General')]
print(LM_cols)

['General_2016_11_08', 'General_2020_11_03', 'General_2014_11_04', 'General_2018_11_06']
['Consolidated_General_2017_11_07', 'Local_or_Municipal_2019_06_04', 'Consolidated_General_2021_11_02', 'Local_or_Municipal_2021_06_08', 'Local_or_Municipal_2019_04_16', 'Local_or_Municipal_2020_08_03', 'Local_or_Municipal_2019_08_13', 'Local_or_Municipal_2019_08_27', 'Local_or_Municipal_2019_03_05', 'Local_or_Municipal_2021_03_02', 'Consolidated_General_2019_11_05', 'Local_or_Municipal_2021_05_11', 'Local_or_Municipal_2021_07_20', 'Local_or_Municipal_2021_06_01', 'Local_or_Municipal_2020_04_14', 'Local_or_Municipal_2021_04_20']


In [21]:
# fill NA values with "N" to make it easier to compare  with "Y"
merged_file[GE_cols+LM_cols] = merged_file[GE_cols+LM_cols].fillna('N')
merged_file.head()

Unnamed: 0,LALVOTERID,Consolidated_General_2017_11_07,Local_or_Municipal_2019_06_04,Consolidated_General_2021_11_02,Local_or_Municipal_2021_06_08,Local_or_Municipal_2019_04_16,Local_or_Municipal_2020_08_03,General_2016_11_08,Local_or_Municipal_2019_08_13,Local_or_Municipal_2019_08_27,...,Voters_BirthDate,Parties_Description,EthnicGroups_EthnicGroup1Desc,Voters_OfficialRegDate,County,CommercialData_Education,CommercialData_EstimatedHHIncome,CommercialData_EstimatedHHIncomeAmount,FECDonors_NumberOfDonations,FECDonors_TotalDonationsAmount
0,LALCA453164106,N,N,N,N,N,N,Y,N,N,...,04/29/1993,Democratic,others,06/18/2021,ALAMEDA,,,,,
1,LALCA453008306,N,N,N,N,N,N,N,N,N,...,02/02/1996,Non-Partisan,black,04/01/2021,ALAMEDA,,,,,
2,LALCA22129469,N,N,N,N,N,N,Y,N,N,...,02/02/1975,Democratic,white,11/16/2021,ALAMEDA,HS Diploma - Extremely Likely,,,,
3,LALCA549803906,N,N,N,N,N,N,N,N,N,...,02/09/1962,Democratic,others,02/07/2022,ALAMEDA,,,,,
4,LALCA24729024,N,N,N,N,N,N,N,N,N,...,01/01/1966,Democratic,white,02/28/2016,ALAMEDA,HS Diploma - Extremely Likely,,,,


In [22]:
# We created the dataframe below in order to easily calculate perc_turnout when no one voted
list_ethnic_city = merged_file[['Residence_Addresses_City', 'EthnicGroups_EthnicGroup1Desc']].drop_duplicates()
list_ethnic_city_No = list_ethnic_city.copy()
list_ethnic_city_No['voted'] = 'N'
list_ethnic_city_Yes = list_ethnic_city.copy()
list_ethnic_city_Yes['voted'] = 'Y'
list_ethnic_city = pd.concat([list_ethnic_city_No, list_ethnic_city_Yes])

In [23]:
list_ethnic_city

Unnamed: 0,Residence_Addresses_City,EthnicGroups_EthnicGroup1Desc,voted
0,Oakland,others,N
1,Oakland,black,N
2,Oakland,white,N
4,San Leandro,white,N
5,Livermore,white,N
...,...,...,...
3994321,Santa Rosa,asian,Y
3994376,Santa Rosa,others,Y
3994377,Santa Rosa,black,Y
4104814,Davis,others,Y


In [50]:
# we also need the total voters information per city and ethnicity
total_city_ethnic = merged_file.groupby(['Residence_Addresses_City', 'EthnicGroups_EthnicGroup1Desc']).size().reset_index()
total_city_ethnic.columns = ['Residence_Addresses_City', 'EthnicGroups_EthnicGroup1Desc', 'total_voters']
total_city_ethnic  = total_city_ethnic.merge(list_ethnic_city, on = ['Residence_Addresses_City', 'EthnicGroups_EthnicGroup1Desc'])

total_city_ethnic = replace_ethnicities(total_city_ethnic)
total_city_ethnic

Unnamed: 0,Residence_Addresses_City,EthnicGroups_EthnicGroup1Desc,total_voters,voted
0,Albany,asian,2405,N
1,Albany,asian,2405,Y
2,Albany,black,147,N
3,Albany,black,147,Y
4,Albany,hispanic,1035,N
...,...,...,...,...
395,Whittier,hispanic,76334,Y
396,Whittier,others,2525,N
397,Whittier,others,2525,Y
398,Whittier,white,26477,N


In [25]:
def calc_votes(df, col):
    voter_turnout_stats = df.groupby(['Residence_Addresses_City', 'EthnicGroups_EthnicGroup1Desc', col]).size().reset_index(name='voted_voters')

    # 'voted' is either 'Y' or 'N'
    voter_turnout_stats = voter_turnout_stats.rename(columns = {col: 'voted'})    

    voter_turnout_stats = total_city_ethnic.merge(voter_turnout_stats, 
                                                     how = 'left',
                                                     on = ['Residence_Addresses_City', 'EthnicGroups_EthnicGroup1Desc', 'voted']) 
    voter_turnout_stats['perc_turnout'] = voter_turnout_stats['voted_voters']/voter_turnout_stats['total_voters']

    voter_turnout_stats['elec_date'] = col[len(col)-10:]
    voter_turnout_stats['elec_year'] = col[len(col)-10:len(col)-6]
    voter_turnout_stats['elec_type'] = col[:len(col)-11]

    voter_turnout_stats[['voted_voters', 'perc_turnout']] = voter_turnout_stats[['voted_voters', 'perc_turnout']].fillna(0)
    voter_turnout_stats = voter_turnout_stats[voter_turnout_stats['voted'] == 'Y']    
    pivot_df = voter_turnout_stats.pivot(index = ['elec_type','elec_year', 'elec_date', 'Residence_Addresses_City'],
                                    columns='EthnicGroups_EthnicGroup1Desc', 
                                    values=['total_voters', 'voted_voters', 'perc_turnout']).reset_index()
    pivot_df.columns = pivot_df.columns.map('_'.join)
    pivot_df = pivot_df.rename(columns = {'elec_type_':'elec_type', 'elec_year_':'elec_year', 'elec_date_':'elec_date', 'Residence_Addresses_City_':'Residence_Addresses_City'})

    del voter_turnout_stats
    gc.collect()
    return pivot_df

elec_date_cols = GE_cols+LM_cols

for i in range(len(elec_date_cols)):
    col = elec_date_cols[i]
    pivot_df = calc_votes(merged_file, col)    
    # stack all types of election into one dataframe 
    if i == 0:
        voter_turnout_merge_ethnicity = pivot_df.copy() 
    else:
        voter_turnout_merge_ethnicity = pd.concat([voter_turnout_merge_ethnicity, pivot_df])


In [26]:
print(voter_turnout_merge_ethnicity.shape)
voter_turnout_merge_ethnicity.head()

(800, 19)


Unnamed: 0,elec_type,elec_year,elec_date,Residence_Addresses_City,total_voters_asian,total_voters_black,total_voters_hispanic,total_voters_others,total_voters_white,voted_voters_asian,voted_voters_black,voted_voters_hispanic,voted_voters_others,voted_voters_white,perc_turnout_asian,perc_turnout_black,perc_turnout_hispanic,perc_turnout_others,perc_turnout_white
0,General,2016,2016_11_08,Albany,2405.0,147.0,1035.0,641.0,6169.0,1396.0,88.0,668.0,379.0,4659.0,0.580457,0.598639,0.645411,0.591264,0.755228
1,General,2016,2016_11_08,Alhambra,17451.0,191.0,16596.0,969.0,7359.0,7041.0,90.0,9717.0,517.0,4574.0,0.403473,0.471204,0.585503,0.53354,0.621552
2,General,2016,2016_11_08,Anaheim,26340.0,1211.0,70052.0,8857.0,54644.0,12245.0,609.0,34883.0,4078.0,35829.0,0.464882,0.50289,0.497959,0.460427,0.65568
3,General,2016,2016_11_08,Bellflower,2153.0,3614.0,19899.0,1301.0,10792.0,840.0,2043.0,9474.0,542.0,6092.0,0.390153,0.565302,0.476104,0.416603,0.564492
4,General,2016,2016_11_08,Berkeley,8549.0,5942.0,6388.0,3626.0,39425.0,4602.0,3770.0,3636.0,2066.0,27737.0,0.538309,0.634467,0.569192,0.569774,0.703538


In [29]:
print(voter_turnout_merge_ethnicity.shape)
# remove rows where election dates are not associated with city
# need to do this only once as we will be using inner join to ensure only necessary combinations of city and election dates are present
voter_turnout_merge_ethnicity = GE_LM_dates_df.merge(voter_turnout_merge_ethnicity, 
                                how = 'left',
                                on = ['elec_type', 'elec_year', 'elec_date', 'Residence_Addresses_City'])
print(voter_turnout_merge_ethnicity.shape)

(800, 19)
(320, 19)


In [30]:
#should be empty dataframe because of the way we have filitered the dataframe

no_voter_turnout = voter_turnout_merge_ethnicity[(voter_turnout_merge_ethnicity['perc_turnout_asian'] == 0) &
                                       (voter_turnout_merge_ethnicity['perc_turnout_black'] == 0) &
                                       (voter_turnout_merge_ethnicity['perc_turnout_hispanic'] == 0) &
                                       (voter_turnout_merge_ethnicity['perc_turnout_white'] == 0)]

no_voter_turnout.head()

Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type,total_voters_asian,total_voters_black,total_voters_hispanic,total_voters_others,total_voters_white,voted_voters_asian,voted_voters_black,voted_voters_hispanic,voted_voters_others,voted_voters_white,perc_turnout_asian,perc_turnout_black,perc_turnout_hispanic,perc_turnout_others,perc_turnout_white


#  3.2.  Calculate average donation 

In [31]:
def calc_donation(df):
    donations_df = df[['Residence_Addresses_City', 'FECDonors_TotalDonationsAmount', 'FECDonors_NumberOfDonations']
                  + elec_date_cols]
    melt_donations_df = donations_df.melt(id_vars=['Residence_Addresses_City', 'FECDonors_TotalDonationsAmount', 'FECDonors_NumberOfDonations'], 
              value_vars=elec_date_cols,
              var_name='elec_type_date',
              value_name='voted')
    melt_donations_df = melt_donations_df[melt_donations_df['voted'] == 'Y']

    melt_donations_df = melt_donations_df.astype({'FECDonors_TotalDonationsAmount': float, 'FECDonors_NumberOfDonations': float})                        
    melt_donations_df = melt_donations_df.groupby(['Residence_Addresses_City', 'elec_type_date']).agg({'FECDonors_TotalDonationsAmount':'sum','FECDonors_NumberOfDonations':'sum'}).reset_index()    
    melt_donations_df['mean_donation'] = melt_donations_df['FECDonors_TotalDonationsAmount']/melt_donations_df['FECDonors_NumberOfDonations']
    melt_donations_df['elec_date'] = melt_donations_df['elec_type_date'].str[-10:]
    melt_donations_df['elec_year'] = melt_donations_df['elec_type_date'].str[-10:-6]
    melt_donations_df['elec_type'] = melt_donations_df['elec_type_date'].str[:-11]
    melt_donations_df = melt_donations_df.drop(columns = 'elec_type_date').reset_index(drop=True)
    
    return melt_donations_df

avg_donations = calc_donation(merged_file)
print(avg_donations.shape)
avg_donations.head()

(492, 7)


Unnamed: 0,Residence_Addresses_City,FECDonors_TotalDonationsAmount,FECDonors_NumberOfDonations,mean_donation,elec_date,elec_year,elec_type
0,Albany,3455.0,43.0,80.348837,2017_11_07,2017,Consolidated_General
1,Albany,44682.0,132.0,338.5,2019_11_05,2019,Consolidated_General
2,Albany,2564493.0,25666.0,99.917907,2014_11_04,2014,General
3,Albany,2936493.0,28594.0,102.696125,2016_11_08,2016,General
4,Albany,2893942.0,27827.0,103.997628,2018_11_06,2018,General


In [32]:
# Merge 3.1 and 3.2
voter_turnout_merge = voter_turnout_merge_ethnicity.merge(avg_donations, 
                                                          how = 'inner',
                                                          on = ['elec_type', 'elec_year', 'elec_date', 'Residence_Addresses_City'])

voter_turnout_merge.head()

Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type,total_voters_asian,total_voters_black,total_voters_hispanic,total_voters_others,total_voters_white,voted_voters_asian,...,voted_voters_others,voted_voters_white,perc_turnout_asian,perc_turnout_black,perc_turnout_hispanic,perc_turnout_others,perc_turnout_white,FECDonors_TotalDonationsAmount,FECDonors_NumberOfDonations,mean_donation
0,Oakland,2020_11_03,2020,General,30600.0,61476.0,37174.0,8628.0,83122.0,23041.0,...,6346.0,69989.0,0.752974,0.746486,0.725077,0.735512,0.842003,44186445.0,403388.0,109.538323
1,Oakland,2018_11_06,2018,General,30600.0,61476.0,37174.0,8628.0,83122.0,14972.0,...,4536.0,57872.0,0.489281,0.569523,0.480363,0.52573,0.69623,42827869.0,394704.0,108.506296
2,Oakland,2016_11_08,2016,General,30600.0,61476.0,37174.0,8628.0,83122.0,16057.0,...,4560.0,57968.0,0.524739,0.606025,0.532415,0.528512,0.697385,42296298.0,390007.0,108.4501
3,Oakland,2014_11_04,2014,General,30600.0,61476.0,37174.0,8628.0,83122.0,8145.0,...,2097.0,35411.0,0.266176,0.345907,0.221526,0.243046,0.426012,37016092.0,341253.0,108.471111
4,San Leandro,2020_11_03,2020,General,12705.0,5596.0,16028.0,2429.0,17780.0,9229.0,...,1729.0,14638.0,0.726407,0.768227,0.747692,0.711816,0.823285,1875137.0,30683.0,61.113222


#  3.3.  Calculate voter turnout per income

In [33]:
# percent missing values for income
print('Percent of rows with missing value for income:',
      100 * merged_file['CommercialData_EstimatedHHIncome'].isnull().sum() / merged_file.shape[0], '%')

Percent of rows with missing value for income: 1.5627218986359785 %


As long as this percentage is low, we can continue with our turnout calculations for income.

In [34]:
# Similar to before, but with income
list_income_city = merged_file[['Residence_Addresses_City', 'CommercialData_EstimatedHHIncome']].drop_duplicates()
list_income_city_No = list_income_city.copy()
list_income_city_No['voted'] = 'N'
list_income_city_Yes = list_income_city.copy()
list_income_city_Yes['voted'] = 'Y'
list_income_city = pd.concat([list_income_city_No, list_income_city_Yes])

In [35]:
# we also need the total voters information per city and income
total_city_income = merged_file.groupby(['Residence_Addresses_City', 'CommercialData_EstimatedHHIncome']).size().reset_index()
total_city_income.columns = ['Residence_Addresses_City', 'CommercialData_EstimatedHHIncome', 'total_voters']
total_city_income  = total_city_income.merge(list_income_city, on = ['Residence_Addresses_City', 'CommercialData_EstimatedHHIncome'])

total_city_income

Unnamed: 0,Residence_Addresses_City,CommercialData_EstimatedHHIncome,total_voters,voted
0,Albany,$1000-14999,95,N
1,Albany,$1000-14999,95,Y
2,Albany,$100000-124999,1423,N
3,Albany,$100000-124999,1423,Y
4,Albany,$125000-149999,1914,N
...,...,...,...,...
955,Whittier,$35000-49999,6486,Y
956,Whittier,$50000-74999,24453,N
957,Whittier,$50000-74999,24453,Y
958,Whittier,$75000-99999,25324,N


In [36]:
# function to calculate percent turnout by income bracket
def calc_votes_income(df, col):
    voter_turnout_stats = df.groupby(['Residence_Addresses_City', 'CommercialData_EstimatedHHIncome', col]).size().reset_index(name='voted_voters')

    # 'voted' is either 'Y' or 'N'
    voter_turnout_stats = voter_turnout_stats.rename(columns = {col: 'voted'})    

    voter_turnout_stats = total_city_income.merge(voter_turnout_stats, 
                                                     how = 'left',
                                                     on = ['Residence_Addresses_City', 'CommercialData_EstimatedHHIncome', 'voted']) 
    voter_turnout_stats['perc_turnout_income'] = voter_turnout_stats['voted_voters']/voter_turnout_stats['total_voters']

    voter_turnout_stats['elec_date'] = col[len(col)-10:]
    voter_turnout_stats['elec_year'] = col[len(col)-10:len(col)-6]
    voter_turnout_stats['elec_type'] = col[:len(col)-11]

    voter_turnout_stats[['voted_voters', 'perc_turnout_income']] = voter_turnout_stats[['voted_voters', 'perc_turnout_income']].fillna(0)
    voter_turnout_stats = voter_turnout_stats[voter_turnout_stats['voted'] == 'Y']    
    pivot_df = voter_turnout_stats.pivot(index = ['elec_type','elec_year', 'elec_date', 'Residence_Addresses_City'],
                                    columns='CommercialData_EstimatedHHIncome', 
                                    values=['total_voters', 'voted_voters', 'perc_turnout_income']).reset_index()
    pivot_df.columns = pivot_df.columns.map('_'.join)
    pivot_df = pivot_df.rename(columns = {'elec_type_':'elec_type', 'elec_year_':'elec_year', 'elec_date_':'elec_date', 'Residence_Addresses_City_':'Residence_Addresses_City'})

    del voter_turnout_stats
    gc.collect()
    return pivot_df

elec_date_cols = GE_cols+LM_cols

for i in range(len(elec_date_cols)):
    col = elec_date_cols[i]
    pivot_df = calc_votes_income(merged_file, col)    
    # stack all types of election into one dataframe 
    if i == 0:
        voter_turnout_income = pivot_df.copy() 
    else:
        voter_turnout_income = pd.concat([voter_turnout_income, pivot_df])


In [37]:
voter_turnout_income.head()

Unnamed: 0,elec_type,elec_year,elec_date,Residence_Addresses_City,total_voters_$1000-14999,total_voters_$100000-124999,total_voters_$125000-149999,total_voters_$15000-24999,total_voters_$150000-174999,total_voters_$175000-199999,...,perc_turnout_income_$125000-149999,perc_turnout_income_$15000-24999,perc_turnout_income_$150000-174999,perc_turnout_income_$175000-199999,perc_turnout_income_$200000-249999,perc_turnout_income_$25000-34999,perc_turnout_income_$250000+,perc_turnout_income_$35000-49999,perc_turnout_income_$50000-74999,perc_turnout_income_$75000-99999
0,General,2016,2016_11_08,Albany,95.0,1423.0,1914.0,118.0,2007.0,873.0,...,0.675549,0.728814,0.694071,0.695304,0.758343,0.714286,0.772205,0.528833,0.751566,0.692308
1,General,2016,2016_11_08,Alhambra,935.0,4523.0,3644.0,968.0,1289.0,1291.0,...,0.53787,0.580579,0.573313,0.563904,0.606627,0.594378,0.58612,0.519024,0.463069,0.521314
2,General,2016,2016_11_08,Anaheim,2576.0,17191.0,17997.0,3225.0,7429.0,7195.0,...,0.580097,0.557209,0.601696,0.60417,0.636299,0.563849,0.624249,0.4955,0.501569,0.5196
3,General,2016,2016_11_08,Bellflower,857.0,3339.0,2694.0,928.0,991.0,874.0,...,0.573868,0.568966,0.54894,0.578947,0.582393,0.514286,0.593137,0.444792,0.470068,0.507695
4,General,2016,2016_11_08,Berkeley,1050.0,6466.0,6376.0,777.0,3580.0,7520.0,...,0.611355,0.724582,0.681564,0.685372,0.70482,0.728042,0.760962,0.584843,0.579962,0.634057


In [38]:
# Merge 3.1, 3.2 and 3.3
# merge with previous calculations for race and donation
voter_turnout_merge = voter_turnout_merge.merge(voter_turnout_income, 
                                                          how = 'inner',
                                                          on = ['elec_type', 'elec_year', 'elec_date', 'Residence_Addresses_City'])

voter_turnout_merge.head()

Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type,total_voters_asian,total_voters_black,total_voters_hispanic,total_voters_others,total_voters_white,voted_voters_asian,...,perc_turnout_income_$125000-149999,perc_turnout_income_$15000-24999,perc_turnout_income_$150000-174999,perc_turnout_income_$175000-199999,perc_turnout_income_$200000-249999,perc_turnout_income_$25000-34999,perc_turnout_income_$250000+,perc_turnout_income_$35000-49999,perc_turnout_income_$50000-74999,perc_turnout_income_$75000-99999
0,Oakland,2020_11_03,2020,General,30600.0,61476.0,37174.0,8628.0,83122.0,23041.0,...,0.844532,0.745143,0.83592,0.864074,0.88356,0.708806,0.901653,0.667258,0.692373,0.774733
1,Oakland,2018_11_06,2018,General,30600.0,61476.0,37174.0,8628.0,83122.0,14972.0,...,0.676914,0.536449,0.66313,0.707414,0.73604,0.508057,0.742673,0.443464,0.472464,0.57619
2,Oakland,2016_11_08,2016,General,30600.0,61476.0,37174.0,8628.0,83122.0,16057.0,...,0.684374,0.580689,0.664555,0.717283,0.739153,0.556968,0.741839,0.494703,0.514018,0.605413
3,Oakland,2014_11_04,2014,General,30600.0,61476.0,37174.0,8628.0,83122.0,8145.0,...,0.398418,0.320639,0.373139,0.447006,0.467931,0.309351,0.453466,0.241035,0.2531,0.321743
4,San Leandro,2020_11_03,2020,General,12705.0,5596.0,16028.0,2429.0,17780.0,9229.0,...,0.780595,0.785362,0.795634,0.817673,0.852273,0.792653,0.831599,0.758358,0.729072,0.739658


In [39]:
# add one column that is just overall average income 
merged_file['CommercialData_EstimatedHHIncomeAmount']= merged_file['CommercialData_EstimatedHHIncomeAmount'].str.replace('$','', regex=False)

merged_file = merged_file.astype({'CommercialData_EstimatedHHIncomeAmount': float})
        
avg_income = merged_file[['Residence_Addresses_City', 'CommercialData_EstimatedHHIncomeAmount']].\
            groupby(['Residence_Addresses_City']).\
            mean().reset_index()

avg_income.head(10)

Unnamed: 0,Residence_Addresses_City,CommercialData_EstimatedHHIncomeAmount
0,Albany,144687.939527
1,Alhambra,94583.458682
2,Anaheim,104340.105909
3,Bellflower,85210.521807
4,Berkeley,149833.4451
5,Buena Park,106024.824131
6,Burbank,122254.424497
7,Calabasas,185269.946742
8,Carpinteria,121623.341137
9,Chino Hills,141805.006169


In [40]:
# Merge 3.1, 3.2 and 3.3
# merge with previous calculations for race, donation, income bracket
voter_turnout_merge = voter_turnout_merge.merge(avg_income, 
                                                          how = 'inner',
                                                          on = ['Residence_Addresses_City'])


voter_turnout_merge.head(10)

Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type,total_voters_asian,total_voters_black,total_voters_hispanic,total_voters_others,total_voters_white,voted_voters_asian,...,perc_turnout_income_$15000-24999,perc_turnout_income_$150000-174999,perc_turnout_income_$175000-199999,perc_turnout_income_$200000-249999,perc_turnout_income_$25000-34999,perc_turnout_income_$250000+,perc_turnout_income_$35000-49999,perc_turnout_income_$50000-74999,perc_turnout_income_$75000-99999,CommercialData_EstimatedHHIncomeAmount
0,Oakland,2020_11_03,2020,General,30600.0,61476.0,37174.0,8628.0,83122.0,23041.0,...,0.745143,0.83592,0.864074,0.88356,0.708806,0.901653,0.667258,0.692373,0.774733,115534.731762
1,Oakland,2018_11_06,2018,General,30600.0,61476.0,37174.0,8628.0,83122.0,14972.0,...,0.536449,0.66313,0.707414,0.73604,0.508057,0.742673,0.443464,0.472464,0.57619,115534.731762
2,Oakland,2016_11_08,2016,General,30600.0,61476.0,37174.0,8628.0,83122.0,16057.0,...,0.580689,0.664555,0.717283,0.739153,0.556968,0.741839,0.494703,0.514018,0.605413,115534.731762
3,Oakland,2014_11_04,2014,General,30600.0,61476.0,37174.0,8628.0,83122.0,8145.0,...,0.320639,0.373139,0.447006,0.467931,0.309351,0.453466,0.241035,0.2531,0.321743,115534.731762
4,Oakland,2021_11_02,2021,Consolidated_General,30600.0,61476.0,37174.0,8628.0,83122.0,1.0,...,0.0,0.000106,0.0,4.7e-05,0.0,6.9e-05,0.0,4.8e-05,4.2e-05,115534.731762
5,Oakland,2020_08_03,2020,Local_or_Municipal,30600.0,61476.0,37174.0,8628.0,83122.0,55.0,...,0.000577,0.003326,0.002903,0.006603,0.00034,0.004584,0.000975,0.000713,0.001821,115534.731762
6,Oakland,2020_04_14,2020,Local_or_Municipal,30600.0,61476.0,37174.0,8628.0,83122.0,2.0,...,0.0,5.3e-05,0.0,0.0,0.0,0.0,0.0,0.0,8.5e-05,115534.731762
7,Oakland,2019_11_05,2019,Consolidated_General,30600.0,61476.0,37174.0,8628.0,83122.0,320.0,...,0.004616,0.015468,0.012191,0.018251,0.006355,0.019169,0.003989,0.004799,0.010418,115534.731762
8,San Leandro,2020_11_03,2020,General,12705.0,5596.0,16028.0,2429.0,17780.0,9229.0,...,0.785362,0.795634,0.817673,0.852273,0.792653,0.831599,0.758358,0.729072,0.739658,113020.551072
9,San Leandro,2018_11_06,2018,General,12705.0,5596.0,16028.0,2429.0,17780.0,5006.0,...,0.573191,0.551091,0.530947,0.584545,0.574694,0.569171,0.535247,0.465299,0.466684,113020.551072


#  3.4.  Calculate Voter turnout for college ed vs no college

In [41]:
# values for education column
merged_file['CommercialData_Education'].value_counts()

Some College - Likely                             416493
Bach Degree - Extremely Likely                    368922
Bach Degree - Likely                              303216
HS Diploma - Extremely Likely                     250953
HS Diploma - Likely                               214314
Grad Degree - Likely                              211235
Grad Degree - Extremely Likely                    191528
Less than HS Diploma - Likely                     156698
Some College -Extremely Likely                     75317
Vocational Technical Degree - Extremely Likely      2519
Less than HS Diploma - Ex Like                       127
Name: CommercialData_Education, dtype: int64

In [42]:
# add column to merged file for college or no college
college_ed = ['Some College - Likely', 'Bach Degree - Extremely Likely', 'Bach Degree - Likely', 
              'Grad Degree - Likely', 'Grad Degree - Extremely Likely', 'Some College -Extremely Likely']
no_college_ed = ['HS Diploma - Extremely Likely', 'HS Diploma - Likely', 'Less than HS Diploma - Likely', 
                'Vocational Technical Degree - Extremely Likely', 'Less than HS Diploma - Ex Like']

conditions = [merged_file['CommercialData_Education'].isin(college_ed), 
             merged_file['CommercialData_Education'].isin(no_college_ed)]
outputs = ['college', 'no_college']
education_col = np.select(conditions, outputs, None)
education_col = pd.Series(education_col)

# add to merged file
merged_file['College_Ed'] = education_col

In [43]:
merged_file.head()

Unnamed: 0,LALVOTERID,Consolidated_General_2017_11_07,Local_or_Municipal_2019_06_04,Consolidated_General_2021_11_02,Local_or_Municipal_2021_06_08,Local_or_Municipal_2019_04_16,Local_or_Municipal_2020_08_03,General_2016_11_08,Local_or_Municipal_2019_08_13,Local_or_Municipal_2019_08_27,...,Parties_Description,EthnicGroups_EthnicGroup1Desc,Voters_OfficialRegDate,County,CommercialData_Education,CommercialData_EstimatedHHIncome,CommercialData_EstimatedHHIncomeAmount,FECDonors_NumberOfDonations,FECDonors_TotalDonationsAmount,College_Ed
0,LALCA453164106,N,N,N,N,N,N,Y,N,N,...,Democratic,others,06/18/2021,ALAMEDA,,,,,,
1,LALCA453008306,N,N,N,N,N,N,N,N,N,...,Non-Partisan,black,04/01/2021,ALAMEDA,,,,,,
2,LALCA22129469,N,N,N,N,N,N,Y,N,N,...,Democratic,white,11/16/2021,ALAMEDA,HS Diploma - Extremely Likely,,,,,no_college
3,LALCA549803906,N,N,N,N,N,N,N,N,N,...,Democratic,others,02/07/2022,ALAMEDA,,,,,,
4,LALCA24729024,N,N,N,N,N,N,N,N,N,...,Democratic,white,02/28/2016,ALAMEDA,HS Diploma - Extremely Likely,,,,,no_college


In [44]:
# get counts for voters and college ed
list_edu_city = merged_file[['Residence_Addresses_City', 'College_Ed']].drop_duplicates()
list_edu_city_No = list_edu_city.copy()
list_edu_city_No['voted'] = 'N'
list_edu_city_Yes = list_edu_city.copy()
list_edu_city_Yes['voted'] = 'Y'
list_edu_city = pd.concat([list_edu_city_No, list_edu_city_Yes])

# we also need the total voters information per city and education
total_city_edu = merged_file.groupby(['Residence_Addresses_City', 'College_Ed']).size().reset_index()
total_city_edu.columns = ['Residence_Addresses_City', 'College_Ed', 'total_voters']
total_city_edu  = total_city_edu.merge(list_edu_city, on = ['Residence_Addresses_City', 'College_Ed'])

total_city_edu

Unnamed: 0,Residence_Addresses_City,College_Ed,total_voters,voted
0,Albany,college,5107,N
1,Albany,college,5107,Y
2,Albany,no_college,696,N
3,Albany,no_college,696,Y
4,Alhambra,college,13958,N
...,...,...,...,...
155,Watsonville,no_college,6446,Y
156,Whittier,college,37134,N
157,Whittier,college,37134,Y
158,Whittier,no_college,23619,N


In [45]:
# function to calculate percent turnout by income bracket
def calc_votes_edu(df, col):
    voter_turnout_stats = df.groupby(['Residence_Addresses_City', 'College_Ed', col]).size().reset_index(name='voted_voters')

    # 'voted' is either 'Y' or 'N'
    voter_turnout_stats = voter_turnout_stats.rename(columns = {col: 'voted'})    

    voter_turnout_stats = total_city_edu.merge(voter_turnout_stats, 
                                                     how = 'left',
                                                     on = ['Residence_Addresses_City', 'College_Ed', 'voted']) 
    voter_turnout_stats['perc_turnout'] = voter_turnout_stats['voted_voters']/voter_turnout_stats['total_voters']

    voter_turnout_stats['elec_date'] = col[len(col)-10:]
    voter_turnout_stats['elec_year'] = col[len(col)-10:len(col)-6]
    voter_turnout_stats['elec_type'] = col[:len(col)-11]

    voter_turnout_stats[['voted_voters', 'perc_turnout']] = voter_turnout_stats[['voted_voters', 'perc_turnout']].fillna(0)
    voter_turnout_stats = voter_turnout_stats[voter_turnout_stats['voted'] == 'Y']    
    pivot_df = voter_turnout_stats.pivot(index = ['elec_type','elec_year', 'elec_date', 'Residence_Addresses_City'],
                                    columns='College_Ed', 
                                    values=['total_voters', 'voted_voters', 'perc_turnout']).reset_index()
    pivot_df.columns = pivot_df.columns.map('_'.join)
    pivot_df = pivot_df.rename(columns = {'elec_type_':'elec_type', 'elec_year_':'elec_year', 'elec_date_':'elec_date', 'Residence_Addresses_City_':'Residence_Addresses_City'})

    del voter_turnout_stats
    gc.collect()
    return pivot_df

elec_date_cols = GE_cols+LM_cols

for i in range(len(elec_date_cols)):
    col = elec_date_cols[i]
    pivot_df = calc_votes_edu(merged_file, col)    
    # stack all types of election into one dataframe 
    if i == 0:
        voter_turnout_edu = pivot_df.copy() 
    else:
        voter_turnout_edu = pd.concat([voter_turnout_edu, pivot_df])

In [46]:
voter_turnout_edu.head()

Unnamed: 0,elec_type,elec_year,elec_date,Residence_Addresses_City,total_voters_college,total_voters_no_college,voted_voters_college,voted_voters_no_college,perc_turnout_college,perc_turnout_no_college
0,General,2016,2016_11_08,Albany,5107.0,696.0,4031.0,553.0,0.789309,0.79454
1,General,2016,2016_11_08,Alhambra,13958.0,7112.0,8595.0,4471.0,0.615776,0.628656
2,General,2016,2016_11_08,Anaheim,52558.0,28944.0,34878.0,19107.0,0.66361,0.660137
3,General,2016,2016_11_08,Bellflower,10496.0,7699.0,6468.0,4823.0,0.616235,0.626445
4,General,2016,2016_11_08,Berkeley,27403.0,3717.0,21904.0,2861.0,0.799329,0.769707


In [47]:
# Merge 3.1, 3.2, 3.3, and 3.4
voter_turnout_merge = voter_turnout_merge.merge(voter_turnout_edu, 
                                                          how = 'inner',
                                                          on = ['elec_type', 'elec_year', 'elec_date', 'Residence_Addresses_City'])

voter_turnout_merge.head()

Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type,total_voters_asian,total_voters_black,total_voters_hispanic,total_voters_others,total_voters_white,voted_voters_asian,...,perc_turnout_income_$35000-49999,perc_turnout_income_$50000-74999,perc_turnout_income_$75000-99999,CommercialData_EstimatedHHIncomeAmount,total_voters_college,total_voters_no_college,voted_voters_college,voted_voters_no_college,perc_turnout_college,perc_turnout_no_college
0,Oakland,2020_11_03,2020,General,30600.0,61476.0,37174.0,8628.0,83122.0,23041.0,...,0.667258,0.692373,0.774733,115534.731762,80668.0,27989.0,71356.0,22543.0,0.884564,0.805424
1,Oakland,2018_11_06,2018,General,30600.0,61476.0,37174.0,8628.0,83122.0,14972.0,...,0.443464,0.472464,0.57619,115534.731762,80668.0,27989.0,59248.0,17372.0,0.734467,0.620672
2,Oakland,2016_11_08,2016,General,30600.0,61476.0,37174.0,8628.0,83122.0,16057.0,...,0.494703,0.514018,0.605413,115534.731762,80668.0,27989.0,60766.0,18951.0,0.753285,0.677087
3,Oakland,2014_11_04,2014,General,30600.0,61476.0,37174.0,8628.0,83122.0,8145.0,...,0.241035,0.2531,0.321743,115534.731762,80668.0,27989.0,38216.0,10811.0,0.473744,0.386259
4,Oakland,2021_11_02,2021,Consolidated_General,30600.0,61476.0,37174.0,8628.0,83122.0,1.0,...,0.0,4.8e-05,4.2e-05,115534.731762,80668.0,27989.0,2.0,1.0,2.5e-05,3.6e-05


#  3.5.  Calculate voter average age

In [48]:
def calc_age(df):
    age_merged_file = merged_file[['Residence_Addresses_City', 'Voters_BirthDate'] + elec_date_cols]
    melt_age_df = age_merged_file.melt(id_vars=['Residence_Addresses_City', 'Voters_BirthDate'],
                                       value_vars=elec_date_cols,
                                       var_name='elec_type_date',
                                       value_name='voted')
    melt_age_df = melt_age_df[melt_age_df['voted'] == 'Y']
    
    elec_date_dict = pd.DataFrame(elec_date_cols, columns = ['elec_type_date'])
    elec_date_dict['date'] = elec_date_dict['elec_type_date'].str.slice(-5,-3) + '/' + elec_date_dict['elec_type_date'].str.slice(-2) + '/' + elec_date_dict['elec_type_date'].str.slice(-10,-6)
    melt_age_df = melt_age_df.merge(elec_date_dict,
                                    how = 'inner', 
                                    on = ['elec_type_date'])
    melt_age_df.rename(columns = {'date': 'voting_date'}, inplace = True)
    melt_age_df['Voters_BirthDate'] = pd.to_datetime(melt_age_df['Voters_BirthDate'], format='%m/%d/%Y')
    melt_age_df['voting_date'] = pd.to_datetime(melt_age_df['voting_date'], format='%m/%d/%Y')
    melt_age_df['age_on_vote'] = (melt_age_df['voting_date'] - melt_age_df['Voters_BirthDate']) / np.timedelta64(1, 'Y')
    melt_age_df['elec_date'] = melt_age_df['elec_type_date'].str[-10:]
    melt_age_df['elec_year'] = melt_age_df['elec_type_date'].str[-10:-6]
    melt_age_df['elec_type'] = melt_age_df['elec_type_date'].str[:-11]
    
    age_df = melt_age_df.groupby(['Residence_Addresses_City', 'elec_date', 'elec_year', 'elec_type']).agg({'age_on_vote':'mean'}).reset_index()
    age_df.rename(columns = {'age_on_vote': 'mean_age'}, inplace = True)
    return age_df

In [49]:
age_df = calc_age(merged_file)
age_df

Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type,mean_age
0,Albany,2014_11_04,2014,General,52.884080
1,Albany,2016_11_08,2016,General,49.504594
2,Albany,2017_11_07,2017,Consolidated_General,43.797614
3,Albany,2018_11_06,2018,General,50.653974
4,Albany,2019_03_05,2019,Local_or_Municipal,31.922170
...,...,...,...,...,...
487,Whittier,2020_11_03,2020,General,47.928141
488,Whittier,2021_04_20,2021,Local_or_Municipal,45.794233
489,Whittier,2021_06_01,2021,Local_or_Municipal,38.259513
490,Whittier,2021_07_20,2021,Local_or_Municipal,55.780324


In [51]:
# Merge 3.1, 3.2, 3.3, 3.4 and 3.5

voter_turnout_merge = voter_turnout_merge.merge(age_df,
                                                how = 'inner',
                                                on = ['elec_type', 'elec_year', 'elec_date', 'Residence_Addresses_City'])

voter_turnout_merge.head()

Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type,total_voters_asian,total_voters_black,total_voters_hispanic,total_voters_others,total_voters_white,voted_voters_asian,...,perc_turnout_income_$50000-74999,perc_turnout_income_$75000-99999,CommercialData_EstimatedHHIncomeAmount,total_voters_college,total_voters_no_college,voted_voters_college,voted_voters_no_college,perc_turnout_college,perc_turnout_no_college,mean_age
0,Oakland,2020_11_03,2020,General,30600.0,61476.0,37174.0,8628.0,83122.0,23041.0,...,0.692373,0.774733,115534.731762,80668.0,27989.0,71356.0,22543.0,0.884564,0.805424,47.829424
1,Oakland,2018_11_06,2018,General,30600.0,61476.0,37174.0,8628.0,83122.0,14972.0,...,0.472464,0.57619,115534.731762,80668.0,27989.0,59248.0,17372.0,0.734467,0.620672,48.224196
2,Oakland,2016_11_08,2016,General,30600.0,61476.0,37174.0,8628.0,83122.0,16057.0,...,0.514018,0.605413,115534.731762,80668.0,27989.0,60766.0,18951.0,0.753285,0.677087,46.819512
3,Oakland,2014_11_04,2014,General,30600.0,61476.0,37174.0,8628.0,83122.0,8145.0,...,0.2531,0.321743,115534.731762,80668.0,27989.0,38216.0,10811.0,0.473744,0.386259,51.034918
4,Oakland,2021_11_02,2021,Consolidated_General,30600.0,61476.0,37174.0,8628.0,83122.0,1.0,...,4.8e-05,4.2e-05,115534.731762,80668.0,27989.0,2.0,1.0,2.5e-05,3.6e-05,55.732444


In [52]:
del age_df
gc.collect()

40

# 3.6 Calculate total population of age 20+

In [210]:
# read in cities.csv 
df_cities = pd.read_csv(filepath + cities_filename)

# fix naming errors
if state == 'california':
    # replace El Paso de Robles with Paso Robles:
    df_cities.loc[df_cities['city'] == 'El Paso de Robles', 'city'] = 'Paso Robles'
elif state == 'utah':
    # replace St. George with Saint George
    df_cities.loc[df_cities['city'] == 'St. George', 'city'] = 'Saint George'
    df_cities = df_cities.dropna(subset=['population_proper'])
    
# make new column w/ voter_population, that is total population * percent20+
def calculate_voter_pop(row):
    percent_voting_age = row[['age_20s','age_30s', 'age_40s', 'age_50s', 'age_60s', 
                              'age_70s', 'age_over_80']].sum() / 100
    return int(np.floor(row['population_proper'] * percent_voting_age))

df_cities['voter_population'] = df_cities.apply(lambda row: calculate_voter_pop(row), axis=1)
df_cities.head()

Unnamed: 0,RCV,city,city_ascii,city_alt,state_id,state_name,county_fips,county_name,county_fips_all,county_name_all,...,race_other,race_multiple,hispanic,disabled,poverty,limited_english,commute_time,health_uninsured,veteran,voter_population
0,1.0,Salt Lake City,Salt Lake City,,UT,Utah,49035.0,Salt Lake,49035.0,Salt Lake,...,12.8,3.1,21.3,10.6,17.8,4.9,19.4,14.6,4.7,152613
1,,Ogden,Ogden,,UT,Utah,49057.0,Weber,49057.0,Weber,...,7.1,3.9,32.3,13.0,21.4,4.1,20.3,17.1,7.8,60921
2,,Provo,Provo,,UT,Utah,49049.0,Utah,49049.0,Utah,...,2.6,3.6,16.3,8.3,25.4,3.4,17.8,11.8,2.5,83542
3,,West Valley City,West Valley City,,UT,Utah,49035.0,Salt Lake,49035.0,Salt Lake,...,25.0,4.3,37.7,9.4,13.8,7.6,22.0,20.8,5.4,89055
4,,Saint George,St. George,Saint George,UT,Utah,49053.0,Washington,49053.0,Washington,...,4.6,2.8,13.0,13.8,15.5,2.1,15.0,13.9,8.6,60096


In [211]:
# merge with all previous calculations
voter_turnout_merge = voter_turnout_merge.merge(df_cities[['city', 'voter_population']], how = 'inner', left_on = ['Residence_Addresses_City'], right_on = ['city'])
voter_turnout_merge.drop(columns=['city'], inplace=True)
voter_turnout_merge.head()

Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type,total_voters_asian,total_voters_black,total_voters_hispanic,total_voters_white,voted_voters_asian,voted_voters_black,...,perc_turnout_income_$75000-99999,CommercialData_EstimatedHHIncomeAmount,total_voters_college,total_voters_no_college,voted_voters_college,voted_voters_no_college,perc_turnout_college,perc_turnout_no_college,mean_age,voter_population
0,Brigham City,2020_11_03,2020,General,104.0,,524.0,8553.0,89.0,,...,0.835677,85141.696468,4077.0,2106.0,3657.0,1866.0,0.896983,0.88604,53.145914,13197
1,Brigham City,2018_11_06,2018,General,104.0,,524.0,8553.0,57.0,,...,0.611892,85141.696468,4077.0,2106.0,2846.0,1518.0,0.698062,0.720798,57.152151,13197
2,Brigham City,2016_11_08,2016,General,104.0,,524.0,8553.0,62.0,,...,0.629972,85141.696468,4077.0,2106.0,2985.0,1534.0,0.732156,0.728395,57.16618,13197
3,Brigham City,2014_11_04,2014,General,104.0,,524.0,8553.0,21.0,,...,0.293692,85141.696468,4077.0,2106.0,1466.0,804.0,0.359578,0.381766,64.762912,13197
4,Brigham City,2019_11_05,2019,Consolidated_General,104.0,,524.0,8553.0,23.0,,...,0.292085,85141.696468,4077.0,2106.0,1494.0,814.0,0.366446,0.386515,62.947498,13197


# Save the merged aggregations 

In [147]:
voter_turnout_merge.to_csv(f'{filepath}voter_turnout_merged_{state}.csv', index=False)

In [148]:
del voter_turnout_merge
gc.collect()

20

In [149]:
end_time = time.time()
print("Time take to run this notebook in seconds: ", end_time - start_time)

Time take to run this notebook in seconds:  132.2906289100647
