# 1. Data Loading

1. DEMOGRAPHIC_selected_cols parquet file contains all rows with selected columns (voter id, city, county, ethnicity, age, gender, education, income, donation and parties description). See Reduce_to_parquet.ipynb 
2. VOTEHISTORY_selected_cols parquet file contains all rows with selected columns (General and Local_or_Municipal). See Reduce_to_parquet.ipynb 
3. GE_LM_dates_per_city parquet file contains four most recent General election and Local_or_Municipal for each of the selected city. See Find_recent_election_dates.ipynb and ca_similarity_search.ipynb

Some observations
- We choose the 5 non-RCV cities with highest cosine similary score compared to the 7 RCV cities in CA
- There were 33 distinct cities among those 35 cities
- There are 66 non-registered voters among 21.7 million voters
- There are total of 3.9 million voters in the sampled cities
- City 'El Paso de Robles' didn't match in demographic data


In [60]:
import pandas as pd
import numpy as np
import janitor
import gc
import time
from datetime import datetime
start_time = time.time()

#### Define state here

In [2]:
state = 'CA' ##california
# state = 'CO' ##colorado
# state = 'MD' ##maryland
# state = 'ME' ##maine
# state = 'MN' ##minnesota
# state = 'NM' ##new mexico
# state = 'UT' ##utah
# state = 'VT' ##vermont

elec_dates_filename = f'GE_LM_dates_per_city_{state}.parquet'

In [3]:
def combine_cities_list(RCV_list, NonRCV_list):

    print("total number of cities:", len(RCV_list))

    print("number of distinct cities:", len(set(NonRCV_list)))

    print("name of cities that were duplicated:", set([x for x in NonRCV_list if NonRCV_list.count(x) > 1]))

    combined_cityName = RCV_list+list(set(NonRCV_list))
    print("number of distinct RCV and sampled nonRCV cities:", len(combined_cityName))
    return combined_cityName


In [4]:
### California
if state=='CA':
    
    filepath = '../data/VM2--CA--2022-04-25/'
    DEMO_filename = 'VM2--CA--2022-04-25-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--CA--2022-04-25-VOTEHISTORY_selected_cols.parquet'
    cities_filename = 'ca-cities.csv'

    RCV_cities_CA = ['San Francisco',
     'Oakland',
     'Berkeley',
     'San Leandro',
     'Palm Desert',
     'Eureka',
     'Albany']

    sampled_nonRCV_cities_CA = ['Fresno',
     'San Diego',
     'Sacramento',
     'Riverside',
     'San Jose',
     'Santa Ana',
     'Anaheim',
     'Santa Rosa',
     'Merced',
     'Santa Clarita',
     'Alhambra',
     'Davis',
     'Montebello',
     'Burbank',
     'Huntington Park',
     'Bellflower',
     'Watsonville',
     'Gilroy',
     'Whittier',
     'Lynwood',
     'Lakewood',
     'Pico Rivera',
     'Lake Forest',
     'Livermore',
     'Chino Hills',
     'Paramount',
     'El Paso de Robles',
     'Pico Rivera',
     'Buena Park',
     'Whittier',
     'Calabasas',
     'Carpinteria',
     'Morro Bay',
     'San Carlos',
     'Solvang']

    combined_sampled_cityName = combine_cities_list(RCV_list= RCV_cities_CA, NonRCV_list = sampled_nonRCV_cities_CA)

    
### Colorado
if state=='CO':
    
    filepath = '../data/VM2--CO--2022-04-26/'
    DEMO_filename = 'VM2--CO--2022-04-26-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--CO--2022-04-26-VOTEHISTORY_selected_cols.parquet'
    cities_filename = 'cities.csv'
    
    RCV_cities_CO = ['Boulder']

    sampled_nonRCV_cities_CO = ['Alamosa', 'Arvada', 'Brighton', 'Broomfield', 'Castle Rock', 
                                'Canon City', 'Centennial', 'Commerce City', 
                                'Durango', 'Englewood', 'Fountain', 'Glenwood Springs', 'Golden', 
                                'Greenwood Village', 'Highlands Ranch', 'Lafayette', 
                                'Littleton', 'Longmont', 'Louisville', 'Loveland', 'Montrose', 'Northglenn', 
                                'Parker', 'Silverthorne', 'Steamboat Springs', 'Wheat Ridge', 'Windsor']

    combined_sampled_cityName = combine_cities_list(RCV_list= RCV_cities_CO, NonRCV_list = sampled_nonRCV_cities_CO)


### Maryland
if state=='MD':
    
    filepath = '../data/VM2--MD--2022-04-08/'
    DEMO_filename = 'VM2--MD--2022-04-08-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--MD--2022-04-08-VOTEHISTORY_selected_cols.parquet'
    cities_filename = 'cities.csv'
    
    RCV_cities_MD = ['Takoma Park']

    sampled_nonRCV_cities_MD = ['Adelphi', 'Annapolis', 'Aspen Hill', 'Bethesda', 
                                'Cockeysville', 'College Park', 'Easton', 
                                'Hyattsville', 'New Carrollton', 
                                'North Bethesda', 'North Potomac', 'Ocean Pines', 'Potomac', 
                                'Princess Anne', 'Severna Park', 'Timonium', 
                                'Westminster']

    combined_sampled_cityName = combine_cities_list(RCV_list= RCV_cities_MD, NonRCV_list = sampled_nonRCV_cities_MD)

### Maine
if state=='ME':
    
    filepath = '../data/VM2--ME--2022-03-02/'
    DEMO_filename = 'VM2--ME--2022-03-02-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--ME--2022-03-02-VOTEHISTORY_selected_cols.parquet'
    cities_filename = 'cities.csv'
    
    # low number of cities
    combined_sampled_cityName = ['Sanford', 'Westbrook', 'Lewiston', 'Wells', 'Standish', 'Waterville',
     'Falmouth', 'Windham', 'Kennebunk', 'Scarborough', 'South Portland', 'Bangor',
     'Augusta', 'Brunswick', 'Auburn', 'Portland', 'Biddeford', 'York', 'Saco',
     'Orono', 'Gorham']

### Minnesota
if state=='MN':

    filepath = '../data/VM2--MN--2022-03-25/'
    DEMO_filename = 'VM2--MN--2022-03-25-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--MN--2022-03-25-VOTEHISTORY_selected_cols.parquet'
    cities_filename = 'cities.csv'
    
    RCV_cities_MN = ['Bloomington', 'Minneapolis', 'Minnetonka', 'St. Louis Park']

    sampled_nonRCV_cities_MN = ['Alexandria', 'Bemidji', 'Blaine', 'Blaine', 'Brainerd', 'Brainerd', 
                                'Brooklyn Center', 'Brooklyn Park', 'Brooklyn Park', 'Burnsville', 
                                'Coon Rapids', 'Coon Rapids', 'Duluth', 'Eagan', 'Eden Prairie', 'Eden Prairie', 
                                'Fridley', 'Inver Grove Heights', 'Mankato', 'Mankato', 'Maple Grove', 'Maplewood', 
                                'Moorhead', 'Plymouth', 'Richfield', 'Richfield', 'Rochester', 'Roseville', 
                                'Roseville', 'St. Cloud', 'St. Paul', 'Winona']

    combined_sampled_cityName = combine_cities_list(RCV_list= RCV_cities_MN, NonRCV_list = sampled_nonRCV_cities_MN)

### New Mexico
if state=='NM':

    filepath = '../data/VM2--NM--2022-03-30/'
    DEMO_filename = 'VM2--NM--2022-03-30-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--NM--2022-03-30-VOTEHISTORY_selected_cols.parquet'
    cities_filename = 'cities.csv'
    
    # low number of cities
    combined_sampled_cityName = ['Los Alamos', 'Albuquerque', 'Rio Rancho', 'Farmington',
     'Taos', 'Las Cruces', 'Silver City', 'Roswell', 'Lovington', 'Deming',
     'Alamogordo', 'Chaparral', 'Las Vegas', 'Los Lunas', 'Hobbs',
     'Clovis', 'Sunland Park', 'Artesia', 'Grants', 'Carlsbad', 'Portales', 'Gallup',
     'Espanola', 'Santa Fe']

### Utah
if state=='UT':
    
    filepath = '../data/VM2--UT--2022-03-30/'
    DEMO_filename = 'VM2--UT--2022-03-30-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--UT--2022-03-30-VOTEHISTORY_selected_cols.parquet'
    cities_filename = 'ut-cities.csv'

    RCV_cities_UT = ['Salt Lake City', 'Sandy', 'Lehi', 'Millcreek', 
                     'Draper', 'Riverton',  'Cottonwood Heights', 
                     'Springville', 'Midvale', 'Magna', 'South Salt Lake', 
                     'Payson', 'Bluffdale']

    sampled_nonRCV_cities_UT = ['Ogden', 'Provo', 'West Valley City', 
                                'Logan', 'St. George', 'Taylorsville', 
                                'Layton', 'Orem', 'South Jordan', 'Murray', 
                                'South Jordan', 'Clearfield', 'Spanish Fork', 
                                'Tooele', 'Kearns', 'Cedar City', 'Murray', 
                                'Bountiful',  'South Jordan', 'Pleasant Grove', 
                                'Vernal', 'Hurricane', 'Herriman', 'American Fork', 
                                'Washington', 'Eagle Mountain', 'Brigham City', 
                                'American Fork', 'Herriman', 'Spanish Fork', 
                                'Washington', 'Heber', 'Hurricane', 'Vernal', 
                                'Holladay', 'Pleasant Grove', 'American Fork', 
                                'Herriman', 'Eagle Mountain', 'Vernal', 
                                'Bountiful', 'Pleasant Grove', 'Washington', 
                                'South Jordan', 'Vernal', 'Tooele', 
                                'Spanish Fork', 'Clearfield', 'Kearns', 
                                'Eagle Mountain', 'Washington', 'Bountiful', 
                                'Pleasant Grove', 'Hurricane', 'Cedar City', 
                                'Saratoga Springs', 'Kaysville', 'Brigham City', 
                                'North Salt Lake', 'American Fork', 'Highland', 
                                'Lindon', 'Alpine', 'West Haven', 'North Logan']

    combined_sampled_cityName = combine_cities_list(RCV_list= RCV_cities_UT, NonRCV_list = sampled_nonRCV_cities_UT)

### Vermont
if state=='VT':
    
    filepath = '../data/VM2--VT--2022-04-20/'
    DEMO_filename = 'VM2--VT--2022-04-20-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--VT--2022-04-20-VOTEHISTORY_selected_cols.parquet'
    elec_dates_filename = 'GE_LM_dates_per_city_VT.parquet'
    cities_filename = 'cities.csv'
    
    # low number of cities
    combined_sampled_cityName = ['Burlington', 'South Burlington', 'Essex', 'Rutland', 'Bennington', 'Milton',
     'Essex Junction', 'Barre', 'Colchester', 'Brattleboro']


total number of cities: 7
number of distinct cities: 33
name of cities that were duplicated: {'Whittier', 'Pico Rivera'}
number of distinct RCV and sampled nonRCV cities: 40


# 1.1 Demographic Data

1. Select only the columns required: city name ('Residence_Addresses_City'), unique voter id ('LALVOTERID'), voter's ethnicity ('EthnicGroups_EthnicGroup1Desc'), date when voter was registered ('Voters_OfficialRegDate'), voter's gender, date of birth, plus additionsl columns
2. Keep only the cities that were identified as being similar to RCV cities in CA (See ca_similarity_search.ipynb for reference) 
3. Keep only rows EthnicGroups_EthnicGroup1Desc == “European”,  “Likely African-American”,“Hispanic and Portuguese” and “East and South Asian” 
4. Keep only registered voters identified in 'Voters_OfficialRegDate'


In [5]:
def read_DEMOGRAPHIC():
    df_demographic = pd.read_parquet(f'{filepath}{DEMO_filename}')
    print("Total number of unique cities:", df_demographic.Residence_Addresses_City.nunique())
    print("Total number of unique voters:", df_demographic.LALVOTERID.nunique())
    print("Count of non-registered voters:", len(df_demographic[df_demographic['Voters_OfficialRegDate'].isnull()]))
    
    print("Number of expected cities:", len(combined_sampled_cityName))
    missing_cities = [city for city in combined_sampled_cityName if city not in df_demographic['Residence_Addresses_City'].unique()]
    if len(missing_cities) > 0:
        print("number of cities not found in demographic data:", len(missing_cities))
        print(missing_cities)
        
    return df_demographic
        
state_demographic = read_DEMOGRAPHIC()

Total number of unique cities: 1533
Total number of unique voters: 21711617
Count of non-registered voters: 66
Number of expected cities: 40
number of cities not found in demographic data: 1
['El Paso de Robles']


In [6]:
state_demographic.head(5)

Unnamed: 0,LALVOTERID,Residence_Addresses_City,Voters_Gender,Voters_Age,Voters_BirthDate,Parties_Description,EthnicGroups_EthnicGroup1Desc,Voters_OfficialRegDate,County,CommercialData_Education,CommercialData_EstimatedHHIncome,CommercialData_EstimatedHHIncomeAmount,FECDonors_NumberOfDonations,FECDonors_TotalDonationsAmount
0,LALCA453164106,Oakland,F,29,04/29/1993,Democratic,Other,06/18/2021,ALAMEDA,,,,,
1,LALCA453008306,Oakland,F,26,02/02/1996,Non-Partisan,Likely African-American,04/01/2021,ALAMEDA,,,,,
2,LALCA22129469,Oakland,F,47,02/02/1975,Democratic,European,11/16/2021,ALAMEDA,HS Diploma - Extremely Likely,,,,
3,LALCA549803906,Oakland,M,60,02/09/1962,Democratic,Other,02/07/2022,ALAMEDA,,,,,
4,LALCA24729024,San Leandro,F,56,01/01/1966,Democratic,European,02/28/2016,ALAMEDA,HS Diploma - Extremely Likely,,,,


In [7]:
#standardize duplicate city names to names as presented in "cities.csv"

def rename_dup_city(df, old_name, new_name):
    print("number of records before fixing duplicates")
    print("\t",old_name, len(df[df['Residence_Addresses_City'] == old_name]))
    print("\t",new_name, len(df[df['Residence_Addresses_City'] == new_name]))
    df.loc[df['Residence_Addresses_City'] == old_name, 'Residence_Addresses_City'] = new_name
    print("number of records after fixing duplicates")
    print("\t",old_name, len(df[df['Residence_Addresses_City'] == old_name]))
    print("\t",new_name, len(df[df['Residence_Addresses_City'] == new_name]))
    return df


In [8]:
# Modifying the names of cities that were duplicate (Calabasas Hills > Calabasas and Huntington Pk > Huntington Park)
if state == 'CA':
    rename_dup_city(state_demographic, 'Paso Robles', 'El Paso de Robles')
    rename_dup_city(state_demographic, 'Huntington Pk', 'Huntington Park')
    rename_dup_city(state_demographic, 'Calabasas Hills', 'Calabasas')


if state=='CO':
    # NOTE: ['Sherrelwood', 'Cherry Creek', 'Ken Caryl'] not found in demographic data so were removed from 
    # sampled_non_RCV_cities_CO list

    rename_dup_city(state_demographic, 'Hghlnds Ranch', 'Highlands Ranch')
    rename_dup_city(state_demographic, 'Glenwood Spgs', 'Glenwood Springs')
    rename_dup_city(state_demographic, 'Steamboat Spr', 'Steamboat Springs')
    rename_dup_city(state_demographic, 'Greenwood Vlg', 'Greenwood Village')
    

if state=='MD':
    # NOTE: ['Colesville', 'Fairland', 'Cloverly', 'Annapolis Neck', 'Redland', 'Glenmont', 'Travilah', 
    # 'South Laurel', 'White Oak', 'Glassmanor', 'Kemp Mill', 'Parole', 'Calverton'] not found in demographic data 
    # so were removed from sampled_non_RCV_cities_MD list

    rename_dup_city(state_demographic, 'N Bethesda', 'North Bethesda')
    
if state=='ME':
    rename_dup_city(state_demographic, 'S Portland', 'South Portland')
    
if state=='MN':
    rename_dup_city(state_demographic, 'St Louis Park', 'St. Louis Park')
    rename_dup_city(state_demographic, 'Saint Paul', 'St. Paul')
    rename_dup_city(state_demographic, 'Saint Cloud', 'St. Cloud')
    # Inver Grove Heights
    # Inver Grove

# didn't see any duplicate for new mexico
# if state=='NM':
    # NOTE: ['North Valley', 'South Valley'] not found in demographic data so were removed 
    # from sampled_non_RCV_cities_NM list
    
    
if state=='UT':
    rename_dup_city(state_demographic, 'W Valley City', 'West Valley City')
    rename_dup_city(state_demographic, 'Saint George', 'St. George')
    rename_dup_city(state_demographic, 'St George', 'St. George')
    rename_dup_city(state_demographic, 'Saratoga Spgs', 'Saratoga Springs')
    rename_dup_city(state_demographic, 'Salt Lake Cty', 'Salt Lake City')
    rename_dup_city(state_demographic, 'S Salt Lake', 'South Salt Lake')
    rename_dup_city(state_demographic, 'Pleasant Grv', 'Pleasant Grove')
    rename_dup_city(state_demographic, 'N Salt Lake', 'North Salt Lake')

if state=='VT':
    rename_dup_city(state_demographic, 'Essex Jct', 'Essex Junction')
    rename_dup_city(state_demographic, 'S Burlington', 'South Burlington')
    

number of records before fixing duplicates
	 Paso Robles 28787
	 El Paso de Robles 0
number of records after fixing duplicates
	 Paso Robles 0
	 El Paso de Robles 28787
number of records before fixing duplicates
	 Huntington Pk 711
	 Huntington Park 28745
number of records after fixing duplicates
	 Huntington Pk 0
	 Huntington Park 29456
number of records before fixing duplicates
	 Calabasas Hills 1
	 Calabasas 18630
number of records after fixing duplicates
	 Calabasas Hills 0
	 Calabasas 18631


### Filter DEMOGRAPHIC data based on the list of cities, ethnicities, and registered voters

In [9]:
# 2. filter DEMOGRAPHIC data based on the list of cities, ethnicities and registered voters

selected_ethnicities = ['European', 'Likely African-American','Hispanic and Portuguese', 'East and South Asian', 'Other']

def filter_demo(df, list_cityNames):
    filtered_df = df[df['Residence_Addresses_City'].isin(list_cityNames) &
            df['EthnicGroups_EthnicGroup1Desc'].isin(selected_ethnicities) &
            df['Voters_OfficialRegDate'].notnull()]
    
    print(filtered_df.shape)
    print("number of unique cities:", filtered_df.Residence_Addresses_City.nunique())
    
    return filtered_df

state_demographic_subset = filter_demo(df = state_demographic, list_cityNames = combined_sampled_cityName)
state_demographic_subset.head()

(4140404, 14)
number of unique cities: 40


Unnamed: 0,LALVOTERID,Residence_Addresses_City,Voters_Gender,Voters_Age,Voters_BirthDate,Parties_Description,EthnicGroups_EthnicGroup1Desc,Voters_OfficialRegDate,County,CommercialData_Education,CommercialData_EstimatedHHIncome,CommercialData_EstimatedHHIncomeAmount,FECDonors_NumberOfDonations,FECDonors_TotalDonationsAmount
0,LALCA453164106,Oakland,F,29,04/29/1993,Democratic,Other,06/18/2021,ALAMEDA,,,,,
1,LALCA453008306,Oakland,F,26,02/02/1996,Non-Partisan,Likely African-American,04/01/2021,ALAMEDA,,,,,
2,LALCA22129469,Oakland,F,47,02/02/1975,Democratic,European,11/16/2021,ALAMEDA,HS Diploma - Extremely Likely,,,,
3,LALCA549803906,Oakland,M,60,02/09/1962,Democratic,Other,02/07/2022,ALAMEDA,,,,,
4,LALCA24729024,San Leandro,F,56,01/01/1966,Democratic,European,02/28/2016,ALAMEDA,HS Diploma - Extremely Likely,,,,


In [10]:
del state_demographic
gc.collect()

20

# 1.2 Vote History

1. Select only the columns that are 4 most recent General elections and 4 most recent Local_or_Municipal elections and EthnicGroups_EthnicGroup1Desc
2. Load Vote History 
3. Merge Vote History with the sampled Demographic Data 


## 1. Get four most recent election dates

In [11]:
# load the list of election dates for each city
GE_LM_dates_dict = pd.read_parquet(f'{filepath}{elec_dates_filename}')

GE_LM_dates_dict

Unnamed: 0,city,GE_dates,LM_dates
0,Oakland,"[General_2020_11_03, General_2018_11_06, Gener...",
1,San Leandro,"[General_2020_11_03, General_2018_11_06, Gener...",
2,Livermore,"[General_2020_11_03, General_2018_11_06, Gener...",
3,Berkeley,"[General_2020_11_03, General_2018_11_06, Gener...",
4,Albany,"[General_2020_11_03, General_2018_11_06, Gener...",
5,San Francisco,"[General_2020_11_03, General_2018_11_06, Gener...","[Consolidated_General_2019_11_05, Consolidated..."
6,San Diego,"[General_2020_11_03, General_2018_11_06, Gener...",
7,San Jose,"[General_2020_11_03, General_2018_11_06, Gener...",
8,Fresno,"[General_2020_11_03, General_2018_11_06, Gener...",
9,Eureka,"[General_2020_11_03, General_2018_11_06, Gener...","[Consolidated_General_2017_11_07, Consolidated..."


In [12]:
# need in order to filter out rows after aggregation
def get_correct_dates(list_like_df, column_name):
    print("Shape before reshaping:",list_like_df.shape)
    list_like_df = list_like_df[['city', column_name]]
    list_like_df = list_like_df.explode(column_name).melt(id_vars=["city"], 
                                                                       var_name="Date", 
                                                                       value_name="Value")
    list_like_df = list_like_df.drop(columns = 'Date')
    list_like_df.columns = ['Residence_Addresses_City', 'elec_type_date']                          
    list_like_df['elec_date'] = list_like_df['elec_type_date'].str[-10:]
    list_like_df['elec_year'] = list_like_df['elec_type_date'].str[-10:-6]
    list_like_df['elec_type'] = list_like_df['elec_type_date'].str[:-11]                    
    list_like_df = list_like_df.drop(columns = 'elec_type_date')
    print("Shape after reshaping:",list_like_df.shape)
    return list_like_df

GE_dates_df = get_correct_dates(GE_LM_dates_dict, 'GE_dates')
LM_dates_df = get_correct_dates(GE_LM_dates_dict, 'LM_dates')
GE_LM_dates_df = pd.concat([GE_dates_df, LM_dates_df])
print(len(GE_LM_dates_df))
print(len(GE_LM_dates_df.dropna(axis = 0)))

GE_LM_dates_df.dropna()

# [, 'LM_dates']

# GE_LM_dates_df = get_correct_dates(GE_LM_dates_dict)
# GE_LM_dates_df.head()
# ['GE_dates', 'LM_dates']

Shape before reshaping: (40, 3)
Shape after reshaping: (160, 4)
Shape before reshaping: (40, 3)
Shape after reshaping: (64, 4)
224
192


Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type
0,Oakland,2020_11_03,2020,General
1,Oakland,2018_11_06,2018,General
2,Oakland,2016_11_08,2016,General
3,Oakland,2014_11_04,2014,General
4,San Leandro,2020_11_03,2020,General
...,...,...,...,...
50,Riverside,2019_06_04,2019,Local_or_Municipal
54,San Carlos,2017_11_07,2017,Consolidated_General
55,San Carlos,2015_11_03,2015,Consolidated_General
56,San Carlos,2013_11_05,2013,Consolidated_General


In [13]:
GE_LM_dates_dict.fillna("",inplace=True)

e_dates = set()
for v in GE_LM_dates_dict['GE_dates']:
    if len(v)== 4:
        for vv in v:
            e_dates.add(vv)
for v in GE_LM_dates_dict['LM_dates'] :
    if len(v)== 4:
        for vv in v:
            e_dates.add(vv)
        
print(list(e_dates))


# ## when all four dates are not found e_dates will contain None, we need to remove it
# if None in list(e_dates):
#     e_dates.remove(None)
#     print(list(e_dates))

['Consolidated_General_2011_11_08', 'Consolidated_General_2015_11_03', 'Local_or_Municipal_2017_04_11', 'Consolidated_General_2013_11_05', 'General_2016_11_08', 'General_2020_11_03', 'Consolidated_General_2017_11_07', 'General_2018_11_06', 'Local_or_Municipal_2019_03_05', 'Local_or_Municipal_2021_06_08', 'Local_or_Municipal_2017_03_07', 'Local_or_Municipal_2019_06_04', 'General_2014_11_04', 'Local_or_Municipal_2015_04_14', 'Consolidated_General_2021_11_02', 'Consolidated_General_2019_11_05', 'Local_or_Municipal_2017_02_28']


## 2. load the VOTE HISTORY data for selected election dates only

In [14]:
needed_variables = ['LALVOTERID'] + list(e_dates)

state_voterhistory_4_dates = pd.read_parquet(f'{filepath}{VOTE_filename}',
                                             columns=needed_variables)
                                
state_voterhistory_4_dates.head(5)

Unnamed: 0,LALVOTERID,Consolidated_General_2011_11_08,Consolidated_General_2015_11_03,Local_or_Municipal_2017_04_11,Consolidated_General_2013_11_05,General_2016_11_08,General_2020_11_03,Consolidated_General_2017_11_07,General_2018_11_06,Local_or_Municipal_2019_03_05,Local_or_Municipal_2021_06_08,Local_or_Municipal_2017_03_07,Local_or_Municipal_2019_06_04,General_2014_11_04,Local_or_Municipal_2015_04_14,Consolidated_General_2021_11_02,Consolidated_General_2019_11_05,Local_or_Municipal_2017_02_28
0,LALCA453164106,,,,,Y,Y,,Y,,,,,,,,,
1,LALCA453008306,,,,,,,,Y,,,,,,,,,
2,LALCA22129469,,,,,Y,Y,,Y,,,,,Y,,,,
3,LALCA549803906,,,,,,Y,,,,,,,,,,,
4,LALCA24729024,,,,,,,,,,,,,,,,,


## 3. Merge Vote History and Demographic Data

In [15]:
merged_file = pd.merge(state_voterhistory_4_dates, state_demographic_subset,
                       how='inner', left_on='LALVOTERID', right_on='LALVOTERID')

print(merged_file.shape)

print("number of unique cities:", merged_file.Residence_Addresses_City.nunique())

merged_file.head(5)

(4140404, 31)
number of unique cities: 40


Unnamed: 0,LALVOTERID,Consolidated_General_2011_11_08,Consolidated_General_2015_11_03,Local_or_Municipal_2017_04_11,Consolidated_General_2013_11_05,General_2016_11_08,General_2020_11_03,Consolidated_General_2017_11_07,General_2018_11_06,Local_or_Municipal_2019_03_05,...,Voters_BirthDate,Parties_Description,EthnicGroups_EthnicGroup1Desc,Voters_OfficialRegDate,County,CommercialData_Education,CommercialData_EstimatedHHIncome,CommercialData_EstimatedHHIncomeAmount,FECDonors_NumberOfDonations,FECDonors_TotalDonationsAmount
0,LALCA453164106,,,,,Y,Y,,Y,,...,04/29/1993,Democratic,Other,06/18/2021,ALAMEDA,,,,,
1,LALCA453008306,,,,,,,,Y,,...,02/02/1996,Non-Partisan,Likely African-American,04/01/2021,ALAMEDA,,,,,
2,LALCA22129469,,,,,Y,Y,,Y,,...,02/02/1975,Democratic,European,11/16/2021,ALAMEDA,HS Diploma - Extremely Likely,,,,
3,LALCA549803906,,,,,,Y,,,,...,02/09/1962,Democratic,Other,02/07/2022,ALAMEDA,,,,,
4,LALCA24729024,,,,,,,,,,...,01/01/1966,Democratic,European,02/28/2016,ALAMEDA,HS Diploma - Extremely Likely,,,,


In [16]:
merge_filename = DEMO_filename.replace('DEMOGRAPHIC_selected_cols.parquet', 'merged.parquet')
print(merge_filename)
merged_file.to_parquet(f'{filepath}{merge_filename}')

VM2--CA--2022-04-25-merged.parquet


# 3.1. Calculate voter turnout per ethnicity

In [17]:
merge_filename = DEMO_filename.replace('DEMOGRAPHIC_selected_cols.parquet', 'merged.parquet')
merged_file = pd.read_parquet(f'{filepath}{merge_filename}')

In [18]:
def replace_ethnicities(df):
    df = df.replace('East and South Asian', 'asian')
    df = df.replace('European', 'white')
    df = df.replace('Hispanic and Portuguese', 'hispanic')
    df = df.replace('Likely African-American', 'black')
    df = df.replace('Other', 'others') # including 'others' ethnical group
    return df

In [19]:
merged_file = replace_ethnicities(merged_file)
merged_file.head()

Unnamed: 0,LALVOTERID,Consolidated_General_2011_11_08,Consolidated_General_2015_11_03,Local_or_Municipal_2017_04_11,Consolidated_General_2013_11_05,General_2016_11_08,General_2020_11_03,Consolidated_General_2017_11_07,General_2018_11_06,Local_or_Municipal_2019_03_05,...,Voters_BirthDate,Parties_Description,EthnicGroups_EthnicGroup1Desc,Voters_OfficialRegDate,County,CommercialData_Education,CommercialData_EstimatedHHIncome,CommercialData_EstimatedHHIncomeAmount,FECDonors_NumberOfDonations,FECDonors_TotalDonationsAmount
0,LALCA453164106,,,,,Y,Y,,Y,,...,04/29/1993,Democratic,others,06/18/2021,ALAMEDA,,,,,
1,LALCA453008306,,,,,,,,Y,,...,02/02/1996,Non-Partisan,black,04/01/2021,ALAMEDA,,,,,
2,LALCA22129469,,,,,Y,Y,,Y,,...,02/02/1975,Democratic,white,11/16/2021,ALAMEDA,HS Diploma - Extremely Likely,,,,
3,LALCA549803906,,,,,,Y,,,,...,02/09/1962,Democratic,others,02/07/2022,ALAMEDA,,,,,
4,LALCA24729024,,,,,,,,,,...,01/01/1966,Democratic,white,02/28/2016,ALAMEDA,HS Diploma - Extremely Likely,,,,


In [20]:
GE_cols = [col for col in merged_file.columns if col.startswith('General')]
print(GE_cols)
LM_cols = [col for col in merged_file.columns if col.startswith('Local_or_Municipal') \
           or col.startswith('Consolidated_General')]
print(LM_cols)

['General_2016_11_08', 'General_2020_11_03', 'General_2018_11_06', 'General_2014_11_04']
['Consolidated_General_2011_11_08', 'Consolidated_General_2015_11_03', 'Local_or_Municipal_2017_04_11', 'Consolidated_General_2013_11_05', 'Consolidated_General_2017_11_07', 'Local_or_Municipal_2019_03_05', 'Local_or_Municipal_2021_06_08', 'Local_or_Municipal_2017_03_07', 'Local_or_Municipal_2019_06_04', 'Local_or_Municipal_2015_04_14', 'Consolidated_General_2021_11_02', 'Consolidated_General_2019_11_05', 'Local_or_Municipal_2017_02_28']


In [21]:
# fill NA values with "N" to make it easier to compare  with "Y"
merged_file[GE_cols+LM_cols] = merged_file[GE_cols+LM_cols].fillna('N')
merged_file.head()

Unnamed: 0,LALVOTERID,Consolidated_General_2011_11_08,Consolidated_General_2015_11_03,Local_or_Municipal_2017_04_11,Consolidated_General_2013_11_05,General_2016_11_08,General_2020_11_03,Consolidated_General_2017_11_07,General_2018_11_06,Local_or_Municipal_2019_03_05,...,Voters_BirthDate,Parties_Description,EthnicGroups_EthnicGroup1Desc,Voters_OfficialRegDate,County,CommercialData_Education,CommercialData_EstimatedHHIncome,CommercialData_EstimatedHHIncomeAmount,FECDonors_NumberOfDonations,FECDonors_TotalDonationsAmount
0,LALCA453164106,N,N,N,N,Y,Y,N,Y,N,...,04/29/1993,Democratic,others,06/18/2021,ALAMEDA,,,,,
1,LALCA453008306,N,N,N,N,N,N,N,Y,N,...,02/02/1996,Non-Partisan,black,04/01/2021,ALAMEDA,,,,,
2,LALCA22129469,N,N,N,N,Y,Y,N,Y,N,...,02/02/1975,Democratic,white,11/16/2021,ALAMEDA,HS Diploma - Extremely Likely,,,,
3,LALCA549803906,N,N,N,N,N,Y,N,N,N,...,02/09/1962,Democratic,others,02/07/2022,ALAMEDA,,,,,
4,LALCA24729024,N,N,N,N,N,N,N,N,N,...,01/01/1966,Democratic,white,02/28/2016,ALAMEDA,HS Diploma - Extremely Likely,,,,


In [22]:
# We created the dataframe below in order to easily calculate perc_turnout when no one voted
list_ethnic_city = merged_file[['Residence_Addresses_City', 'EthnicGroups_EthnicGroup1Desc']].drop_duplicates()
list_ethnic_city_No = list_ethnic_city.copy()
list_ethnic_city_No['voted'] = 'N'
list_ethnic_city_Yes = list_ethnic_city.copy()
list_ethnic_city_Yes['voted'] = 'Y'
list_ethnic_city = pd.concat([list_ethnic_city_No, list_ethnic_city_Yes])

In [23]:
list_ethnic_city

Unnamed: 0,Residence_Addresses_City,EthnicGroups_EthnicGroup1Desc,voted
0,Oakland,others,N
1,Oakland,black,N
2,Oakland,white,N
4,San Leandro,white,N
5,Livermore,white,N
...,...,...,...
3994321,Santa Rosa,asian,Y
3994376,Santa Rosa,others,Y
3994377,Santa Rosa,black,Y
4104814,Davis,others,Y


In [24]:
# we also need the total voters information per city and ethnicity
total_city_ethnic = merged_file.groupby(['Residence_Addresses_City', 'EthnicGroups_EthnicGroup1Desc']).size().reset_index()
total_city_ethnic.columns = ['Residence_Addresses_City', 'EthnicGroups_EthnicGroup1Desc', 'total_voters']
total_city_ethnic  = total_city_ethnic.merge(list_ethnic_city, on = ['Residence_Addresses_City', 'EthnicGroups_EthnicGroup1Desc'])

total_city_ethnic = replace_ethnicities(total_city_ethnic)
total_city_ethnic

Unnamed: 0,Residence_Addresses_City,EthnicGroups_EthnicGroup1Desc,total_voters,voted
0,Albany,asian,2405,N
1,Albany,asian,2405,Y
2,Albany,black,147,N
3,Albany,black,147,Y
4,Albany,hispanic,1035,N
...,...,...,...,...
395,Whittier,hispanic,76334,Y
396,Whittier,others,2525,N
397,Whittier,others,2525,Y
398,Whittier,white,26477,N


In [25]:
def calc_votes(df, col):
    voter_turnout_stats = df.groupby(['Residence_Addresses_City', 'EthnicGroups_EthnicGroup1Desc', col]).size().reset_index(name='voted_voters')

    # 'voted' is either 'Y' or 'N'
    voter_turnout_stats = voter_turnout_stats.rename(columns = {col: 'voted'})    

    voter_turnout_stats = total_city_ethnic.merge(voter_turnout_stats, 
                                                     how = 'left',
                                                     on = ['Residence_Addresses_City', 'EthnicGroups_EthnicGroup1Desc', 'voted']) 
    voter_turnout_stats['perc_turnout'] = voter_turnout_stats['voted_voters']/voter_turnout_stats['total_voters']*100

    voter_turnout_stats['elec_date'] = col[len(col)-10:]
    voter_turnout_stats['elec_year'] = col[len(col)-10:len(col)-6]
    voter_turnout_stats['elec_type'] = col[:len(col)-11]

    voter_turnout_stats[['voted_voters', 'perc_turnout']] = voter_turnout_stats[['voted_voters', 'perc_turnout']].fillna(0)
    voter_turnout_stats = voter_turnout_stats[voter_turnout_stats['voted'] == 'Y']    
    pivot_df = voter_turnout_stats.pivot(index = ['elec_type','elec_year', 'elec_date', 'Residence_Addresses_City'],
                                    columns='EthnicGroups_EthnicGroup1Desc', 
                                    values=['total_voters', 'voted_voters', 'perc_turnout']).reset_index()
    pivot_df.columns = pivot_df.columns.map('_'.join)
    pivot_df = pivot_df.rename(columns = {'elec_type_':'elec_type', 'elec_year_':'elec_year', 'elec_date_':'elec_date', 'Residence_Addresses_City_':'Residence_Addresses_City'})

    del voter_turnout_stats
    gc.collect()
    return pivot_df

elec_date_cols = GE_cols+LM_cols

for i in range(len(elec_date_cols)):
    col = elec_date_cols[i]
    pivot_df = calc_votes(merged_file, col)    
    # stack all types of election into one dataframe 
    if i == 0:
        voter_turnout_merge_ethnicity = pivot_df.copy() 
    else:
        voter_turnout_merge_ethnicity = pd.concat([voter_turnout_merge_ethnicity, pivot_df])


In [26]:
print(voter_turnout_merge_ethnicity.shape)
voter_turnout_merge_ethnicity.head()

(680, 19)


Unnamed: 0,elec_type,elec_year,elec_date,Residence_Addresses_City,total_voters_asian,total_voters_black,total_voters_hispanic,total_voters_others,total_voters_white,voted_voters_asian,voted_voters_black,voted_voters_hispanic,voted_voters_others,voted_voters_white,perc_turnout_asian,perc_turnout_black,perc_turnout_hispanic,perc_turnout_others,perc_turnout_white
0,General,2016,2016_11_08,Albany,2405.0,147.0,1035.0,641.0,6169.0,1396.0,88.0,668.0,379.0,4659.0,58.045738,59.863946,64.541063,59.126365,75.522775
1,General,2016,2016_11_08,Alhambra,17451.0,191.0,16596.0,969.0,7359.0,7041.0,90.0,9717.0,517.0,4574.0,40.347258,47.120419,58.550253,53.353973,62.155184
2,General,2016,2016_11_08,Anaheim,26340.0,1211.0,70052.0,8857.0,54644.0,12245.0,609.0,34883.0,4078.0,35829.0,46.488231,50.289017,49.795866,46.042678,65.56804
3,General,2016,2016_11_08,Bellflower,2153.0,3614.0,19899.0,1301.0,10792.0,840.0,2043.0,9474.0,542.0,6092.0,39.015327,56.53016,47.610433,41.660261,56.449222
4,General,2016,2016_11_08,Berkeley,8549.0,5942.0,6388.0,3626.0,39425.0,4602.0,3770.0,3636.0,2066.0,27737.0,53.830857,63.446651,56.919224,56.977386,70.353836


In [27]:
print(voter_turnout_merge_ethnicity.shape)
# remove rows where election dates are not associated with city
# need to do this only once as we will be using inner join to ensure only necessary combinations of city and election dates are present
voter_turnout_merge_ethnicity = GE_LM_dates_df.merge(voter_turnout_merge_ethnicity, 
                                how = 'left',
                                on = ['elec_type', 'elec_year', 'elec_date', 'Residence_Addresses_City'])
print(voter_turnout_merge_ethnicity.shape)

(680, 19)
(224, 19)


In [28]:
#should be empty dataframe because of the way we have filitered the dataframe

no_voter_turnout = voter_turnout_merge_ethnicity[(voter_turnout_merge_ethnicity['perc_turnout_asian'] == 0) &
                                       (voter_turnout_merge_ethnicity['perc_turnout_black'] == 0) &
                                       (voter_turnout_merge_ethnicity['perc_turnout_hispanic'] == 0) &
                                       (voter_turnout_merge_ethnicity['perc_turnout_white'] == 0)]

no_voter_turnout.head()

Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type,total_voters_asian,total_voters_black,total_voters_hispanic,total_voters_others,total_voters_white,voted_voters_asian,voted_voters_black,voted_voters_hispanic,voted_voters_others,voted_voters_white,perc_turnout_asian,perc_turnout_black,perc_turnout_hispanic,perc_turnout_others,perc_turnout_white


#  3.2.  Calculate average donation 

In [29]:
def calc_donation(df):
    donations_df = df[['Residence_Addresses_City', 'FECDonors_TotalDonationsAmount', 'FECDonors_NumberOfDonations']
                  + elec_date_cols]
    melt_donations_df = donations_df.melt(id_vars=['Residence_Addresses_City', 'FECDonors_TotalDonationsAmount', 'FECDonors_NumberOfDonations'], 
              value_vars=elec_date_cols,
              var_name='elec_type_date',
              value_name='voted')
    melt_donations_df = melt_donations_df[melt_donations_df['voted'] == 'Y']

    melt_donations_df = melt_donations_df.astype({'FECDonors_TotalDonationsAmount': float, 'FECDonors_NumberOfDonations': float})                        
    melt_donations_df = melt_donations_df.groupby(['Residence_Addresses_City', 'elec_type_date']).agg({'FECDonors_TotalDonationsAmount':'sum','FECDonors_NumberOfDonations':'sum'}).reset_index()    
    melt_donations_df['mean_donation'] = melt_donations_df['FECDonors_TotalDonationsAmount']/melt_donations_df['FECDonors_NumberOfDonations']
    melt_donations_df['elec_date'] = melt_donations_df['elec_type_date'].str[-10:]
    melt_donations_df['elec_year'] = melt_donations_df['elec_type_date'].str[-10:-6]
    melt_donations_df['elec_type'] = melt_donations_df['elec_type_date'].str[:-11]
    melt_donations_df = melt_donations_df.drop(columns = 'elec_type_date').reset_index(drop=True)
    
    return melt_donations_df

avg_donations = calc_donation(merged_file)
print(avg_donations.shape)
avg_donations.head()

(582, 7)


Unnamed: 0,Residence_Addresses_City,FECDonors_TotalDonationsAmount,FECDonors_NumberOfDonations,mean_donation,elec_date,elec_year,elec_type
0,Albany,63622.0,878.0,72.462415,2011_11_08,2011,Consolidated_General
1,Albany,50323.0,602.0,83.593023,2013_11_05,2013,Consolidated_General
2,Albany,63164.0,918.0,68.8061,2015_11_03,2015,Consolidated_General
3,Albany,3455.0,43.0,80.348837,2017_11_07,2017,Consolidated_General
4,Albany,44682.0,132.0,338.5,2019_11_05,2019,Consolidated_General


In [30]:
# Merge 3.1 and 3.2
voter_turnout_merge = voter_turnout_merge_ethnicity.merge(avg_donations, 
                                                          how = 'inner',
                                                          on = ['elec_type', 'elec_year', 'elec_date', 'Residence_Addresses_City'])

voter_turnout_merge.head()

Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type,total_voters_asian,total_voters_black,total_voters_hispanic,total_voters_others,total_voters_white,voted_voters_asian,...,voted_voters_others,voted_voters_white,perc_turnout_asian,perc_turnout_black,perc_turnout_hispanic,perc_turnout_others,perc_turnout_white,FECDonors_TotalDonationsAmount,FECDonors_NumberOfDonations,mean_donation
0,Oakland,2020_11_03,2020,General,30600.0,61476.0,37174.0,8628.0,83122.0,23041.0,...,6346.0,69989.0,75.297386,74.648643,72.507667,73.551229,84.200332,44186445.0,403388.0,109.538323
1,Oakland,2018_11_06,2018,General,30600.0,61476.0,37174.0,8628.0,83122.0,14972.0,...,4536.0,57872.0,48.928105,56.952307,48.036262,52.573018,69.622964,42827869.0,394704.0,108.506296
2,Oakland,2016_11_08,2016,General,30600.0,61476.0,37174.0,8628.0,83122.0,16057.0,...,4560.0,57968.0,52.473856,60.602512,53.241513,52.851182,69.738457,42296298.0,390007.0,108.4501
3,Oakland,2014_11_04,2014,General,30600.0,61476.0,37174.0,8628.0,83122.0,8145.0,...,2097.0,35411.0,26.617647,34.590735,22.15258,24.30459,42.601237,37016092.0,341253.0,108.471111
4,San Leandro,2020_11_03,2020,General,12705.0,5596.0,16028.0,2429.0,17780.0,9229.0,...,1729.0,14638.0,72.640693,76.822731,74.769154,71.181556,82.328459,1875137.0,30683.0,61.113222


#  3.3.  Calculate voter turnout per income

In [31]:
# percent missing values for income
print('Percent of rows with missing value for income:',
      100 * merged_file['CommercialData_EstimatedHHIncome'].isnull().sum() / merged_file.shape[0], '%')

Percent of rows with missing value for income: 1.5627218986359785 %


As long as this percentage is low, we can continue with our turnout calculations for income.

In [32]:
# Similar to before, but with income
list_income_city = merged_file[['Residence_Addresses_City', 'CommercialData_EstimatedHHIncome']].drop_duplicates()
list_income_city_No = list_income_city.copy()
list_income_city_No['voted'] = 'N'
list_income_city_Yes = list_income_city.copy()
list_income_city_Yes['voted'] = 'Y'
list_income_city = pd.concat([list_income_city_No, list_income_city_Yes])

In [33]:
# we also need the total voters information per city and income
total_city_income = merged_file.groupby(['Residence_Addresses_City', 'CommercialData_EstimatedHHIncome']).size().reset_index()
total_city_income.columns = ['Residence_Addresses_City', 'CommercialData_EstimatedHHIncome', 'total_voters']
total_city_income  = total_city_income.merge(list_income_city, on = ['Residence_Addresses_City', 'CommercialData_EstimatedHHIncome'])

total_city_income

Unnamed: 0,Residence_Addresses_City,CommercialData_EstimatedHHIncome,total_voters,voted
0,Albany,$1000-14999,95,N
1,Albany,$1000-14999,95,Y
2,Albany,$100000-124999,1423,N
3,Albany,$100000-124999,1423,Y
4,Albany,$125000-149999,1914,N
...,...,...,...,...
955,Whittier,$35000-49999,6486,Y
956,Whittier,$50000-74999,24453,N
957,Whittier,$50000-74999,24453,Y
958,Whittier,$75000-99999,25324,N


In [34]:
# function to calculate percent turnout by income bracket
def calc_votes_income(df, col):
    voter_turnout_stats = df.groupby(['Residence_Addresses_City', 'CommercialData_EstimatedHHIncome', col]).size().reset_index(name='voted_voters')

    # 'voted' is either 'Y' or 'N'
    voter_turnout_stats = voter_turnout_stats.rename(columns = {col: 'voted'})    

    voter_turnout_stats = total_city_income.merge(voter_turnout_stats, 
                                                     how = 'left',
                                                     on = ['Residence_Addresses_City', 'CommercialData_EstimatedHHIncome', 'voted']) 
    voter_turnout_stats['perc_turnout_income'] = voter_turnout_stats['voted_voters']/voter_turnout_stats['total_voters']*100

    voter_turnout_stats['elec_date'] = col[len(col)-10:]
    voter_turnout_stats['elec_year'] = col[len(col)-10:len(col)-6]
    voter_turnout_stats['elec_type'] = col[:len(col)-11]

    voter_turnout_stats[['voted_voters', 'perc_turnout_income']] = voter_turnout_stats[['voted_voters', 'perc_turnout_income']].fillna(0)
    voter_turnout_stats = voter_turnout_stats[voter_turnout_stats['voted'] == 'Y']    
    pivot_df = voter_turnout_stats.pivot(index = ['elec_type','elec_year', 'elec_date', 'Residence_Addresses_City'],
                                    columns='CommercialData_EstimatedHHIncome', 
                                    values=['total_voters', 'voted_voters', 'perc_turnout_income']).reset_index()
    pivot_df.columns = pivot_df.columns.map('_'.join)
    pivot_df = pivot_df.rename(columns = {'elec_type_':'elec_type', 'elec_year_':'elec_year', 'elec_date_':'elec_date', 'Residence_Addresses_City_':'Residence_Addresses_City'})

    del voter_turnout_stats
    gc.collect()
    return pivot_df

elec_date_cols = GE_cols+LM_cols

for i in range(len(elec_date_cols)):
    col = elec_date_cols[i]
    pivot_df = calc_votes_income(merged_file, col)    
    # stack all types of election into one dataframe 
    if i == 0:
        voter_turnout_income = pivot_df.copy() 
    else:
        voter_turnout_income = pd.concat([voter_turnout_income, pivot_df])


In [35]:
voter_turnout_income.head()

Unnamed: 0,elec_type,elec_year,elec_date,Residence_Addresses_City,total_voters_$1000-14999,total_voters_$100000-124999,total_voters_$125000-149999,total_voters_$15000-24999,total_voters_$150000-174999,total_voters_$175000-199999,...,perc_turnout_income_$125000-149999,perc_turnout_income_$15000-24999,perc_turnout_income_$150000-174999,perc_turnout_income_$175000-199999,perc_turnout_income_$200000-249999,perc_turnout_income_$25000-34999,perc_turnout_income_$250000+,perc_turnout_income_$35000-49999,perc_turnout_income_$50000-74999,perc_turnout_income_$75000-99999
0,General,2016,2016_11_08,Albany,95.0,1423.0,1914.0,118.0,2007.0,873.0,...,67.554859,72.881356,69.407075,69.530355,75.834292,71.428571,77.220481,52.883263,75.156576,69.230769
1,General,2016,2016_11_08,Alhambra,935.0,4523.0,3644.0,968.0,1289.0,1291.0,...,53.787047,58.057851,57.331265,56.390395,60.662702,59.437751,58.611987,51.902369,46.306916,52.131351
2,General,2016,2016_11_08,Anaheim,2576.0,17191.0,17997.0,3225.0,7429.0,7195.0,...,58.009668,55.72093,60.169606,60.416956,63.629931,56.384869,62.424904,49.550035,50.156898,51.960025
3,General,2016,2016_11_08,Bellflower,857.0,3339.0,2694.0,928.0,991.0,874.0,...,57.386785,56.896552,54.894046,57.894737,58.239278,51.428571,59.313725,44.479213,47.006791,50.769549
4,General,2016,2016_11_08,Berkeley,1050.0,6466.0,6376.0,777.0,3580.0,7520.0,...,61.135508,72.458172,68.156425,68.537234,70.482027,72.804233,76.09616,58.484288,57.99618,63.405702


In [36]:
# Merge 3.1, 3.2 and 3.3
# merge with previous calculations for race and donation
voter_turnout_merge = voter_turnout_merge.merge(voter_turnout_income, 
                                                          how = 'inner',
                                                          on = ['elec_type', 'elec_year', 'elec_date', 'Residence_Addresses_City'])

voter_turnout_merge.head()

Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type,total_voters_asian,total_voters_black,total_voters_hispanic,total_voters_others,total_voters_white,voted_voters_asian,...,perc_turnout_income_$125000-149999,perc_turnout_income_$15000-24999,perc_turnout_income_$150000-174999,perc_turnout_income_$175000-199999,perc_turnout_income_$200000-249999,perc_turnout_income_$25000-34999,perc_turnout_income_$250000+,perc_turnout_income_$35000-49999,perc_turnout_income_$50000-74999,perc_turnout_income_$75000-99999
0,Oakland,2020_11_03,2020,General,30600.0,61476.0,37174.0,8628.0,83122.0,23041.0,...,84.453235,74.51433,83.592018,86.407364,88.355971,70.880617,90.165301,66.725766,69.237349,77.473319
1,Oakland,2018_11_06,2018,General,30600.0,61476.0,37174.0,8628.0,83122.0,14972.0,...,67.691438,53.644932,66.312955,70.741416,73.604037,50.805719,74.267259,44.346439,47.246377,57.619007
2,Oakland,2016_11_08,2016,General,30600.0,61476.0,37174.0,8628.0,83122.0,16057.0,...,68.437412,58.068859,66.455496,71.728313,73.915299,55.696777,74.183914,49.470325,51.401758,60.54125
3,Oakland,2014_11_04,2014,General,30600.0,61476.0,37174.0,8628.0,83122.0,8145.0,...,39.841763,32.063858,37.313906,44.700614,46.793058,30.935089,45.346576,24.103542,25.31005,32.174318
4,San Leandro,2020_11_03,2020,General,12705.0,5596.0,16028.0,2429.0,17780.0,9229.0,...,78.059452,78.536184,79.563429,81.767338,85.227273,79.265306,83.159923,75.835756,72.907226,73.965751


In [37]:
# add one column that is just overall average income 
merged_file['CommercialData_EstimatedHHIncomeAmount']= merged_file['CommercialData_EstimatedHHIncomeAmount'].str.replace('$','', regex=False)

merged_file = merged_file.astype({'CommercialData_EstimatedHHIncomeAmount': float})
        
avg_income = merged_file[['Residence_Addresses_City', 'CommercialData_EstimatedHHIncomeAmount']].\
            groupby(['Residence_Addresses_City']).\
            mean().reset_index()

avg_income.head(10)

Unnamed: 0,Residence_Addresses_City,CommercialData_EstimatedHHIncomeAmount
0,Albany,144687.939527
1,Alhambra,94583.458682
2,Anaheim,104340.105909
3,Bellflower,85210.521807
4,Berkeley,149833.4451
5,Buena Park,106024.824131
6,Burbank,122254.424497
7,Calabasas,185269.946742
8,Carpinteria,121623.341137
9,Chino Hills,141805.006169


In [38]:
# Merge 3.1, 3.2 and 3.3
# merge with previous calculations for race, donation, income bracket
voter_turnout_merge = voter_turnout_merge.merge(avg_income, 
                                                          how = 'inner',
                                                          on = ['Residence_Addresses_City'])


voter_turnout_merge.head(10)

Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type,total_voters_asian,total_voters_black,total_voters_hispanic,total_voters_others,total_voters_white,voted_voters_asian,...,perc_turnout_income_$15000-24999,perc_turnout_income_$150000-174999,perc_turnout_income_$175000-199999,perc_turnout_income_$200000-249999,perc_turnout_income_$25000-34999,perc_turnout_income_$250000+,perc_turnout_income_$35000-49999,perc_turnout_income_$50000-74999,perc_turnout_income_$75000-99999,CommercialData_EstimatedHHIncomeAmount
0,Oakland,2020_11_03,2020,General,30600.0,61476.0,37174.0,8628.0,83122.0,23041.0,...,74.51433,83.592018,86.407364,88.355971,70.880617,90.165301,66.725766,69.237349,77.473319,115534.731762
1,Oakland,2018_11_06,2018,General,30600.0,61476.0,37174.0,8628.0,83122.0,14972.0,...,53.644932,66.312955,70.741416,73.604037,50.805719,74.267259,44.346439,47.246377,57.619007,115534.731762
2,Oakland,2016_11_08,2016,General,30600.0,61476.0,37174.0,8628.0,83122.0,16057.0,...,58.068859,66.455496,71.728313,73.915299,55.696777,74.183914,49.470325,51.401758,60.54125,115534.731762
3,Oakland,2014_11_04,2014,General,30600.0,61476.0,37174.0,8628.0,83122.0,8145.0,...,32.063858,37.313906,44.700614,46.793058,30.935089,45.346576,24.103542,25.31005,32.174318,115534.731762
4,San Leandro,2020_11_03,2020,General,12705.0,5596.0,16028.0,2429.0,17780.0,9229.0,...,78.536184,79.563429,81.767338,85.227273,79.265306,83.159923,75.835756,72.907226,73.965751,113020.551072
5,San Leandro,2018_11_06,2018,General,12705.0,5596.0,16028.0,2429.0,17780.0,5006.0,...,57.319079,55.109143,53.094705,58.454545,57.469388,56.917148,53.524709,46.529931,46.668362,113020.551072
6,San Leandro,2016_11_08,2016,General,12705.0,5596.0,16028.0,2429.0,17780.0,5602.0,...,65.378289,59.601392,58.948546,62.954545,65.22449,60.732177,60.537791,52.849988,52.449983,113020.551072
7,San Leandro,2014_11_04,2014,General,12705.0,5596.0,16028.0,2429.0,17780.0,2510.0,...,37.582237,31.414109,29.492916,32.818182,40.326531,31.984586,31.831395,24.314333,23.202781,113020.551072
8,Livermore,2020_11_03,2020,General,4749.0,287.0,9037.0,2161.0,35915.0,3795.0,...,81.359649,82.099479,84.836882,87.188224,86.183465,87.420483,85.608171,82.162512,83.389726,159543.656496
9,Livermore,2018_11_06,2018,General,4749.0,287.0,9037.0,2161.0,35915.0,2205.0,...,57.45614,57.779063,59.825394,64.195175,70.215176,62.181934,70.566388,62.997734,61.82713,159543.656496


#  3.4.  Calculate Voter turnout for college ed vs no college

In [39]:
# values for education column
merged_file['CommercialData_Education'].value_counts()

Some College - Likely                             416493
Bach Degree - Extremely Likely                    368922
Bach Degree - Likely                              303216
HS Diploma - Extremely Likely                     250953
HS Diploma - Likely                               214314
Grad Degree - Likely                              211235
Grad Degree - Extremely Likely                    191528
Less than HS Diploma - Likely                     156698
Some College -Extremely Likely                     75317
Vocational Technical Degree - Extremely Likely      2519
Less than HS Diploma - Ex Like                       127
Name: CommercialData_Education, dtype: int64

In [40]:
# add column to merged file for college or no college
college_ed = ['Some College - Likely', 'Bach Degree - Extremely Likely', 'Bach Degree - Likely', 
              'Grad Degree - Likely', 'Grad Degree - Extremely Likely', 'Some College -Extremely Likely']
no_college_ed = ['HS Diploma - Extremely Likely', 'HS Diploma - Likely', 'Less than HS Diploma - Likely', 
                'Vocational Technical Degree - Extremely Likely', 'Less than HS Diploma - Ex Like']

conditions = [merged_file['CommercialData_Education'].isin(college_ed), 
             merged_file['CommercialData_Education'].isin(no_college_ed)]
outputs = ['college', 'no_college']
education_col = np.select(conditions, outputs, None)
education_col = pd.Series(education_col)

# add to merged file
merged_file['College_Ed'] = education_col

In [41]:
merged_file.head()

Unnamed: 0,LALVOTERID,Consolidated_General_2011_11_08,Consolidated_General_2015_11_03,Local_or_Municipal_2017_04_11,Consolidated_General_2013_11_05,General_2016_11_08,General_2020_11_03,Consolidated_General_2017_11_07,General_2018_11_06,Local_or_Municipal_2019_03_05,...,Parties_Description,EthnicGroups_EthnicGroup1Desc,Voters_OfficialRegDate,County,CommercialData_Education,CommercialData_EstimatedHHIncome,CommercialData_EstimatedHHIncomeAmount,FECDonors_NumberOfDonations,FECDonors_TotalDonationsAmount,College_Ed
0,LALCA453164106,N,N,N,N,Y,Y,N,Y,N,...,Democratic,others,06/18/2021,ALAMEDA,,,,,,
1,LALCA453008306,N,N,N,N,N,N,N,Y,N,...,Non-Partisan,black,04/01/2021,ALAMEDA,,,,,,
2,LALCA22129469,N,N,N,N,Y,Y,N,Y,N,...,Democratic,white,11/16/2021,ALAMEDA,HS Diploma - Extremely Likely,,,,,no_college
3,LALCA549803906,N,N,N,N,N,Y,N,N,N,...,Democratic,others,02/07/2022,ALAMEDA,,,,,,
4,LALCA24729024,N,N,N,N,N,N,N,N,N,...,Democratic,white,02/28/2016,ALAMEDA,HS Diploma - Extremely Likely,,,,,no_college


In [42]:
# get counts for voters and college ed
list_edu_city = merged_file[['Residence_Addresses_City', 'College_Ed']].drop_duplicates()
list_edu_city_No = list_edu_city.copy()
list_edu_city_No['voted'] = 'N'
list_edu_city_Yes = list_edu_city.copy()
list_edu_city_Yes['voted'] = 'Y'
list_edu_city = pd.concat([list_edu_city_No, list_edu_city_Yes])

# we also need the total voters information per city and education
total_city_edu = merged_file.groupby(['Residence_Addresses_City', 'College_Ed']).size().reset_index()
total_city_edu.columns = ['Residence_Addresses_City', 'College_Ed', 'total_voters']
total_city_edu  = total_city_edu.merge(list_edu_city, on = ['Residence_Addresses_City', 'College_Ed'])

total_city_edu

Unnamed: 0,Residence_Addresses_City,College_Ed,total_voters,voted
0,Albany,college,5107,N
1,Albany,college,5107,Y
2,Albany,no_college,696,N
3,Albany,no_college,696,Y
4,Alhambra,college,13958,N
...,...,...,...,...
155,Watsonville,no_college,6446,Y
156,Whittier,college,37134,N
157,Whittier,college,37134,Y
158,Whittier,no_college,23619,N


In [43]:
# function to calculate percent turnout by income bracket
def calc_votes_edu(df, col):
    voter_turnout_stats = df.groupby(['Residence_Addresses_City', 'College_Ed', col]).size().reset_index(name='voted_voters')

    # 'voted' is either 'Y' or 'N'
    voter_turnout_stats = voter_turnout_stats.rename(columns = {col: 'voted'})    

    voter_turnout_stats = total_city_edu.merge(voter_turnout_stats, 
                                                     how = 'left',
                                                     on = ['Residence_Addresses_City', 'College_Ed', 'voted']) 
    voter_turnout_stats['perc_turnout'] = voter_turnout_stats['voted_voters']/voter_turnout_stats['total_voters']*100

    voter_turnout_stats['elec_date'] = col[len(col)-10:]
    voter_turnout_stats['elec_year'] = col[len(col)-10:len(col)-6]
    voter_turnout_stats['elec_type'] = col[:len(col)-11]

    voter_turnout_stats[['voted_voters', 'perc_turnout']] = voter_turnout_stats[['voted_voters', 'perc_turnout']].fillna(0)
    voter_turnout_stats = voter_turnout_stats[voter_turnout_stats['voted'] == 'Y']    
    pivot_df = voter_turnout_stats.pivot(index = ['elec_type','elec_year', 'elec_date', 'Residence_Addresses_City'],
                                    columns='College_Ed', 
                                    values=['total_voters', 'voted_voters', 'perc_turnout']).reset_index()
    pivot_df.columns = pivot_df.columns.map('_'.join)
    pivot_df = pivot_df.rename(columns = {'elec_type_':'elec_type', 'elec_year_':'elec_year', 'elec_date_':'elec_date', 'Residence_Addresses_City_':'Residence_Addresses_City'})

    del voter_turnout_stats
    gc.collect()
    return pivot_df

elec_date_cols = GE_cols+LM_cols

for i in range(len(elec_date_cols)):
    col = elec_date_cols[i]
    pivot_df = calc_votes_edu(merged_file, col)    
    # stack all types of election into one dataframe 
    if i == 0:
        voter_turnout_edu = pivot_df.copy() 
    else:
        voter_turnout_edu = pd.concat([voter_turnout_edu, pivot_df])

In [44]:
voter_turnout_edu.head()

Unnamed: 0,elec_type,elec_year,elec_date,Residence_Addresses_City,total_voters_college,total_voters_no_college,voted_voters_college,voted_voters_no_college,perc_turnout_college,perc_turnout_no_college
0,General,2016,2016_11_08,Albany,5107.0,696.0,4031.0,553.0,78.930879,79.454023
1,General,2016,2016_11_08,Alhambra,13958.0,7112.0,8595.0,4471.0,61.57759,62.865579
2,General,2016,2016_11_08,Anaheim,52558.0,28944.0,34878.0,19107.0,66.360973,66.013682
3,General,2016,2016_11_08,Bellflower,10496.0,7699.0,6468.0,4823.0,61.623476,62.644499
4,General,2016,2016_11_08,Berkeley,27403.0,3717.0,21904.0,2861.0,79.932854,76.970675


In [45]:
# Merge 3.1, 3.2, 3.3, and 3.4
voter_turnout_merge = voter_turnout_merge.merge(voter_turnout_edu, 
                                                          how = 'inner',
                                                          on = ['elec_type', 'elec_year', 'elec_date', 'Residence_Addresses_City'])

voter_turnout_merge.head()

Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type,total_voters_asian,total_voters_black,total_voters_hispanic,total_voters_others,total_voters_white,voted_voters_asian,...,perc_turnout_income_$35000-49999,perc_turnout_income_$50000-74999,perc_turnout_income_$75000-99999,CommercialData_EstimatedHHIncomeAmount,total_voters_college,total_voters_no_college,voted_voters_college,voted_voters_no_college,perc_turnout_college,perc_turnout_no_college
0,Oakland,2020_11_03,2020,General,30600.0,61476.0,37174.0,8628.0,83122.0,23041.0,...,66.725766,69.237349,77.473319,115534.731762,80668.0,27989.0,71356.0,22543.0,88.456389,80.542356
1,Oakland,2018_11_06,2018,General,30600.0,61476.0,37174.0,8628.0,83122.0,14972.0,...,44.346439,47.246377,57.619007,115534.731762,80668.0,27989.0,59248.0,17372.0,73.44672,62.067241
2,Oakland,2016_11_08,2016,General,30600.0,61476.0,37174.0,8628.0,83122.0,16057.0,...,49.470325,51.401758,60.54125,115534.731762,80668.0,27989.0,60766.0,18951.0,75.328507,67.708743
3,Oakland,2014_11_04,2014,General,30600.0,61476.0,37174.0,8628.0,83122.0,8145.0,...,24.103542,25.31005,32.174318,115534.731762,80668.0,27989.0,38216.0,10811.0,47.374424,38.625889
4,San Leandro,2020_11_03,2020,General,12705.0,5596.0,16028.0,2429.0,17780.0,9229.0,...,75.835756,72.907226,73.965751,113020.551072,19051.0,10653.0,16014.0,8674.0,84.05858,81.423073


#  3.5.  Calculate voter average age

In [46]:
def calc_age(df):
    age_merged_file = merged_file[['Residence_Addresses_City', 'Voters_BirthDate'] + elec_date_cols]
    melt_age_df = age_merged_file.melt(id_vars=['Residence_Addresses_City', 'Voters_BirthDate'],
                                       value_vars=elec_date_cols,
                                       var_name='elec_type_date',
                                       value_name='voted')
    melt_age_df = melt_age_df[melt_age_df['voted'] == 'Y']
    
    elec_date_dict = pd.DataFrame(elec_date_cols, columns = ['elec_type_date'])
    elec_date_dict['date'] = elec_date_dict['elec_type_date'].str.slice(-5,-3) + '/' + elec_date_dict['elec_type_date'].str.slice(-2) + '/' + elec_date_dict['elec_type_date'].str.slice(-10,-6)
    melt_age_df = melt_age_df.merge(elec_date_dict,
                                    how = 'inner', 
                                    on = ['elec_type_date'])
    melt_age_df.rename(columns = {'date': 'voting_date'}, inplace = True)
    melt_age_df['Voters_BirthDate'] = pd.to_datetime(melt_age_df['Voters_BirthDate'], format='%m/%d/%Y')
    melt_age_df['voting_date'] = pd.to_datetime(melt_age_df['voting_date'], format='%m/%d/%Y')
    melt_age_df['age_on_vote'] = (melt_age_df['voting_date'] - melt_age_df['Voters_BirthDate']) / np.timedelta64(1, 'Y')
    melt_age_df['elec_date'] = melt_age_df['elec_type_date'].str[-10:]
    melt_age_df['elec_year'] = melt_age_df['elec_type_date'].str[-10:-6]
    melt_age_df['elec_type'] = melt_age_df['elec_type_date'].str[:-11]
    
    age_df = melt_age_df.groupby(['Residence_Addresses_City', 'elec_date', 'elec_year', 'elec_type']).agg({'age_on_vote':'mean'}).reset_index()
    age_df.rename(columns = {'age_on_vote': 'mean_age'}, inplace = True)
    return age_df

In [47]:
age_df = calc_age(merged_file)
age_df

Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type,mean_age
0,Albany,2011_11_08,2011,Consolidated_General,40.756879
1,Albany,2013_11_05,2013,Consolidated_General,45.844953
2,Albany,2014_11_04,2014,General,52.884080
3,Albany,2015_11_03,2015,Consolidated_General,39.316921
4,Albany,2016_11_08,2016,General,49.504594
...,...,...,...,...,...
577,Whittier,2019_03_05,2019,Local_or_Municipal,47.278821
578,Whittier,2019_06_04,2019,Local_or_Municipal,89.680144
579,Whittier,2019_11_05,2019,Consolidated_General,49.308055
580,Whittier,2020_11_03,2020,General,47.928141


In [48]:
# Merge 3.1, 3.2, 3.3, 3.4 and 3.5

voter_turnout_merge = voter_turnout_merge.merge(age_df,
                                                how = 'inner',
                                                on = ['elec_type', 'elec_year', 'elec_date', 'Residence_Addresses_City'])

voter_turnout_merge.head()

Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type,total_voters_asian,total_voters_black,total_voters_hispanic,total_voters_others,total_voters_white,voted_voters_asian,...,perc_turnout_income_$50000-74999,perc_turnout_income_$75000-99999,CommercialData_EstimatedHHIncomeAmount,total_voters_college,total_voters_no_college,voted_voters_college,voted_voters_no_college,perc_turnout_college,perc_turnout_no_college,mean_age
0,Oakland,2020_11_03,2020,General,30600.0,61476.0,37174.0,8628.0,83122.0,23041.0,...,69.237349,77.473319,115534.731762,80668.0,27989.0,71356.0,22543.0,88.456389,80.542356,47.829424
1,Oakland,2018_11_06,2018,General,30600.0,61476.0,37174.0,8628.0,83122.0,14972.0,...,47.246377,57.619007,115534.731762,80668.0,27989.0,59248.0,17372.0,73.44672,62.067241,48.224196
2,Oakland,2016_11_08,2016,General,30600.0,61476.0,37174.0,8628.0,83122.0,16057.0,...,51.401758,60.54125,115534.731762,80668.0,27989.0,60766.0,18951.0,75.328507,67.708743,46.819512
3,Oakland,2014_11_04,2014,General,30600.0,61476.0,37174.0,8628.0,83122.0,8145.0,...,25.31005,32.174318,115534.731762,80668.0,27989.0,38216.0,10811.0,47.374424,38.625889,51.034918
4,San Leandro,2020_11_03,2020,General,12705.0,5596.0,16028.0,2429.0,17780.0,9229.0,...,72.907226,73.965751,113020.551072,19051.0,10653.0,16014.0,8674.0,84.05858,81.423073,49.75255


In [49]:
del age_df
gc.collect()

0

# 3.6 Calculate total population of age 20+

In [50]:
# read in cities.csv 
df_cities = pd.read_csv('../data/' + cities_filename)
df_cities = df_cities.dropna(subset=['population_proper'])

# make new column w/ voter_population, that is total population * percent20+
def calculate_voter_pop(row):
    percent_voting_age = row[['age_20s','age_30s', 'age_40s', 'age_50s', 'age_60s', 
                              'age_70s', 'age_over_80']].sum() / 100
    return int(np.floor(row['population_proper'] * percent_voting_age))

df_cities['voter_population'] = df_cities.apply(lambda row: calculate_voter_pop(row), axis=1)
df_cities.head()

Unnamed: 0,RCV,city,city_ascii,city_alt,state_id,state_name,county_fips,county_name,county_fips_all,county_name_all,...,race_other,race_multiple,hispanic,disabled,poverty,limited_english,commute_time,health_uninsured,veteran,voter_population
0,,Los Angeles,Los Angeles,,CA,California,6037,Los Angeles,6037,Los Angeles,...,22.9,3.5,48.7,10.0,20.4,14.9,30.9,15.5,2.9,3039816
1,1.0,San Francisco,San Francisco,,CA,California,6075,San Francisco,6075,San Francisco,...,7.5,5.1,15.3,10.6,11.7,11.8,32.8,5.5,3.3,751708
2,,San Diego,San Diego,,CA,California,6073,San Diego,6073,San Diego,...,6.2,5.1,30.0,9.0,14.5,7.2,24.0,10.4,8.4,1088768
3,,Riverside,Riverside,,CA,California,6065,Riverside,6065,Riverside,...,18.9,4.8,52.8,10.3,16.6,6.1,30.4,12.5,5.3,232359
4,,Sacramento,Sacramento,,CA,California,6067,Sacramento,6067,Sacramento,...,10.3,6.8,28.3,12.5,19.8,7.9,25.4,9.1,6.1,371908


In [51]:
# merge with all previous calculations
voter_turnout_merge = voter_turnout_merge.merge(df_cities[['city', 'voter_population']], how = 'inner', left_on = ['Residence_Addresses_City'], right_on = ['city'])
voter_turnout_merge.drop(columns=['city'], inplace=True)
voter_turnout_merge.head()

Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type,total_voters_asian,total_voters_black,total_voters_hispanic,total_voters_others,total_voters_white,voted_voters_asian,...,perc_turnout_income_$75000-99999,CommercialData_EstimatedHHIncomeAmount,total_voters_college,total_voters_no_college,voted_voters_college,voted_voters_no_college,perc_turnout_college,perc_turnout_no_college,mean_age,voter_population
0,Oakland,2020_11_03,2020,General,30600.0,61476.0,37174.0,8628.0,83122.0,23041.0,...,77.473319,115534.731762,80668.0,27989.0,71356.0,22543.0,88.456389,80.542356,47.829424,331652
1,Oakland,2018_11_06,2018,General,30600.0,61476.0,37174.0,8628.0,83122.0,14972.0,...,57.619007,115534.731762,80668.0,27989.0,59248.0,17372.0,73.44672,62.067241,48.224196,331652
2,Oakland,2016_11_08,2016,General,30600.0,61476.0,37174.0,8628.0,83122.0,16057.0,...,60.54125,115534.731762,80668.0,27989.0,60766.0,18951.0,75.328507,67.708743,46.819512,331652
3,Oakland,2014_11_04,2014,General,30600.0,61476.0,37174.0,8628.0,83122.0,8145.0,...,32.174318,115534.731762,80668.0,27989.0,38216.0,10811.0,47.374424,38.625889,51.034918,331652
4,San Leandro,2020_11_03,2020,General,12705.0,5596.0,16028.0,2429.0,17780.0,9229.0,...,73.965751,113020.551072,19051.0,10653.0,16014.0,8674.0,84.05858,81.423073,49.75255,70359


# Calculate total turnout for each election

In [52]:
# get counts for voters 
list_city = merged_file[['Residence_Addresses_City']].drop_duplicates()
list_city_No = list_city.copy()
list_city_No['voted'] = 'N'
list_city_Yes = list_city.copy()
list_city_Yes['voted'] = 'Y'
list_city = pd.concat([list_city_No, list_city_Yes])

list_city

Unnamed: 0,Residence_Addresses_City,voted
0,Oakland,N
4,San Leandro,N
5,Livermore,N
46,Berkeley,N
4538,Albany,N
...,...,...
3432084,Solvang,Y
3446681,Gilroy,Y
3446764,Watsonville,Y
3994274,Davis,Y


In [53]:
# Calculate total voters per election

def calc_turnout(df, col):
    # count number of Y for col, grouping by city
    voter_turnout_stats = df.groupby(['Residence_Addresses_City', col]).size().reset_index(name='voted_voters')
    
    # 'voted' is either 'Y' or 'N'
    voter_turnout_stats = voter_turnout_stats.rename(columns = {col: 'voted'}) 
    
    voter_turnout_stats = list_city.merge(voter_turnout_stats, 
                                                     how = 'left',
                                                     on = ['Residence_Addresses_City', 'voted']) 
    voter_turnout_stats['elec_date'] = col[len(col)-10:]
    voter_turnout_stats['elec_year'] = col[len(col)-10:len(col)-6]
    voter_turnout_stats['elec_type'] = col[:len(col)-11]
    
    voter_turnout_stats[['voted_voters']] = voter_turnout_stats[['voted_voters']].fillna(0)
    voter_turnout_stats = voter_turnout_stats[voter_turnout_stats['voted'] == 'Y'] 
    
    df_out = voter_turnout_stats[['Residence_Addresses_City', 'voted_voters', 'elec_date', 'elec_year', 'elec_type']]
    del voter_turnout_stats
    gc.collect()
    
    return df_out

elec_date_cols = GE_cols+LM_cols

for i in range(len(elec_date_cols)):
    col = elec_date_cols[i]
    df_out = calc_turnout(merged_file, col)    
    # stack all types of election into one dataframe 
    if i == 0:
        voter_turnout = df_out.copy() 
    else:
        voter_turnout = pd.concat([voter_turnout, df_out])
        
voter_turnout

Unnamed: 0,Residence_Addresses_City,voted_voters,elec_date,elec_year,elec_type
40,Oakland,135633.0,2016_11_08,2016,General
41,San Leandro,30717.0,2016_11_08,2016,General
42,Livermore,33594.0,2016_11_08,2016,General
43,Berkeley,41811.0,2016_11_08,2016,General
44,Albany,7190.0,2016_11_08,2016,General
...,...,...,...,...,...
75,Solvang,0.0,2017_02_28,2017,Local_or_Municipal
76,Gilroy,0.0,2017_02_28,2017,Local_or_Municipal
77,Watsonville,0.0,2017_02_28,2017,Local_or_Municipal
78,Davis,0.0,2017_02_28,2017,Local_or_Municipal


In [54]:
# merge with all previous calculations
voter_turnout_merge = voter_turnout_merge.merge(voter_turnout,
                                                how = 'inner',
                                                on = ['elec_type', 'elec_year', 'elec_date', 'Residence_Addresses_City'])

voter_turnout_merge.head(8)


Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type,total_voters_asian,total_voters_black,total_voters_hispanic,total_voters_others,total_voters_white,voted_voters_asian,...,CommercialData_EstimatedHHIncomeAmount,total_voters_college,total_voters_no_college,voted_voters_college,voted_voters_no_college,perc_turnout_college,perc_turnout_no_college,mean_age,voter_population,voted_voters
0,Oakland,2020_11_03,2020,General,30600.0,61476.0,37174.0,8628.0,83122.0,23041.0,...,115534.731762,80668.0,27989.0,71356.0,22543.0,88.456389,80.542356,47.829424,331652,172221.0
1,Oakland,2018_11_06,2018,General,30600.0,61476.0,37174.0,8628.0,83122.0,14972.0,...,115534.731762,80668.0,27989.0,59248.0,17372.0,73.44672,62.067241,48.224196,331652,130249.0
2,Oakland,2016_11_08,2016,General,30600.0,61476.0,37174.0,8628.0,83122.0,16057.0,...,115534.731762,80668.0,27989.0,60766.0,18951.0,75.328507,67.708743,46.819512,331652,135633.0
3,Oakland,2014_11_04,2014,General,30600.0,61476.0,37174.0,8628.0,83122.0,8145.0,...,115534.731762,80668.0,27989.0,38216.0,10811.0,47.374424,38.625889,51.034918,331652,75153.0
4,San Leandro,2020_11_03,2020,General,12705.0,5596.0,16028.0,2429.0,17780.0,9229.0,...,113020.551072,19051.0,10653.0,16014.0,8674.0,84.05858,81.423073,49.75255,70359,41879.0
5,San Leandro,2018_11_06,2018,General,12705.0,5596.0,16028.0,2429.0,17780.0,5006.0,...,113020.551072,19051.0,10653.0,11549.0,6219.0,60.62149,58.377922,51.129865,70359,27621.0
6,San Leandro,2016_11_08,2016,General,12705.0,5596.0,16028.0,2429.0,17780.0,5602.0,...,113020.551072,19051.0,10653.0,12551.0,7066.0,65.881056,66.328734,49.239731,70359,30717.0
7,San Leandro,2014_11_04,2014,General,12705.0,5596.0,16028.0,2429.0,17780.0,2510.0,...,113020.551072,19051.0,10653.0,6931.0,3729.0,36.381292,35.004224,54.201707,70359,15063.0


In [55]:
print(voter_turnout_merge.shape)

(192, 68)


In [56]:
## --- CA ---
## we found 24 cities out 40 with four most recent local_or_municipal + consolidated_genreal election dates on or after 2008.
## we found all general election dates for all 40 cities
print("expected number of rows for CA:", 24*8 + (40-24)*4)


expected number of rows for CA: 256


# Save the merged aggregations 

In [57]:
voter_turnout_merge.to_csv(f'{filepath}voter_turnout_merged_{state}.csv', index=False)

In [58]:
del voter_turnout_merge
gc.collect()

0

In [59]:
end_time = time.time()
print("Time take to run this notebook in seconds: ", end_time - start_time)

Time take to run this notebook in seconds:  270.4008569717407


# Combine all states into one file
Once the above code is run for all states and saved, run the code below to combine them into one. 

In [70]:
### Initiatize with California 

all_states_voter_turnout_merge = pd.read_csv('../data/VM2--CA--2022-04-25/voter_turnout_merged_CA.csv')
all_states_voter_turnout_merge['state'] = 'CA'

for state in ['CO', 'MD', 'ME', 'MN', 'NM', 'UT', 'VT']:

    ### Colorado
    if state=='CO':
        filepath = '../data/VM2--CO--2022-04-26/'

    ### Maryland
    if state=='MD':
        filepath = '../data/VM2--MD--2022-04-08/'

    ### Maine
    if state=='ME':
        filepath = '../data/VM2--ME--2022-03-02/'

    ### Minnesota
    if state=='MN':
        filepath = '../data/VM2--MN--2022-03-25/'

    ### New Mexico
    if state=='NM':
        filepath = '../data/VM2--NM--2022-03-30/'

    ### Utah
    if state=='UT':
        filepath = '../data/VM2--UT--2022-03-30/'

    ### Vermont
    if state=='VT':
        filepath = '../data/VM2--VT--2022-04-20/'
        
    voter_turnout_merge_state = pd.read_csv(f'{filepath}voter_turnout_merged_{state}.csv')
    voter_turnout_merge_state['state'] = state
    all_states_voter_turnout_merge = pd.concat([all_states_voter_turnout_merge, voter_turnout_merge_state])
    del voter_turnout_merge_state

In [71]:
all_states_voter_turnout_merge['state'].unique()

array(['CA', 'CO', 'MD', 'ME', 'MN', 'NM', 'UT', 'VT'], dtype=object)

In [72]:
all_states_voter_turnout_merge.to_csv(f'../data/voter_turnout_merged_all_states.csv', index=False)

In [73]:
all_states_voter_turnout_merge.head()

Unnamed: 0,Residence_Addresses_City,elec_date,elec_year,elec_type,total_voters_asian,total_voters_black,total_voters_hispanic,total_voters_others,total_voters_white,voted_voters_asian,...,total_voters_college,total_voters_no_college,voted_voters_college,voted_voters_no_college,perc_turnout_college,perc_turnout_no_college,mean_age,voter_population,voted_voters,state
0,Oakland,2020_11_03,2020,General,30600.0,61476.0,37174.0,8628.0,83122.0,23041.0,...,80668.0,27989.0,71356.0,22543.0,88.456389,80.542356,47.829424,331652.0,172221.0,CA
1,Oakland,2018_11_06,2018,General,30600.0,61476.0,37174.0,8628.0,83122.0,14972.0,...,80668.0,27989.0,59248.0,17372.0,73.44672,62.067241,48.224196,331652.0,130249.0,CA
2,Oakland,2016_11_08,2016,General,30600.0,61476.0,37174.0,8628.0,83122.0,16057.0,...,80668.0,27989.0,60766.0,18951.0,75.328507,67.708743,46.819512,331652.0,135633.0,CA
3,Oakland,2014_11_04,2014,General,30600.0,61476.0,37174.0,8628.0,83122.0,8145.0,...,80668.0,27989.0,38216.0,10811.0,47.374424,38.625889,51.034918,331652.0,75153.0,CA
4,San Leandro,2020_11_03,2020,General,12705.0,5596.0,16028.0,2429.0,17780.0,9229.0,...,19051.0,10653.0,16014.0,8674.0,84.05858,81.423073,49.75255,70359.0,41879.0,CA
