In [30]:
import pandas as pd
import numpy as np
import janitor
import gc
import time
from datetime import datetime
import matplotlib.pyplot as plt
start_time = time.time()

In [2]:
state = 'california'

In [3]:
def combine_cities_list(RCV_list, NonRCV_list):

    print("total number of cities:", len(RCV_list))

    print("number of distinct cities:", len(set(NonRCV_list)))

    print("name of cities that were duplicated:", set([x for x in NonRCV_list if NonRCV_list.count(x) > 1]))

    combined_cityName = RCV_list+list(set(NonRCV_list))
    print("number of distinct RCV and sampled nonRCV cities:", len(combined_cityName))
    return combined_cityName

In [4]:
if state=='california':
    # ------ California -------

    ## change the filepath as required, we have selected the folder with the latest date

    filepath = '../data/VM2--CA--2022-04-25/'
    DEMO_filename = 'VM2--CA--2022-04-25-DEMOGRAPHIC_selected_cols.parquet'
    VOTE_filename = 'VM2--CA--2022-04-25-VOTEHISTORY_selected_cols.parquet'
    elec_dates_filename = 'GE_LM_dates_per_city_CA.parquet'
    cities_filename = 'ca-cities.csv'

    # 1. List of RCV and non-RCV cities 

    RCV_cities_CA = ['San Francisco',
     'Oakland',
     'Berkeley',
     'San Leandro',
     'Palm Desert',
     'Eureka',
     'Albany']

    sampled_nonRCV_cities_CA = ['Fresno',
     'San Diego',
     'Sacramento',
     'Riverside',
     'San Jose',
     'Santa Ana',
     'Anaheim',
     'Santa Rosa',
     'Merced',
     'Santa Clarita',
     'Alhambra',
     'Davis',
     'Montebello',
     'Burbank',
     'Huntington Park',
     'Bellflower',
     'Watsonville',
     'Gilroy',
     'Whittier',
     'Lynwood',
     'Lakewood',
     'Pico Rivera',
     'Lake Forest',
     'Livermore',
     'Chino Hills',
     'Paramount',
     'El Paso de Robles',
     'Pico Rivera',
     'Buena Park',
     'Whittier',
     'Calabasas',
     'Carpinteria',
     'Morro Bay',
     'San Carlos',
     'Solvang']

    combined_sampled_cityName = combine_cities_list(RCV_list= RCV_cities_CA, NonRCV_list = sampled_nonRCV_cities_CA)
    # ---------------------

total number of cities: 7
number of distinct cities: 33
name of cities that were duplicated: {'Pico Rivera', 'Whittier'}
number of distinct RCV and sampled nonRCV cities: 40


In [5]:
merge_filename = DEMO_filename.replace('DEMOGRAPHIC_selected_cols.parquet', 'merged.parquet')
merged_file = pd.read_parquet(f'{filepath}{merge_filename}')

In [7]:
def replace_ethnicities(df):
    df = df.replace('East and South Asian', 'asian')
    df = df.replace('European', 'white')
    df = df.replace('Hispanic and Portuguese', 'hispanic')
    df = df.replace('Likely African-American', 'black')
    df = df.replace('Other', 'others') # including 'others' ethnical group
    return df

In [8]:
merged_file = replace_ethnicities(merged_file)

In [9]:
GE_cols = [col for col in merged_file.columns if col.startswith('General')]
print(GE_cols)
LM_cols = [col for col in merged_file.columns if col.startswith('Local_or_Municipal') \
           or col.startswith('Consolidated_General')]
print(LM_cols)

['General_2014_11_04', 'General_2016_11_08', 'General_2018_11_06', 'General_2020_11_03']
['Consolidated_General_2019_11_05', 'Local_or_Municipal_2019_06_04', 'Local_or_Municipal_2019_08_27', 'Local_or_Municipal_2021_05_11', 'Local_or_Municipal_2021_04_20', 'Local_or_Municipal_2021_07_20', 'Local_or_Municipal_2021_06_01', 'Local_or_Municipal_2019_03_05', 'Local_or_Municipal_2020_04_14', 'Local_or_Municipal_2019_08_13', 'Local_or_Municipal_2021_06_08', 'Local_or_Municipal_2019_04_16', 'Consolidated_General_2021_11_02', 'Local_or_Municipal_2021_03_02', 'Consolidated_General_2017_11_07', 'Local_or_Municipal_2020_08_03']


In [10]:
# fill NA values with "N" to make it easier to compare  with "Y"
merged_file[GE_cols+LM_cols] = merged_file[GE_cols+LM_cols].fillna('N')
merged_file.head()

Unnamed: 0,LALVOTERID,Consolidated_General_2019_11_05,Local_or_Municipal_2019_06_04,Local_or_Municipal_2019_08_27,Local_or_Municipal_2021_05_11,Local_or_Municipal_2021_04_20,Local_or_Municipal_2021_07_20,General_2014_11_04,Local_or_Municipal_2021_06_01,Local_or_Municipal_2019_03_05,...,Voters_BirthDate,Parties_Description,EthnicGroups_EthnicGroup1Desc,Voters_OfficialRegDate,County,CommercialData_Education,CommercialData_EstimatedHHIncome,CommercialData_EstimatedHHIncomeAmount,FECDonors_NumberOfDonations,FECDonors_TotalDonationsAmount
0,LALCA453164106,N,N,N,N,N,N,N,N,N,...,04/29/1993,Democratic,others,06/18/2021,ALAMEDA,,,,,
1,LALCA453008306,N,N,N,N,N,N,N,N,N,...,02/02/1996,Non-Partisan,black,04/01/2021,ALAMEDA,,,,,
2,LALCA22129469,N,N,N,N,N,N,Y,N,N,...,02/02/1975,Democratic,white,11/16/2021,ALAMEDA,HS Diploma - Extremely Likely,,,,
3,LALCA549803906,N,N,N,N,N,N,N,N,N,...,02/09/1962,Democratic,others,02/07/2022,ALAMEDA,,,,,
4,LALCA24729024,N,N,N,N,N,N,N,N,N,...,01/01/1966,Democratic,white,02/28/2016,ALAMEDA,HS Diploma - Extremely Likely,,,,


In [11]:
merged_file.columns

Index(['LALVOTERID', 'Consolidated_General_2019_11_05',
       'Local_or_Municipal_2019_06_04', 'Local_or_Municipal_2019_08_27',
       'Local_or_Municipal_2021_05_11', 'Local_or_Municipal_2021_04_20',
       'Local_or_Municipal_2021_07_20', 'General_2014_11_04',
       'Local_or_Municipal_2021_06_01', 'Local_or_Municipal_2019_03_05',
       'Local_or_Municipal_2020_04_14', 'Local_or_Municipal_2019_08_13',
       'Local_or_Municipal_2021_06_08', 'General_2016_11_08',
       'Local_or_Municipal_2019_04_16', 'Consolidated_General_2021_11_02',
       'Local_or_Municipal_2021_03_02', 'General_2018_11_06',
       'Consolidated_General_2017_11_07', 'General_2020_11_03',
       'Local_or_Municipal_2020_08_03', 'Residence_Addresses_City',
       'Voters_Gender', 'Voters_Age', 'Voters_BirthDate',
       'Parties_Description', 'EthnicGroups_EthnicGroup1Desc',
       'Voters_OfficialRegDate', 'County', 'CommercialData_Education',
       'CommercialData_EstimatedHHIncome',
       'CommercialData_E

In [18]:
elec_date_cols = GE_cols+LM_cols

In [19]:
age_per_election = merged_file[['Residence_Addresses_City', 'Voters_BirthDate']+elec_date_cols].drop_duplicates()

In [20]:
age_per_election

Unnamed: 0,Residence_Addresses_City,Voters_BirthDate,General_2014_11_04,General_2016_11_08,General_2018_11_06,General_2020_11_03,Consolidated_General_2019_11_05,Local_or_Municipal_2019_06_04,Local_or_Municipal_2019_08_27,Local_or_Municipal_2021_05_11,...,Local_or_Municipal_2021_06_01,Local_or_Municipal_2019_03_05,Local_or_Municipal_2020_04_14,Local_or_Municipal_2019_08_13,Local_or_Municipal_2021_06_08,Local_or_Municipal_2019_04_16,Consolidated_General_2021_11_02,Local_or_Municipal_2021_03_02,Consolidated_General_2017_11_07,Local_or_Municipal_2020_08_03
0,Oakland,04/29/1993,N,Y,Y,Y,N,N,N,N,...,N,N,N,N,N,N,N,N,N,N
1,Oakland,02/02/1996,N,N,Y,N,N,N,N,N,...,N,N,N,N,N,N,N,N,N,N
2,Oakland,02/02/1975,Y,Y,Y,Y,N,N,N,N,...,N,N,N,N,N,N,N,N,N,N
3,Oakland,02/09/1962,N,N,N,Y,N,N,N,N,...,N,N,N,N,N,N,N,N,N,N
4,San Leandro,01/01/1966,N,N,N,N,N,N,N,N,...,N,N,N,N,N,N,N,N,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4140397,Davis,11/11/1995,N,N,N,Y,N,N,N,N,...,N,N,N,N,N,N,N,N,N,N
4140399,Davis,06/02/1996,N,N,N,Y,N,N,N,N,...,N,N,N,N,N,N,N,N,N,N
4140400,Davis,10/14/1995,N,Y,N,Y,N,N,N,N,...,N,N,N,N,N,N,N,N,N,N
4140401,Davis,10/29/1995,N,Y,Y,Y,N,N,N,N,...,N,N,N,N,N,N,N,N,N,N


In [21]:
melt_age_df = age_per_election.melt(id_vars=['Residence_Addresses_City', 'Voters_BirthDate'],
                                       value_vars=elec_date_cols,
                                       var_name='elec_type_date',
                                       value_name='voted')

In [22]:
melt_age_df

Unnamed: 0,Residence_Addresses_City,Voters_BirthDate,elec_type_date,voted
0,Oakland,04/29/1993,General_2014_11_04,N
1,Oakland,02/02/1996,General_2014_11_04,N
2,Oakland,02/02/1975,General_2014_11_04,Y
3,Oakland,02/09/1962,General_2014_11_04,N
4,San Leandro,01/01/1966,General_2014_11_04,N
...,...,...,...,...
42950095,Davis,11/11/1995,Local_or_Municipal_2020_08_03,N
42950096,Davis,06/02/1996,Local_or_Municipal_2020_08_03,N
42950097,Davis,10/14/1995,Local_or_Municipal_2020_08_03,N
42950098,Davis,10/29/1995,Local_or_Municipal_2020_08_03,N


In [23]:
melt_age_df = melt_age_df[melt_age_df['voted'] == 'Y']

In [24]:
elec_date_dict = pd.DataFrame(elec_date_cols, columns = ['elec_type_date'])
elec_date_dict['date'] = elec_date_dict['elec_type_date'].str.slice(-5,-3) + '/' + elec_date_dict['elec_type_date'].str.slice(-2) + '/' + elec_date_dict['elec_type_date'].str.slice(-10,-6)
melt_age_df = melt_age_df.merge(elec_date_dict,
                                    how = 'inner', 
                                    on = ['elec_type_date'])
melt_age_df.rename(columns = {'date': 'voting_date'}, inplace = True)
melt_age_df['Voters_BirthDate'] = pd.to_datetime(melt_age_df['Voters_BirthDate'], format='%m/%d/%Y')
melt_age_df['voting_date'] = pd.to_datetime(melt_age_df['voting_date'], format='%m/%d/%Y')
melt_age_df['age_on_vote'] = (melt_age_df['voting_date'] - melt_age_df['Voters_BirthDate']) / np.timedelta64(1, 'Y')
melt_age_df['elec_date'] = melt_age_df['elec_type_date'].str[-10:]
melt_age_df['elec_year'] = melt_age_df['elec_type_date'].str[-10:-6]
melt_age_df['elec_type'] = melt_age_df['elec_type_date'].str[:-11]

In [25]:
melt_age_df

Unnamed: 0,Residence_Addresses_City,Voters_BirthDate,elec_type_date,voted,voting_date,age_on_vote,elec_date,elec_year,elec_type
0,Oakland,1975-02-02,General_2014_11_04,Y,2014-11-04,39.754410,2014_11_04,2014,General
1,Livermore,1959-06-28,General_2014_11_04,Y,2014-11-04,55.355004,2014_11_04,2014,General
2,Livermore,1963-04-29,General_2014_11_04,Y,2014-11-04,51.519196,2014_11_04,2014,General
3,Oakland,1967-03-02,General_2014_11_04,Y,2014-11-04,47.677913,2014_11_04,2014,General
4,Oakland,1957-09-21,General_2014_11_04,Y,2014-11-04,57.120954,2014_11_04,2014,General
...,...,...,...,...,...,...,...,...,...
4847702,Davis,1950-02-28,Local_or_Municipal_2020_08_03,Y,2020-08-03,70.429920,2020_08_03,2020,Local_or_Municipal
4847703,Davis,1941-06-23,Local_or_Municipal_2020_08_03,Y,2020-08-03,79.114561,2020_08_03,2020,Local_or_Municipal
4847704,Davis,1964-07-29,Local_or_Municipal_2020_08_03,Y,2020-08-03,56.014839,2020_08_03,2020,Local_or_Municipal
4847705,Davis,1943-04-08,Local_or_Municipal_2020_08_03,Y,2020-08-03,77.323970,2020_08_03,2020,Local_or_Municipal


In [157]:
age_df = melt_age_df.groupby(['voting_date', 'elec_type']).agg({'age_on_vote':'mean'}).reset_index()
age_df.rename(columns = {'age_on_vote': 'mean_age'}, inplace = True)
age_df

Unnamed: 0,voting_date,elec_type,mean_age
0,2014-11-04,General,53.010559
1,2016-11-08,General,48.905987
2,2017-11-07,Consolidated_General,59.334618
3,2018-11-06,General,50.722472
4,2019-03-05,Local_or_Municipal,59.683766
5,2019-04-16,Local_or_Municipal,52.777832
6,2019-06-04,Local_or_Municipal,57.549057
7,2019-08-13,Local_or_Municipal,53.850441
8,2019-08-27,Local_or_Municipal,45.308939
9,2019-11-05,Consolidated_General,53.151171


In [155]:
# ploting the graph