In [1]:
import pandas as pd
import janitor
import gc

In [7]:
sampled_cityName = ['Fresno',
 'San Diego',
 'Sacramento',
 'Riverside',
 'San Jose',
 'Santa Ana',
 'Anaheim',
 'Santa Rosa',
 'Merced',
 'Santa Clarita',
 'Alhambra',
 'Davis',
 'Montebello',
 'Burbank',
 'Huntington Park',
 'Bellflower',
 'Watsonville',
 'Gilroy',
 'Whittier',
 'Lynwood',
 'Lakewood',
 'Pico Rivera',
 'Lake Forest',
 'Livermore',
 'Chino Hills',
 'Paramount',
 'El Paso de Robles',
 'Pico Rivera',
 'Buena Park',
 'Whittier',
 'Calabasas',
 'Carpinteria',
 'Morro Bay',
 'San Carlos',
 'Solvang']

## Demographic Data

1. Select only the columns required: city name ('Residence_Addresses_City'), unique voter id ('LALVOTERID'), voter's ethnicity ('EthnicGroups_EthnicGroup1Desc') and date when voter was registered ('Voters_OfficialRegDate')
2. Keep only the cities that were identified as being similar to RCV cities in CA (See ca_similarity_search.ipynb for reference) 
3. Keep only rows EthnicGroups_EthnicGroup1Desc == “European”,  “Likely African-American”,“Hispanic and Portuguese” and “East and South Asian” 
4. Keep only registered voters identified in 'Voters_OfficialRegDate'


In [6]:
# change the filepath as required
filepath = 'VM2--CA--2022-04-25/'

selected_variables = ['Residence_Addresses_City', 
                      'LALVOTERID',
                      'EthnicGroups_EthnicGroup1Desc',
                      'Voters_OfficialRegDate'
                     ]

state_demographic = pd.read_csv(f'{filepath}VM2--CA--2022-04-25-DEMOGRAPHIC.tab', 
                                sep='\t', dtype=str, encoding='unicode_escape',
                                usecols=selected_variables)


In [8]:
state_demographic.head(5)

Unnamed: 0,LALVOTERID,Residence_Addresses_City,EthnicGroups_EthnicGroup1Desc,Voters_OfficialRegDate
0,LALCA453164106,Oakland,Other,06/18/2021
1,LALCA453008306,Oakland,Likely African-American,04/01/2021
2,LALCA22129469,Oakland,European,11/16/2021
3,LALCA549803906,Oakland,Other,02/07/2022
4,LALCA24729024,San Leandro,European,02/28/2016


In [9]:
selected_ethnicities = ['European', 'Likely African-American','Hispanic and Portuguese', 'East and South Asian']

state_demographic_subset = state_demographic[state_demographic['Residence_Addresses_City'].isin(sampled_cityName) &
                                             state_demographic['EthnicGroups_EthnicGroup1Desc'].isin(selected_ethnicities) &
                                             state_demographic['Voters_OfficialRegDate'].notnull()
                                            ]
print(state_demographic_subset.shape)
state_demographic_subset.head()

(3110292, 4)


Unnamed: 0,LALVOTERID,Residence_Addresses_City,EthnicGroups_EthnicGroup1Desc,Voters_OfficialRegDate
6,LALCA22466723,Livermore,European,11/01/2021
7,LALCA22466636,Livermore,European,12/07/2021
8,LALCA22466642,Livermore,European,12/07/2021
115,LALCA581684072,Livermore,European,09/20/2019
116,LALCA549860099,Livermore,European,07/26/2020


In [None]:
del state_demographic
gc.collect()

## Vote History

1. Select only the columns that are 4 most recent General elections and 4 most recent Local_or_Municipal elections and EthnicGroups_EthnicGroup1Desc
2. Merge Vote History with the sampled Demographic Data 


In [10]:
# select only subset of rows to find the column names that are 4 most recent General and Local_or_Municipal elections
state_voterhistory = pd.read_csv(f'{filepath}VM2--CA--2022-04-25-VOTEHISTORY.tab',
                                 sep='\t', dtype=str, encoding='unicode_escape',
                                nrows=10)
                                
state_voterhistory.head(5)

Unnamed: 0,LALVOTERID,Special_2022_04_19,Special_2022_04_12,Special_2022_04_05,Special_2022_02_15,Special_2022_02_01,Special_2021_12_14,Special_2021_12_07,Special_2021_11_02,Consolidated_General_2021_11_02,...,BallotReturnDate_General_2018_11_06,BallotReturnDate_Primary_2018_06_05,BallotReturnDate_General_2016_11_08,BallotReturnDate_Primary_2016_06_07,BallotReturnDate_General_2014_11_04,BallotReturnDate_Primary_2014_06_03,BallotReturnDate_General_2012_11_06,BallotReturnDate_Primary_2012_06_05,BallotReturnDate_General_2010_11_02,BallotReturnDate_Primary_2010_06_08
0,LALCA453164106,,,,,,,,,,...,,,11/07/2016,,,,,,,
1,LALCA453008306,,,,,,,,,,...,,,,,,,,,,
2,LALCA22129469,,,,,,,,,,...,11/06/2018,,,,,,,,,
3,LALCA549803906,,,,,,,,,,...,,,,,,,,,,
4,LALCA24729024,,,,,,,,,,...,,,,,,,,,,


In [11]:
def get_4_recent_date(string, df):
    list_cols = [col for col in df.columns if col.startswith(string)]
    dates = [col.replace(string+'_', '') for col in list_cols]
    dates.sort(reverse=True)
    return [string+'_'+d for d in dates[:4]]

GE_cols = get_4_recent_date('General', state_voterhistory)
print(GE_cols)
LM_cols = get_4_recent_date('Local_or_Municipal', state_voterhistory)
print(LM_cols)

['General_2020_11_03', 'General_2018_11_06', 'General_2016_11_08', 'General_2014_11_04']
['Local_or_Municipal_2021_08_31', 'Local_or_Municipal_2021_07_20', 'Local_or_Municipal_2021_06_08', 'Local_or_Municipal_2021_06_01']


In [12]:
del state_voterhistory
gc.collect()

706

In [13]:
needed_variables = ['LALVOTERID'] + LM_cols + GE_cols

state_voterhistory = pd.read_csv(f'{filepath}VM2--CA--2022-04-25-VOTEHISTORY.tab',
                                 sep='\t', dtype=str, encoding='unicode_escape',
                                 usecols=needed_variables)
                                
state_voterhistory.head(5)

Unnamed: 0,LALVOTERID,Local_or_Municipal_2021_08_31,Local_or_Municipal_2021_07_20,Local_or_Municipal_2021_06_08,Local_or_Municipal_2021_06_01,General_2020_11_03,General_2018_11_06,General_2016_11_08,General_2014_11_04
0,LALCA453164106,,,,,Y,Y,Y,
1,LALCA453008306,,,,,,Y,,
2,LALCA22129469,,,,,Y,Y,Y,Y
3,LALCA549803906,,,,,Y,,,
4,LALCA24729024,,,,,,,,


In [14]:
merged_file = pd.merge(state_voterhistory, state_demographic_subset,
                       how='inner', left_on='LALVOTERID', right_on='LALVOTERID')

merged_file.head(5)

Unnamed: 0,LALVOTERID,Local_or_Municipal_2021_08_31,Local_or_Municipal_2021_07_20,Local_or_Municipal_2021_06_08,Local_or_Municipal_2021_06_01,General_2020_11_03,General_2018_11_06,General_2016_11_08,General_2014_11_04,Residence_Addresses_City,EthnicGroups_EthnicGroup1Desc,Voters_OfficialRegDate
0,LALCA22466723,,,,,,,,,Livermore,European,11/01/2021
1,LALCA22466636,,,,,Y,Y,Y,Y,Livermore,European,12/07/2021
2,LALCA22466642,,,,,Y,Y,Y,Y,Livermore,European,12/07/2021
3,LALCA581684072,,,,,Y,,,,Livermore,European,09/20/2019
4,LALCA549860099,,,,,Y,,,,Livermore,European,07/26/2020


In [15]:
print(merged_file.shape)

(3110292, 12)


In [16]:
merged_file.to_csv('VM2--CA--2022-04-25-MERGED.csv')