# Reduce Data to parquet

**Note: Run only once**
- select a subset of columns
- save in parquet format

In [2]:
import pandas as pd
# change the filepath as required, we have selected the folder with the latest date
filepath = '../data/VM2--CA--2022-04-25/'

# 1. Reduce DEMOGRAPHIC file to parquet

**Note: Run only once**
- select a subset of demography columns
- save in parquet format

In [3]:
selected_variables = ['LALVOTERID',
                      'Residence_Addresses_City', 
                      'County',
                      'EthnicGroups_EthnicGroup1Desc',
                      'Voters_OfficialRegDate', 
                      'Voters_Age',
                      'Voters_Gender',        
                      'CommercialData_Education',
                      'CommercialData_EstimatedHHIncomeAmount',
                      'FECDonors_NumberOfDonations',
                      'FECDonors_TotalDonationsAmount', 
                      'Parties_Description'
                     ]

state_demographic = pd.read_csv(f'{filepath}VM2--CA--2022-04-25-DEMOGRAPHIC.tab', 
                                sep='\t', dtype=str, encoding='unicode_escape',
                                usecols=selected_variables)

state_demographic.head()

Unnamed: 0,LALVOTERID,Residence_Addresses_City,Voters_Gender,Voters_Age,Parties_Description,EthnicGroups_EthnicGroup1Desc,Voters_OfficialRegDate,County,CommercialData_Education,CommercialData_EstimatedHHIncomeAmount,FECDonors_NumberOfDonations,FECDonors_TotalDonationsAmount
0,LALCA453164106,Oakland,F,29,Democratic,Other,06/18/2021,ALAMEDA,,,,
1,LALCA453008306,Oakland,F,26,Non-Partisan,Likely African-American,04/01/2021,ALAMEDA,,,,
2,LALCA22129469,Oakland,F,47,Democratic,European,11/16/2021,ALAMEDA,HS Diploma - Extremely Likely,,,
3,LALCA549803906,Oakland,M,60,Democratic,Other,02/07/2022,ALAMEDA,,,,
4,LALCA24729024,San Leandro,F,56,Democratic,European,02/28/2016,ALAMEDA,HS Diploma - Extremely Likely,,,


In [4]:
state_demographic.to_parquet(f'{filepath}VM2--CA--2022-04-25-DEMOGRAPHIC_selected_cols.parquet')

# 2. Reduce VOTEHISTORY file to parquet

**Note: Run only once**
1. load the data with only one row to find out which dates are  "General" and "Local_Municipal" 
2. create a list of columns that contains those dates
3. reload the data with only those columns and the voter ID (`LALVOTERID`) and all rows
4. save the data in parquet format

In [5]:
# 1. select only one rows to find the column names that are General and Local_or_Municipal elections
state_voterhistory_cols = pd.read_csv(f'{filepath}VM2--CA--2022-04-25-VOTEHISTORY.tab',
                                 sep='\t', dtype=str, encoding='unicode_escape',
                                nrows=1)
                                
state_voterhistory_cols.head()

Unnamed: 0,LALVOTERID,Special_2022_04_19,Special_2022_04_12,Special_2022_04_05,Special_2022_02_15,Special_2022_02_01,Special_2021_12_14,Special_2021_12_07,Special_2021_11_02,Consolidated_General_2021_11_02,...,BallotReturnDate_General_2018_11_06,BallotReturnDate_Primary_2018_06_05,BallotReturnDate_General_2016_11_08,BallotReturnDate_Primary_2016_06_07,BallotReturnDate_General_2014_11_04,BallotReturnDate_Primary_2014_06_03,BallotReturnDate_General_2012_11_06,BallotReturnDate_Primary_2012_06_05,BallotReturnDate_General_2010_11_02,BallotReturnDate_Primary_2010_06_08
0,LALCA453164106,,,,,,,,,,...,,,11/07/2016,,,,,,,


In [7]:
#2. select only voter ID and columns with General or Local_or_Municipal election dates

GE_cols = [col for col in state_voterhistory_cols.columns if col.startswith('General')]
print("total number of General election dates", len(GE_cols))

LM_cols = [col for col in state_voterhistory_cols.columns if col.startswith('Local_or_Municipal')]
print("total number of Local or Municipal election dates", len(LM_cols))

needed_variables = ['LALVOTERID'] + LM_cols + GE_cols


total number of General election dates 18
total number of Local or Municipal election dates 131


In [8]:
#3. reload the data with all columns related to General or Local_or_Municipal election dates and all rows 
state_voterhistory = pd.read_csv(f'{filepath}VM2--CA--2022-04-25-VOTEHISTORY.tab',
                                 sep='\t', dtype=str, encoding='unicode_escape',
                                 usecols=needed_variables)
                                
state_voterhistory.head(5)

Unnamed: 0,LALVOTERID,Local_or_Municipal_2021_08_31,Local_or_Municipal_2021_07_20,Local_or_Municipal_2021_06_08,Local_or_Municipal_2021_06_01,Local_or_Municipal_2021_05_11,Local_or_Municipal_2021_05_04,Local_or_Municipal_2021_04_20,Local_or_Municipal_2021_03_09,Local_or_Municipal_2021_03_02,...,General_2004_11_02,General_2002_11_05,General_2000_11_07,General_1998_11_03,General_1996_11_05,General_1994_11_08,General_1992_11_03,General_1990_11_06,General_1988_11_08,General_1986_11_04
0,LALCA453164106,,,,,,,,,,...,,,,,,,,,,
1,LALCA453008306,,,,,,,,,,...,,,,,,,,,,
2,LALCA22129469,,,,,,,,,,...,Y,,,,,,,,,
3,LALCA549803906,,,,,,,,,,...,,,,,,,,,,
4,LALCA24729024,,,,,,,,,,...,,,,,,,,,,


In [10]:
#4. save to parquet format
state_voterhistory.to_parquet(f'{filepath}VM2--CA--2022-04-25-VOTEHISTORY_selected_cols.parquet')