In [104]:
import pandas as pd
from zipfile import ZipFile
import os
import math
import numpy as np
from slugify import slugify
import locale
locale.setlocale(locale.LC_ALL, '')
# for printing dfs
pd.options.display.max_rows = 100
# for printing lists
pd.options.display.max_seq_items = 50

In [105]:
# Set relative filepaths
# Missouri voter data is obtained via public records request to the Elections Division, Office of Secretary of State
# More info about data source can be found in the README

__file__ = 'os.path.abspath('')'

script_dir = os.path.dirname(__file__)
rel_path = './data/Missouri'
abs_file_path = os.path.join(script_dir, rel_path)

In [106]:
# Show the name of the zipfile that is opened in the next step

files = os.listdir(abs_file_path)

In [107]:
# Read the zipfile

voters = (files[1])
zf = ZipFile(abs_file_path + "/" + voters)

In [108]:
# List files in zipfile

zf.namelist()

['data\\PSR_VotersList_01032023_9-51-24 AM.txt']

In [109]:
# Load data into dataframe, first with no header for processing reasons

voters = pd.read_csv(zf.open('data\\PSR_VotersList_01032023_9-51-24 AM.txt'), sep='\t', header=None)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [110]:
# Now set the first row as header

voters.columns = voters.iloc[0] 

In [111]:
# Checking the df

voters.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4268188 entries, 0 to 4268187
Data columns (total 54 columns):
 #   Column                     Dtype 
---  ------                     ----- 
 0   County                     object
 1   Voter ID                   object
 2   First Name                 object
 3   Middle Name                object
 4   Last Name                  object
 5   Suffix                     object
 6   House Number               object
 7   House Suffix               object
 8   Pre Direction              object
 9   Street Name                object
 10  Street Type                object
 11  Post Direction             object
 12  Unit Type                  object
 13  Unit Number                object
 14  Non Standard Address       object
 15  Residential City           object
 16  Residential State          object
 17  Residential ZipCode        object
 18  Mailing Address            object
 19  Mailing City               object
 20  Mailing State           

In [112]:
# Slugifying columns

voters.columns = voters.columns.str.replace(r'\W+', '_', regex=True)
voters.columns = [x.lower() for x in voters.columns]

In [113]:
# For some reason I can't figure out right now, the slugifying thru RE step left the original col names 
# But as the first records in the dataframe
# Here, we remove it

voters.drop(index=voters.index[0], axis=0, inplace=True)

In [114]:
# Checking the df

voters.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4268187 entries, 1 to 4268187
Data columns (total 54 columns):
 #   Column                     Dtype 
---  ------                     ----- 
 0   county                     object
 1   voter_id                   object
 2   first_name                 object
 3   middle_name                object
 4   last_name                  object
 5   suffix                     object
 6   house_number               object
 7   house_suffix               object
 8   pre_direction              object
 9   street_name                object
 10  street_type                object
 11  post_direction             object
 12  unit_type                  object
 13  unit_number                object
 14  non_standard_address       object
 15  residential_city           object
 16  residential_state          object
 17  residential_zipcode        object
 18  mailing_address            object
 19  mailing_city               object
 20  mailing_state           

In [115]:
# Checking the df 

voters.head()

Unnamed: 0,county,voter_id,first_name,middle_name,last_name,suffix,house_number,house_suffix,pre_direction,street_name,...,voter_history_11,voter_history_12,voter_history_13,voter_history_14,voter_history_15,voter_history_16,voter_history_17,voter_history_18,voter_history_19,voter_history_20
1,Adair,461017702,JOHN,WILLIAM,MCNEILL,,1306,,,ROOK,...,,,,,,,,,,
2,Adair,751833496,ALEXANDER,DOUGLAS STONEBURNER,KARST,,702,,S,SHERIDAN,...,,,,,,,,,,
3,Adair,751105687,KEVIN,LEE,WINDSPERGER,,17469,,,DAIRY,...,,,,,,,,,,
4,Adair,752025280,TAYLOR,ANN,CLAYTON,,809,,S,MULANIX,...,,,,,,,,,,
5,Adair,751367266,AUSTIN,BRADLEY,MORSE,,1214,,S,WABASH,...,,,,,,,,,,


In [116]:
# Columns vary in their degree of missing values\

voters.isna().sum()

county                             0
voter_id                           0
first_name                        30
middle_name                   401624
last_name                        456
suffix                       4114617
house_number                   31179
house_suffix                 4256636
pre_direction                2963552
street_name                    31198
street_type                   531846
post_direction               4233141
unit_type                    3874393
unit_number                  3874425
non_standard_address         4235200
residential_city                   0
residential_state                 16
residential_zipcode                0
mailing_address              4035157
mailing_city                 4039026
mailing_state                4039175
mailing_zipcode              4039164
birthdate                          0
political_party              4084653
registration_date                  0
precinct                           0
precinct_name                      0
s

In [117]:
# Assess the df for duplicates and create a col that denotes duplicates - the first dupe is NOT marked as a dupe
# So that we keep one of each duplicate

voters['duplicate'] = np.where(voters.duplicated(keep='first'), 'Yes','No')

In [118]:
voters.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4268187 entries, 1 to 4268187
Data columns (total 55 columns):
 #   Column                     Dtype 
---  ------                     ----- 
 0   county                     object
 1   voter_id                   object
 2   first_name                 object
 3   middle_name                object
 4   last_name                  object
 5   suffix                     object
 6   house_number               object
 7   house_suffix               object
 8   pre_direction              object
 9   street_name                object
 10  street_type                object
 11  post_direction             object
 12  unit_type                  object
 13  unit_number                object
 14  non_standard_address       object
 15  residential_city           object
 16  residential_state          object
 17  residential_zipcode        object
 18  mailing_address            object
 19  mailing_city               object
 20  mailing_state           

In [119]:
# Moving the duplicate column from the end of the voter history to the end of the data we will be keeping

dupe = voters.pop('duplicate')
voters.insert(33, dupe.name, dupe)

In [120]:
voters.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4268187 entries, 1 to 4268187
Data columns (total 55 columns):
 #   Column                     Dtype 
---  ------                     ----- 
 0   county                     object
 1   voter_id                   object
 2   first_name                 object
 3   middle_name                object
 4   last_name                  object
 5   suffix                     object
 6   house_number               object
 7   house_suffix               object
 8   pre_direction              object
 9   street_name                object
 10  street_type                object
 11  post_direction             object
 12  unit_type                  object
 13  unit_number                object
 14  non_standard_address       object
 15  residential_city           object
 16  residential_state          object
 17  residential_zipcode        object
 18  mailing_address            object
 19  mailing_city               object
 20  mailing_state           

In [123]:
voters.head()

Unnamed: 0,county,voter_id,first_name,middle_name,last_name,suffix,house_number,house_suffix,pre_direction,street_name,...,voter_history_11,voter_history_12,voter_history_13,voter_history_14,voter_history_15,voter_history_16,voter_history_17,voter_history_18,voter_history_19,voter_history_20
1,Adair,461017702,JOHN,WILLIAM,MCNEILL,,1306,,,ROOK,...,,,,,,,,,,
2,Adair,751833496,ALEXANDER,DOUGLAS STONEBURNER,KARST,,702,,S,SHERIDAN,...,,,,,,,,,,
3,Adair,751105687,KEVIN,LEE,WINDSPERGER,,17469,,,DAIRY,...,,,,,,,,,,
4,Adair,752025280,TAYLOR,ANN,CLAYTON,,809,,S,MULANIX,...,,,,,,,,,,
5,Adair,751367266,AUSTIN,BRADLEY,MORSE,,1214,,S,WABASH,...,,,,,,,,,,


In [124]:
yes = ['Yes']
voters.loc[voters['duplicate'].isin(yes)]

Unnamed: 0,county,voter_id,first_name,middle_name,last_name,suffix,house_number,house_suffix,pre_direction,street_name,...,voter_history_11,voter_history_12,voter_history_13,voter_history_14,voter_history_15,voter_history_16,voter_history_17,voter_history_18,voter_history_19,voter_history_20


In [66]:
# There are 20 voter history columns. We'll keep the most recent and store the others in a different dataframe.
# Also get the names of column headers so we can use them to put the columns we need in another dataframe.

column_headers = list(voters.columns.values)
del_cols = column_headers[35:]
print(del_cols)

['voter_history_2', 'voter_history_3', 'voter_history_4', 'voter_history_5', 'voter_history_6', 'voter_history_7', 'voter_history_8', 'voter_history_9', 'voter_history_10', 'voter_history_11', 'voter_history_12', 'voter_history_13', 'voter_history_14', 'voter_history_15', 'voter_history_16', 'voter_history_17', 'voter_history_18', 'voter_history_19', 'voter_history_20', 'Duplicate']


In [60]:
# Put old voter history 2-20 into a new, separate dataframe

voters2 = pd.DataFrame()
voters2 = pd.concat([voters2,voters[del_cols]],axis=0)

In [61]:
# Checking voter history dataframe

voters2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4268187 entries, 1 to 4268187
Data columns (total 19 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   voter_history_2   object
 1   voter_history_3   object
 2   voter_history_4   object
 3   voter_history_5   object
 4   voter_history_6   object
 5   voter_history_7   object
 6   voter_history_8   object
 7   voter_history_9   object
 8   voter_history_10  object
 9   voter_history_11  object
 10  voter_history_12  object
 11  voter_history_13  object
 12  voter_history_14  object
 13  voter_history_15  object
 14  voter_history_16  object
 15  voter_history_17  object
 16  voter_history_18  object
 17  voter_history_19  object
 18  voter_history_20  object
dtypes: object(19)
memory usage: 651.3+ MB


In [14]:
# Later on we will drop the extended voter history from the original, first df
# But for now we'll keep it, for the sake of the next step
# Check this data against previous MO voter data, and keep only voters not found in old data
# Loading 2020 data

voters20 = pd.read_csv('./data/mo_voters_2020.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [15]:
# Slugifying columns in 2020 data

voters20.columns = voters20.columns.str.replace(r'\W+', '_', regex=True)
voters20.columns = [x.lower() for x in voters20.columns]
voters20.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4609735 entries, 0 to 4609734
Data columns (total 36 columns):
 #   Column                Dtype  
---  ------                -----  
 0   county                object 
 1   voter_id              int64  
 2   first_name            object 
 3   middle_name           object 
 4   last_name             object 
 5   suffix                object 
 6   house_number          float64
 7   house_suffix          object 
 8   pre_direction         object 
 9   street_name           object 
 10  street_type           object 
 11  post_direction        object 
 12  unit_type             object 
 13  unit_number           object 
 14  non_standard_address  object 
 15  city                  object 
 16  state                 object 
 17  zip                   object 
 18  birth_date            object 
 19  reg_date              object 
 20  precinct              object 
 21  precinct_name         object 
 22  split                 object 
 23  townshi

In [29]:
# Comparing voter ID columns
# In 2020, the TAP team received a similar file. We are going to keep any
# registered voters not found in the current file.

idx1 = pd.Index(voters.voter_id)
idx2 = pd.Index(voters20.voter_id)

diff = idx2.difference(idx1).values
print("There are " + "{:,}".format((len(diff))) +  " voters, out of 4,268,187 in the 2020 data who are not in the current data.")

There are 706,371 voters, out of 4,268,187 in the 2020 data who are not in the current data.


In [17]:
# Convert diff array to list

diff = list(diff)

In [18]:
# Put those voters from 2020 data not in current data into a df; we'll need it later

keepers = voters20[voters20['voter_id'].isin(diff)]
keepers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 706372 entries, 0 to 4609734
Data columns (total 36 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   county                706372 non-null  object 
 1   voter_id              706372 non-null  int64  
 2   first_name            706358 non-null  object 
 3   middle_name           641694 non-null  object 
 4   last_name             706316 non-null  object 
 5   suffix                29167 non-null   object 
 6   house_number          671155 non-null  float64
 7   house_suffix          3461 non-null    object 
 8   pre_direction         213309 non-null  object 
 9   street_name           671150 non-null  object 
 10  street_type           598529 non-null  object 
 11  post_direction        5338 non-null    object 
 12  unit_type             113346 non-null  object 
 13  unit_number           113343 non-null  object 
 14  non_standard_address  35534 non-null   object 
 15 

In [19]:
# Drop voter history 2-20 from original dataframe

voters.drop(columns=['voter_history_2', 'voter_history_3', 'voter_history_4', 'voter_history_5', 'voter_history_6', 'voter_history_7', 'voter_history_8', 'voter_history_9', 'voter_history_10', 'voter_history_11', 'voter_history_12', 'voter_history_13', 'voter_history_14', 'voter_history_15', 'voter_history_16', 'voter_history_17', 'voter_history_18', 'voter_history_19', 'voter_history_20'], inplace=True)

In [20]:
voters.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4268188 entries, 0 to 4268187
Data columns (total 35 columns):
 #   Column                     Dtype 
---  ------                     ----- 
 0   county                     object
 1   voter_id                   object
 2   first_name                 object
 3   middle_name                object
 4   last_name                  object
 5   suffix                     object
 6   house_number               object
 7   house_suffix               object
 8   pre_direction              object
 9   street_name                object
 10  street_type                object
 11  post_direction             object
 12  unit_type                  object
 13  unit_number                object
 14  non_standard_address       object
 15  residential_city           object
 16  residential_state          object
 17  residential_zipcode        object
 18  mailing_address            object
 19  mailing_city               object
 20  mailing_state           

In [21]:
# Change names of columns to match old file 

voters.rename(columns = {'mailing_address':'address_clean', 'mailing_zipcode':'zip_clean', 'mailing_city':'city_clean', 'residential_city': 'city', 'residential_zip': 'zip', 'mailing_state': 'state', 'congressional_district_20': 'congressional', 'legislative_district_20': 'legislative', 'senate_district_20': 'senate', 'voter_history_1': 'last_election'}, inplace = True)

In [22]:
print(voters['mailing_address'].unique())

KeyError: 'mailing_address'

In [None]:
voters.info()

In [None]:
# For source and dupe_flag, we will create them and fill them with values to match the old file

voters['source'] = 1
voters['dupe_flag'] = 'FALSE'

In [None]:
# Drop the residential state field because we already have a state field 

voters.drop('residential_state', axis=1, inplace=True)

In [None]:
# Drop residential ZIP because we already have a ZIP field

voters.drop('residential_zipcode', axis=1, inplace=True)

In [None]:
voters.info()