In [1]:
### Packages ###

In [2]:
import pandas as pd
import pandas_usaddress
from zipfile import ZipFile
import os
import math
import numpy as np
from slugify import slugify
import locale
from scourgify import normalize_address_record, NormalizeAddress
import pprint as pprint
import usaddress
from scourgify import normalize_address_record, NormalizeAddress
locale.setlocale(locale.LC_ALL, '')
# for printing dfs
pd.options.display.max_rows = 100
# for printing lists
pd.options.display.max_seq_items = 50



In [3]:
### Data ###

In [4]:
# Set relative filepaths
# Missouri voter data is obtained via public records request to the Elections Division, Office of Secretary of State
# More info about data source can be found in the README

__file__ = 'os.path.abspath('')'

script_dir = os.path.dirname(__file__)
rel_path = './data/Missouri'
abs_file_path = os.path.join(script_dir, rel_path)

In [5]:
### Read ###

In [6]:
# Get list of files from zipfile opened in next step

files = os.listdir(abs_file_path)

In [7]:
# Read the zipfile

voters = (files[1])
zf = ZipFile(abs_file_path + "/" + voters)

In [8]:
# List files in zipfile

zf.namelist()

['data\\PSR_VotersList_01032023_9-51-24 AM.txt']

In [9]:
# Load data into dataframe, first with no header for processing reasons

voters = pd.read_csv(zf.open('data\\PSR_VotersList_01032023_9-51-24 AM.txt'), sep='\t', header=None)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [10]:
# Now set the first row as header

voters.columns = voters.iloc[0] 

In [11]:
# Info on df

voters.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4268188 entries, 0 to 4268187
Data columns (total 54 columns):
 #   Column                     Dtype 
---  ------                     ----- 
 0   County                     object
 1   Voter ID                   object
 2   First Name                 object
 3   Middle Name                object
 4   Last Name                  object
 5   Suffix                     object
 6   House Number               object
 7   House Suffix               object
 8   Pre Direction              object
 9   Street Name                object
 10  Street Type                object
 11  Post Direction             object
 12  Unit Type                  object
 13  Unit Number                object
 14  Non Standard Address       object
 15  Residential City           object
 16  Residential State          object
 17  Residential ZipCode        object
 18  Mailing Address            object
 19  Mailing City               object
 20  Mailing State           

In [12]:
# Slugifying columns

voters.columns = voters.columns.str.replace(r'\W+', '_', regex=True)
voters.columns = [x.lower() for x in voters.columns]

In [13]:
# Fixing another issue with col names

voters.drop(index=voters.index[0], axis=0, inplace=True)

In [14]:
# Checking info on df

voters.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4268187 entries, 1 to 4268187
Data columns (total 54 columns):
 #   Column                     Dtype 
---  ------                     ----- 
 0   county                     object
 1   voter_id                   object
 2   first_name                 object
 3   middle_name                object
 4   last_name                  object
 5   suffix                     object
 6   house_number               object
 7   house_suffix               object
 8   pre_direction              object
 9   street_name                object
 10  street_type                object
 11  post_direction             object
 12  unit_type                  object
 13  unit_number                object
 14  non_standard_address       object
 15  residential_city           object
 16  residential_state          object
 17  residential_zipcode        object
 18  mailing_address            object
 19  mailing_city               object
 20  mailing_state           

In [15]:
# Creating a separate df of this exact copy of the data for testing later

testdf = voters.copy()
testdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4268187 entries, 1 to 4268187
Data columns (total 54 columns):
 #   Column                     Dtype 
---  ------                     ----- 
 0   county                     object
 1   voter_id                   object
 2   first_name                 object
 3   middle_name                object
 4   last_name                  object
 5   suffix                     object
 6   house_number               object
 7   house_suffix               object
 8   pre_direction              object
 9   street_name                object
 10  street_type                object
 11  post_direction             object
 12  unit_type                  object
 13  unit_number                object
 14  non_standard_address       object
 15  residential_city           object
 16  residential_state          object
 17  residential_zipcode        object
 18  mailing_address            object
 19  mailing_city               object
 20  mailing_state           

In [16]:
# Later on we will drop the extended voter history from the original, first df
# But for now we'll keep it, for the sake of the next step
# Check new, 2023 data against 2020 MO voter data, keep only voters not found in old data
# Loading 2020 data

voters20 = pd.read_csv('./data/mo_voters_2020.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [17]:
# Slugifying columns in 2020 data

voters20.columns = voters20.columns.str.replace(r'\W+', '_', regex=True)
voters20.columns = [x.lower() for x in voters20.columns]
voters20.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4609735 entries, 0 to 4609734
Data columns (total 36 columns):
 #   Column                Dtype  
---  ------                -----  
 0   county                object 
 1   voter_id              int64  
 2   first_name            object 
 3   middle_name           object 
 4   last_name             object 
 5   suffix                object 
 6   house_number          float64
 7   house_suffix          object 
 8   pre_direction         object 
 9   street_name           object 
 10  street_type           object 
 11  post_direction        object 
 12  unit_type             object 
 13  unit_number           object 
 14  non_standard_address  object 
 15  city                  object 
 16  state                 object 
 17  zip                   object 
 18  birth_date            object 
 19  reg_date              object 
 20  precinct              object 
 21  precinct_name         object 
 22  split                 object 
 23  townshi

In [18]:
# Comparing voter ID columns
# In 2020, the TAP team received a similar file. We are going to keep any
# registered voters, per voter ID, not found in the current file.

idx1 = pd.Index(voters.voter_id)
idx2 = pd.Index(voters20.voter_id)

diff = idx2.difference(idx1).values
print("There are " + "{:,}".format((len(diff))) +  " voters, out of 4,268,187 in the 2020 data who are not in the current data.")

There are 706,371 voters, out of 4,268,187 in the 2020 data who are not in the current data.


In [19]:
# Convert diff array to list

diff = list(diff)

In [20]:
# Put those voters from 2020 data not in current data into a df; we'll need it later

keepers = voters20[voters20['voter_id'].isin(diff)]
keepers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 706372 entries, 0 to 4609734
Data columns (total 36 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   county                706372 non-null  object 
 1   voter_id              706372 non-null  int64  
 2   first_name            706358 non-null  object 
 3   middle_name           641694 non-null  object 
 4   last_name             706316 non-null  object 
 5   suffix                29167 non-null   object 
 6   house_number          671155 non-null  float64
 7   house_suffix          3461 non-null    object 
 8   pre_direction         213309 non-null  object 
 9   street_name           671150 non-null  object 
 10  street_type           598529 non-null  object 
 11  post_direction        5338 non-null    object 
 12  unit_type             113346 non-null  object 
 13  unit_number           113343 non-null  object 
 14  non_standard_address  35534 non-null   object 
 15 

In [21]:
### Explore ###

In [22]:
# Print number of rows and cols

num_vot = len(voters)
nvot = '{:,}'.format(num_vot)
print("There are " + nvot + " rows and " + str(len(voters.columns)) + " columns.")

There are 4,268,187 rows and 54 columns.


In [23]:
### Missing ###

In [24]:
# Columns vary in their degree of missing values

voters.isna().sum()

county                             0
voter_id                           0
first_name                        30
middle_name                   401624
last_name                        456
suffix                       4114617
house_number                   31179
house_suffix                 4256636
pre_direction                2963552
street_name                    31198
street_type                   531846
post_direction               4233141
unit_type                    3874393
unit_number                  3874425
non_standard_address         4235200
residential_city                   0
residential_state                 16
residential_zipcode                0
mailing_address              4035157
mailing_city                 4039026
mailing_state                4039175
mailing_zipcode              4039164
birthdate                          0
political_party              4084653
registration_date                  0
precinct                           0
precinct_name                      0
s

In [25]:
### Duplicates ###

In [26]:
all_dupe = voters[voters.duplicated()]
all_dupe.info

<bound method DataFrame.info of Empty DataFrame
Columns: [county, voter_id, first_name, middle_name, last_name, suffix, house_number, house_suffix, pre_direction, street_name, street_type, post_direction, unit_type, unit_number, non_standard_address, residential_city, residential_state, residential_zipcode, mailing_address, mailing_city, mailing_state, mailing_zipcode, birthdate, political_party, registration_date, precinct, precinct_name, split, township, ward, congressional_district_20, legislative_district_20, senate_district_20, voter_status, voter_history_1, voter_history_2, voter_history_3, voter_history_4, voter_history_5, voter_history_6, voter_history_7, voter_history_8, voter_history_9, voter_history_10, voter_history_11, voter_history_12, voter_history_13, voter_history_14, voter_history_15, voter_history_16, ...]
Index: []

[0 rows x 54 columns]>

In [27]:
# Check for duplicate voter IDs

voterids = voters.duplicated(subset=["voter_id"])
print("Duplicate voter IDs:")

# Print voter ID records only if duplicate = True
if voterids[2] == True:
    print(voterids)

Duplicate voter IDs:


In [28]:
# Check for duplicate voter IDs and create a Boolean Series

is_duplicate = voters['voter_id'].duplicated(keep=False)

In [29]:
# Create 'dupe_flag column' and fill it with TRUE or FALSE based on the Boolean Series
voters['dupe_flag'] = is_duplicate.map({True: 'TRUE', False: 'FALSE'})

In [30]:
# Check for duplicates across name, birthdate, mailing ZIP

dupe_names = voters.duplicated(subset=["voter_id", "first_name", "last_name", "birthdate", "mailing_zipcode"])
names_dupe = voters[dupe_names]
print("Duplicate rows:")
print(names_dupe)

# In the 2023 data, there appear to be no duplicates.

Duplicate rows:
Empty DataFrame
Columns: [county, voter_id, first_name, middle_name, last_name, suffix, house_number, house_suffix, pre_direction, street_name, street_type, post_direction, unit_type, unit_number, non_standard_address, residential_city, residential_state, residential_zipcode, mailing_address, mailing_city, mailing_state, mailing_zipcode, birthdate, political_party, registration_date, precinct, precinct_name, split, township, ward, congressional_district_20, legislative_district_20, senate_district_20, voter_status, voter_history_1, voter_history_2, voter_history_3, voter_history_4, voter_history_5, voter_history_6, voter_history_7, voter_history_8, voter_history_9, voter_history_10, voter_history_11, voter_history_12, voter_history_13, voter_history_14, voter_history_15, voter_history_16, ...]
Index: []

[0 rows x 55 columns]


In [31]:
### Categorical ###

In [32]:
# This time, we only have year for birthdate. That is how it came in the raw data. 
# MO used to give full birth date but changed the law 
# Before generating a list of unique values, cast to int
# Generate a series that tells us how many unique values in each field
# Put the series into a dataframe

cols_to_ints = ['birthdate']
voters[cols_to_ints] = voters[cols_to_ints].astype(int)
unique = voters.nunique()
unique_df = pd.DataFrame({"Column": unique.index, "Unique Values": unique.values})
print(unique_df)

                       Column  Unique Values
0                      county            116
1                    voter_id        4268187
2                  first_name         140907
3                 middle_name         133899
4                   last_name         229386
5                      suffix            985
6                house_number          67853
7                house_suffix            106
8               pre_direction              9
9                 street_name          55604
10                street_type            109
11             post_direction              9
12                  unit_type             32
13                unit_number          14843
14       non_standard_address          21251
15           residential_city            989
16          residential_state              2
17        residential_zipcode         101865
18            mailing_address         112175
19               mailing_city           3859
20              mailing_state             60
21        

In [33]:
### Dates ###

In [34]:
# Be sure this col is cast as int before running .unique()

bdunq =  voters["birthdate"].unique()
print(bdunq)

[1988 2001 1970 2002 1996 2000 1992 1995 1965 1946 1966 1956 1986 1964
 1962 1963 1943 1982 1998 1952 1968 1985 1976 1955 1958 1967 1994 1940
 1961 1991 1947 1980 1979 1957 1974 1997 1999 1972 1990 1977 1973 1983
 1989 1984 1931 1948 1936 1933 1939 1978 1944 1945 1951 1950 1993 1929
 1941 1937 1938 1975 1953 1942 1960 1969 1954 1971 1987 1981 1932 1959
 1934 1949 1935 1927 1930 1922 1928 1925 1926 1920 1919 1924 1921 2004
 2003 1923 1918 2005 1917 1901 1916 1910 1914 1902 1912 1908 1909 1900
 1913 1907 1915 1911 1906 1903 1893 1905 1904 1899 1800 1895    1]


In [36]:
# There are voters — more than 1,000 of them — whose birth dates are 1800, 1895, 1899
# There are more than 3,400 voters who are over 100 years old
# I have an inquiry in to the Missouri Secretary of State about this
# We note it in the data description but leave them in the final output here

# over_100 = [1800, 1895, 1899, 1901, 1902, 1903, 1904, 1905, 1906, 1907, 1908, 1909, 1910, 1911, 1912, 1913, 1914, 1915, 1916, 1917, 1918, 1919, 1920, 1921, 1922]
supercents = [1800, 1895, 1899, 1901, 1902, 1903, 1904, 1905, 1906, 1907, 1908, 1909, 1910, 1911, 1912, 1913]
supercentsdf = voters[voters["birthdate"].isin(supercents)]
supercentsdf.to_csv('supercents_23.csv')

In [42]:
# Let's look at supercentenarians, and their voting history, in the 2020 data as well
keepers['birth_date'] = pd.to_datetime(keepers['birth_date'])
supercentsdf20 = keepers[keepers['birth_date'].dt.year < 1913].copy()
supercentsdf20.to_csv('supercents20.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [40]:
# For the sake of notation I've left this in, which will filter the data to people over 100
# And also generate a file of only those rows

# over100 = voters[voters["birthdate"].isin(over_100)]
# over100.to_csv('centenarians.csv')

In [41]:
# Registration date is the date that will be used in final TAP output data
# Doing the same sort of quality check as the birthdate col 
# Only these dates are month, day, year, so format them as datetime
# Errors = coerce will put NaN values in any invalid values

voters['registration_date'] = pd.to_datetime(voters['registration_date'], errors="coerce")
unique = voters.nunique()
unique_df = pd.DataFrame({"Column": unique.index, "Unique Values": unique.values})
print(unique_df)

                       Column  Unique Values
0                      county            116
1                    voter_id        4268187
2                  first_name         140907
3                 middle_name         133899
4                   last_name         229386
5                      suffix            985
6                house_number          67853
7                house_suffix            106
8               pre_direction              9
9                 street_name          55604
10                street_type            109
11             post_direction              9
12                  unit_type             32
13                unit_number          14843
14       non_standard_address          21251
15           residential_city            989
16          residential_state              2
17        residential_zipcode         101865
18            mailing_address         112175
19               mailing_city           3859
20              mailing_state             60
21        

In [46]:
# To check the MO SOS claim about people who registered prior to 2005, let's filter both 
# Sets of data to just voters registered prior to 2005
# I want to see what the breakdown of birthdays is in those voters

keepers['reg_date'] = pd.to_datetime(keepers['reg_date'], errors='coerce')
voters_05_23 = voters[voters['registration_date'].dt.year < 2005].copy()
voters_05_20 = keepers[keepers['reg_date'].dt.year < 2005].copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [56]:
values_bdays = voters_05_23['birthdate'].value_counts
print(values_bdays)

<bound method IndexOpsMixin.value_counts of 13         1946
14         1966
15         1970
16         1956
19         1962
           ... 
4268175    1957
4268182    1953
4268183    1953
4268184    1951
4268185    1986
Name: birthdate, Length: 1552501, dtype: int64>


In [102]:
### Wrangle ###
# Perform some consistent string normalization on address variables

In [103]:
### Address ###
# Create a single, unified normalized address field

voters["address_norm"] = voters["house_number"].astype(str) +"-"+ voters["house_suffix"].astype(str) +"-"+ voters["pre_direction"].astype(str) +"-"+ voters["street_name"].astype(str) +"-"+ voters["street_type"].astype(str) +"-"+ voters["post_direction"].astype(str) +"-"+ voters["unit_type"].astype(str) +"-"+ voters["unit_number"].astype(str) +"-"+ voters["non_standard_address"].astype(str)

In [104]:
# Replace and clean up nan values in the address_norm field

voters["address_norm"] = voters["address_norm"].str.replace('nan-', '')
voters["address_norm"] = voters["address_norm"].str.replace('-nan', '')
voters["address_norm"] = voters["address_norm"].str.replace(' ', '-')
voters["address_norm"] = voters["address_norm"].str.replace('-', ' ')

In [105]:
# First, fix inconsistent/misspellings/etc that may confuse code, in street type field
# I can't believe this exact dict, that I created, doesn't already exist, but
# I didn't find it anywhere, and found people on Stack asking how to do what I'm doing here

voters['street_type'].unique()

array(['DR', 'ST', 'WAY', nan, 'CT', 'AVE', 'LN', 'TRL', 'PL', 'RD',
       'PLZ', 'XXXXX', 'TER', 'CIRC', 'SQ', 'BLVD', 'TERR', 'EST', 'RDG',
       'CIR', 'LANE', 'VLG', 'CV', 'PKWY', 'RD.', 'LOOP', 'HWY', 'BND',
       'CR', 'RUN', 'ALY', 'DM', 'LK', 'HGTS', 'PT', 'SPGS', 'ST.', 'A',
       'HLS', 'PARK', 'VW', 'BR', 'GRV', 'VLY', 'HOLW', 'TRLS', 'APT',
       'PASS', 'COND', 'COR', 'TRCE', 'HILL', 'PK', 'JCT', 'BLF', 'XING',
       'AV', 'BYP', 'PATH', 'ESTS', 'TR', 'TRFY', 'MEWS', 'WAYE', 'GDNS',
       'SPUR', 'CRK', 'HL', 'HTS', 'S', 'LP', 'COVE', 'LNDG', 'RUE',
       'MNR', 'WALK', 'FLDS', 'EXPY', 'PSGE', 'WY', 'PKY', 'EXT', 'GLN',
       'FRK', 'BRK', 'CTR', 'TPKE', 'MDWS', 'TFWY', 'HVN', 'RD2', 'PLN',
       'MHP', 'ROW', 'ANX', 'STA', 'MDW', 'CMN', 'IS', 'CRST', 'CLB',
       'HBR', 'FRST', 'SHR', 'CORS', 'MTN', 'MWS', 'GTWY', 'RNCH', 'FLD'],
      dtype=object)

In [106]:
# Translations are taken from here: https://pe.usps.com/text/pub28/28apc_002.htm
# Any others needed are added (XXXXX = redacted)
# I put those into a Google Sheet, then saved it as a .csv
# Then into a df here
# Then map the values defined in the dict to the col in the df
# Then make replacements

usps_street_types = pd.read_csv('usps_street_types.csv')
mapping_dict = usps_street_types.set_index('abbr')['full'].to_dict()
voters['street_type'] = voters['street_type'].replace(mapping_dict)

In [107]:
# Cq'd any that did not show up in USPS by Googling the address
# Not sure about MHP, which in Missouri usually stands for Missouri Highway Patrol
# Also not sure about ANEX

voters['street_type'].unique()

array(['DRIVE', 'STREET', 'WAY', nan, 'COURT', 'AVENUE', 'LANE', 'TRAIL',
       'PLACE', 'ROAD', 'PLAZA', 'redacted', 'TERRACE', 'CIRCLE',
       'SQUARE', 'BOULEVARD', 'ESTATE', 'RIDGE', 'VILLAGE', 'COVE',
       'PARKWAY', 'LOOP', 'HIGHWAY', 'BEND', 'RUN', 'ALLEY', 'DAM',
       'LAKE', 'HEIGHTS', 'POINT', 'SPRINGS', 'A', 'HILLS', 'PARK',
       'VIEW', 'BRANCH', 'GROVE', 'VALLEY', 'HOLLOW', 'APT', 'PASS',
       'COND', 'CORNER', 'TRACE', 'HILL', 'JUNCTION', 'BLUFF', 'CROSSING',
       'BYPASS', 'PATH', 'ESTATES', 'TRAILS', 'TRAFFICWAY', 'MEWS',
       'WAYE', 'GARDENS', 'SPUR', 'CREEK', 'LANDING', 'RUE', 'MANOR',
       'WALK', 'FIELDS', 'EXPRESSWAY', 'PASSAGE', 'EXTENSION', 'GLEN',
       'FORK', 'BROOK', 'CENTER', 'TURNPIKE', 'MEADOWS', 'HAVEN',
       'ROAD #2', 'PLAIN', 'MHP', 'ROW', 'ANEX', 'STATION', 'COMMON',
       'ISLAND', 'CREST', 'CLUB', 'HARBOR', 'FOREST', 'SHORE', 'CORNERS',
       'MOUNTAIN', 'GATEWAY', 'RANCH', 'FIELD'], dtype=object)

In [110]:
### ZIP codes ###

# Let's see what we're working with 

zip_code_lengths1 = voters['residential_zipcode'].str.len().value_counts()
print(zip_code_lengths1)

5.0     3808894
10.0     270265
7.0       57759
9.0         161
8.0          36
Name: residential_zipcode, dtype: int64


In [111]:
# There are some 9-digit ZIPs in here; we clean those up to be just 5 digits

voters['zip_clean'] = voters['residential_zipcode'].str.slice(0, 5)

In [112]:
# This is to be sure the 5-digit ZIPs that begin with leading 0s do have the leading 0s -
# Python strips them, and, when we export to .csv, they will not show up in Excel - so the ZIP 01234 would appear as 1234
# However, if you open the file in Sublime Text, the 0s are there

voters['zip_clean'] = voters['zip_clean'].str.zfill(5)

In [113]:
zip_code_lengths2 = voters['zip_clean'].str.len().value_counts()
print(zip_code_lengths2)

5.0    4137115
Name: zip_clean, dtype: int64


In [48]:
### State ### 

In [49]:
# As expected, all voters registered to vote live in the state of Missouri

voters['residential_state'].unique()

array(['MO', 'XXXXX', nan], dtype=object)

In [50]:
### City ###
# Cities are the most difficult to normalize bc of wide variety of cities/formats
# State seems to have normalized them, though
# The only step left w/ addresses would be to standardize these fields/addresses against USPS
# There are several Python libraries that do this, but 
# Doing so is outside of the scope of work here, so we leave it

voters['residential_city'].unique()

array(['KIRKSVILLE', 'BRASHEAR', 'GIBBS', 'LA PLATA', 'NOVINGER',
       'GREENTOP', 'GREENCASTLE', 'XXXXX', 'HURDLAND', 'NEW BOSTON',
       'COUNTRY CLUB', 'SAVANNAH', 'ST JOSEPH', 'COSBY', 'UNION STAR',
       'AMAZONIA', 'ROSENDALE', 'FILLMORE', 'CLARKSDALE', 'HELENA',
       'BOLCKOW', 'KING CITY', 'REA', 'BARNARD', 'GUILFORD', 'GRAHAM',
       'ROCK PORT', 'TARKIO', 'FAIRFAX', 'BURLINGTON JUNCTION',
       'WESTBORO', 'CRAIG', 'WATSON', 'ELMO', 'SKIDMORE', 'VANDALIA',
       'MEXICO', 'LADDONIA', 'BENTON CITY', 'CENTRALIA', 'THOMPSON',
       'FARBER', 'MARTINSBURG', 'RUSH HILL', 'MIDDLETOWN', 'WELLSVILLE',
       'AUXVASSE', 'STURGEON', 'CLARK', 'MADISON', 'MONETT', 'PURDY',
       'SHELL KNOB', 'CASSVILLE', 'WASHBURN', 'GOLDEN', 'EXETER',
       'SELIGMAN', 'CRANE', 'AURORA', 'BUTTERFIELD', 'EAGLE ROCK',
       'VERONA', 'WHEATON', 'PIERCE CITY', 'CAPE FAIR', 'FAIRVIEW',
       'GALENA', 'JENKINS', 'ROCKY COMFORT', 'LAMAR', 'MINDENMINES',
       'GOLDEN CITY', 'SHELDON', 'LIBER

In [51]:
# How many unique cities are there? 989, which seems reasonable:
# There are roughly 1,000 muni governments, and several other types of smaller gov'ts in MO
# https://www2.census.gov/govs/cog/gc0212mo.pdf

print(len(voters['residential_city'].unique()))

989


In [52]:
### Conclude ###

In [53]:
# Before exporting, we need to:
# Rename specified fields in the df to match the prior TAP MO voter data file format and
# Create source flag in new file (in joined file at end, 1 = old data, 2 = new)
# Create a new df that has only the data we need
# Join it with the old file
# Check for duplicates in new, joined file, create dupe flag in join file based on check
# Export
# That's all, folks!

In [54]:
# Change names of columns to match the prior file 

voters.rename(columns = {'address_norm':'address_clean', 'residential_city':'city', 'residential_state': 'state', 'residential_zipcode': 'zip', 'birthdate': 'birth_date', 'registration_date': 'reg_date', 'congressional_district_20': 'congressional', 'legislative_district_20': 'legislative', 'senate_district_20': 'state_senate', 'voter_history_1': 'last_election'}, inplace = True)

In [55]:
voters.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4268187 entries, 1 to 4268187
Data columns (total 57 columns):
 #   Column                Dtype         
---  ------                -----         
 0   county                object        
 1   voter_id              object        
 2   first_name            object        
 3   middle_name           object        
 4   last_name             object        
 5   suffix                object        
 6   house_number          object        
 7   house_suffix          object        
 8   pre_direction         object        
 9   street_name           object        
 10  street_type           object        
 11  post_direction        object        
 12  unit_type             object        
 13  unit_number           object        
 14  non_standard_address  object        
 15  city                  object        
 16  state                 object        
 17  zip                   object        
 18  mailing_address       object        
 19  

In [56]:
# Create fields that don't exist in df currently but are in old data

voters['reg_year'] = voters['reg_date'].dt.year
voters['source'] = 2
voters['zip_clean'] = voters['zip']
voters['city_clean'] = voters['city']

In [57]:
# Define fields needed

fields_needed = ['county', 'voter_id', 'first_name', 'middle_name', 'last_name', 'suffix', 'house_number', 'house_suffix', 'pre_direction', 'street_name', 'street_type', 'post_direction', 'unit_type', 'unit_number', 'non_standard_address', 'city', 'state', 'zip', 'birth_date', 'reg_date', 'precinct', 'precinct_name', 'split', 'township', 'ward', 'congressional', 'legislative', 'state_senate', 'voter_status', 'last_election', 'source', 'dupe_flag', 'reg_year', 'address_clean', 'zip_clean', 'city_clean']

In [58]:
# Filter df to only those fields 

voters_tap = voters[fields_needed]

In [59]:
# Join (concat) the 2023 data and the prior (obtained in 2020) file

joined = pd.concat([voters_tap, keepers])

In [60]:
# Now we need to check the joined file for duplicate values
# keep=False here is marking all duplicates, based on voter_id, as True
# The dupe analysis based on unique IDs seems to indicate there aren't any
# BUT, caveat: There are unique IDs, and still duplicate voters; see below

is_duplicated = joined['voter_id'].duplicated(keep=False)

In [61]:
# Check for duplicates across name, birthdate, mailing ZIP
# Reveals that despite unique voter IDs (per Kiernan's R script), there are dupes
# Recast dupe_flag based on this

duped_names = joined.duplicated(subset=["first_name", "last_name", "birth_date", "zip"])

# This filters the dataframe to only the duplicated names 

names_duped = joined[duped_names]

# This recasts the dupe flag
joined['dupe_flag'] = np.where(joined.duplicated(subset=["first_name", "last_name", "birth_date", "zip"], keep=False), 'TRUE', 'FALSE')

t = ['TRUE']
filtered_t = joined[joined['dupe_flag'].isin(t)]
print('In the 2023 and 2020 data, combined, there are ' + "{:,}".format(len(filtered_t['voter_id'])) + ' duplicates.')
# trues = (joined['dupe_flag'] == 'TRUE')
# print(trues)
# print('In the 2023 and 2020 data, combined, there are ' + "{:,}".format(len(trues)) + ' duplicates.')

In the 2023 and 2020 data, combined, there are 3,969 duplicates.


In [62]:
print(joined.dupe_flag.value_counts())

FALSE    4970590
TRUE        3969
Name: dupe_flag, dtype: int64


In [63]:
# Export for TAP

#joined.to_csv('mo_voters_2023.csv')

In [62]:
### Steps for double-checking data are below

In [201]:
# Take a random sample of 500 rows, export to csv (for testing)

random_sample = joined.sample(n=500)
random_sample.to_csv('random_sample.csv', index=False)

In [202]:
# Checking the joined dupe flag
# Export 500 random rows

selected_rows_df = joined.head(500)
selected_rows_df.to_csv('selected_rows.csv', index=False)

In [203]:
### For checking:
column_to_filter_r = random_sample['voter_id']
column_to_filter_s = selected_rows_df['voter_id']

In [204]:
testdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4268187 entries, 1 to 4268187
Data columns (total 54 columns):
 #   Column                     Dtype 
---  ------                     ----- 
 0   county                     object
 1   voter_id                   object
 2   first_name                 object
 3   middle_name                object
 4   last_name                  object
 5   suffix                     object
 6   house_number               object
 7   house_suffix               object
 8   pre_direction              object
 9   street_name                object
 10  street_type                object
 11  post_direction             object
 12  unit_type                  object
 13  unit_number                object
 14  non_standard_address       object
 15  residential_city           object
 16  residential_state          object
 17  residential_zipcode        object
 18  mailing_address            object
 19  mailing_city               object
 20  mailing_state           

In [205]:
filtered_df_r_20 = voters20[voters20['voter_id'].isin(column_to_filter_r)]
filtered_df_s_20 = voters20[voters20['voter_id'].isin(column_to_filter_s)]
filtered_df_r_23 = testdf[testdf['voter_id'].isin(column_to_filter_r)]
filtered_df_s_23 = testdf[testdf['voter_id'].isin(column_to_filter_r)]

In [207]:
filtered_df_r_20.to_csv('random_sample_test_rows20.csv')
filtered_df_s_20.to_csv('selected_rows_test_rows20.csv')
filtered_df_r_23.to_csv('random_sample_test_rows23.csv')
filtered_df_s_23.to_csv('selected_rows_test_rows23.csv')