# Texas Secretary of State Voter Data File ETL

This notebook extracts voter data from flat files provided by the Texas Secretary of State office, and loads into a database schema for further transformation and analysis.




In [1]:
# user data
year = '2024'
election = 'general'

In [2]:
import os
import pandas as pd
import sqlalchemy

In [3]:
# file path to dataset
# todo: update to real file once data available
voter_data_file = os.path.join('..', 'data', 'inputs', 'tx_secretary_of_state', 'mock_data', 'TEST_voter_data_file.xlsx')

In [4]:
# sets the fields for each dataset, pasted from the documentation, as list of lists
voter_data_file_fields = [['COUNTY CODE', 1, 3],
                          ['PRECINCT', 4, 10],
                          ['VUID', 13, 10],
                          ['LAST NAME', 23, 50],
                          ['FIRST NAME', 73, 50],
                          ['MIDDLE NAME', 123, 50],
                          ['FORMER LAST NAME', 173, 50],
                          ['SUFFIX', 223, 4],
                          ['GENDER', 227, 1],
                          ['DOB', 228, 8],
                          ['PERM HOUSE NUMBER', 236, 9],
                          ['PERM DESIGNATOR', 245, 12],
                          ['PERM DIRECTIONAL PREFIX', 257, 2],
                          ['PERM STREET NAME', 259, 50],
                          ['PERM STREET TYPE', 309, 12],
                          ['PERM DIRECTIONAL SUFFIX', 321, 2],
                          ['PERM UNIT NUMBER', 323, 12],
                          ['PERM UNIT TYPE', 335, 12],
                          ['PERM CITY', 347, 50],
                          ['PERM ZIPCODE', 397, 9],
                          ['MAILING ADDRESS 1', 406, 110],
                          ['MAILING ADDRESS 2', 516, 50],
                          ['MAILING CITY', 566, 50],
                          ['MAILING STATE', 616, 20],
                          ['MAILING ZIPCODE', 636, 20],
                          ['EDR (EFFECTIVE DATE OF REGISTRATION)', 656, 8],
                          ['STATUS CODE', 664, 1],
                          ['HISPANIC SURNAME FLAG', 665, 1],
                          ['ELECTION DATE', 666, 8],
                          ['ELECTION TYPE', 674, 2],
                          ['ELECTION PARTY', 676, 3],
                          ['ELECTION VOTING METHOD', 679, 6]]
                          #TOTAL N/A 685

In [5]:
# creates list of tuples that can be passed to pandas fixed width file parser
fixed_width_file_colspec = []
for index, field in enumerate(voter_data_file_fields):
    # subtract one (1) from initial condition from documentation to match zero index
    fixed_width_file_colspec.append((voter_data_file_fields[index][1] - 1,  voter_data_file_fields[index][1] - 1 + voter_data_file_fields[index][2]))

In [6]:
# parses fixed width file
# todo: once real file available, replace excel reader with fwf reader
# voter_data_file_df = pd.read_fwf(voter_data_file, col=voter_data_file_fields[0][0].to_lower().replace(' ', ''))
voter_data_file_df = pd.read_excel(voter_data_file, na_filter=False).astype(str)

In [7]:
# aggregates address data from the individual columns
voter_data_file_df['PERM ADDRESS'] = voter_data_file_df[['PERM HOUSE NUMBER', 'PERM DESIGNATOR', 'PERM DIRECTIONAL PREFIX', 'PERM STREET NAME', 'PERM STREET TYPE', 'PERM DIRECTIONAL SUFFIX', 'PERM UNIT TYPE', 'PERM UNIT NUMBER', 'PERM CITY', 'PERM ZIPCODE']].agg(' '.join, axis=1)

In [8]:
# removes any extraneous spaces from entire dataframe (needed for normalization and to deal with fixed width file)
voter_data_file_df = voter_data_file_df.replace("\s+", " ", regex=True).apply(lambda x: x.str.strip())

In [9]:
# normalizes column names
voter_data_file_df.columns = voter_data_file_df.columns.str.lower().str.replace(' ', '_')

In [12]:
# loads dataframe into database
database_dirpath = os.path.join('..', 'data', 'databases', 'texas_secretary_of_state')
if not os.path.exists(database_dirpath):
    os.makedirs(database_dirpath)
    
sql_engine = sqlalchemy.create_engine(os.path.join('sqlite:///', database_dirpath, f'voter_data_file.db'))

voter_data_file_df.to_sql(f'{year}_{election}', sql_engine, if_exists='replace')

879827