# Texas Secretary of State Voter Data File ETL




Sources:
- MapBox API Pricing
    - https://www.mapbox.com/pricing#temporary-geocoding-api
| Monthly requests       | Cost per 1,000 (as of Nov 2024) |
|------------------------|----------------------------|
| Up to 100,000          | Free                       |
| 100,001 to 500,000     | \$0.75                     |
| 500,001 to 1,000,000   | \$0.60                     |
| 1,000,001 to 4,999,999 | \$0.45                     |
| 5,000,000+             | Contact sales for discount |

In [1]:
import os
import pandas as pd
import geopandas as gpd
from dotenv import load_dotenv
from tqdm import tqdm
from functools import partial
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import MapBox
from shapely import wkt

In [2]:
# file path to dataset
# todo: update to real file once data available
voter_data_file = os.path.join('..', 'data', 'inputs', 'tx_secretary_of_state', 'TEST_voter_data_file.xlsx')

In [3]:
# sets the fields for each dataset, pasted from the documentation, as list of lists
voter_data_file_fields = [['COUNTY CODE', 1, 3],
                          ['PRECINCT', 4, 10],
                          ['VUID', 13, 10],
                          ['LAST NAME', 23, 50],
                          ['FIRST NAME', 73, 50],
                          ['MIDDLE NAME', 123, 50],
                          ['FORMER LAST NAME', 173, 50],
                          ['SUFFIX', 223, 4],
                          ['GENDER', 227, 1],
                          ['DOB', 228, 8],
                          ['PERM HOUSE NUMBER', 236, 9],
                          ['PERM DESIGNATOR', 245, 12],
                          ['PERM DIRECTIONAL PREFIX', 257, 2],
                          ['PERM STREET NAME', 259, 50],
                          ['PERM STREET TYPE', 309, 12],
                          ['PERM DIRECTIONAL SUFFIX', 321, 2],
                          ['PERM UNIT NUMBER', 323, 12],
                          ['PERM UNIT TYPE', 335, 12],
                          ['PERM CITY', 347, 50],
                          ['PERM ZIPCODE', 397, 9],
                          ['MAILING ADDRESS 1', 406, 110],
                          ['MAILING ADDRESS 2', 516, 50],
                          ['MAILING CITY', 566, 50],
                          ['MAILING STATE', 616, 20],
                          ['MAILING ZIPCODE', 636, 20],
                          ['EDR (EFFECTIVE DATE OF REGISTRATION)', 656, 8],
                          ['STATUS CODE', 664, 1],
                          ['HISPANIC SURNAME FLAG', 665, 1],
                          ['ELECTION DATE', 666, 8],
                          ['ELECTION TYPE', 674, 2],
                          ['ELECTION PARTY', 676, 3],
                          ['ELECTION VOTING METHOD', 679, 6]]
                          #TOTAL N/A 685

In [4]:
# creates list of tuples that can be passed to pandas fixed width file parser
fixed_width_file_colspec = []
for index, field in enumerate(voter_data_file_fields):
    # subtract one (1) from initial condition from documentation to match zero index
    fixed_width_file_colspec.append((voter_data_file_fields[index][1] - 1,  voter_data_file_fields[index][1] - 1 + voter_data_file_fields[index][2]))

In [5]:
# parses fixed width file
# todo: once real file available, replace excel reader with fwf reader
# voter_data_file_df = pd.read_fwf(voter_data_file, col=voter_data_file_fields[0][0].to_lower().replace(' ', ''))
voter_data_file_df = pd.read_excel(voter_data_file, na_filter=False).astype(str)

In [6]:
# aggregates address data from the individual columns
voter_data_file_df['PERM ADDRESS'] = voter_data_file_df[['PERM HOUSE NUMBER', 'PERM DESIGNATOR', 'PERM DIRECTIONAL PREFIX', 'PERM STREET NAME', 'PERM STREET TYPE', 'PERM DIRECTIONAL SUFFIX', 'PERM UNIT TYPE', 'PERM UNIT NUMBER', 'PERM CITY', 'PERM ZIPCODE']].agg(' '.join, axis=1)

In [7]:
# voter_data_file_df['PERM ADDRESS'] = voter_data_file_df['PERM ADDRESS'].replace("\s+", " ", regex=True).apply(lambda x: x.strip())

# removes any extraneous spaces from entire dataframe (needed for normalization and to deal with fixed width file)
voter_data_file_df = voter_data_file_df.replace("\s+", " ", regex=True).apply(lambda x: x.str.strip())

In [8]:
# todo: delete dataframe sample creation code
# creates small sample dataframe to test geocoding source without testing rate limits
voter_data_file_df = voter_data_file_df.iloc[:5]

In [9]:
# API key string, or import from .env file
load_dotenv()
api_key = os.getenv('mapbox_access_token')

# initializes geocoder class  using Nominatim (OpenStreeMaps) as source
geolocator = MapBox(api_key=api_key)
geocode = RateLimiter(geolocator.geocode)
# creates progress bar
tqdm.pandas()

voter_data_file_df['location'] = voter_data_file_df['PERM ADDRESS'].progress_apply(partial(geocode, exactly_one=True))

100%|██████████| 5/5 [00:00<00:00, 29.71it/s]


In [10]:
# removes PERM ADDRESS column since location data is stored in new `location` column
voter_data_file_df = voter_data_file_df.drop(['PERM ADDRESS'], axis=1)

# creates column that contains only latitude and longitude data for geospatial analysis
voter_data_file_df['geometry'] = voter_data_file_df['location'].apply(lambda loc: (loc.latitude, loc.longitude) if loc else None)

# adds string `POINT` to `geometry` field to fulfil geodataframe requirements
voter_data_file_df['geometry'] = 'POINT ' + voter_data_file_df['geometry'].astype(str)

# repplaces extraneous commas in field
voter_data_file_df['geometry'] = voter_data_file_df['geometry'].str.replace(',', '')

# converts `geometry` field to geometry field
voter_data_file_df['geometry'] = voter_data_file_df['geometry'].apply(wkt.loads)

# transforms dataframe into geodataframe for proper handling of geographic data
voter_data_file_gdf = gpd.GeoDataFrame(voter_data_file_df, geometry=voter_data_file_df['geometry'])

# removes extraneous data from `location` (validated address) field
voter_data_file_gdf['location'] = voter_data_file_gdf['location'].apply(lambda loc: loc[0])

In [14]:
voter_data_file_gdf.style.hide(['VUID', 'DOB', 'FIRST NAME', 'MIDDLE NAME', 'LAST NAME'], axis='columns')

Unnamed: 0,COUNTY CODE,PRECINCT,FORMER LAST NAME,SUFFIX,GENDER,PERM HOUSE NUMBER,PERM DESIGNATOR,PERM DIRECTIONAL PREFIX,PERM STREET NAME,PERM STREET TYPE,PERM DIRECTIONAL SUFFIX,PERM UNIT NUMBER,PERM UNIT TYPE,PERM CITY,PERM ZIPCODE,MAILING ADDRESS 1,MAILING ADDRESS 2,MAILING CITY,MAILING STATE,MAILING ZIPCODE,EDR (EFFECTIVE DATE OF REGISTRATION),STATUS CODE,HISPANIC SURNAME FLAG,ELECTION DATE,ELECTION TYPE,ELECTION PARTY,ELECTION VOTING METHOD,location,geometry
0,453,,,,F,307,,,CLEARSKY,CIR,,,,AUSTIN,78745,,,,,,1/12/2000,,,,,,,"307 Clearsky Circle, Austin, Texas 78745, United States",POINT (30.19578 -97.78391)
1,453,,,,F,5504,,,LAKEMOORE,DR,,,,AUSTIN,78731,5504 LAKEMOORE DR,,AUSTIN,TX,787310000.0,1/8/2000,,,,,,,"5504 Lakemore Drive, Austin, Texas 78731, United States",POINT (30.365635 -97.785962)
2,453,,,,M,15309,,,VELIAS,WAY,,,,PFLUGERVILLE,78660,,,,,,1/8/2000,,,,,,,"15309 Velia's Way, Pflugerville, Texas 78660, United States",POINT (30.443957 -97.656907)
3,453,,,,M,9309,,,SANFORD,DR,,,,AUSTIN,78748,,,,,,1/12/2000,,,,,,,"9309 Sanford Drive, Austin, Texas 78748, United States",POINT (30.18516 -97.8344)
4,453,,,,F,909,,E,VILLAGE,LN,,,,AUSTIN,78758,,,,,,1/1/2000,,,,,,,"909 East Village Lane, Austin, Texas 78758, United States",POINT (30.368842 -97.697656)
