In [1]:
# DO NOT USE SOS DOWNLOAD FOR 2014 GENERAL/SPECIAL ELECTION. IT IS MISSING VOTERS.
# USE 'voter_history_2014' FROM DATA WAREHOUSE INSTEAD.

In [2]:
import pandas as pd
import datetime
import mariadb
import numpy as np
import os
from pathlib import Path

In [3]:
# make sure these values are correct for each iteration of this notebook
# all other fields should run without changes
election_date = '2014-11-04'
election_type1 = 'GENERAL ELECTION'
election_type2 = 'None'
election_type3 = 'None'
sql_query = "SELECT * FROM `ga_sos_voters`.`voter_history_2014`"
output_name = "general_2014.csv"


In [4]:
# Connect to data warehouse with MariaDB
try:
    conn = mariadb.connect(
        user=os.getenv("MARIADB_USER"),
        password=os.getenv("MARIADB_PASSWORD"),
        host=os.getenv("MARIADB_HOST"),
        port=int(os.getenv("MARIADB_PORT"))
    )
except mariadb.Error as e:
    print(f"Error connecting to MariaDB Platform: {e}")
    sys.exit(1)

# Get Cursor
cur = conn.cursor()

In [5]:
# pull contents of SQL table into Pandas DataFrame
original_table = pd.read_sql(sql_query, conn, dtype='str')

  original_table = pd.read_sql(sql_query, conn, dtype='str')


In [6]:
# make copy to avoid having to redownload as often
df = original_table.copy()

In [7]:
df.head(20)

Unnamed: 0,county_num,registration_num,election_date,election_type,party,absentee,provisional,supplemental,data_from_history_year_file,ajc_data_acquisition_year,ajc_data_loader_initials,ajc_id_num
0,143,746640,2014-03-18,5,Y,Y,N,N,2014,2016,JLP,1
1,143,5956114,2014-03-18,5,Y,Y,N,N,2014,2016,JLP,2
2,143,5607322,2014-03-18,5,Y,Y,N,N,2014,2016,JLP,3
3,143,5689778,2014-03-18,5,Y,Y,N,N,2014,2016,JLP,4
4,143,10162548,2014-03-18,5,Y,Y,N,N,2014,2016,JLP,5
5,143,10162560,2014-03-18,5,Y,Y,N,N,2014,2016,JLP,6
6,143,5267044,2014-03-18,5,Y,Y,N,N,2014,2016,JLP,7
7,143,7410768,2014-03-18,5,Y,Y,N,N,2014,2016,JLP,8
8,143,8366498,2014-03-18,5,Y,Y,N,N,2014,2016,JLP,9
9,143,7742525,2014-03-18,5,Y,Y,N,N,2014,2016,JLP,10


In [8]:
# pull county code list and make dictionary
county_code_df = pd.read_sql("SELECT * FROM `ga_sos_voters`.`lu_countycode`", conn, dtype='str')

  county_code_df = pd.read_sql("SELECT * FROM `ga_sos_voters`.`lu_countycode`", conn, dtype='str')


In [9]:
county_code_dict = dict(zip(county_code_df.COUNTY_CODE, county_code_df.COUNTY))

In [10]:
# make function to use dictionaries to fill in DataFrame columns with expanded information
def dict_lookup_list(column, dictionary):
    county_list = []
    for entry in column:
        if len(entry) == 3:
            county_name = dictionary.get(f'{entry}')
            county_list.append(county_name)
        else:
            county_list.append(entry)
    return county_list

In [11]:
# use dict_lookup_list() to fill in 'county_name' and get rid of 'county_num'
if 'county_num' in df.columns:
    df.insert(0, 'county_name', dict_lookup_list(df.county_num, county_code_dict))

In [12]:
df.pop('county_num')

0          143
1          143
2          143
3          143
4          143
          ... 
4269686    146
4269687    146
4269688    146
4269689    146
4269690    146
Name: county_num, Length: 4269691, dtype: object

In [13]:
# rename voter_registration_number column to fit other data warehouse tables
df = df.rename(columns={'registration_num': 'voter_registration_number'})

In [14]:
# function to add index to new DataFrame
def add_row_names(column):
    counter = 0
    row_name_list = []
    for row in column:
        counter += 1
        row_name_list.append(counter)
    return row_name_list

In [15]:
if 'row_names' not in df.columns:
    df.insert(0, 'row_names', add_row_names(df['county_name']))

In [16]:
# make dictionary of election types
election_type_df = pd.read_sql("SELECT * FROM `ga_sos_voters`.`lu_election_type`", conn)

  election_type_df = pd.read_sql("SELECT * FROM `ga_sos_voters`.`lu_election_type`", conn)


In [17]:
election_type_dict = dict(zip(election_type_df.type_of_election, election_type_df.description))

In [18]:
# Make election types all caps to match other tables
df['election_type'] = dict_lookup_list(df['election_type'], election_type_dict)

In [19]:
df['election_type'] = df['election_type'].str.upper()

In [20]:
# get rid of "None" values to match other tables in data warehouse

def remove_nones(column):
    entry_list = []
    for entry in column:
        if entry == "None":
            entry_list.append("")
        else:
            entry_list.append(entry)
    return entry_list

In [21]:
df['county_name'] = remove_nones(df['county_name'])

In [22]:
df['voter_registration_number'] = remove_nones(df['voter_registration_number'])

In [23]:
df['election_date'] = remove_nones(df['election_date'])

In [24]:
df['election_type'] = remove_nones(df['election_type'])

In [25]:
df['party'] = remove_nones(df['party'])

In [26]:
df['absentee'] = remove_nones(df['absentee'])

In [27]:
df['provisional'] = remove_nones(df['provisional'])

In [28]:
df['supplemental'] = remove_nones(df['supplemental'])

In [29]:
# add or delete columns not used in other tables
if 'ballot_style' not in df.columns:
    df.insert(6, 'ballot_style', '')

In [30]:
if 'data_from_history_year_file' in df.columns:
    df.pop('data_from_history_year_file')

In [31]:
if 'ajc_data_acquisition_year' in df.columns:
    df.pop('ajc_data_acquisition_year')

In [32]:
if 'ajc_data_loader_initials' in df.columns:
    df.pop('ajc_data_loader_initials')

In [33]:
# there are both "GENERAL ELECTION" values and "GENERAL/SPECIAL ELECTION" values, and the vast majority are not duplicates
# I will use both values to filter and dedupe so that I am not missing 1/3 of the voters
# it would be a good idea to run this same notebook on more recent tables to check how this process compares to what the state does to get their values filtered
df.election_type.value_counts()

election_type
GENERAL ELECTION            2595786
GENERAL PRIMARY ELECTION     984350
GENERAL PRIMARY RUNOFF       634310
SPECIAL ELECTION              24453
GENERAL ELECTION RUNOFF       17746
SPECIAL ELECTION RUNOFF       10524
MUNICIPAL ELECTION             2168
RECALL                          354
Name: count, dtype: int64

In [34]:
df[df['election_type'] == 'GENERAL ELECTION'].party.value_counts()

party
N    1640788
Y     954998
Name: count, dtype: int64

In [35]:
filtered_date_df = df[df['election_date'] == election_date]

In [36]:
filtered_election_df = df[(df['election_type'] == election_type1) | (df['election_type'] == election_type2) | (df['election_type'] == election_type3)]

In [37]:
filtered_date_election_df = filtered_election_df[filtered_election_df['election_date'] == election_date]

In [38]:
filtered_date_election_df.row_names = add_row_names(filtered_date_election_df['county_name'])

In [39]:
# it is hard to tell why there are duplicates with the general and general/special election types
filtered_date_election_df[filtered_date_election_df['voter_registration_number'].duplicated(keep=False)].sort_values('voter_registration_number')

Unnamed: 0,row_names,county_name,voter_registration_number,election_date,election_type,party,ballot_style,absentee,provisional,supplemental,ajc_id_num


In [40]:
deduped_df = filtered_date_election_df.drop_duplicates('voter_registration_number')

In [41]:
# there are 114 duplicated entries, and after deduping the DataFrame I have 114 fewer entries
len(filtered_date_election_df) - len(deduped_df)

0

In [42]:
len(filtered_date_election_df[filtered_date_election_df['voter_registration_number'].duplicated()])

0

In [43]:
len(filtered_date_election_df.drop_duplicates('voter_registration_number'))

2595786

In [45]:
filtered_date_election_df.value_counts(['election_type'])

election_type   
GENERAL ELECTION    2595786
Name: count, dtype: int64

In [46]:
output_path = Path().cwd().parent.joinpath(f'output_csv/{output_name}')
filtered_date_election_df.to_csv(output_path)

In [47]:
filtered_date_election_df

Unnamed: 0,row_names,county_name,voter_registration_number,election_date,election_type,party,ballot_style,absentee,provisional,supplemental,ajc_id_num
996319,1,BACON,00469744,2014-11-04,GENERAL ELECTION,Y,,Y,N,N,996320
996320,2,BACON,00472019,2014-11-04,GENERAL ELECTION,Y,,Y,N,N,996321
996321,3,CHATHAM,01539116,2014-11-04,GENERAL ELECTION,Y,,Y,N,N,996322
996322,4,BROOKS,00589643,2014-11-04,GENERAL ELECTION,Y,,Y,N,N,996323
996323,5,CHATHAM,07404428,2014-11-04,GENERAL ELECTION,Y,,Y,N,N,996324
...,...,...,...,...,...,...,...,...,...,...,...
3592100,2595782,WHITFIELD,03911540,2014-11-04,GENERAL ELECTION,Y,,Y,N,N,3592101
3592101,2595783,WASHINGTON,00745158,2014-11-04,GENERAL ELECTION,Y,,Y,N,N,3592102
3592102,2595784,WAYNE,02303566,2014-11-04,GENERAL ELECTION,Y,,Y,N,N,3592103
3592103,2595785,WHITFIELD,00080406,2014-11-04,GENERAL ELECTION,N,,N,N,N,3592104
