In [1]:
import pandas as pd
import datetime
import mariadb
import numpy as np
from pathlib import Path
import os

In [2]:
# make sure these values are correct for each iteration of this notebook
# all other fields should run without changes
election_date = '2008-11-04'
election_type1 = 'GENERAL ELECTION'
election_type2 = 'None'
election_type3 = 'None'
sql_query = "SELECT * FROM `ga_sos_voters`.`voter_history_2008`"
output_name = "general_2008.csv"

In [3]:
# Connect to data warehouse with MariaDB
try:
    conn = mariadb.connect(
        user=os.getenv("MARIADB_USER"),
        password=os.getenv("MARIADB_PASSWORD"),
        host=os.getenv("MARIADB_HOST"),
        port=int(os.getenv("MARIADB_PORT"))
    )
except mariadb.Error as e:
    print(f"Error connecting to MariaDB Platform: {e}")
    sys.exit(1)

# Get Cursor
cur = conn.cursor()

In [4]:
# pull contents of SQL table into Pandas DataFrame
original_table = pd.read_sql(sql_query, conn, dtype='str')

  original_table = pd.read_sql(sql_query, conn, dtype='str')


In [5]:
# make copy to avoid having to redownload as often
df = original_table.copy()

In [6]:
# pull county code list and make dictionary
county_code_df = pd.read_sql("SELECT * FROM `ga_sos_voters`.`lu_countycode`", conn, dtype='str')

  county_code_df = pd.read_sql("SELECT * FROM `ga_sos_voters`.`lu_countycode`", conn, dtype='str')


In [7]:
county_code_dict = dict(zip(county_code_df.COUNTY_CODE, county_code_df.COUNTY))

In [8]:
# make function to use dictionaries to fill in DataFrame columns with expanded information
def dict_lookup_list(column, dictionary):
    county_list = []
    for entry in column:
        if len(entry) == 3:
            county_name = dictionary.get(f'{entry}')
            county_list.append(county_name)
        else:
            county_list.append(entry)
    return county_list

In [9]:
# use dict_lookup_list() to fill in 'county_name' and get rid of 'county_num'
if 'county_num' in df.columns:
    df.insert(0, 'county_name', dict_lookup_list(df.county_num, county_code_dict))

In [10]:
df.pop('county_num')

0          001
1          001
2          001
3          001
4          001
          ... 
9628477    159
9628478    159
9628479    159
9628480    159
9628481    159
Name: county_num, Length: 9628482, dtype: object

In [11]:
# rename voter_registration_number column to fit other data warehouse tables
df = df.rename(columns={'registration_num': 'voter_registration_number'})

In [12]:
# function to add index to new DataFrame
def add_row_names(column):
    counter = 0
    row_name_list = []
    for row in column:
        counter += 1
        row_name_list.append(counter)
    return row_name_list

In [13]:
if 'row_names' not in df.columns:
    df.insert(0, 'row_names', add_row_names(df['county_name']))

In [14]:
# make dictionary of election types
election_type_df = pd.read_sql("SELECT * FROM `ga_sos_voters`.`lu_election_type`", conn)

  election_type_df = pd.read_sql("SELECT * FROM `ga_sos_voters`.`lu_election_type`", conn)


In [15]:
election_type_dict = dict(zip(election_type_df.type_of_election, election_type_df.description))

In [16]:
# Make election types all caps to match other tables
df['election_type'] = dict_lookup_list(df['election_type'], election_type_dict)

In [17]:
df['election_type'] = df['election_type'].str.upper()

In [18]:
df['election_date'] = df['election_date']

In [19]:
# get rid of "None" values to match other tables in data warehouse

def remove_nones(column):
    entry_list = []
    for entry in column:
        if entry == "None":
            entry_list.append("")
        else:
            entry_list.append(entry)
    return entry_list

In [20]:
df['county_name'] = remove_nones(df['county_name'])

In [21]:
df['voter_registration_number'] = remove_nones(df['voter_registration_number'])

In [22]:
df['election_date'] = remove_nones(df['election_date'])

In [23]:
df['election_type'] = remove_nones(df['election_type'])

In [24]:
df['party'] = remove_nones(df['party'])

In [25]:
df['absentee'] = remove_nones(df['absentee'])

In [26]:
df['provisional'] = remove_nones(df['provisional'])

In [27]:
df['supplemental'] = remove_nones(df['supplemental'])

In [28]:
# add or delete columns not used in other tables
if 'ballot_style' not in df.columns:
    df.insert(6, 'ballot_style', '')

In [29]:
if 'data_from_history_year_file' in df.columns:
    df.pop('data_from_history_year_file')

In [30]:
if 'ajc_data_acquisition_year' in df.columns:
    df.pop('ajc_data_acquisition_year')

In [31]:
if 'ajc_data_loader_initials' in df.columns:
    df.pop('ajc_data_loader_initials')

In [32]:
# there are both "GENERAL ELECTION" values and "GENERAL/SPECIAL ELECTION" values, and the vast majority are not duplicates
# I will use both values to filter and dedupe so that I am not missing 1/3 of the voters
# it would be a good idea to run this same notebook on more recent tables to check how this process compares to what the state does to get their values filtered
df.election_type.value_counts()

election_type
GENERAL ELECTION                         3931522
GENERAL ELECTION RUNOFF                  2109114
PRESIDENTIAL PRIMARY                     1846418
GENERAL PRIMARY ELECTION                  820096
GENERAL PRIMARY RUNOFF                    463250
GENERAL PRIMARY/SPECIAL ELECTION          192162
PRESIDENTIAL PRIMARY/SPECIAL ELECTION     176686
SPECIAL ELECTION                           73340
MUNICIPAL ELECTION                          6047
MUNICIPAL SPECIAL ELECTION                  5208
SPECIAL PRIMARY RUNOFF                      2196
MUNICIPAL SPECIAL ELECTION RUNOFF            996
MUNICIPAL ELECTION RUNOFF                    800
SPECIAL ELECTION RUNOFF                      420
SPECIAL PRIMARY                              220
MUNICIPAL RECALL ELECTION                      7
Name: count, dtype: int64

In [33]:
filtered_date_df = df[df['election_date'] == election_date]

In [45]:
filtered_election_df = df[(df['election_type'] == election_type1) | (df['election_type'] == election_type2) | (df['election_type'] == election_type3)]

In [47]:
filtered_date_election_df = filtered_election_df[filtered_election_df['election_date'] == election_date]

In [49]:
filtered_date_election_df.row_names = add_row_names(filtered_date_election_df['county_name'])

In [51]:
# it is hard to tell why there are duplicates with the general and general/special election types
filtered_date_election_df[filtered_date_election_df['voter_registration_number'].duplicated(keep=False)].sort_values('voter_registration_number')

Unnamed: 0,row_names,county_name,voter_registration_number,election_date,election_type,party,ballot_style,absentee,provisional,supplemental


In [38]:
deduped_df = filtered_date_election_df.drop_duplicates('voter_registration_number')

In [39]:
# there are 114 duplicated entries, and after deduping the DataFrame I have 114 fewer entries
len(filtered_date_election_df) - len(deduped_df)

0

In [40]:
len(filtered_date_election_df[filtered_date_election_df['voter_registration_number'].duplicated()])

0

In [41]:
len(filtered_date_election_df.drop_duplicates('voter_registration_number'))

3931522

In [42]:
filtered_date_election_df.head(20)

Unnamed: 0,row_names,county_name,voter_registration_number,election_date,election_type,party,ballot_style,absentee,provisional,supplemental
0,1,APPLING,4137,2008-11-04,GENERAL ELECTION,,,,,
2,2,APPLING,43572,2008-11-04,GENERAL ELECTION,,,,,
3,3,APPLING,45353,2008-11-04,GENERAL ELECTION,,,,,
5,4,APPLING,45572,2008-11-04,GENERAL ELECTION,,,,,
8,5,APPLING,45951,2008-11-04,GENERAL ELECTION,,,Y,,
11,6,APPLING,47002,2008-11-04,GENERAL ELECTION,,,,,
13,7,APPLING,69178,2008-11-04,GENERAL ELECTION,,,,,
17,8,APPLING,122112,2008-11-04,GENERAL ELECTION,,,Y,,
20,9,APPLING,172072,2008-11-04,GENERAL ELECTION,,,Y,,
23,10,APPLING,174479,2008-11-04,GENERAL ELECTION,,,Y,,


In [54]:
filtered_date_election_df.value_counts(['election_type'])

election_type   
GENERAL ELECTION    3931522
Name: count, dtype: int64

In [44]:
output_path = Path().cwd().parent.joinpath('output_csv/general_2008.csv')
filtered_date_election_df.to_csv(output_path)