In [1]:
import pandas as pd
import datetime
import mariadb
import numpy as np
from pathlib import Path
import os

In [2]:
# make sure these values are correct for each iteration of this notebook
# all other fields should run without changes
election_date = '2010-11-02'
election_type1 = 'GENERAL ELECTION'
election_type2 = 'None'
election_type3 = 'None'
sql_query = "SELECT * FROM `ga_sos_voters`.`voter_history_2010`"
output_name = "general_2010.csv"


In [3]:
# Connect to data warehouse with MariaDB
try:
    conn = mariadb.connect(
        user=os.getenv("MARIADB_USER"),
        password=os.getenv("MARIADB_PASSWORD"),
        host=os.getenv("MARIADB_HOST"),
        port=int(os.getenv("MARIADB_PORT"))
    )
except mariadb.Error as e:
    print(f"Error connecting to MariaDB Platform: {e}")
    sys.exit(1)

# Get Cursor
cur = conn.cursor()

In [4]:
# pull contents of SQL table into Pandas DataFrame
original_table = pd.read_sql(sql_query, conn, dtype='str')

  original_table = pd.read_sql(sql_query, conn, dtype='str')


In [5]:
# make copy to avoid having to redownload as often
df = original_table.copy()

In [6]:
df.head(20)

Unnamed: 0,county_num,registration_num,election_date,election_type,party,absentee,provisional,supplemental,data_from_history_year_file,ajc_data_acquisition_year,ajc_data_loader_initials
0,1,4137,2010-11-02,3,,,,,2010,2016,JLP
1,1,45353,2010-07-20,1,R,,,,2010,2016,JLP
2,1,45353,2010-08-10,2,R,,,,2010,2016,JLP
3,1,45353,2010-11-02,3,,,,,2010,2016,JLP
4,1,45572,2010-07-20,1,R,,,,2010,2016,JLP
5,1,45572,2010-11-02,3,,,,,2010,2016,JLP
6,1,45951,2010-07-20,1,R,,,,2010,2016,JLP
7,1,45951,2010-08-10,2,R,,,,2010,2016,JLP
8,1,45951,2010-11-02,3,,,,,2010,2016,JLP
9,1,47002,2010-07-20,1,R,,,,2010,2016,JLP


In [7]:
# pull county code list and make dictionary
county_code_df = pd.read_sql("SELECT * FROM `ga_sos_voters`.`lu_countycode`", conn, dtype='str')

  county_code_df = pd.read_sql("SELECT * FROM `ga_sos_voters`.`lu_countycode`", conn, dtype='str')


In [8]:
county_code_dict = dict(zip(county_code_df.COUNTY_CODE, county_code_df.COUNTY))

In [9]:
# make function to use dictionaries to fill in DataFrame columns with expanded information
def dict_lookup_list(column, dictionary):
    county_list = []
    for entry in column:
        if len(entry) == 3:
            county_name = dictionary.get(f'{entry}')
            county_list.append(county_name)
        else:
            county_list.append(entry)
    return county_list

In [10]:
# use dict_lookup_list() to fill in 'county_name' and get rid of 'county_num'
if 'county_num' in df.columns:
    df.insert(0, 'county_name', dict_lookup_list(df.county_num, county_code_dict))

In [11]:
df.pop('county_num')

0          001
1          001
2          001
3          001
4          001
          ... 
4841788    159
4841789    159
4841790    159
4841791    159
4841792    159
Name: county_num, Length: 4841793, dtype: object

In [12]:
# rename voter_registration_number column to fit other data warehouse tables
df = df.rename(columns={'registration_num': 'voter_registration_number'})

In [13]:
# function to add index to new DataFrame
def add_row_names(column):
    counter = 0
    row_name_list = []
    for row in column:
        counter += 1
        row_name_list.append(counter)
    return row_name_list

In [14]:
if 'row_names' not in df.columns:
    df.insert(0, 'row_names', add_row_names(df['county_name']))

In [15]:
# make dictionary of election types
election_type_df = pd.read_sql("SELECT * FROM `ga_sos_voters`.`lu_election_type`", conn)

  election_type_df = pd.read_sql("SELECT * FROM `ga_sos_voters`.`lu_election_type`", conn)


In [16]:
election_type_dict = dict(zip(election_type_df.type_of_election, election_type_df.description))

In [17]:
# Make election types all caps to match other tables
df['election_type'] = dict_lookup_list(df['election_type'], election_type_dict)

In [18]:
df['election_type'] = df['election_type'].str.upper()

In [19]:
# get rid of "None" values to match other tables in data warehouse

def remove_nones(column):
    entry_list = []
    for entry in column:
        if entry == "None":
            entry_list.append("")
        else:
            entry_list.append(entry)
    return entry_list

In [20]:
df['county_name'] = remove_nones(df['county_name'])

In [21]:
df['voter_registration_number'] = remove_nones(df['voter_registration_number'])

In [22]:
df['election_date'] = remove_nones(df['election_date'])

In [23]:
df['election_type'] = remove_nones(df['election_type'])

In [24]:
df['party'] = remove_nones(df['party'])

In [25]:
df['absentee'] = remove_nones(df['absentee'])

In [26]:
df['provisional'] = remove_nones(df['provisional'])

In [27]:
df['supplemental'] = remove_nones(df['supplemental'])

In [28]:
# add or delete columns not used in other tables
if 'ballot_style' not in df.columns:
    df.insert(6, 'ballot_style', '')

In [29]:
if 'data_from_history_year_file' in df.columns:
    df.pop('data_from_history_year_file')

In [30]:
if 'ajc_data_acquisition_year' in df.columns:
    df.pop('ajc_data_acquisition_year')

In [31]:
if 'ajc_data_loader_initials' in df.columns:
    df.pop('ajc_data_loader_initials')

In [32]:
# there are both "GENERAL ELECTION" values and "GENERAL/SPECIAL ELECTION" values, and the vast majority are not duplicates
# I will use both values to filter and dedupe so that I am not missing 1/3 of the voters
# it would be a good idea to run this same notebook on more recent tables to check how this process compares to what the state does to get their values filtered
df.election_type.value_counts()

election_type
GENERAL ELECTION                           2568080
GENERAL PRIMARY ELECTION                    804377
GENERAL PRIMARY RUNOFF                      669725
GENERAL PRIMARY/SPECIAL ELECTION            294948
GENERAL ELECTION RUNOFF                     290322
SPECIAL ELECTION                             76550
GENERAL/SPECIAL ELECTION                     54649
SPECIAL ELECTION RUNOFF                      54622
GENERAL PRIMARY/SPECIAL ELECTION RUNOFF      18535
MUNICIPAL SPECIAL ELECTION                    5316
MUNICIPAL ELECTION                            4536
MUNICIPAL ELECTION RUNOFF                      128
MUNICIPAL SPECIAL ELECTION RUNOFF                5
Name: count, dtype: int64

In [33]:
df[df['election_type'] == 'GENERAL ELECTION'].party.value_counts()

party
     2568002
R         50
D         28
Name: count, dtype: int64

In [34]:
filtered_date_df = df[df['election_date'] == election_date]

In [35]:
filtered_election_df = df[(df['election_type'] == election_type1) | (df['election_type'] == election_type2) | (df['election_type'] == election_type3)]

In [36]:
filtered_date_election_df = filtered_election_df[filtered_election_df['election_date'] == election_date]

In [37]:
filtered_date_election_df.row_names = add_row_names(filtered_date_election_df['county_name'])

In [38]:
# it is hard to tell why there are duplicates with the general and general/special election types
filtered_date_election_df[filtered_date_election_df['voter_registration_number'].duplicated(keep=False)].sort_values('voter_registration_number')

Unnamed: 0,row_names,county_name,voter_registration_number,election_date,election_type,party,ballot_style,absentee,provisional,supplemental


In [39]:
deduped_df = filtered_date_election_df.drop_duplicates('voter_registration_number')

In [40]:
# there are 114 duplicated entries, and after deduping the DataFrame I have 114 fewer entries
len(filtered_date_election_df) - len(deduped_df)

0

In [41]:
len(filtered_date_election_df[filtered_date_election_df['voter_registration_number'].duplicated()])

0

In [42]:
len(filtered_date_election_df.drop_duplicates('voter_registration_number'))

2568080

In [43]:
filtered_date_election_df.party.value_counts()

party
     2568002
R         50
D         28
Name: count, dtype: int64

In [44]:
filtered_date_election_df.value_counts(['election_type'])

election_type   
GENERAL ELECTION    2568080
Name: count, dtype: int64

In [45]:
output_path = Path().cwd().parent.joinpath('output_csv/general_2010.csv')
filtered_date_election_df.to_csv(output_path)