In [1]:
# run individual voter history file cleaning scripts first! (in cleaning_notebooks and historical_filter directories)
# use pip to install these libraries before running

import pandas as pd
import mariadb
import numpy as np
import os
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
# optional to increase display size of dataframes
pd.options.display.max_rows = 999
pd.options.display.max_colwidth = 999
pd.options.display.width = 999
pd.options.display.max_columns = 999

In [3]:
# use MariaDB to query from data warehouse if voter history file not already in "main_voter_dataset" folder in this directory

try:
    # use after initial pull from data warehouse
    original_demographics_path = Path().cwd().parent.joinpath('main_voter_dataset/voters_20240410_clean.csv')
    original_demographics_df = pd.read_csv(original_demographics_path, dtype='str')
except:
    # only do this block once to get narrow_demographics.csv, then import that csv instead
    sql_query = "SELECT ajc_id_num, registration_num, status, last_name, first_name, middle_name, county, county_precinct_description, residence_city, municipal_precinct, state_senate_district, state_house_district, birth_year, race, gender, registration_date, voter_created_date, last_party_voted, last_vote_date FROM `ga_sos_voters`.`voters_20240410_clean`"
    # Connect to data warehouse with MariaDB
    try:
        conn = mariadb.connect(
            user=os.getenv("MARIADB_USER"),
            password=os.getenv("MARIADB_PASSWORD"),
            host=os.getenv("MARIADB_HOST"),
            port=int(os.getenv("MARIADB_PORT"))
        )
    except mariadb.Error as e:
        print(f"Error connecting to MariaDB Platform: {e}")
        sys.exit(1)
    # Get Cursor
    cur = conn.cursor()
    original_demographics_df = pd.read_sql(sql_query, conn, dtype='str')
    original_demographics_csv_path = Path().cwd().parent.joinpath('main_voter_dataset/voters_20240410_clean.csv')
    original_demographics_df.to_csv(original_demographics_csv_path)

In [4]:
demographics_df = original_demographics_df.copy()

In [5]:
demographics_df.rename(columns={'registration_num': 'voter_registration_number'}, inplace=True)

In [6]:
demographics_df.head()

Unnamed: 0.1,Unnamed: 0,ajc_id_num,voter_registration_number,status,last_name,first_name,middle_name,county,county_precinct_description,residence_city,municipal_precinct,state_senate_district,state_house_district,birth_year,race,gender,registration_date,voter_created_date,last_party_voted,last_vote_date
0,0,1,11666054,Active,ARAUJO,ALFONSO,,DEKALB,MILLER GROVE,DECATUR,,41,84,1991.0,Hispanic/Latino,Male,2018-09-14,03/23/2018,,
1,1,2,11442395,Active,ARAUJO,CAMILO,ANDRES,DEKALB,STONE MILL ELEM,STONE MOUNTAIN,,55,88,1993.0,Hispanic/Latino,Male,2017-08-29,08/29/2017,,11/08/2022
2,2,3,10826861,Active,ARAYA,YONATAN,,DEKALB,DUNAIRE ELEM,STONE MOUNTAIN,,41,85,1989.0,Black,Male,2018-07-10,07/28/2016,DEMOCRAT,11/03/2022
3,3,4,11597340,Active,ARAYA,SEMIRA,Z,DEKALB,JOLLY ELEM,CLARKSTON,,41,86,1990.0,Black,Female,2018-01-09,01/09/2018,DEMOCRAT,10/22/2020
4,4,5,4182507,Active,ARBUCKLE,WILLIAM,JOSEPH,DEKALB,CORALWOOD,DECATUR,,10,86,1960.0,White,Male,2008-09-17,08/06/1997,Democrat,11/29/2022


In [7]:
# set filepaths for voter history files

general_2008_path = Path().cwd().parent.joinpath('output_csv/general_2008.csv')
general_2010_path = Path().cwd().parent.joinpath('output_csv/general_2010.csv')
general_2012_path = Path().cwd().parent.joinpath('output_csv/general_2012.csv')
general_2014_path = Path().cwd().parent.joinpath('output_csv/general_2014.csv')
general_2016_path = Path().cwd().parent.joinpath('output_csv/general_2016.csv')
general_2018_path = Path().cwd().parent.joinpath('output_csv/general_2018.csv')
general_2020_path = Path().cwd().parent.joinpath('output_csv/general_2020.csv')
general_2022_path = Path().cwd().parent.joinpath('output_csv/general_2022.csv')
general_runoff_2022_path = Path().cwd().parent.joinpath('output_csv/general_runoff_2022.csv')
senate_runoff_2021_path = Path().cwd().parent.joinpath('output_csv/senate_runoff_2021.csv')
primary_2012_path = Path().cwd().parent.joinpath('output_csv/primary_2012.csv')
primary_2016_path = Path().cwd().parent.joinpath('output_csv/primary_2016.csv')
primary_2020_path = Path().cwd().parent.joinpath('output_csv/primary_2020.csv')
primary_2024_path = Path().cwd().parent.joinpath('output_csv/primary_2024.csv')

In [8]:
# read voter history files into variables

general_2008_df = pd.read_csv(general_2008_path, dtype='str')
general_2010_df = pd.read_csv(general_2010_path, dtype='str')
general_2012_df = pd.read_csv(general_2012_path, dtype='str')
general_2014_df = pd.read_csv(general_2014_path, dtype='str')
general_2016_df = pd.read_csv(general_2016_path, dtype='str')
general_2018_df = pd.read_csv(general_2018_path, dtype='str')
general_2020_df = pd.read_csv(general_2020_path, dtype='str')
general_2022_df = pd.read_csv(general_2022_path, dtype='str')
general_runoff_2022_df = pd.read_csv(general_runoff_2022_path, dtype='str')
senate_runoff_2021_df = pd.read_csv(senate_runoff_2021_path, dtype='str')
primary_2012_df = pd.read_csv(primary_2012_path, dtype='str')
primary_2016_df = pd.read_csv(primary_2016_path, dtype='str')
primary_2020_df = pd.read_csv(primary_2020_path, dtype='str')
primary_2024_df = pd.read_csv(primary_2024_path, dtype='str')

In [9]:
# get rid of the things we don't need

general_2008_df['general_2008'] = "1"
general_2008_df_thin = general_2008_df[['voter_registration_number', 'general_2008']]
general_2010_df['general_2010'] = "1"
general_2010_df_thin = general_2010_df[['voter_registration_number', 'general_2010']]
general_2012_df['general_2012'] = "1"
general_2012_df_thin = general_2012_df[['voter_registration_number', 'general_2012']]
general_2014_df['general_2014'] = "1"
general_2014_df_thin = general_2014_df[['voter_registration_number', 'general_2014']]
general_2016_df['general_2016'] = "1"
general_2016_df_thin = general_2016_df[['voter_registration_number', 'general_2016']]
general_2018_df['general_2018'] = "1"
general_2018_df_thin = general_2018_df[['voter_registration_number', 'general_2018']]
general_2020_df['general_2020'] = "1"
general_2020_df_thin = general_2020_df[['voter_registration_number', 'general_2020']]
general_2022_df['general_2022'] = "1"
general_2022_df_thin = general_2022_df[['voter_registration_number', 'general_2022']]
general_runoff_2022_df['general_runoff_2022'] = "1"
general_runoff_2022_df_thin = general_runoff_2022_df[['voter_registration_number', 'general_runoff_2022']]
senate_runoff_2021_df['senate_runoff_2021'] = "1"
senate_runoff_2021_df_thin = senate_runoff_2021_df[['voter_registration_number', 'senate_runoff_2021']]
primary_2012_df['primary_2012'] = "1"
primary_2012_df_thin = primary_2012_df[['voter_registration_number', 'primary_2012']]
primary_2016_df['primary_2016'] = "1"
primary_2016_df_thin = primary_2016_df[['voter_registration_number', 'primary_2016']]
primary_2020_df['primary_2020'] = "1"
primary_2020_df_thin = primary_2020_df[['voter_registration_number', 'primary_2020']]
primary_2024_df['primary_2024'] = "1"
primary_2024_df_thin = primary_2024_df[['voter_registration_number', 'primary_2024']]

In [10]:
# merge voter history files to demographics data

demographics_df = demographics_df.set_index('voter_registration_number').join(general_2008_df_thin.set_index('voter_registration_number'), how='outer')
demographics_df = demographics_df.join(general_2010_df_thin.set_index('voter_registration_number'), how='outer')
demographics_df = demographics_df.join(general_2012_df_thin.set_index('voter_registration_number'), how='outer')
demographics_df = demographics_df.join(general_2014_df_thin.set_index('voter_registration_number'), how='outer')
demographics_df = demographics_df.join(general_2016_df_thin.set_index('voter_registration_number'), how='outer')
demographics_df = demographics_df.join(general_2018_df_thin.set_index('voter_registration_number'), how='outer')
demographics_df = demographics_df.join(general_2020_df_thin.set_index('voter_registration_number'), how='outer')
demographics_df = demographics_df.join(senate_runoff_2021_df_thin.set_index('voter_registration_number'), how='outer')
demographics_df = demographics_df.join(general_2022_df_thin.set_index('voter_registration_number'), how='outer')
demographics_df = demographics_df.join(general_runoff_2022_df_thin.set_index('voter_registration_number'), how='outer')
demographics_df = demographics_df.join(primary_2012_df_thin.set_index('voter_registration_number'), how='outer')
demographics_df = demographics_df.join(primary_2016_df_thin.set_index('voter_registration_number'), how='outer')
demographics_df = demographics_df.join(primary_2020_df_thin.set_index('voter_registration_number'), how='outer')
demographics_df = demographics_df.join(primary_2024_df_thin.set_index('voter_registration_number'), how='outer')

In [None]:
df = demographics_df.copy()

In [None]:
df.reset_index(inplace=True)

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.dropna(how='all', inplace=True)

In [None]:
df.head()

Unnamed: 0,voter_registration_number,ajc_id_num,status,last_name,first_name,middle_name,county,county_precinct_description,residence_city,municipal_precinct,state_senate_district,state_house_district,birth_year,race,gender,registration_date,voter_created_date,last_party_voted,last_vote_date,general_2008,general_2010,general_2012,general_2014,general_2016,general_2018,general_2020,senate_runoff_2021,general_2022,general_runoff_2022,primary_2012,primary_2016,primary_2020,primary_2024
0,1,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,
1,2,4340064.0,Active,BOSWELL,JOURNEY,ROGER,BANKS,ANDERSON,MAYSVILLE,6.0,50.0,32.0,1937.0,Other,Male,1967-10-03,02/04/1995,Republican,03/12/2024,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,3,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,
3,4,2843263.0,Active,CAGLE,RONNIE,D,BANKS,ANDERSON,MAYSVILLE,,50.0,32.0,1954.0,White,Male,1972-07-06,02/04/1995,REPUBLICAN,11/30/2022,1.0,1.0,1.0,,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,,
4,5,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,


In [None]:
if 'Unnamed: 0' in df.columns:
    df.pop('Unnamed: 0')

In [None]:
# replace Pandas' NaN values with blank fields

df.status.fillna('', inplace=True)
df.last_name.fillna('', inplace=True)
df.first_name.fillna('', inplace=True)
df.middle_name.fillna('', inplace=True)
df.county.fillna('', inplace=True)
df.county_precinct_description.fillna('', inplace=True)
df.residence_city.fillna('', inplace=True)
df.municipal_precinct.fillna('', inplace=True)
df.state_senate_district.fillna('', inplace=True)
df.state_house_district.fillna('', inplace=True)
df.state_house_district.fillna('', inplace=True)
df.birth_year.fillna('', inplace=True)
df.race.fillna('', inplace=True)
df.gender.fillna('', inplace=True)
df.last_party_voted.fillna('', inplace=True)
df.last_vote_date.fillna('', inplace=True)
df.registration_date.fillna('', inplace=True)
df.voter_created_date.fillna('', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.status.fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.last_name.fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a 

In [None]:
# replace Pandas' NaN in integer columns with 0

df.general_2008.fillna('0', inplace=True)
df.general_2010.fillna('0', inplace=True)
df.general_2012.fillna('0', inplace=True)
df.general_2014.fillna('0', inplace=True)
df.general_2016.fillna('0', inplace=True)
df.general_2018.fillna('0', inplace=True)
df.general_2020.fillna('0', inplace=True)
df.general_2022.fillna('0', inplace=True)
df.general_runoff_2022.fillna('0', inplace=True)
df.senate_runoff_2021.fillna('0', inplace=True)
df.primary_2012.fillna('0', inplace=True)
df.primary_2016.fillna('0', inplace=True)
df.primary_2020.fillna('0', inplace=True)
df.primary_2024.fillna('0', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.general_2008.fillna('0', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.general_2010.fillna('0', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always be

In [None]:
# get rid of rows that do not have a voter_registration_number . . . useless 

df.dropna(subset='voter_registration_number', inplace=True)

In [None]:
df

Unnamed: 0,voter_registration_number,ajc_id_num,status,last_name,first_name,middle_name,county,county_precinct_description,residence_city,municipal_precinct,state_senate_district,state_house_district,birth_year,race,gender,registration_date,voter_created_date,last_party_voted,last_vote_date,general_2008,general_2010,general_2012,general_2014,general_2016,general_2018,general_2020,senate_runoff_2021,general_2022,general_runoff_2022,primary_2012,primary_2016,primary_2020,primary_2024
0,00000001,,,,,,,,,,,,,,,,,,,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,00000002,4340064,Active,BOSWELL,JOURNEY,ROGER,BANKS,ANDERSON,MAYSVILLE,6,050,032,1937.0,Other,Male,1967-10-03,02/04/1995,Republican,03/12/2024,1,1,1,0,1,1,1,1,1,1,1,1,1,1
2,00000003,,,,,,,,,,,,,,,,,,,1,1,1,0,1,1,1,1,1,1,1,1,1,0
3,00000004,2843263,Active,CAGLE,RONNIE,D,BANKS,ANDERSON,MAYSVILLE,,050,032,1954.0,White,Male,1972-07-06,02/04/1995,REPUBLICAN,11/30/2022,1,1,1,0,1,1,0,1,1,1,1,1,0,0
4,00000005,,,,,,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9414227,8131680,1315547,Active,IMUS,JOSEPH,THOMAS,CHATHAM,POOLER RECREATION CENTER GYMNASIUM,POOLER,7-16P,001,164,1979.0,White,Male,2009-11-12,06/15/2023,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9414228,81430782,,,,,,,,,,,,,,,,,,,0,0,1,0,0,0,0,0,0,0,0,0,0,0
9414229,8352360,4362775,Active,BETHELY,CORDARO,,STEWART,RICHLAND,LUMPKIN,,012,151,1991.0,Black,Male,2012-06-09,03/26/2022,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9414230,87666901,,,,,,,,,,,,,,,,,,,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [None]:
df.sort_values('ajc_id_number', ascending=True)

In [None]:
voter_history_output_path = Path().cwd().parent.joinpath('main_voter_dataset/voters_20240410_clean_with_voter_history.csv')
# df.to_csv(voter_history_output_path)