In [1]:
import pandas as pd
import datetime
import mariadb
import numpy as np
from pathlib import Path

In [2]:
csv_path = Path().cwd().parent.joinpath('voter_history_files/2020.csv')
import_csv = pd.read_csv(csv_path, converters={'County Name': str, 'Voter Registration Number': str, 'Election Date':str, 'Election Type':str, 'Party': str, 'Ballot Style':str, 'Absentee':str, 'Provisional':str, 'Supplemental':str})
output_name = 'primary_2020.csv'

In [3]:
import_csv

Unnamed: 0,County Name,Voter Registration Number,Election Date,Election Type,Party,Ballot Style,Absentee,Provisional,Supplemental
0,COLQUITT,07271776,01/28/2020,SPECIAL ELECTION,,REGULAR,N,N,N
1,COLQUITT,00006871,01/28/2020,SPECIAL ELECTION,,REGULAR,N,N,N
2,COLQUITT,00041320,01/28/2020,SPECIAL ELECTION,,REGULAR,N,N,N
3,COLQUITT,00267983,01/28/2020,SPECIAL ELECTION,,REGULAR,N,N,N
4,COLQUITT,00503206,01/28/2020,SPECIAL ELECTION,,REGULAR,N,N,N
...,...,...,...,...,...,...,...,...,...
8329265,DEKALB,13476309,09/29/2020,SPECIAL ELECTION,,REGULAR,N,N,N
8329266,FULTON,04533303,09/29/2020,SPECIAL ELECTION,,REGULAR,N,N,N
8329267,FULTON,02399012,09/29/2020,SPECIAL ELECTION,,IN PERSON,Y,N,N
8329268,DEKALB,03963464,09/29/2020,SPECIAL ELECTION,,REGULAR,N,N,N


In [4]:
df = import_csv.copy()

In [5]:
df = df.rename(columns={'County Name': 'county_name', 'Voter Registration Number': 'voter_registration_number', 'Election Date': 'election_date', 'Election Type': 'election_type', 'Party': 'party', 'Ballot Style': 'ballot_style', 'Absentee': 'absentee', 'Provisional': 'provisional', 'Supplemental':'supplemental'})

In [6]:
def add_row_names(column):
    counter = 0
    row_name_list = []
    for row in column:
        counter += 1
        row_name_list.append(counter)
    return row_name_list

In [7]:
df.insert(0, 'row_names', add_row_names(df['county_name']))

In [8]:
def column_float_to_strings(column):
    string_list = []
    for entry in column:
        if str(entry) == 'nan':
            string_list.append("nan")
        else:
            string_list.append(str(entry))
    return string_list

In [9]:
def remove_last_two_digits(column):
    string_list = []
    for entry in column:
        string_list.append(entry[0:-2])
    return string_list

In [10]:
df['voter_registration_number'][df['voter_registration_number'].isnull()]

Series([], Name: voter_registration_number, dtype: object)

In [11]:
df

Unnamed: 0,row_names,county_name,voter_registration_number,election_date,election_type,party,ballot_style,absentee,provisional,supplemental
0,1,COLQUITT,07271776,01/28/2020,SPECIAL ELECTION,,REGULAR,N,N,N
1,2,COLQUITT,00006871,01/28/2020,SPECIAL ELECTION,,REGULAR,N,N,N
2,3,COLQUITT,00041320,01/28/2020,SPECIAL ELECTION,,REGULAR,N,N,N
3,4,COLQUITT,00267983,01/28/2020,SPECIAL ELECTION,,REGULAR,N,N,N
4,5,COLQUITT,00503206,01/28/2020,SPECIAL ELECTION,,REGULAR,N,N,N
...,...,...,...,...,...,...,...,...,...,...
8329265,8329266,DEKALB,13476309,09/29/2020,SPECIAL ELECTION,,REGULAR,N,N,N
8329266,8329267,FULTON,04533303,09/29/2020,SPECIAL ELECTION,,REGULAR,N,N,N
8329267,8329268,FULTON,02399012,09/29/2020,SPECIAL ELECTION,,IN PERSON,Y,N,N
8329268,8329269,DEKALB,03963464,09/29/2020,SPECIAL ELECTION,,REGULAR,N,N,N


In [12]:
def reformat_date(column):
    date_list = []
    for entry in column:
        date_list.append(f"{entry[6:10]}" + "-" + f"{entry[0:2]}" + "-" + f"{entry[3:5]}")
    return date_list

In [13]:
df['election_date'] = reformat_date(df['election_date'])

In [14]:
df

Unnamed: 0,row_names,county_name,voter_registration_number,election_date,election_type,party,ballot_style,absentee,provisional,supplemental
0,1,COLQUITT,07271776,2020-01-28,SPECIAL ELECTION,,REGULAR,N,N,N
1,2,COLQUITT,00006871,2020-01-28,SPECIAL ELECTION,,REGULAR,N,N,N
2,3,COLQUITT,00041320,2020-01-28,SPECIAL ELECTION,,REGULAR,N,N,N
3,4,COLQUITT,00267983,2020-01-28,SPECIAL ELECTION,,REGULAR,N,N,N
4,5,COLQUITT,00503206,2020-01-28,SPECIAL ELECTION,,REGULAR,N,N,N
...,...,...,...,...,...,...,...,...,...,...
8329265,8329266,DEKALB,13476309,2020-09-29,SPECIAL ELECTION,,REGULAR,N,N,N
8329266,8329267,FULTON,04533303,2020-09-29,SPECIAL ELECTION,,REGULAR,N,N,N
8329267,8329268,FULTON,02399012,2020-09-29,SPECIAL ELECTION,,IN PERSON,Y,N,N
8329268,8329269,DEKALB,03963464,2020-09-29,SPECIAL ELECTION,,REGULAR,N,N,N


In [15]:
df.value_counts('election_type')

election_type
GENERAL                    4973992
GENERAL PRIMARY            2281405
GENERAL PRIMARY RUNOFF      662740
PPP                         292293
SPECIAL ELECTION             56974
GENERAL ELECTION RUNOFF      33469
SPECIAL ELECTION RUNOFF      28344
RECALL                          53
Name: count, dtype: int64

In [16]:
filtered_date_df = df[df['election_date'] == "2020-06-09"]

In [17]:
filtered_date_df

Unnamed: 0,row_names,county_name,voter_registration_number,election_date,election_type,party,ballot_style,absentee,provisional,supplemental
5350104,5350105,COOK,00578947,2020-06-09,SPECIAL ELECTION,,REGULAR,N,N,N
5350105,5350106,COOK,00611960,2020-06-09,SPECIAL ELECTION,,REGULAR,N,N,N
5350106,5350107,COOK,00610063,2020-06-09,SPECIAL ELECTION,,REGULAR,N,N,N
5350107,5350108,COOK,00962640,2020-06-09,SPECIAL ELECTION,,REGULAR,N,N,N
5350108,5350109,COOK,01220392,2020-06-09,SPECIAL ELECTION,,REGULAR,N,N,N
...,...,...,...,...,...,...,...,...,...,...
7631784,7631785,FULTON,05323544,2020-06-09,GENERAL PRIMARY,DEMOCRAT,REGULAR,N,N,N
7631785,7631786,UPSON,04069157,2020-06-09,GENERAL PRIMARY,DEMOCRAT,MAIL IN,Y,N,N
7631786,7631787,HENRY,04034579,2020-06-09,GENERAL PRIMARY,REPUBLICAN,IN PERSON,Y,N,N
7631787,7631788,GWINNETT,10203088,2020-06-09,GENERAL PRIMARY,DEMOCRAT,REGULAR,N,N,N


In [18]:
filtered_date_df.value_counts('election_type')

election_type
GENERAL PRIMARY     2281405
SPECIAL ELECTION        280
Name: count, dtype: int64

In [19]:
filtered_date_ppp = filtered_date_df[filtered_date_df['election_type'] == "GENERAL PRIMARY"]

In [20]:
filtered_date_ppp

Unnamed: 0,row_names,county_name,voter_registration_number,election_date,election_type,party,ballot_style,absentee,provisional,supplemental
5350384,5350385,APPLING,00044647,2020-06-09,GENERAL PRIMARY,REPUBLICAN,REGULAR,N,N,N
5350385,5350386,APPLING,00047002,2020-06-09,GENERAL PRIMARY,REPUBLICAN,REGULAR,N,N,N
5350386,5350387,APPLING,00045730,2020-06-09,GENERAL PRIMARY,REPUBLICAN,REGULAR,N,N,N
5350387,5350388,APPLING,00122112,2020-06-09,GENERAL PRIMARY,REPUBLICAN,REGULAR,N,N,N
5350388,5350389,APPLING,00167836,2020-06-09,GENERAL PRIMARY,REPUBLICAN,REGULAR,N,N,N
...,...,...,...,...,...,...,...,...,...,...
7631784,7631785,FULTON,05323544,2020-06-09,GENERAL PRIMARY,DEMOCRAT,REGULAR,N,N,N
7631785,7631786,UPSON,04069157,2020-06-09,GENERAL PRIMARY,DEMOCRAT,MAIL IN,Y,N,N
7631786,7631787,HENRY,04034579,2020-06-09,GENERAL PRIMARY,REPUBLICAN,IN PERSON,Y,N,N
7631787,7631788,GWINNETT,10203088,2020-06-09,GENERAL PRIMARY,DEMOCRAT,REGULAR,N,N,N


In [21]:
output_path = Path().cwd().parent.joinpath('output_csv/primary_2020.csv')
filtered_date_ppp.to_csv(output_path)