In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
from sklearn.datasets import make_classification

In [None]:
years = np.array([2008, 2012, 2016, 2020])
ga_ballots = np.array([3940705, 3919355, 4165405, 5000511])
nat_ballots = np.array([132609063, 130292355, 138846571, 159690457])

labels = [x for x in years]

fig, ax = plt.subplots(figsize=(8,5))
width = 0.4
xlocs = np.arange(len(ga_ballots))

ax.bar(xlocs, nat_ballots, width,
      color= 'red', label='National')

ax.set_xticks(ticks= range(len(years)))
ax.set_xticklabels(labels)
ax.yaxis.grid(True)

ax.set_ylabel('Ballots Counted (100 millions)')
ax.set_title('National Voter Turnout 2008-2020')
fig.tight_layout(pad=1)
fig.savefig('../images/nat_turnout.png', dpi=125)
plt.show()

In [None]:
labels = [x for x in years]

fig, ax = plt.subplots(figsize=(8,5))
width = 0.4
xlocs = np.arange(len(ga_ballots))
ax.bar(xlocs, ga_ballots, width,
      color= 'blue', label='Georgia')


ax.set_xticks(ticks= range(len(years)))
ax.set_xticklabels(labels)
ax.yaxis.grid(True)

ax.set_ylabel('Ballots Counted (millions)')
ax.set_title('Georgia Voter Turnout 2008-2020')
fig.tight_layout(pad=1)
fig.savefig('../images/ga_turnout.png', dpi=125)
plt.show()

In [None]:
years = np.array([2008, 2012, 2016, 2020])
ga_turnout = np.array([.627, .593, .598, .677])
nat_turnout = np.array([.622, .586, .601, .667])

labels = [x for x in years]

fig, ax = plt.subplots(figsize=(8,5))
width = 0.4
xlocs = np.arange(len(ga_ballots))

ax.plot(xlocs, nat_turnout, width,
      color= 'red', label='National')
red_line,_ = plt.plot(xlocs, nat_turnout, width,
      color= 'red', label='National')

ax.plot(xlocs, ga_turnout, width,
      color= 'blue', label='Georgia')
blue_line,_ = plt.plot(xlocs, ga_turnout, width,
      color= 'blue', label='Georgia')

ax.set_xticks(ticks= range(len(years)))
ax.set_xticklabels(labels)
ax.yaxis.grid(True)
ax.legend(handles=[red_line, blue_line],loc='best')

ax.set_ylabel('Percentage of Eligible Voters')
ax.set_title('National Voter Turnout Rate 2008-2020')
fig.tight_layout(pad=1)
fig.savefig('../images/turnout_rate.png', dpi=125)
plt.show()

In [2]:
tot = pd.read_csv('../data/ga_archive/tbl_prod_GABU202012_all.csv', sep = '|')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
oct_new = pd.read_csv('../data/ga_archive/tbl_prod_GABU202010_new_records.csv', sep = '|')
nov_new = pd.read_csv('../data/ga_archive/tbl_prod_GABU202011_new_records.csv', sep = '|')
dec_new = pd.read_csv('../data/ga_archive/tbl_prod_GABU202012_new_records.csv', sep = '|')
new = pd.concat([oct_new, nov_new, dec_new], axis=0)

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
new_lst = list(new['registration_number'])

In [5]:
mask = tot['registration_number'].isin(new_lst)

In [6]:
tot['new_registration'] = mask

In [7]:
tot['new_registration'] = tot['new_registration'].map({True: int(1), False: int(0)})

In [8]:
tot = tot.drop([
    'land_district',
    'land_lot',
    'status_reason',
    'city_precinct_id',
    'county_districta_name',
    'county_districta_value',
    'county_districtb_name',
    'county_districtb_value',
    'city_dista_name',
    'city_dista_value',
    'city_distb_name',
    'city_distb_value',
    'city_distc_name',
    'city_distc_value',
    'city_distd_name',
    'city_distd_value',
    'party_last_voted',
    'city_school_district_name',
    'municipal_name',
    'municipal_code',
    'ward_city_council_code',
    'race_desc',
    'residence_city',
    'residence_zipcode',
    'county_precinct_id',
    'city_school_district_value',
    'senate_district',
    'house_district',
    'judicial_district',
    'commission_district',
    'school_district',
    'date_added',
    'date_changed',
    'district_combo',
    'last_contact_date',
    'ward_city_council_name',
    'date_last_voted',
    'registration_date'
    ], axis=1)

tot = tot.drop(['registration_number'], axis=1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7729838 entries, 0 to 7729837
Data columns (total 7 columns):
 #   Column                  Dtype 
---  ------                  ----- 
 0   county_code             int64 
 1   voter_status            object
 2   birthyear               int64 
 3   race                    object
 4   gender                  object
 5   congressional_district  int64 
 6   new_registration        int64 
dtypes: int64(4), object(3)
memory usage: 412.8+ MB


In [9]:
tot['voter_status'] = tot['voter_status'].map({'A': int(1), 'I': int(0)})

In [10]:
r_dummies = pd.get_dummies(tot['race'], dtype='int64')
tot[r_dummies.columns] = r_dummies

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7729838 entries, 0 to 7729837
Data columns (total 14 columns):
 #   Column                  Dtype 
---  ------                  ----- 
 0   county_code             int64 
 1   voter_status            int64 
 2   birthyear               int64 
 3   race                    object
 4   gender                  object
 5   congressional_district  int64 
 6   new_registration        int64 
 7   AI                      int64 
 8   AP                      int64 
 9   BH                      int64 
 10  HP                      int64 
 11  OT                      int64 
 12  U                       int64 
 13  WH                      int64 
dtypes: int64(12), object(2)
memory usage: 825.6+ MB


In [11]:
tot = tot.drop(['race'], axis=1)
g_dummies = pd.get_dummies(tot['gender'], dtype='int64')
tot = tot.drop(['gender'], axis=1)
tot[g_dummies.columns] = g_dummies

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7729838 entries, 0 to 7729837
Data columns (total 15 columns):
 #   Column                  Dtype
---  ------                  -----
 0   county_code             int64
 1   voter_status            int64
 2   birthyear               int64
 3   congressional_district  int64
 4   new_registration        int64
 5   AI                      int64
 6   AP                      int64
 7   BH                      int64
 8   HP                      int64
 9   OT                      int64
 10  U                       int64
 11  WH                      int64
 12  F                       int64
 13  M                       int64
 14  O                       int64
dtypes: int64(15)
memory usage: 884.6 MB


In [12]:
counties = ['Appling', 
            'Atkinson',
            'Bacon',
            'Baker',
            'Baldwin',
            'Banks',
            'Barrow',
            'Bartow',
            'Ben_Hill',
            'Berrien',
            'Bibb',
            'Bleckley',
            'Brantley',
            'Brooks',
            'Bryan',
            'Bulloch',
            'Burke',
            'Butts',
            'Calhoun',
            'Camden',
            'Candler',
            'Carroll',
            'Catoosa',
            'Charlton',
            'Chatham',
            'Chattahoochee',
            'Chattooga',
            'Cherokee',
            'Clarke',
            'Clay',
            'Clayton',
            'Clinch',
            'Cobb',
            'Coffee',
            'Colquitt',
            'Columbia',
            'Cook',
            'Coweta',
            'Crawford',
            'Crisp',
            'Dade',
            'Dawson',
            'De_Kalb',
            'Decatur',
            'Dodge',
            'Dooly',
            'Dougherty',
            'Douglas',
            'Early',
            'Echols',
            'Effingham',
            'Elbert',
            'Emanuel',
            'Evans',
            'Fannin',
            'Fayette',
            'Floyd',
            'Forsyth',
            'Franklin',
            'Fulton',
            'Gilmer',
            'Glascock',
            'Glynn',
            'Gordon',
            'Grady',
            'Greene',
            'Gwinnett',
            'Habersham',
            'Hall',
            'Hancock',
            'Haralson',
            'Harris',
            'Hart',
            'Heard',
            'Henry',
            'Houston',
            'Irwin',
            'Jackson',
            'Jasper',
            'Jeff_Davis',
            'Jefferson',
            'Jenkins',
            'Johnson',
            'Jones',
            'Lamar',
            'Lanier',
            'Laurens',
            'Lee',
            'Liberty',
            'Lincoln',
            'Long',
            'Lowndes',
            'Lumpkin',
            'Macon',
            'Madison',
            'Marion',
            'McDuffie',
            'McIntosh',
            'Meriwether',
            'Miller',
            'Mitchell',
            'Monroe',
            'Montgomery',
            'Morgan',
            'Murray',
            'Muscogee',
            'Newton',
            'Oconee',
            'Oglethorpe',
            'Paulding',
            'Peach',
            'Pickens',
            'Pierce',
            'Pike',
            'Polk',
            'Pulaski',
            'Putnam',
            'Quitman',
            'Rabun',
            'Randolph',
            'Richmond',
            'Rockdale',
            'Schley',
            'Screven',
            'Seminole',
            'Spalding',
            'Stephens',
            'Stewart',
            'Sumter',
            'Talbot',
            'Taliaferro',
            'Tattnall',
            'Taylor',
            'Telfair',
            'Terrell',
            'Thomas',
            'Tift',
            'Toombs',
            'Towns',
            'Treutlen',
            'Troup',
            'Turner',
            'Twiggs',
            'Union',
            'Upson',
            'Walker',
            'Walton',
            'Ware',
            'Warren',
            'Washington',
            'Wayne',
            'Webster',
            'Wheeler',
            'White',
            'Whitfield',
            'Wilcox',
            'Wilkes',
            'Wilkinson',
            'Worth'
           ]

In [13]:
keys = range(1,161)
keys

range(1, 161)

In [14]:
county_dict = {}
for key in keys:
    for county in counties:
        county_dict[key] = county
        counties.remove(county)
        break
county_dict

{1: 'Appling',
 2: 'Atkinson',
 3: 'Bacon',
 4: 'Baker',
 5: 'Baldwin',
 6: 'Banks',
 7: 'Barrow',
 8: 'Bartow',
 9: 'Ben_Hill',
 10: 'Berrien',
 11: 'Bibb',
 12: 'Bleckley',
 13: 'Brantley',
 14: 'Brooks',
 15: 'Bryan',
 16: 'Bulloch',
 17: 'Burke',
 18: 'Butts',
 19: 'Calhoun',
 20: 'Camden',
 21: 'Candler',
 22: 'Carroll',
 23: 'Catoosa',
 24: 'Charlton',
 25: 'Chatham',
 26: 'Chattahoochee',
 27: 'Chattooga',
 28: 'Cherokee',
 29: 'Clarke',
 30: 'Clay',
 31: 'Clayton',
 32: 'Clinch',
 33: 'Cobb',
 34: 'Coffee',
 35: 'Colquitt',
 36: 'Columbia',
 37: 'Cook',
 38: 'Coweta',
 39: 'Crawford',
 40: 'Crisp',
 41: 'Dade',
 42: 'Dawson',
 43: 'De_Kalb',
 44: 'Decatur',
 45: 'Dodge',
 46: 'Dooly',
 47: 'Dougherty',
 48: 'Douglas',
 49: 'Early',
 50: 'Echols',
 51: 'Effingham',
 52: 'Elbert',
 53: 'Emanuel',
 54: 'Evans',
 55: 'Fannin',
 56: 'Fayette',
 57: 'Floyd',
 58: 'Forsyth',
 59: 'Franklin',
 60: 'Fulton',
 61: 'Gilmer',
 62: 'Glascock',
 63: 'Glynn',
 64: 'Gordon',
 65: 'Grady',
 66:

In [15]:
tot['county_code'] = tot['county_code'].replace(county_dict)
tot = tot.rename(columns={'county_code': 'county'})
tot['county']

0             Banks
1             Banks
2             Banks
3             Banks
4             Banks
             ...   
7729833     Decatur
7729834     Decatur
7729835    Muscogee
7729836    Muscogee
7729837    Muscogee
Name: county, Length: 7729838, dtype: object

In [16]:
rural = ['Appling', 
            'Atkinson',
            'Bacon',
            'Baker',
            'Baldwin',
            'Banks',          
            'Ben_Hill',
            'Berrien',
            'Bleckley',
            'Brantley',
            'Brooks',
            'Bryan',
            'Burke',
            'Butts',
            'Calhoun',
            'Candler',
            'Charlton',
            'Chattahoochee',
            'Chattooga',
            'Clay',
            'Clinch',
            'Coffee',
            'Colquitt',
            'Cook',
            'Crawford',
            'Crisp',
            'Dade',
            'Dawson',
            'Decatur',
            'Dodge',
            'Dooly',
            'Early',
            'Echols',
            'Elbert',
            'Emanuel',
            'Evans',
            'Fannin',
            'Franklin',
            'Gilmer',
            'Glascock',
            'Grady',
            'Greene',
            'Habersham',
            'Hancock',
            'Haralson',
            'Harris',
            'Hart',
            'Heard',
            'Irwin',
            'Jasper',
            'Jeff_Davis',
            'Jefferson',
            'Jenkins',
            'Johnson',
            'Jones',
            'Lamar',
            'Lanier',
            'Laurens',
            'Lee',
            'Lincoln',
            'Long',
            'Lumpkin',
            'Macon',
            'Madison',
            'Marion',
            'McDuffie',
            'McIntosh',
            'Meriwether',
            'Miller',
            'Mitchell',
            'Monroe',
            'Montgomery',
            'Morgan',
            'Murray',
            'Oconee',
            'Oglethorpe',
            'Peach',
            'Pickens',
            'Pierce',
            'Pike',
            'Polk',
            'Pulaski',
            'Putnam',
            'Quitman',
            'Rabun',
            'Randolph',
            'Schley',
            'Screven',
            'Seminole',
            'Stephens',
            'Stewart',
            'Sumter',
            'Talbot',
            'Taliaferro',
            'Tattnall',
            'Taylor',
            'Telfair',
            'Terrell',
            'Thomas',
            'Tift',
            'Toombs',
            'Towns',
            'Treutlen',
            'Turner',
            'Twiggs',
            'Union',
            'Upson',
            'Ware',
            'Warren',
            'Washington',
            'Wayne',
            'Webster',
            'Wheeler',
            'White',
            'Wilcox',
            'Wilkes',
            'Wilkinson',
            'Worth'
           ]

In [17]:
urban = ['Barrow',
    'Bartow',
    'Bibb',
    'Bulloch',
    'Carroll',
    'Catoosa',
    'Chatham',
    'Cherokee',
    'Clarke',
    'Clayton',
    'Cobb',
    'Columbia',
    'Coweta',
    'De_Kalb',
    'Dougherty',
    'Douglas',
    'Effingham',
    'Fayette',
    'Floyd',
    'Forsyth',
    'Fulton',
    'Glynn',
    'Gordon',
    'Gwinnett',
    'Hall',
    'Henry',
    'Houston',
    'Jackson',
    'Lowndes',
    'Muscogee',
    'Newton',
    'Paulding',
    'Richmond',
    'Rockdale',
    'Spalding',
    'Troup',
    'Walker',
    'Walton',
    'Whitfield'
    ]

In [18]:
military = ['Camden','Liberty']

In [19]:
r_mask = tot['county'].isin(rural)
u_mask = tot['county'].isin(urban)
m_mask = tot['county'].isin(military)

tot['rural'] = r_mask
tot['urban'] = u_mask
tot['military'] = m_mask


In [20]:
tot['rural'] = tot['rural'].map({True: int(1), False: int(0)})
tot['urban'] = tot['urban'].map({True: int(1), False: int(0)})
tot['military'] = tot['military'].map({True: int(1), False: int(0)})

tot = tot.drop('county', axis=1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7729838 entries, 0 to 7729837
Data columns (total 17 columns):
 #   Column                  Dtype
---  ------                  -----
 0   voter_status            int64
 1   birthyear               int64
 2   congressional_district  int64
 3   new_registration        int64
 4   AI                      int64
 5   AP                      int64
 6   BH                      int64
 7   HP                      int64
 8   OT                      int64
 9   U                       int64
 10  WH                      int64
 11  F                       int64
 12  M                       int64
 13  O                       int64
 14  rural                   int64
 15  urban                   int64
 16  military                int64
dtypes: int64(17)
memory usage: 1002.6 MB


In [21]:
cd_dummies = pd.get_dummies(tot['congressional_district'], prefix='cd', dtype='int64')
tot = tot.drop(['congressional_district'], axis=1)
tot[cd_dummies.columns] = cd_dummies
tot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7729838 entries, 0 to 7729837
Data columns (total 31 columns):
 #   Column            Dtype
---  ------            -----
 0   voter_status      int64
 1   birthyear         int64
 2   new_registration  int64
 3   AI                int64
 4   AP                int64
 5   BH                int64
 6   HP                int64
 7   OT                int64
 8   U                 int64
 9   WH                int64
 10  F                 int64
 11  M                 int64
 12  O                 int64
 13  rural             int64
 14  urban             int64
 15  military          int64
 16  cd_1              int64
 17  cd_2              int64
 18  cd_3              int64
 19  cd_4              int64
 20  cd_5              int64
 21  cd_6              int64
 22  cd_7              int64
 23  cd_8              int64
 24  cd_9              int64
 25  cd_10             int64
 26  cd_11             int64
 27  cd_12             int64
 28  cd_13       

In [22]:
tot['age'] = 2020 - tot['birthyear']
tot['age'] = tot['age'].astype('int64')
tot = tot.drop(['birthyear'], axis=1)

In [23]:
msk = np.random.rand(len(tot)) < 0.8
train = tot[msk]
test = tot[~msk]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6181531 entries, 1 to 7729837
Data columns (total 31 columns):
 #   Column            Dtype
---  ------            -----
 0   voter_status      int64
 1   new_registration  int64
 2   AI                int64
 3   AP                int64
 4   BH                int64
 5   HP                int64
 6   OT                int64
 7   U                 int64
 8   WH                int64
 9   F                 int64
 10  M                 int64
 11  O                 int64
 12  rural             int64
 13  urban             int64
 14  military          int64
 15  cd_1              int64
 16  cd_2              int64
 17  cd_3              int64
 18  cd_4              int64
 19  cd_5              int64
 20  cd_6              int64
 21  cd_7              int64
 22  cd_8              int64
 23  cd_9              int64
 24  cd_10             int64
 25  cd_11             int64
 26  cd_12             int64
 27  cd_13             int64
 28  cd_14       

In [24]:
X = train.copy()
y = X.pop('new_registration')
X.shape, y.shape

((6181531, 30), (6181531,))

In [None]:
sample = train.sample(10000)
X_sample = sample.copy()
y_sample = X_sample.pop('new_registration')

In [None]:
sns.set_style("whitegrid")
sns.set(font_scale=1.4)
fig, age = plt.subplots(figsize=(10, 6))
age = sns.regplot(x= X_sample['age'], y=y_sample, data=sample, line_kws={"color": "red"}, logistic= True).set_title("Age Log Odds Linear Plot")
age.figure.savefig("../images/age_log_lin_plot.png")

In [None]:
mask = np.zeros_like(X_sample.corr())

mask[np.triu_indices_from(mask)] = True

with sns.axes_style("white"):
    
    sns.set(font_scale=1.4)

    fig, corr = plt.subplots(figsize=(10,6))

    corr = sns.heatmap(X_sample.corr(), mask=mask, square=True,
                     annot = False, cmap = 'coolwarm').set_title("Correlation among Features")
corr.figure.savefig("../images/corr_heatmap.png")


In [26]:
groupings = ['y~ C(voter_status) + C(rural) + age',
                'y~ C(AI) + C(AP) + C(BH) + C(HP) + C(OT) + C(U) + C(WH)',
                'y~ C(F) + C(M) + C(O)',
                'y~ C(rural) + C(urban) + C(military)',
                'y~ C(cd_1) + C(cd_2) + C(cd_3) + C(cd_4) + C(cd_5) + C(cd_6) + C(cd_7) + C(cd_8) + C(cd_9) + C(cd_10) + C(cd_11) + C(cd_12) + C(cd_13) + C(cd_14) + C(cd_99999)',
                'y~ C(voter_status) + C(AI) + C(AP) + C(BH) + C(HP) + C(OT) + C(U) + C(WH) + C(F) + C(M) + C(O) + C(rural) + C(urban) + C(military) + C(cd_1) + C(cd_2) + C(cd_3) + C(cd_4) + C(cd_5) + C(cd_6) + C(cd_7) + C(cd_8) + C(cd_9) + C(cd_10) + C(cd_11) + C(cd_12) + C(cd_13) + C(cd_14) + C(cd_99999) + age']

In [33]:
model= smf.logit(formula=groupings[5], data= X).fit()
model.summary()

         Current function value: 0.145037
         Iterations: 35




0,1,2,3
Dep. Variable:,y,No. Observations:,6181531.0
Model:,Logit,Df Residuals:,6181500.0
Method:,MLE,Df Model:,30.0
Date:,"Thu, 01 Apr 2021",Pseudo R-squ.:,0.04466
Time:,16:26:52,Log-Likelihood:,-896550.0
converged:,False,LL-Null:,-938460.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-4.3755,9.31e+04,-4.7e-05,1.000,-1.82e+05,1.82e+05
C(voter_status)[T.1],5.5953,0.147,37.940,0.000,5.306,5.884
C(AI)[T.1],-0.4067,6.13e+04,-6.63e-06,1.000,-1.2e+05,1.2e+05
C(AP)[T.1],-0.1788,6.11e+04,-2.92e-06,1.000,-1.2e+05,1.2e+05
C(BH)[T.1],-0.8289,6.24e+04,-1.33e-05,1.000,-1.22e+05,1.22e+05
C(HP)[T.1],-0.4579,6.2e+04,-7.39e-06,1.000,-1.21e+05,1.21e+05
C(OT)[T.1],-0.3711,6.22e+04,-5.97e-06,1.000,-1.22e+05,1.22e+05
C(U)[T.1],-0.9155,6.16e+04,-1.49e-05,1.000,-1.21e+05,1.21e+05
C(WH)[T.1],-0.8654,6.18e+04,-1.4e-05,1.000,-1.21e+05,1.21e+05


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
thresholds = [.62, .625, .63, .635, .64, .645, .65, .655, .66]
results = {}
for threshold in thresholds:
    regressor = LogisticRegression(class_weight = 'balanced',
                                   solver='liblinear')
    regressor.fit(X_train, y_train)
    yhat_probs = regressor.predict_proba(X_test)[:,1]
    preds = np.where(yhat_probs > threshold, 1, 0)

    results[f'threshold {threshold}'] = [('F1 Score', round(f1_score(y_test,preds), 2)),
                                        ('Accuracy Score', round(accuracy_score(y_test,preds),2)),
                                        ('AUC Score', round(roc_auc_score(y_test,preds),2))]

In [None]:
plt.style.use("ggplot")

kfold = KFold(n_splits = 5)

accuracies = []
precisions = []
recalls = []

for train_index, test_index in kfold.split(X_train):
    model = LogisticRegression(class_weight = 'balanced', solver="liblinear")
    model.fit(X_train.iloc[train_index], y_train.iloc[train_index])
    y_predict = model.predict(X_train.iloc[test_index])
    y_true = y_train.iloc[test_index]
    accuracies.append(accuracy_score(y_true, y_predict))
    precisions.append(precision_score(y_true, y_predict))
    recalls.append(recall_score(y_true, y_predict))
    
print("Accuracy:", np.average(accuracies))
print("Precision:", np.average(precisions))
print("Recall:", np.average(recalls))