In [2]:
# Group Members: Emily Rude, Ye Sheng, Tiffany Valitis, Leon Cai

# Final Project: Predicting the political affiliations of US Counties

The goal of this project is to predict whether a given county in the U.S. leans Democratic or Republican based on characteristics such as age, educational level and racial breakdown, the types of industries that its residents work in, and median income. For the purposes of this project, a county is classified as Democratic if the Democratic presidental candidate had the highest percentage of votes in 2016, and Republican if the Republican presidental candidate had the highest percentage of votes in 2016. The dataset we are using, which shows the Democratic/Republican voting breakdown for every county in the U.S. and the charactersitics of each of these counties can be found [here](https://public.opendatasoft.com/explore/dataset/usa-2016-presidential-election-by-county/export/?disjunctive.state).


In [3]:
#You may add additional imports
import warnings
warnings.simplefilter("ignore")
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import time
import math
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
%matplotlib inline

## Feature Selection

While the given dataset includes a vast amount of information on each county, many of these features can be trimmed because either two or more features are closely correlated to each other and thus not all of those features are needed, or there is little or no correlation between those features and a county's political affiliation. The features selected below are the most indicative of a county's political characteristics. The percincts feature was left out for example because the votes feature was a similar indicator of the county's size, and the weather features were left out for example because they were lowly correlated to political affiliation.

In [4]:
# Read the data from csv file
col_names = []
for i in range(33):
    if i == 0:
        col_names.append('State')
    if i == 1:
        col_names.append('Fips')
    if i == 2:
        col_names.append('County')
    if i == 3:
        col_names.append('Votes')
    if i == 4:
        col_names.append('Republicans 2016')
    if i == 5:
        col_names.append('Democrats 2016')
    if i == 6:
        col_names.append('Republicans 2012')
    if i == 7:
        col_names.append('Republicans 2008')
    if i == 8:
        col_names.append('Democrats 2012')
    if i == 9:
        col_names.append('Democrats 2008')
    if i == 10:
        col_names.append('Less Than High School Diploma')
    if i == 11:
        col_names.append('At Least High School Diploma')
    if i == 12:
        col_names.append('At Least Bachelors Degree')
    if i == 13:
        col_names.append('Graduate Degree')
    if i == 14:
        col_names.append('Median Earnings 2010')
    if i == 15:
        col_names.append('Total Population')
    if i == 16:
        col_names.append('Poverty Rate below federal poverty threshold')
    if i == 17:
        col_names.append('Management professional and related occupations')
    if i == 18:
        col_names.append('Service occupations')
    if i == 19:
        col_names.append('Sales and office occupations')
    if i == 20:
        col_names.append('Farming fishing and forestry occupations')
    if i == 21:
        col_names.append('Construction extraction maintenance and repair occupations')
    if i == 22:
        col_names.append('Production transportation and material moving occupations')
    if i == 23:
        col_names.append('White')
    if i == 24:
        col_names.append('Black')
    if i == 25:
        col_names.append('Hispanic')
    if i == 26:
        col_names.append('Asian')
    if i == 27:
        col_names.append('Amerindian')
    if i == 28:
        col_names.append('Other Race')
    if i == 29:
        col_names.append('Median Age')
    if i == 30:
        col_names.append('Uninsured')
    if i == 31:
        col_names.append('Unemployment')
    if i == 32:
        col_names.append('Violent crime')
        
data = pd.read_csv("2016_election_dataset.csv", names = col_names, index_col=False)
print(data.shape)
data.head(10)

(3114, 33)


Unnamed: 0,State,Fips,County,Votes,Republicans 2016,Democrats 2016,Republicans 2012,Republicans 2008,Democrats 2012,Democrats 2008,...,White,Black,Hispanic,Asian,Amerindian,Other Race,Median Age,Uninsured,Unemployment,Violent crime
0,MN,27017,"Carlton County, Minnesota",18059.0,45.185226,46.846448,35.723584,35.497859,61.775873,62.339422,...,89.5,1.35,1.4,0.55,5.4,1.9,40.1,0.112,0.071,124.41
1,KS,20127,"Morris County, Kansas",2568.0,69.70405,22.819315,69.203747,65.997888,28.02498,31.925378,...,94.2,0.2,3.65,0.55,0.6,0.8,45.9,0.148,0.066,178.58
2,OK,40107,"Okfuskee County, Oklahoma",3933.0,70.963641,23.976608,65.02367,64.103808,34.97633,35.896192,...,64.0,9.05,2.6,0.05,15.95,8.25,40.3,0.246,0.072,246.62
3,MT,30085,"Roosevelt County, Montana",3502.0,49.171902,42.946887,41.230937,35.468336,56.808279,61.738502,...,36.0,0.05,1.1,0.4,53.55,8.9,32.0,0.264,0.074,314.33
4,NY,36055,"Monroe County, New York",320164.0,40.251559,54.366824,39.95111,40.474828,57.966886,58.180987,...,74.25,14.1,6.6,3.0,0.2,1.8,38.1,0.101,0.08,363.42
5,VA,51117,"Mecklenburg County, Virginia",15177.0,54.576003,43.295777,52.881873,51.829996,45.904358,47.255006,...,58.95,36.95,2.3,0.65,0.1,1.05,45.1,0.187,0.098,193.67
6,GA,13143,"Haralson County, Georgia",11317.0,84.642573,13.024653,81.399383,78.021087,17.241712,20.257727,...,91.5,4.95,1.15,0.35,0.3,1.8,38.0,0.204,0.095,522.03
7,ME,23017,"Oxford County, Maine",31094.0,52.145108,39.145816,40.774983,40.64139,55.506458,56.682464,...,96.55,0.4,0.95,0.5,0.4,1.25,43.6,0.138,0.091,119.42
8,KY,21123,"Larue County, Kentucky",6367.0,75.373017,20.072248,67.852186,67.222402,30.065926,30.964714,...,92.85,3.45,2.15,0.15,0.35,1.1,39.1,0.185,0.075,47.32
9,NC,37021,"Buncombe County, North Carolina",134507.0,41.142097,55.712342,42.836894,42.403974,55.307136,56.315683,...,85.2,6.6,5.1,1.0,0.3,1.8,40.6,0.201,0.075,259.2


## Labeling the Data

Each county is labeled 0 for Republican or 1 for Democrat based on which party had a higher voting percentage in 2016. After these labels are added, the 2016 Democratic and Republican voting percentages are then removed. This is because when we are given a particular county with certain characteristics that we want to predict the political affiliation of, those percentages won't be known.

## Cleaning the Data

For counties that are missing information in any of the features selected (particularly a few lowly populated counties in Alaska that are too small to have information on all of these fields), we have decided to drop these records from the dataset (List wise deletion). Only a few records have to be dropped under this scenario, and we still have more than enough records to build an accurate and coherent model.

In [5]:
#Republican is 0, Democrat is 1
def label_party (row):
    if row['Republicans 2016'] > row['Democrats 2016']:
        return 0
    if row['Republicans 2016'] < row['Democrats 2016']:
        return 1
    if row['Republicans 2016'] == row['Democrats 2016']:
        return None
    
def convertToNumber (s):
    return int.from_bytes(s.encode(), 'little')

# add label
data['Party Label'] = data.apply (lambda row: label_party(row), axis=1)

states = {
        'AK': 1,'AL': 2,'AR': 3,'AS': 4,'AZ': 5,'CA': 6,'CO': 7,'CT': 8,'DC': 9,'DE': 10,'FL': 11,'GA': 12,'GU': 13,'HI': 14,
        'IA': 15,'ID': 16,'IL': 17,'IN': 18,'KS': 19,'KY': 20,'LA': 21,'MA': 22,'MD': 23,'ME': 24,'MI': 25,'MN': 26,'MO': 27,
        'MP': 28,'MS': 29,'MT': 30,'NA': 31,'NC': 32,'ND': 33,'NE': 34,'NH': 35,'NJ': 36,'NM': 37,'NV': 38,'NY': 39,'OH': 40,
        'OK': 41,'OR': 42,'PA': 43,'PR': 44,'RI': 45,'SC': 46,'SD': 47,'TN': 48,'TX': 49,'UT': 50,'VA': 51,'VI': 52, 'VT': 53,
        'WA': 54,'WI': 55,'WV': 56,'WY': 57
}
data['State']=data['State'].map(states)

data = data.drop('County', axis=1)
data = data.drop('Republicans 2016', axis=1)
data = data.drop('Democrats 2016', axis=1)

#just some test, accuracy drops to 89%, so not much really.
# data = data.drop('Republicans 2012', axis=1)
# data = data.drop('Democrats 2012', axis=1)
# data = data.drop('Republicans 2008', axis=1)
# data = data.drop('Democrats 2008', axis=1)

data = data.dropna()

data.head(10)

Unnamed: 0,State,Fips,Votes,Republicans 2012,Republicans 2008,Democrats 2012,Democrats 2008,Less Than High School Diploma,At Least High School Diploma,At Least Bachelors Degree,...,Black,Hispanic,Asian,Amerindian,Other Race,Median Age,Uninsured,Unemployment,Violent crime,Party Label
0,26.0,27017,18059.0,35.723584,35.497859,61.775873,62.339422,9.7,90.3,21.4,...,1.35,1.4,0.55,5.4,1.9,40.1,0.112,0.071,124.41,1.0
1,19.0,20127,2568.0,69.203747,65.997888,28.02498,31.925378,9.9,90.1,16.6,...,0.2,3.65,0.55,0.6,0.8,45.9,0.148,0.066,178.58,0.0
2,41.0,40107,3933.0,65.02367,64.103808,34.97633,35.896192,21.2,78.8,10.9,...,9.05,2.6,0.05,15.95,8.25,40.3,0.246,0.072,246.62,0.0
3,30.0,30085,3502.0,41.230937,35.468336,56.808279,61.738502,10.9,89.1,17.3,...,0.05,1.1,0.4,53.55,8.9,32.0,0.264,0.074,314.33,0.0
4,39.0,36055,320164.0,39.95111,40.474828,57.966886,58.180987,11.6,88.4,34.8,...,14.1,6.6,3.0,0.2,1.8,38.1,0.101,0.08,363.42,1.0
5,51.0,51117,15177.0,52.881873,51.829996,45.904358,47.255006,24.8,75.2,13.4,...,36.95,2.3,0.65,0.1,1.05,45.1,0.187,0.098,193.67,0.0
6,12.0,13143,11317.0,81.399383,78.021087,17.241712,20.257727,30.4,69.6,11.0,...,4.95,1.15,0.35,0.3,1.8,38.0,0.204,0.095,522.03,0.0
7,24.0,23017,31094.0,40.774983,40.64139,55.506458,56.682464,12.5,87.5,18.5,...,0.4,0.95,0.5,0.4,1.25,43.6,0.138,0.091,119.42,0.0
8,20.0,21123,6367.0,67.852186,67.222402,30.065926,30.964714,23.2,76.8,12.1,...,3.45,2.15,0.15,0.35,1.1,39.1,0.185,0.075,47.32,0.0
9,32.0,37021,134507.0,42.836894,42.403974,55.307136,56.315683,12.8,87.2,31.2,...,6.6,5.1,1.0,0.3,1.8,40.6,0.201,0.075,259.2,1.0


In [6]:
labels = data['Party Label']
features = data.drop(['Party Label'], axis=1)

print("Features shape: " + str(features.shape))
print("Labels shape: " + str(labels.shape))
features.head()

Features shape: (2944, 30)
Labels shape: (2944,)


Unnamed: 0,State,Fips,Votes,Republicans 2012,Republicans 2008,Democrats 2012,Democrats 2008,Less Than High School Diploma,At Least High School Diploma,At Least Bachelors Degree,...,White,Black,Hispanic,Asian,Amerindian,Other Race,Median Age,Uninsured,Unemployment,Violent crime
0,26.0,27017,18059.0,35.723584,35.497859,61.775873,62.339422,9.7,90.3,21.4,...,89.5,1.35,1.4,0.55,5.4,1.9,40.1,0.112,0.071,124.41
1,19.0,20127,2568.0,69.203747,65.997888,28.02498,31.925378,9.9,90.1,16.6,...,94.2,0.2,3.65,0.55,0.6,0.8,45.9,0.148,0.066,178.58
2,41.0,40107,3933.0,65.02367,64.103808,34.97633,35.896192,21.2,78.8,10.9,...,64.0,9.05,2.6,0.05,15.95,8.25,40.3,0.246,0.072,246.62
3,30.0,30085,3502.0,41.230937,35.468336,56.808279,61.738502,10.9,89.1,17.3,...,36.0,0.05,1.1,0.4,53.55,8.9,32.0,0.264,0.074,314.33
4,39.0,36055,320164.0,39.95111,40.474828,57.966886,58.180987,11.6,88.4,34.8,...,74.25,14.1,6.6,3.0,0.2,1.8,38.1,0.101,0.08,363.42


## Classifying the Data

In attempting to build the best model possible to classify our records, we will first test several different classification algorithms on our data and assess the performance of each of them. We will assess each of these algorithms on how accurate they are, and how well they handle class imbalance, and display the results of each of those algorithms. Below, we discuss these two meaures in more detail.

### 1) Accuracy

Each algorithm will be tested on the data using K-Fold Cross Validation with 10 folds. In the first iteration, the first tenth of the data set will serve as the testing data and have the labels removed. The rest of the dataset will serve as the training data which will be used to build the algorithm. Once the algorithm is built, the testing records will be fed into the algorithm and classified. The alrogithm's classification of those testing records will be compared to the actual labels of those testing records to determine the accuracy of that iteration. In the second iteration, the second tenth of the data set will serve as the testing data while the rest serves as the training data. And so forth until all 10 folds are completed. The accuracy of that algorithm will be the average accuracy among those 10 folds.

### 2) Class Imbalance

Our dataset contains significantly more Republican than Democratic records given that the Democratic population tends to be more concentrated in urban areas. If our model were to be making mostly Republican predictions and have thus a high accuracy rate, we would want to know if this is because the model can properly distinguish what a Republican county looks like or the model is simply giving Republican labels most of the time without making that distinction. Creating a confusion matrix would be a more effective way to analyze the performance of a model instead of simplying looking at it's accuracy rate.

In the matrix, the top left value is the number of True Positive Classifications (number of records classified Republican that were actually Republican. The Bottom left is the number of False Positive Classifications (number of records classified Republican that were actually Democratic). The top right is the number of False Negative Classifications (number of records classified Democratic that were actually Republican). And the bottom right is the nuumber of True Negative Classifications (number of records classified Democratic that were actually Democratic).

Thus, in this scenario, we are particulary interested in achieveing a low rate of False Positive Classifications to indicate that the algorithm isn't making mostly Republican predictions simply because the majority class is Republican.

Another way to analyze the algorithms performance under this Class Imbalance is calculating Precision and Recall. Precision is the probability that given a Repbulican classification, how likely is it to be correct. And Recall is the probability that given a Republican record, will it be classified as such. In this scenario, we are particularly interested in a high recall rate. The F-measure is also calculated which is a combination of both the precision and recall metric.

### Naive Bayes Algorithm

In [7]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
clf = GaussianNB()
clf.fit(features, labels)

# measuring accuracy
nested_score = cross_val_score(clf, features, labels, cv=10)
print("Accuracy of Naive Bayes Classifier:", nested_score.mean()*100)

# measuring class imblanace
predicted_score = cross_val_predict(clf, features, labels, cv = 10)
print(confusion_matrix(labels, predicted_score))
print()
print(classification_report(labels, predicted_score))

Accuracy of Naive Bayes Classifier: 94.29436181252161
[[2411   66]
 [ 102  365]]

              precision    recall  f1-score   support

         0.0       0.96      0.97      0.97      2477
         1.0       0.85      0.78      0.81       467

    accuracy                           0.94      2944
   macro avg       0.90      0.88      0.89      2944
weighted avg       0.94      0.94      0.94      2944



### Decision Tree Algorithm

In [8]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(criterion='entropy')
clf.fit(features, labels)

# measuring accuracy
nested_score = cross_val_score(clf, features, labels, cv=10)
print("Accuracy of Decision Tree Classifier:", nested_score.mean()*100)

# measuring class imbalance
predicted_score = cross_val_predict(clf, features, labels, cv = 10)
print(confusion_matrix(labels, predicted_score))
print()
print(classification_report(labels, predicted_score))

Accuracy of Decision Tree Classifier: 96.67220108382337
[[2433   44]
 [  53  414]]

              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98      2477
         1.0       0.90      0.89      0.90       467

    accuracy                           0.97      2944
   macro avg       0.94      0.93      0.94      2944
weighted avg       0.97      0.97      0.97      2944



## K-Nearest Neighbor Algorithm

In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

scaler = sk.preprocessing.StandardScaler()
pca = sk.decomposition.PCA()
knn = KNeighborsClassifier(n_neighbors=7)
pipeline = Pipeline(steps = [('standard_scaler', scaler), ('pca', pca), ('knn', knn)])
param_grid = {
    'pca__n_components': list(range(5, 20)),
    'knn__n_neighbors': list(range(1, 26))
}
grid = sk.model_selection.GridSearchCV(pipeline, param_grid, cv = 5)
grid.fit(features, labels)

k_val = grid.best_params_['knn__n_neighbors']
clf = sk.neighbors.KNeighborsClassifier(n_neighbors = k_val)
clf.fit(features, labels)

# measuring accuracy
nested_score = cross_val_score(clf, features, labels, cv=10)
print("Accuracy of KNN Classifier:", nested_score.mean()*100)

# measuring class imblanace
predicted_score = cross_val_predict(clf, features, labels, cv = 10)
print(confusion_matrix(labels, predicted_score))
print()
print(classification_report(labels, predicted_score))

Accuracy of KNN Classifier: 86.82013144240747
[[2404   73]
 [ 315  152]]

              precision    recall  f1-score   support

         0.0       0.88      0.97      0.93      2477
         1.0       0.68      0.33      0.44       467

    accuracy                           0.87      2944
   macro avg       0.78      0.65      0.68      2944
weighted avg       0.85      0.87      0.85      2944



## SVM Algorithm

In [10]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split

features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.20)


#THIS SHOULD ALREADY BE DONE   
# data_Y = data['Party Label']
# data_X = data.drop(['Party Label'],axis=1)
# x_train, x_test, y_train, y_test = train_test_split(data_X, data_Y, test_size=0.20)


pca = PCA()
scaler = StandardScaler()
svc = SVC(gamma='auto')

pipeline_svm = Pipeline([('scaler', scaler), ('pca', pca), ('svc', svc)])
param_grid = {
    'svc__kernel': ['linear', 'rbf', 'poly']
}

grid_search = GridSearchCV(pipeline_svm, param_grid, cv=5, scoring='accuracy')
grid_search.fit(features, labels)

# measuring accuracy
nested_score = cross_val_score(grid_search, features, labels, cv=5)
y_preds = cross_val_predict(pipeline_svm, features_test, labels_test, cv=10) 
print("Accuracy: ", nested_score.mean()*100, '\n')

# measuring class imblanace
predicted_score = cross_val_predict(clf, features, labels, cv = 10)
print(confusion_matrix(labels, predicted_score))
print()
print(classification_report(labels, predicted_score))

Accuracy:  98.1657484725639 

[[2404   73]
 [ 315  152]]

              precision    recall  f1-score   support

         0.0       0.88      0.97      0.93      2477
         1.0       0.68      0.33      0.44       467

    accuracy                           0.87      2944
   macro avg       0.78      0.65      0.68      2944
weighted avg       0.85      0.87      0.85      2944



## Neural Net Algorithm

In [11]:
from sklearn.neural_network import MLPClassifier

mlp_class = MLPClassifier()
pipeline_nn = Pipeline([('scaler', scaler), ('mlp_classifier', mlp_class)])

param_grid = {
    'mlp_classifier__hidden_layer_sizes': list(range(30, 60, 10)),
    'mlp_classifier__activation': ['logistic', 'tanh', 'relu']
}

grid_search = GridSearchCV(pipeline_nn, param_grid, cv=5, scoring='accuracy')
grid_search.fit(features, labels)

# measuring accuracy
nested_score = cross_val_score(grid_search, features, labels, cv=5)
print("Accuracy: ", nested_score.mean()*100, '\n')

# measuring class imblanace
predicted_score = cross_val_predict(clf, features, labels, cv = 10)
print(confusion_matrix(labels, predicted_score))
print()
print(classification_report(labels, predicted_score))

Accuracy:  98.23360243927792 

[[2404   73]
 [ 315  152]]

              precision    recall  f1-score   support

         0.0       0.88      0.97      0.93      2477
         1.0       0.68      0.33      0.44       467

    accuracy                           0.87      2944
   macro avg       0.78      0.65      0.68      2944
weighted avg       0.85      0.87      0.85      2944



## Feature Engineering

Feature Engineering is the process of creating new features in our dataset from a combination of exisiting features. Feature Engineering may improve the performance of the model by creating a feature that is more inditicative of political affiliation than exisiting features, reducing the total number of features that the algorithm has to work with, or a combination of both of these factors.

One feature we added was the percentage of residents in Professional industries by adding the percentage of residents in both management and sales occuptions. Residents working in both of these types of professions tend to be of similar economic status. We then deleted the two individual columns in exchange for this new addition, reducing the total number of features used by the algorithm.

In [12]:
def voter_turnout (row):
    return (row['Votes'] / row['Total Population']) * 100

def minority(row):
    return (row['Black'] + row['Hispanic'] + row['Asian'] + row['Amerindian'] + row['Other Race'])

def professional(row):
    return (row['Management professional and related occupations'] + row['Sales and office occupations'])
    
def manufacturing(row):
    return (row['Farming fishing and forestry occupations'] + row['Construction extraction maintenance and repair occupations'] + row['Production transportation and material moving occupations'])
    
data1 = data.pop('Party Label')    
    
data['Voter Turnout'] = data.apply (lambda row: voter_turnout(row), axis=1)
data['Minority'] = data.apply (lambda row: minority(row), axis=1)
data['Professional'] = data.apply (lambda row: professional(row), axis=1)
data['Manufacturing'] = data.apply (lambda row: manufacturing(row), axis=1)

data = data.drop('Black', axis=1)
data = data.drop('Hispanic', axis=1)
data = data.drop('Asian', axis=1)
data = data.drop('Amerindian', axis=1)
data = data.drop('Other Race', axis=1)
data = data.drop('Management professional and related occupations', axis=1)
data = data.drop('Sales and office occupations', axis=1)
data = data.drop('Farming fishing and forestry occupations', axis=1)
data = data.drop('Construction extraction maintenance and repair occupations', axis=1)
data = data.drop('Production transportation and material moving occupations', axis=1)

data['Party Label'] = data1

data.head(10)

Unnamed: 0,State,Fips,Votes,Republicans 2012,Republicans 2008,Democrats 2012,Democrats 2008,Less Than High School Diploma,At Least High School Diploma,At Least Bachelors Degree,...,White,Median Age,Uninsured,Unemployment,Violent crime,Voter Turnout,Minority,Professional,Manufacturing,Party Label
0,26.0,27017,18059.0,35.723584,35.497859,61.775873,62.339422,9.7,90.3,21.4,...,89.5,40.1,0.112,0.071,124.41,52.148426,10.6,53.4,26.0,1.0
1,19.0,20127,2568.0,69.203747,65.997888,28.02498,31.925378,9.9,90.1,16.6,...,94.2,45.9,0.148,0.066,178.58,43.20323,5.8,49.35,37.75,0.0
2,41.0,40107,3933.0,65.02367,64.103808,34.97633,35.896192,21.2,78.8,10.9,...,64.0,40.3,0.246,0.072,246.62,33.724919,35.9,50.3,28.55,0.0
3,30.0,30085,3502.0,41.230937,35.468336,56.808279,61.738502,10.9,89.1,17.3,...,36.0,32.0,0.264,0.074,314.33,33.907823,64.0,59.5,18.75,0.0
4,39.0,36055,320164.0,39.95111,40.474828,57.966886,58.180987,11.6,88.4,34.8,...,74.25,38.1,0.101,0.08,363.42,43.383714,25.7,66.5,17.35,1.0
5,51.0,51117,15177.0,52.881873,51.829996,45.904358,47.255006,24.8,75.2,13.4,...,58.95,45.1,0.187,0.098,193.67,46.904843,41.05,52.45,30.75,0.0
6,12.0,13143,11317.0,81.399383,78.021087,17.241712,20.257727,30.4,69.6,11.0,...,91.5,38.0,0.204,0.095,522.03,39.474694,8.55,49.5,35.3,0.0
7,24.0,23017,31094.0,40.774983,40.64139,55.506458,56.682464,12.5,87.5,18.5,...,96.55,43.6,0.138,0.091,119.42,54.433416,3.5,47.2,32.8,0.0
8,20.0,21123,6367.0,67.852186,67.222402,30.065926,30.964714,23.2,76.8,12.1,...,92.85,39.1,0.185,0.075,47.32,45.851937,7.2,45.35,40.8,0.0
9,32.0,37021,134507.0,42.836894,42.403974,55.307136,56.315683,12.8,87.2,31.2,...,85.2,40.6,0.201,0.075,259.2,57.953924,14.8,59.75,21.75,1.0


In [13]:
labels = data['Party Label']
features = data.drop(['Party Label'], axis=1)

print("Features shape: " + str(features.shape))
print("Labels shape: " + str(labels.shape))
features.head()

Features shape: (2944, 24)
Labels shape: (2944,)


Unnamed: 0,State,Fips,Votes,Republicans 2012,Republicans 2008,Democrats 2012,Democrats 2008,Less Than High School Diploma,At Least High School Diploma,At Least Bachelors Degree,...,Service occupations,White,Median Age,Uninsured,Unemployment,Violent crime,Voter Turnout,Minority,Professional,Manufacturing
0,26.0,27017,18059.0,35.723584,35.497859,61.775873,62.339422,9.7,90.3,21.4,...,20.55,89.5,40.1,0.112,0.071,124.41,52.148426,10.6,53.4,26.0
1,19.0,20127,2568.0,69.203747,65.997888,28.02498,31.925378,9.9,90.1,16.6,...,12.95,94.2,45.9,0.148,0.066,178.58,43.20323,5.8,49.35,37.75
2,41.0,40107,3933.0,65.02367,64.103808,34.97633,35.896192,21.2,78.8,10.9,...,21.15,64.0,40.3,0.246,0.072,246.62,33.724919,35.9,50.3,28.55
3,30.0,30085,3502.0,41.230937,35.468336,56.808279,61.738502,10.9,89.1,17.3,...,21.75,36.0,32.0,0.264,0.074,314.33,33.907823,64.0,59.5,18.75
4,39.0,36055,320164.0,39.95111,40.474828,57.966886,58.180987,11.6,88.4,34.8,...,16.15,74.25,38.1,0.101,0.08,363.42,43.383714,25.7,66.5,17.35


## Optimizing Classification Algorithms

We will now run our classification algorithms again with the new features in our dataset to assess to what degree adding or removing certain attributes will affect the accuracy of classifying our data. We will thus display the accuracy and class imblance for each algorithm under these new conditions. Furthermore, there are potential ways that each individual classification algorithm can be optimized based on their unique characteristics and we will explore that as well.

### Naive Bayes Algorithm

In [14]:
clf = GaussianNB()
clf.fit(features, labels)

# measuring accuracy
nested_score = cross_val_score(clf, features, labels, cv=10)
print("Accuracy of Naive Bayes Classifier:", nested_score.mean()*100)

# measuring class imblanace
predicted_score = cross_val_predict(clf, features, labels, cv = 10)
print(confusion_matrix(labels, predicted_score))
print()
print(classification_report(labels, predicted_score))

Accuracy of Naive Bayes Classifier: 94.32814481724894
[[2403   74]
 [  93  374]]

              precision    recall  f1-score   support

         0.0       0.96      0.97      0.97      2477
         1.0       0.83      0.80      0.82       467

    accuracy                           0.94      2944
   macro avg       0.90      0.89      0.89      2944
weighted avg       0.94      0.94      0.94      2944



Naive Bayes has the assumption that all the features in the dataset are indepedent. However, some features are depedent on each other, for example, all the percentages of race will add up to 100%. The following will attempt to remove some features that are dependent on each other to see if it gives a different accuracy. As it turned out, the accuracy decreased.

In [15]:
data = data.drop('Fips', axis=1)
data = data.drop('Democrats 2012', axis=1)
data = data.drop('Democrats 2008', axis=1)
data = data.drop('Less Than High School Diploma', axis=1)
data = data.drop('Minority', axis=1)
data = data.drop('Voter Turnout', axis=1)
data = data.drop('Professional', axis=1)

data.head(10)

Unnamed: 0,State,Votes,Republicans 2012,Republicans 2008,At Least High School Diploma,At Least Bachelors Degree,Graduate Degree,Median Earnings 2010,Total Population,Poverty Rate below federal poverty threshold,Service occupations,White,Median Age,Uninsured,Unemployment,Violent crime,Manufacturing,Party Label
0,26.0,18059.0,35.723584,35.497859,90.3,21.4,7.2,30427.26762,34630,10.7,20.55,89.5,40.1,0.112,0.071,124.41,26.0,1.0
1,19.0,2568.0,69.203747,65.997888,90.1,16.6,7.7,25341.94984,5944,11.15,12.95,94.2,45.9,0.148,0.066,178.58,37.75,0.0
2,41.0,3933.0,65.02367,64.103808,78.8,10.9,3.1,22072.27842,11662,24.15,21.15,64.0,40.3,0.246,0.072,246.62,28.55,0.0
3,30.0,3502.0,41.230937,35.468336,89.1,17.3,4.7,27894.54699,10328,23.2,21.75,36.0,32.0,0.264,0.074,314.33,18.75,0.0
4,39.0,320164.0,39.95111,40.474828,88.4,34.8,15.1,29746.92032,737982,13.4,16.15,74.25,38.1,0.101,0.08,363.42,17.35,1.0
5,51.0,15177.0,52.881873,51.829996,75.2,13.4,4.8,22376.19207,32357,17.95,16.7,58.95,45.1,0.187,0.098,193.67,30.75,0.0
6,12.0,11317.0,81.399383,78.021087,69.6,11.0,3.2,25390.72349,28669,19.6,15.2,91.5,38.0,0.204,0.095,522.03,35.3,0.0
7,24.0,31094.0,40.774983,40.64139,87.5,18.5,6.2,24005.21238,57123,13.4,20.1,96.55,43.6,0.138,0.091,119.42,32.8,0.0
8,20.0,6367.0,67.852186,67.222402,76.8,12.1,4.9,22324.71429,13886,14.55,13.9,92.85,39.1,0.185,0.075,47.32,40.8,0.0
9,32.0,134507.0,42.836894,42.403974,87.2,31.2,11.1,25470.16032,232093,14.25,18.45,85.2,40.6,0.201,0.075,259.2,21.75,1.0


In [16]:
labels = data['Party Label']
features = data.drop(['Party Label'], axis=1)
data.head()

print("Features shape: " + str(features.shape))
print("Labels shape: " + str(labels.shape))
features.head()

clf = GaussianNB()
clf.fit(features, labels)

nested_score = cross_val_score(clf, features, labels, cv=10)
print("Accuracy:", nested_score.mean()*100)

predicted_score = cross_val_predict(clf, features, labels, cv = 10)
print(confusion_matrix(labels, predicted_score))
print()
print(classification_report(labels, predicted_score))

Features shape: (2944, 17)
Labels shape: (2944,)
Accuracy: 91.84838002997807
[[2400   77]
 [ 163  304]]

              precision    recall  f1-score   support

         0.0       0.94      0.97      0.95      2477
         1.0       0.80      0.65      0.72       467

    accuracy                           0.92      2944
   macro avg       0.87      0.81      0.83      2944
weighted avg       0.91      0.92      0.92      2944



### Decision Tree Algorithm

In [17]:
# TODO Leon: Run Decision Tree Accuracy & Classification Report with new features

TODO Leon: Discuss problem with overfitting and using different tree sizes

In [18]:
# TODO Leon: Run Decision Tree Accuracy & Classification Report with some different max tree sizes

### KNN Algorithm

In [19]:
# TODO Tiffany: Run KNN Accuracy & Classification Report with new features
scaler = sk.preprocessing.StandardScaler()
pca = sk.decomposition.PCA()
knn = KNeighborsClassifier(n_neighbors=7)
pipeline = Pipeline(steps = [('standard_scaler', scaler), ('pca', pca), ('knn', knn)])
param_grid = {
    'pca__n_components': list(range(1, 15)),
    'knn__n_neighbors': list(range(1, 26))
}
grid = sk.model_selection.GridSearchCV(pipeline, param_grid, cv = 5)
grid.fit(features, labels)

k_val = grid.best_params_['knn__n_neighbors']
clf = sk.neighbors.KNeighborsClassifier(n_neighbors = k_val)
clf.fit(features, labels)

# measuring accuracy
nested_score = cross_val_score(clf, features, labels, cv=10)
print("Accuracy of KNN Classifier:", nested_score.mean()*100)

# measuring class imblanace
predicted_score = cross_val_predict(clf, features, labels, cv = 10)
print(confusion_matrix(labels, predicted_score))
print()
print(classification_report(labels, predicted_score))

Accuracy of KNN Classifier: 86.4470194857604
[[2409   68]
 [ 331  136]]

              precision    recall  f1-score   support

         0.0       0.88      0.97      0.92      2477
         1.0       0.67      0.29      0.41       467

    accuracy                           0.86      2944
   macro avg       0.77      0.63      0.66      2944
weighted avg       0.85      0.86      0.84      2944



The K-Nearest Neighbor algorithm is affected by class imbalances - if there is an uneven ratio of one class label to another, it may predict that the majority class is more likely, simply because it has more records. To combat this, we will weight points by the inverse of their distance. In this case, closer neighbors of a query point will have a greater influence than neighbors which are further away. As it so happens, this caused the accuracy to decrease.

In [20]:
# TODO Tiffany: Run KNN Accuracy & Classification Report with weighted votes
scaler = sk.preprocessing.StandardScaler()
pca = sk.decomposition.PCA()
knn = KNeighborsClassifier(n_neighbors=7, weights='distance')
pipeline = Pipeline(steps = [('standard_scaler', scaler), ('pca', pca), ('knn', knn)])
param_grid = {
    'pca__n_components': list(range(1, 15)),
    'knn__n_neighbors': list(range(1, 26))
}
grid = sk.model_selection.GridSearchCV(pipeline, param_grid, cv = 5)
grid.fit(features, labels)

k_val = grid.best_params_['knn__n_neighbors']
clf = sk.neighbors.KNeighborsClassifier(n_neighbors = k_val, weights='distance')
clf.fit(features, labels)

# measuring accuracy
nested_score = cross_val_score(clf, features, labels, cv=10)
print("Accuracy of Weighted KNN Classifier:", nested_score.mean()*100)

# measuring class imblanace
predicted_score = cross_val_predict(clf, features, labels, cv = 10)
print(confusion_matrix(labels, predicted_score))
print()
print(classification_report(labels, predicted_score))

Accuracy of Weighted KNN Classifier: 86.07263922518159
[[2400   77]
 [ 333  134]]

              precision    recall  f1-score   support

         0.0       0.88      0.97      0.92      2477
         1.0       0.64      0.29      0.40       467

    accuracy                           0.86      2944
   macro avg       0.76      0.63      0.66      2944
weighted avg       0.84      0.86      0.84      2944



### SVM Algorithm

In [21]:
# TODO Emily: Run SVM Accuracy & Classification Report with new features

TODO Emily: Discuss problem with selecting right kernel

In [22]:
# TODO Emily: Run SVM Accuracy & Classification Report with different kernel

### Neural Net Algorithm

In [23]:
# TODO Emily: Run NN Accuracy & Classification Report with new features

TODO Emily or Leon: Discuss problem with training it correctly and getting stuck at local minima

In [24]:
# TODO Emily or Leon: Run NN Accuracy & Classification with different Hidden Layer Values

### Ensembling

TODO Leon: Write an explanation

### Homogenous Random Forest Ensemble With Decision Trees: 

In [28]:
# your code goes here
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline(steps = [('rfc', RandomForestClassifier())])
param_grid = {
    'rfc__max_depth': list(range(35, 55)),
    'rfc__min_samples_leaf': [8,10,12],
    'rfc__max_features': ["sqrt", "log2"]
    
}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(features, labels)
nested_score = cross_val_score(grid_search, features, labels, cv = 5)
print("Accuracy:", nested_score.mean()*100)

KeyboardInterrupt: 

### Heterogeneous Stacked Ensemble Using Decision Trees, Naive Bayes, and SVM:

In [30]:
from sklearn.ensemble import StackingClassifier

# creating the different classifiers
clf_1 = DecisionTreeClassifier(criterion='entropy')
clf_2 = GaussianNB()
clf_3 = SVC(gamma='auto')

estimators = [('dt', clf_1), ('nb', clf_2), ('svm', clf_3)]


clf = StackingClassifier(estimators=estimators)
acc = clf.fit(features_train, labels_train).score(features_test, labels_test)
print("Accuracy:", acc*100)

Accuracy: 96.26485568760611


In [None]:
### TODO Everyone else: add in your ensembling algorithms

## ROC Curve

A ROC Curve illustrates the alrogithm's trade off between its True Positive Rate and True Negative Rate. The higher percentage of the graph area that is under the curve (AUC or Area Under the Curve), the more accurate the algorithm is. An AUC value of 50% indicates that the algorithm is about as good as random guessing, and the algorithm clearly performs well beyond that.

In [None]:
# your code goes here
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.20)

clf = GaussianNB()
clf.fit(features_train, labels_train)

proba = clf.predict_proba(features_test)
fpr_result, tpr_result, thresholds = roc_curve(labels_test, proba[:,1])
#replace these fpr and tpr with the results of your roc_curve
fpr, tpr = fpr_result, tpr_result

print(roc_auc_score(labels_test, proba[:,1]))

# Do not change this code! This plots the ROC curve.
# Just replace the fpr and tpr above with the values from your roc_curve
plt.plot([0,1],[0,1],'k--') #plot the diagonal line
plt.plot(fpr, tpr, label='NB') #plot the ROC curve
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC Curve Naive Bayes')
plt.show()

In [None]:
# your code goes here
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.20)

clf = GaussianNB()
clf.fit(features_train, labels_train)

proba = clf.predict_proba(features_test)
fpr_result, tpr_result, thresholds = roc_curve(labels_test, proba[:,1])
#replace these fpr and tpr with the results of your roc_curve
fpr, tpr = fpr_result, tpr_result

print(roc_auc_score(labels_test, proba[:,1]))

# Do not change this code! This plots the ROC curve.
# Just replace the fpr and tpr above with the values from your roc_curve
plt.plot([0,1],[0,1],'k--') #plot the diagonal line
plt.plot(fpr, tpr, label='NB') #plot the ROC curve
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC Curve Naive Bayes')
plt.show()