In [5]:
######### FILTER PACKAGE WARNINGS ########

import warnings  # Turn Off Frequent Warnings from Packages
warnings.filterwarnings("ignore", category=DeprecationWarning)


###### PYTHON VERSION COMPATIBILITY ######

from __future__ import print_function  # print() Function in Python 2

try:
    input = raw_input  # 'input' Function in Python 2 & 3
except NameError:
    pass


########## MACHINE LEARNING APIs ##########

# SCIKIT-LEARN
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import chi2, f_classif, f_regression, RFECV, SelectKBest
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, log_loss, mean_squared_error
from sklearn.model_selection import KFold, train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import Imputer, LabelEncoder



############ DATA & MANIPULATION ##########

import numpy as np
from random import setstate, shuffle

# Turn Off Deprecation & Future Warnings from Pandas
with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=DeprecationWarning)
    warnings.filterwarnings("ignore",category=FutureWarning)
    import pandas as pd

# Display All Rows & Columns in DataFrame
pd.set_option('display.max_rows', None) 
pd.set_option('display.max_columns', None)

# Pandas for Geographical Data
import geopandas
from shapely.geometry import Point

################ PLOTTING ################

# Opening & Manipulating Images
import PIL.Image
from scipy.ndimage.interpolation import rotate

# SEABORN
import seaborn as sns

# MATPLOTLIB
import matplotlib
import matplotlib.cm as cm
from matplotlib.colors import Normalize
from matplotlib.colorbar import ColorbarBase
from matplotlib.ticker import NullFormatter
from matplotlib.patches import Ellipse, Circle
import matplotlib.pyplot as plt
from matplotlib import animation, rc
from IPython.display import HTML, Image
%matplotlib inline

# equivalent to rcParams['animation.html'] = 'html5'
rc('animation', html='html5')

font = {'family' : 'serif',
        'weight' : 'normal',
        'size'   : 22}

matplotlib.rc('font', **font)
#import matplotlib.patches as patches


############# FILE MANAGEMENT #############

import errno
import io
import os
import urllib.request

#### Define file name & location.

In [6]:
directory    = 'data'
dict_file    = 'county_facts_dictionary.csv'
county_file  = 'county_facts.csv'
results_file = '2016_US_County_Level_Presidential_Results.csv'

#### Load data into a pandas dataframe.

In [7]:
dict_data    = pd.read_csv(os.path.join(directory, dict_file),    sep=',')
county_data  = pd.read_csv(os.path.join(directory, county_file),  sep=',')
results_data = pd.read_csv(os.path.join(directory, results_file), sep=',')

#### Available Features

In [8]:
dict_data

Unnamed: 0,column_name,description
0,PST045214,"Population, 2014 estimate"
1,PST040210,"Population, 2010 (April 1) estimates base"
2,PST120214,"Population, percent change - April 1, 2010 to ..."
3,POP010210,"Population, 2010"
4,AGE135214,"Persons under 5 years, percent, 2014"
5,AGE295214,"Persons under 18 years, percent, 2014"
6,AGE775214,"Persons 65 years and over, percent, 2014"
7,SEX255214,"Female persons, percent, 2014"
8,RHI125214,"White alone, percent, 2014"
9,RHI225214,"Black or African American alone, percent, 2014"


#### Preview the Population Demographics Data

In [9]:
county_data.head(10)

Unnamed: 0,fips,area_name,state_abbreviation,PST045214,PST040210,PST120214,POP010210,AGE135214,AGE295214,AGE775214,SEX255214,RHI125214,RHI225214,RHI325214,RHI425214,RHI525214,RHI625214,RHI725214,RHI825214,POP715213,POP645213,POP815213,EDU635213,EDU685213,VET605213,LFE305213,HSG010214,HSG445213,HSG096213,HSG495213,HSD410213,HSD310213,INC910213,INC110213,PVY020213,BZA010213,BZA110213,BZA115213,NES010213,SBO001207,SBO315207,SBO115207,SBO215207,SBO515207,SBO415207,SBO015207,MAN450207,WTN220207,RTN130207,RTN131207,AFN120207,BPS030214,LND110210,POP060210
0,0,United States,,318857056,308758105,3.3,308745538,6.2,23.1,14.5,50.8,77.4,13.2,1.2,5.4,0.2,2.5,17.4,62.1,84.9,12.9,20.7,86.0,28.8,21263779,25.5,133957180,64.9,26.0,176700,115610216,2.63,28155,53046,15.4,7488353,118266253,2.0,23005620,27092908,7.1,0.9,5.7,0.1,8.3,28.8,5319456312,4174286516,3917663456,12990,613795732,1046363,3531905.43,87.4
1,1000,Alabama,,4849377,4780127,1.4,4779736,6.1,22.8,15.3,51.5,69.7,26.7,0.7,1.3,0.1,1.5,4.1,66.2,85.0,3.5,5.2,83.1,22.6,388865,24.2,2207912,69.7,15.9,122500,1838683,2.55,23680,43253,18.6,97578,1603100,1.1,311578,382350,14.8,0.8,1.8,0.1,1.2,28.1,112858843,52252752,57344851,12364,6426342,13369,50645.33,94.4
2,1001,Autauga County,AL,55395,54571,1.5,54571,6.0,25.2,13.8,51.4,77.9,18.7,0.5,1.1,0.1,1.8,2.7,75.6,85.0,1.6,3.5,85.6,20.9,5922,26.2,22751,76.8,8.3,136200,20071,2.71,24571,53682,12.1,817,10120,2.1,2947,4067,15.2,0.0,1.3,0.0,0.7,31.7,0,0,598175,12003,88157,131,594.44,91.8
3,1003,Baldwin County,AL,200111,182265,9.8,182265,5.6,22.2,18.7,51.2,87.1,9.6,0.7,0.9,0.1,1.6,4.6,83.0,82.1,3.6,5.5,89.1,27.7,19346,25.9,107374,72.6,24.4,168600,73283,2.52,26766,50221,13.9,4871,54988,3.7,16508,19035,2.7,0.4,1.0,0.0,1.3,27.3,1410273,0,2966489,17166,436955,1384,1589.78,114.6
4,1005,Barbour County,AL,26887,27457,-2.1,27457,5.7,21.2,16.5,46.6,50.2,47.6,0.6,0.5,0.2,0.9,4.5,46.6,84.8,2.9,5.0,73.7,13.4,2120,24.6,11799,67.7,10.6,89200,9200,2.66,16829,32911,26.7,464,6611,-5.6,1546,1667,0.0,0.0,0.0,0.0,0.0,27.0,0,0,188337,6334,0,8,884.88,31.0
5,1007,Bibb County,AL,22506,22919,-1.8,22915,5.3,21.0,14.8,45.9,76.3,22.1,0.4,0.2,0.1,0.9,2.1,74.5,86.6,1.2,2.1,77.5,12.1,1327,27.6,8978,79.0,7.3,90500,7091,3.03,17427,36447,18.1,275,3145,7.5,1126,1385,14.9,0.0,0.0,0.0,0.0,0.0,0,0,124707,5804,10757,19,622.58,36.8
6,1009,Blount County,AL,57719,57322,0.7,57322,6.1,23.6,17.0,50.5,96.0,1.8,0.6,0.3,0.1,1.2,8.7,87.8,88.7,4.3,7.3,77.0,12.1,4540,33.9,23826,81.0,4.5,117100,21108,2.7,20730,44145,15.8,660,6798,3.4,3563,4458,0.0,0.0,0.0,0.0,0.0,23.2,341544,0,319700,5622,20941,3,644.78,88.9
7,1011,Bullock County,AL,10764,10915,-1.4,10914,6.3,21.4,14.9,45.3,26.9,70.1,0.8,0.3,0.7,1.1,7.5,22.1,84.7,5.4,5.2,67.8,12.5,636,26.9,4461,74.3,8.7,70600,3741,2.73,18628,32033,21.6,112,0,0.0,470,417,0.0,0.0,0.0,0.0,0.0,38.8,0,0,43810,3995,3670,1,622.81,17.5
8,1013,Butler County,AL,20296,20946,-3.1,20947,6.1,23.6,18.0,53.6,53.9,44.0,0.4,0.9,0.0,0.8,1.2,53.1,94.6,0.8,1.7,76.3,14.0,1497,24.0,9916,70.3,13.3,74700,8235,2.47,17403,29918,28.4,393,5711,2.7,1095,1769,0.0,0.0,3.3,0.0,0.0,0.0,399132,56712,229277,11326,28427,2,776.83,27.0
9,1015,Calhoun County,AL,115916,118586,-2.3,118572,5.7,22.2,16.0,51.8,75.8,21.1,0.5,0.9,0.1,1.7,3.5,72.9,83.6,2.4,4.5,78.6,16.1,11385,22.5,53289,68.7,13.8,100600,45196,2.54,20828,39962,21.9,2311,34871,0.6,6352,8713,7.2,0.0,1.6,0.0,0.5,24.7,2679991,0,1542981,13678,186533,114,605.87,195.7


#### Preview the Election Results Data

In [10]:
results_data.head(10)

Unnamed: 0.1,Unnamed: 0,votes_dem,votes_gop,total_votes,per_dem,per_gop,diff,per_point_diff,state_abbr,county_name,combined_fips
0,0,93003.0,130413.0,246588.0,0.377159,0.52887,37410,15.17%,AK,Alaska,2013
1,1,93003.0,130413.0,246588.0,0.377159,0.52887,37410,15.17%,AK,Alaska,2016
2,2,93003.0,130413.0,246588.0,0.377159,0.52887,37410,15.17%,AK,Alaska,2020
3,3,93003.0,130413.0,246588.0,0.377159,0.52887,37410,15.17%,AK,Alaska,2050
4,4,93003.0,130413.0,246588.0,0.377159,0.52887,37410,15.17%,AK,Alaska,2060
5,5,93003.0,130413.0,246588.0,0.377159,0.52887,37410,15.17%,AK,Alaska,2068
6,6,93003.0,130413.0,246588.0,0.377159,0.52887,37410,15.17%,AK,Alaska,2070
7,7,93003.0,130413.0,246588.0,0.377159,0.52887,37410,15.17%,AK,Alaska,2090
8,8,93003.0,130413.0,246588.0,0.377159,0.52887,37410,15.17%,AK,Alaska,2100
9,9,93003.0,130413.0,246588.0,0.377159,0.52887,37410,15.17%,AK,Alaska,2105


In [25]:
results_data['candidate'] = 'Trump'
mask = results_data['per_dem'] > results_data['per_gop']
results_data.loc[mask, 'candidate']  = 'Clinton'

results_data['vote'] = 0
results_data.loc[mask, 'vote']  = 1

In [11]:
results_data.head()

Unnamed: 0.1,Unnamed: 0,votes_dem,votes_gop,total_votes,per_dem,per_gop,diff,per_point_diff,state_abbr,county_name,combined_fips
0,0,93003.0,130413.0,246588.0,0.377159,0.52887,37410,15.17%,AK,Alaska,2013
1,1,93003.0,130413.0,246588.0,0.377159,0.52887,37410,15.17%,AK,Alaska,2016
2,2,93003.0,130413.0,246588.0,0.377159,0.52887,37410,15.17%,AK,Alaska,2020
3,3,93003.0,130413.0,246588.0,0.377159,0.52887,37410,15.17%,AK,Alaska,2050
4,4,93003.0,130413.0,246588.0,0.377159,0.52887,37410,15.17%,AK,Alaska,2060


#### Join Population Demographics & Election Results Dataframes on FIPS

In [12]:
results = results_data.drop(["Unnamed: 0", "state_abbr", "county_name"], axis=1)
results = county_data.set_index('fips').join(results.set_index('combined_fips'))

In [13]:
results.head(100)

Unnamed: 0_level_0,area_name,state_abbreviation,PST045214,PST040210,PST120214,POP010210,AGE135214,AGE295214,AGE775214,SEX255214,RHI125214,RHI225214,RHI325214,RHI425214,RHI525214,RHI625214,RHI725214,RHI825214,POP715213,POP645213,POP815213,EDU635213,EDU685213,VET605213,LFE305213,HSG010214,HSG445213,HSG096213,HSG495213,HSD410213,HSD310213,INC910213,INC110213,PVY020213,BZA010213,BZA110213,BZA115213,NES010213,SBO001207,SBO315207,SBO115207,SBO215207,SBO515207,SBO415207,SBO015207,MAN450207,WTN220207,RTN130207,RTN131207,AFN120207,BPS030214,LND110210,POP060210,votes_dem,votes_gop,total_votes,per_dem,per_gop,diff,per_point_diff
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1
0,United States,,318857056,308758105,3.3,308745538,6.2,23.1,14.5,50.8,77.4,13.2,1.2,5.4,0.2,2.5,17.4,62.1,84.9,12.9,20.7,86.0,28.8,21263779,25.5,133957180,64.9,26.0,176700,115610216,2.63,28155,53046,15.4,7488353,118266253,2.0,23005620,27092908,7.1,0.9,5.7,0.1,8.3,28.8,5319456312,4174286516,3917663456,12990,613795732,1046363,3531905.43,87.4,,,,,,,
1000,Alabama,,4849377,4780127,1.4,4779736,6.1,22.8,15.3,51.5,69.7,26.7,0.7,1.3,0.1,1.5,4.1,66.2,85.0,3.5,5.2,83.1,22.6,388865,24.2,2207912,69.7,15.9,122500,1838683,2.55,23680,43253,18.6,97578,1603100,1.1,311578,382350,14.8,0.8,1.8,0.1,1.2,28.1,112858843,52252752,57344851,12364,6426342,13369,50645.33,94.4,,,,,,,
1001,Autauga County,AL,55395,54571,1.5,54571,6.0,25.2,13.8,51.4,77.9,18.7,0.5,1.1,0.1,1.8,2.7,75.6,85.0,1.6,3.5,85.6,20.9,5922,26.2,22751,76.8,8.3,136200,20071,2.71,24571,53682,12.1,817,10120,2.1,2947,4067,15.2,0.0,1.3,0.0,0.7,31.7,0,0,598175,12003,88157,131,594.44,91.8,5908.0,18110.0,24661.0,0.239569,0.734358,12202.0,49.48%
1003,Baldwin County,AL,200111,182265,9.8,182265,5.6,22.2,18.7,51.2,87.1,9.6,0.7,0.9,0.1,1.6,4.6,83.0,82.1,3.6,5.5,89.1,27.7,19346,25.9,107374,72.6,24.4,168600,73283,2.52,26766,50221,13.9,4871,54988,3.7,16508,19035,2.7,0.4,1.0,0.0,1.3,27.3,1410273,0,2966489,17166,436955,1384,1589.78,114.6,18409.0,72780.0,94090.0,0.195653,0.773515,54371.0,57.79%
1005,Barbour County,AL,26887,27457,-2.1,27457,5.7,21.2,16.5,46.6,50.2,47.6,0.6,0.5,0.2,0.9,4.5,46.6,84.8,2.9,5.0,73.7,13.4,2120,24.6,11799,67.7,10.6,89200,9200,2.66,16829,32911,26.7,464,6611,-5.6,1546,1667,0.0,0.0,0.0,0.0,0.0,27.0,0,0,188337,6334,0,8,884.88,31.0,4848.0,5431.0,10390.0,0.466603,0.522714,583.0,5.61%
1007,Bibb County,AL,22506,22919,-1.8,22915,5.3,21.0,14.8,45.9,76.3,22.1,0.4,0.2,0.1,0.9,2.1,74.5,86.6,1.2,2.1,77.5,12.1,1327,27.6,8978,79.0,7.3,90500,7091,3.03,17427,36447,18.1,275,3145,7.5,1126,1385,14.9,0.0,0.0,0.0,0.0,0.0,0,0,124707,5804,10757,19,622.58,36.8,1874.0,6733.0,8748.0,0.21422,0.769662,4859.0,55.54%
1009,Blount County,AL,57719,57322,0.7,57322,6.1,23.6,17.0,50.5,96.0,1.8,0.6,0.3,0.1,1.2,8.7,87.8,88.7,4.3,7.3,77.0,12.1,4540,33.9,23826,81.0,4.5,117100,21108,2.7,20730,44145,15.8,660,6798,3.4,3563,4458,0.0,0.0,0.0,0.0,0.0,23.2,341544,0,319700,5622,20941,3,644.78,88.9,2150.0,22808.0,25384.0,0.084699,0.898519,20658.0,81.38%
1011,Bullock County,AL,10764,10915,-1.4,10914,6.3,21.4,14.9,45.3,26.9,70.1,0.8,0.3,0.7,1.1,7.5,22.1,84.7,5.4,5.2,67.8,12.5,636,26.9,4461,74.3,8.7,70600,3741,2.73,18628,32033,21.6,112,0,0.0,470,417,0.0,0.0,0.0,0.0,0.0,38.8,0,0,43810,3995,3670,1,622.81,17.5,3530.0,1139.0,4701.0,0.750904,0.242289,2391.0,50.86%
1013,Butler County,AL,20296,20946,-3.1,20947,6.1,23.6,18.0,53.6,53.9,44.0,0.4,0.9,0.0,0.8,1.2,53.1,94.6,0.8,1.7,76.3,14.0,1497,24.0,9916,70.3,13.3,74700,8235,2.47,17403,29918,28.4,393,5711,2.7,1095,1769,0.0,0.0,3.3,0.0,0.0,0.0,399132,56712,229277,11326,28427,2,776.83,27.0,3716.0,4891.0,8685.0,0.427864,0.563155,1175.0,13.53%
1015,Calhoun County,AL,115916,118586,-2.3,118572,5.7,22.2,16.0,51.8,75.8,21.1,0.5,0.9,0.1,1.7,3.5,72.9,83.6,2.4,4.5,78.6,16.1,11385,22.5,53289,68.7,13.8,100600,45196,2.54,20828,39962,21.9,2311,34871,0.6,6352,8713,7.2,0.0,1.6,0.0,0.5,24.7,2679991,0,1542981,13678,186533,114,605.87,195.7,13197.0,32803.0,47376.0,0.278559,0.692397,19606.0,41.38%


#### Separate County-Level & State-Level Results

In [14]:
county_results = results[results['state_abbreviation'].notnull()]

In [15]:
state_results = results[results['state_abbreviation'].isnull()]

#### Label Encode Categorical Variables

In [16]:
le = LabelEncoder()
le.fit(county_results['state_abbreviation'])
county_results.loc[:,'state_idx'] = le.transform(county_results['state_abbreviation'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


### Features

In [17]:
all_features = county_data.columns

In [18]:
# Selected Features
sf = all_features[3:].append(pd.Index(['state_idx']))

In [19]:
# Label
label = 'vote'

In [20]:
county_results[sf]

Unnamed: 0_level_0,PST045214,PST040210,PST120214,POP010210,AGE135214,AGE295214,AGE775214,SEX255214,RHI125214,RHI225214,RHI325214,RHI425214,RHI525214,RHI625214,RHI725214,RHI825214,POP715213,POP645213,POP815213,EDU635213,EDU685213,VET605213,LFE305213,HSG010214,HSG445213,HSG096213,HSG495213,HSD410213,HSD310213,INC910213,INC110213,PVY020213,BZA010213,BZA110213,BZA115213,NES010213,SBO001207,SBO315207,SBO115207,SBO215207,SBO515207,SBO415207,SBO015207,MAN450207,WTN220207,RTN130207,RTN131207,AFN120207,BPS030214,LND110210,POP060210,state_idx
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1
1001,55395,54571,1.5,54571,6.0,25.2,13.8,51.4,77.9,18.7,0.5,1.1,0.1,1.8,2.7,75.6,85.0,1.6,3.5,85.6,20.9,5922,26.2,22751,76.8,8.3,136200,20071,2.71,24571,53682,12.1,817,10120,2.1,2947,4067,15.2,0.0,1.3,0.0,0.7,31.7,0,0,598175,12003,88157,131,594.44,91.8,1
1003,200111,182265,9.8,182265,5.6,22.2,18.7,51.2,87.1,9.6,0.7,0.9,0.1,1.6,4.6,83.0,82.1,3.6,5.5,89.1,27.7,19346,25.9,107374,72.6,24.4,168600,73283,2.52,26766,50221,13.9,4871,54988,3.7,16508,19035,2.7,0.4,1.0,0.0,1.3,27.3,1410273,0,2966489,17166,436955,1384,1589.78,114.6,1
1005,26887,27457,-2.1,27457,5.7,21.2,16.5,46.6,50.2,47.6,0.6,0.5,0.2,0.9,4.5,46.6,84.8,2.9,5.0,73.7,13.4,2120,24.6,11799,67.7,10.6,89200,9200,2.66,16829,32911,26.7,464,6611,-5.6,1546,1667,0.0,0.0,0.0,0.0,0.0,27.0,0,0,188337,6334,0,8,884.88,31.0,1
1007,22506,22919,-1.8,22915,5.3,21.0,14.8,45.9,76.3,22.1,0.4,0.2,0.1,0.9,2.1,74.5,86.6,1.2,2.1,77.5,12.1,1327,27.6,8978,79.0,7.3,90500,7091,3.03,17427,36447,18.1,275,3145,7.5,1126,1385,14.9,0.0,0.0,0.0,0.0,0.0,0,0,124707,5804,10757,19,622.58,36.8,1
1009,57719,57322,0.7,57322,6.1,23.6,17.0,50.5,96.0,1.8,0.6,0.3,0.1,1.2,8.7,87.8,88.7,4.3,7.3,77.0,12.1,4540,33.9,23826,81.0,4.5,117100,21108,2.7,20730,44145,15.8,660,6798,3.4,3563,4458,0.0,0.0,0.0,0.0,0.0,23.2,341544,0,319700,5622,20941,3,644.78,88.9,1
1011,10764,10915,-1.4,10914,6.3,21.4,14.9,45.3,26.9,70.1,0.8,0.3,0.7,1.1,7.5,22.1,84.7,5.4,5.2,67.8,12.5,636,26.9,4461,74.3,8.7,70600,3741,2.73,18628,32033,21.6,112,0,0.0,470,417,0.0,0.0,0.0,0.0,0.0,38.8,0,0,43810,3995,3670,1,622.81,17.5,1
1013,20296,20946,-3.1,20947,6.1,23.6,18.0,53.6,53.9,44.0,0.4,0.9,0.0,0.8,1.2,53.1,94.6,0.8,1.7,76.3,14.0,1497,24.0,9916,70.3,13.3,74700,8235,2.47,17403,29918,28.4,393,5711,2.7,1095,1769,0.0,0.0,3.3,0.0,0.0,0.0,399132,56712,229277,11326,28427,2,776.83,27.0,1
1015,115916,118586,-2.3,118572,5.7,22.2,16.0,51.8,75.8,21.1,0.5,0.9,0.1,1.7,3.5,72.9,83.6,2.4,4.5,78.6,16.1,11385,22.5,53289,68.7,13.8,100600,45196,2.54,20828,39962,21.9,2311,34871,0.6,6352,8713,7.2,0.0,1.6,0.0,0.5,24.7,2679991,0,1542981,13678,186533,114,605.87,195.7,1
1017,34076,34170,-0.3,34215,5.9,21.4,18.3,52.3,58.3,39.5,0.3,0.8,0.1,1.1,2.0,56.8,85.8,1.1,1.3,75.1,11.8,2691,24.6,16894,67.9,11.1,81200,13722,2.46,19291,32402,24.1,515,6431,-0.2,2354,1981,0.0,0.0,0.0,0.0,0.0,29.3,667283,0,264650,7620,23237,8,596.53,57.4,1
1019,26037,25986,0.2,25989,4.8,20.4,20.9,50.2,93.0,4.6,0.5,0.3,0.0,1.6,1.5,91.6,90.6,0.7,1.1,78.3,12.8,2174,26.9,16241,76.1,4.6,99400,11656,2.2,22030,34907,21.2,379,3864,5.5,1560,2180,0.0,0.0,0.0,0.0,0.0,14.5,307439,62293,186321,7613,13948,2,553.7,46.9,1


In [21]:
sf

Index(['PST045214', 'PST040210', 'PST120214', 'POP010210', 'AGE135214',
       'AGE295214', 'AGE775214', 'SEX255214', 'RHI125214', 'RHI225214',
       'RHI325214', 'RHI425214', 'RHI525214', 'RHI625214', 'RHI725214',
       'RHI825214', 'POP715213', 'POP645213', 'POP815213', 'EDU635213',
       'EDU685213', 'VET605213', 'LFE305213', 'HSG010214', 'HSG445213',
       'HSG096213', 'HSG495213', 'HSD410213', 'HSD310213', 'INC910213',
       'INC110213', 'PVY020213', 'BZA010213', 'BZA110213', 'BZA115213',
       'NES010213', 'SBO001207', 'SBO315207', 'SBO115207', 'SBO215207',
       'SBO515207', 'SBO415207', 'SBO015207', 'MAN450207', 'WTN220207',
       'RTN130207', 'RTN131207', 'AFN120207', 'BPS030214', 'LND110210',
       'POP060210', 'state_idx'],
      dtype='object')

In [23]:
county_results = county_results.reset_index(drop=True)

In [1]:
def plot_feature_scores(feature_labels, fscores, plot_name="feature_scores.pdf"):

    def compute_upper_bound(fmax):
        if fmax == 0:
            return 0
        return 10**(int(round(np.log10(fmax)))-1) + round(fmax, -int(np.floor(np.log10(fmax))))
    
    
    bar_size    = 0.25
    pad_dist    = 3.5
    num_line    = 5
    score_min   = 0
    score_max   = compute_upper_bound(max(fscores))
    score_range = np.arange(0, score_max, score_max/num_line)
    
    
    fig, ax = plt.subplots(figsize=(8, len(fscores) * bar_size))
    plt.margins(y=0)

    rects = ax.barh(range(len(val)), val[::-1], color='blue')

    for rect in rects:
        ax.text(rect.get_width() + pad_dist, rect.get_y() + rect.get_height()/2.,
                    '%.1f' % rect.get_width(),
                    ha='center', va='center', size=12)

    for x in score_range:
        plt.axvline(x=x, color='g', linestyle='--')

    plt.axhline(y=len(val)-int(np.sqrt(len(val)))-0.5, color='r', linestyle='-')

    ax.set_xlim(0,score_max)
    ax.xaxis.set_label_position('top')
    ax.set_xticklabels(score_range, fontsize=12)
    ax.tick_params(labeltop=True, labelbottom=True)

    ax.set_yticks(range(len(val)))
    ax.set_yticklabels(tick_labels[::-1], rotation='horizontal', fontsize=12)

    ax.set_xlabel("Feature Scores", fontsize=18)
    ax.set_ylabel("Features", fontsize=18)

    plt.tight_layout()
    plt.savefig(plot_name, dpi=300)
    plt.show()
    plt.close()

In [2]:
def train_rfc(x_train, y_train, n_estimators=1000, plot_importance=False, plot_name='feature_scores.pdf'):
    
    """
    train_nearest_neighbor: Training the Nearest Neighbor Algorithm with the Training Set
    
    :params: (1) x_train - Independent Variable of the Training Set
             (2) y_train - Dependent Variable of the Training Set
             (3) n_estimators - Number of Decision Trees in Random Forest
   
    :returns: model  - Model Trained on x_train and y_train.
    
    """
    
    # Defining the Nearest Neighbor Algorithm for k-Neighbors
    rfc   = RandomForestClassifier(n_jobs=-1, n_estimators=n_estimators)

    # Training the Nearest Neighbor Algorithm with the Training Set
    model = rfc.fit(x_train, y_train)
    
    if plot_importance:
        feature_importances = pd.DataFrame(rfc.feature_importances_,
                                   index = x_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
    
        plot_feature_scores(feature_importances['importance'].index.values,
                            feature_importances['importance'].values,
                            plot_name)
    

    return model

In [3]:
def test_rfc(model, x_test, y_test=None, plot=True, file='rfc_results.pdf'):
    
    """
    test_nearest_neighbor: Computes the Predicted Values and Loss Function Associated with the k-Nearest Neighbors Algorithm.
    
    :params: (1) model   - Trained Model
             (2) x_test  - Independent Variable of the Test/Validation Set
             (3) y_test  - Dependent (True) Variable of the Test/Validation Set
             (4) plot    - Flag for Plotting Results of the Prediction
    
    :returns: (1) y_pred - Prediction / Values when x_test is applied to the model.
              (2) loss   - Value of Loss Function of y_test (i.e., true value) and the predicted value (2).
    """
        
        
    # Predicting the Redshift Values of the Test or Validation Set
    y_pred = model.predict(x_test)
    
    if y_test is None:
        return y_pred
    
    loss = log_loss(y_test, y_pred)
    
    ac   = accuracy_score(y_test, y_pred)
    print('Accuracy: {}'.format(np.round(ac,3)))
    
    if plot:
    
        cm = confusion_matrix(y_test, model.predict(x_test))
        ax = sns.heatmap(cm, annot=True, fmt="d")
        ax.set(xlabel='Predicted UDGs', ylabel='True UDGs', title='Confusion Matrix')

        figure = ax.get_figure()    
        figure.savefig(file, dpi=400)
        plt.show()
        plt.close()

    return y_pred, loss, ac

In [4]:
def cross_validation_rfc(X, y, n_estimators=1000, plot=True, file='cv_rfc.pdf',  # Data
                        kfolds=10, shuffle=True, state=0, verbose=True):         # Cross-Validation Parameters

    """
    cross_validation: Determines the Optimal Number of Nearest Neighbors & Evaluates the Model Performance
    
    :params: (1) X - 
             (2) y -
             (3) n_estimators -
             (4) plot -
             (5) file -
             (6) kfolds -
             (7) shuffle -
             (8) state - 
             (9) verbose -
    
    :returns: (1) y_true
              (2) y_pred
    """
    
    
    # Defines the Type of Cross-Validation
    kf = KFold(n_splits=kfolds, shuffle=shuffle, random_state=state)
    
    # Stores Model Selection & Model Performance Results
    y_true = pd.DataFrame()
    y_pred = pd.DataFrame()

    # Splits the Dataset into the Training & Validation/Test Sets
    # Cross Validation: Iterates through Each Fold
    for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X=X, y=y)):

        # Prints Assignment for Training and Validation/Testing Set
        if verbose:
            print("\nFold {0}\n\tTEST: {1}\n".format(fold_idx, test_idx))
            
        # Computes
        model = train_rfc(pd.DataFrame(X, index=train_idx), pd.DataFrame(y, index=train_idx).values.ravel(),
                          n_estimators=n_estimators,
                          plot_importance=False,
                          plot_name='feature_scores'+str(fold_idx)+'.pdf')
        
        pred, loss, ac = test_rfc(model, x_test=pd.DataFrame(X, index=test_idx), y_test=pd.DataFrame(y, index=test_idx),
                                  plot=plot, file='rfc_results_{0}.pdf'.format(fold_idx))
        
        y_true = y_true.append(pd.DataFrame(y, index=test_idx))
        y_pred = y_pred.append(pd.DataFrame(pred))
        
        #show_tree(model, fold_idx)
        
    
    cm = confusion_matrix(y_true, y_pred)
    ax = sns.heatmap(cm, annot=True, fmt="d")
    ax.set(xlabel='Predicted UDGs', ylabel='True UDGs', title='Confusion Matrix')
    figure = ax.get_figure()    
    figure.savefig(file, dpi=400)
    plt.show()
    plt.close()

    print('Overall Accuracy: {}'.format(accuracy_score(y_true, y_pred)))
    
    return y_true, y_pred    

In [26]:
y_true, y_pred = cross_validation_rfc(county_results[sf].dropna(), county_results[label], plot=False, verbose=False)

KeyError: 'vote'

In [None]:
county_results[sf].dtypes

In [None]:
county_results[label].dtypes

In [None]:
np.any(np.isnan(county_results[sf]))

In [None]:
np.all(np.isfinite(county_results[sf]))

In [None]:
np.where(np.isnan(county_results[sf]))