# Introduction
Voter Analysis exploration

## Imports
Import libraries and write settings here.

In [1]:
from fastai.tabular import *
from fastai import *

# Data manipulation
import pandas as pd
import numpy as np

# Options for pandas
pd.options.display.max_columns = 60
pd.options.display.max_rows = 60

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

# Visualizations
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)

import cufflinks as cf
cf.go_offline(connected=True)
cf.set_config_file(theme='white')



# Data Import and Cleaning

In [2]:
#find the data
path = './data/'
df = pd.read_csv('./data/ccesplus.csv',encoding = "ISO-8859-1")

In [3]:
#get rid of spaces and caps in column names (if any)
df.columns = [col.replace(' ','_').lower() for col in df.columns]

In [4]:
# df.head()
# df.columns
# df.describe()

#any duplicate rows?
df.duplicated().sum()

0

In [5]:
#lets see how many columns, how many have no and some missing values
print (f"Total columns={len(df.columns)}")
print(f"Total columns with no Nans= {len(df.columns[~df.isnull().any()])}") 
print(f"Total columns with Nans= {len(df.columns[df.isnull().any()])}") #add ~ to get columns with no missing values

Total columns=866
Total columns with no Nans= 268
Total columns with Nans= 598


The columns with no NaNs (missing values) are the easiest to use since we dont have to impute missing values

## Get a subset of the rows of the data to work on if desired

In [6]:
#get first thousand rows
# df=df[:1000]

## See what the summary columns do
looks like the summaries (..sum) are just the average

In [7]:
columns=['cc.CC16_422c','cc.CC16_422d','cc.CC16_422e','cc.CC16_422f', 'cc.raceviewsum']
columns = [col.lower() for col in columns]
tmp_df=df[columns]
tmp_df.head()

Unnamed: 0,cc.cc16_422c,cc.cc16_422d,cc.cc16_422e,cc.cc16_422f,cc.raceviewsum
0,1.0,2.0,2.0,2.0,1.75
1,1.0,3.0,1.0,2.0,1.75
2,,,,,
3,,,,,
4,1.0,1.0,1.0,1.0,1.0


## Get the columns of interest

In [68]:
#contains all the columns that original logits were run on
#use these to start to compare neural network solution
columns = pd.read_excel('./data/results cces.xlsx',encoding = "ISO-8859-1", skiprows=3)
columns = list(columns.iloc[:,0])

#get the dependant variable(s)
# columns_dep_var= ['cc.TrumpGEVote','cc.TrumpPVote', 'cc.vote12.gop']
columns_dep_var= ['cc.trumpgevote']

#combine
columns.extend(columns_dep_var)

#strip rubbish
columns = [col.replace(' ','_').lower() for col in columns]

## Create dataframe for model training

In [186]:
#select out the columns of interest
df_s=df[columns]

In [187]:
len(df_s.iloc[0])

32

### Output some info

In [11]:
# how many unique values there are per column, 
# use that to guide which columns are categorical and which are continuous
#pick largest value that looks continuous, for instance raceviewsum=17 
#(but see the docs many of theses fields are floats which meand fastai ignores cardinality)
for col in df_s.columns:
    print(f"{col}:{df[col].nunique()}")

cc.blackdum:2
econ.mhi:2646
cc.sex:2
cc.maxeduc.4yr:2
econ.hhpov.p:1030
mort.ucd.despair.disc95.pdpy:2660
cc.union:2
cc.age:80
demo.popdense:2668
cc.emp.nojob:2
rustpc:2437
taa.wrks.disc95.pcpy:2120
cc.faminc:16
crashpc:2626
cc.newsint:5
cc.isimmigrant:2
cc.catholic:2
cc.cc16_305_2:2
job.uer:761
cc.child18:2
cc.evanprot:2
cc.whitedum:2
cc.religiosity:15
cc.cc16_351b:2
cc.cc16_307:4
cc.immviewsum:5
cc.cc16_304:5
cc.inddum:2
cc.ideo7:7
cc.raceviewsum:17
cc.repdum:2
cc.trumpgevote:2


In [12]:
#what types are the columns
df_s.dtypes

cc.blackdum                       int64
econ.mhi                        float64
cc.sex                            int64
cc.maxeduc.4yr                    int64
econ.hhpov.p                    float64
mort.ucd.despair.disc95.pdpy    float64
cc.union                        float64
cc.age                            int64
demo.popdense                   float64
cc.emp.nojob                      int64
rustpc                          float64
taa.wrks.disc95.pcpy            float64
cc.faminc                       float64
crashpc                         float64
cc.newsint                      float64
cc.isimmigrant                    int64
cc.catholic                     float64
cc.cc16_305_2                     int64
job.uer                         float64
cc.child18                      float64
cc.evanprot                     float64
cc.whitedum                       int64
cc.religiosity                  float64
cc.cc16_351b                    float64
cc.cc16_307                     float64


In [13]:
#the percentage of NaNs in each column, note the large percentage of missing values in the bottom columns
#I would guess that cc.faminc and cc.raceviewsum are critical
df_s.isna().sum().sort_values(ascending = True)/len(df)

cc.blackdum                     0.000000
cc.immviewsum                   0.000000
cc.whitedum                     0.000000
cc.cc16_305_2                   0.000000
cc.emp.nojob                    0.000000
cc.age                          0.000000
cc.isimmigrant                  0.000000
cc.sex                          0.000000
cc.maxeduc.4yr                  0.000000
cc.evanprot                     0.000666
cc.newsint                      0.000820
cc.catholic                     0.001130
demo.popdense                   0.001300
econ.hhpov.p                    0.001300
econ.mhi                        0.001300
taa.wrks.disc95.pcpy            0.001300
job.uer                         0.001300
cc.cc16_307                     0.001517
mort.ucd.despair.disc95.pdpy    0.001548
cc.child18                      0.001827
cc.cc16_351b                    0.001950
cc.cc16_304                     0.002059
cc.union                        0.002136
crashpc                         0.002229
rustpc          

In [42]:
#BUMMER! the dependant variable has a lot of missing values (36.7%)
df_s['cc.trumpgevote'].unique()

array([nan,  1.,  0.])

## Remove NaN dependent variable rows
whaddaya going to do?  You have no way of knowing how these people voted

In [188]:
print(f"There are {len(df_s)} rows in df_s")

There are 64600 rows in df_s


In [189]:
#dump the NaN rows
df_s = df_s[pd.notnull(df_s['cc.trumpgevote'] )]

In [190]:
#convert trumpgevote to long (otherwise fit fails)
df_s['cc.trumpgevote'] = df_s['cc.trumpgevote'].astype('int64');

In [191]:
#causes nan for learner
# df_s['cc.trumpgevote'] = df_s['cc.trumpgevote'].astype('category');
# df_s['cc.trumpgevote'].cat.categories

In [192]:
#is the dataset balanced?
df_s['cc.trumpgevote'].value_counts()

0    22136
1    18754
Name: cc.trumpgevote, dtype: int64

## Split out categorical and continuous variables
see if fastai can auto do it

In [193]:
#give fastai a shot at splitting cat and cont variables
res_cont, res_cat = cont_cat_split(df_s,max_card=18)

In [194]:
# ??cont_cat_split

In [195]:
print(res_cont)
print(res_cat)

['econ.mhi', 'econ.hhpov.p', 'mort.ucd.despair.disc95.pdpy', 'cc.union', 'cc.age', 'demo.popdense', 'rustpc', 'taa.wrks.disc95.pcpy', 'cc.faminc', 'crashpc', 'cc.newsint', 'cc.catholic', 'job.uer', 'cc.child18', 'cc.evanprot', 'cc.religiosity', 'cc.cc16_351b', 'cc.cc16_307', 'cc.immviewsum', 'cc.cc16_304', 'cc.inddum', 'cc.ideo7', 'cc.raceviewsum', 'cc.repdum']
['cc.blackdum', 'cc.sex', 'cc.maxeduc.4yr', 'cc.emp.nojob', 'cc.isimmigrant', 'cc.cc16_305_2', 'cc.whitedum', 'cc.trumpgevote']


In [196]:
#remove the dependant variable
res_cont = [x for x in res_cont if x not in columns_dep_var]
res_cat = [x for x in res_cat if x not in columns_dep_var]

In [201]:
df_s.head()
len(df_s)

Unnamed: 0,cc.blackdum,econ.mhi,cc.sex,cc.maxeduc.4yr,econ.hhpov.p,mort.ucd.despair.disc95.pdpy,cc.union,cc.age,demo.popdense,cc.emp.nojob,rustpc,taa.wrks.disc95.pcpy,cc.faminc,crashpc,cc.newsint,cc.isimmigrant,cc.catholic,cc.cc16_305_2,job.uer,cc.child18,cc.evanprot,cc.whitedum,cc.religiosity,cc.cc16_351b,cc.cc16_307,cc.immviewsum,cc.cc16_304,cc.inddum,cc.ideo7,cc.raceviewsum,cc.repdum,cc.trumpgevote,econ.mhi_na,econ.hhpov.p_na,mort.ucd.despair.disc95.pdpy_na,cc.union_na,demo.popdense_na,rustpc_na,taa.wrks.disc95.pcpy_na,cc.faminc_na,crashpc_na,cc.newsint_na,cc.catholic_na,job.uer_na,cc.child18_na,cc.evanprot_na,cc.religiosity_na,cc.cc16_351b_na,cc.cc16_307_na,cc.cc16_304_na,cc.inddum_na,cc.ideo7_na,cc.raceviewsum_na,cc.repdum_na
1,0,53.3485,1,1,10.683333,0.048055,1.0,64,574.943647,0,0.035499,0.000284,6.0,0.154968,1.0,0,0.0,0,8.533333,0.0,1.0,1,16.0,1.0,4.0,0.5,3.0,0.0,7.0,1.75,1.0,1,True,True,True,False,True,True,True,True,True,False,False,True,False,False,False,False,False,False,False,False,False,False
4,0,53.3485,0,1,10.683333,0.048055,0.0,75,574.943647,0,0.035499,0.000284,8.0,0.154968,1.0,0,0.0,0,8.533333,0.0,0.0,1,11.0,0.0,4.0,0.0,1.0,0.0,1.0,1.0,0.0,0,True,True,True,False,True,True,True,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False
7,0,53.3485,1,1,10.683333,0.048055,1.0,81,574.943647,0,0.035499,0.000284,6.0,0.154968,1.0,0,0.0,0,8.533333,0.0,0.0,1,16.0,1.0,4.0,0.75,4.0,1.0,6.0,2.5,0.0,1,True,True,True,False,True,True,True,True,True,False,False,True,False,False,False,False,False,False,False,False,False,False
9,0,53.3485,0,0,10.683333,0.048055,0.0,39,574.943647,0,0.035499,0.000284,10.0,0.154968,1.0,0,0.0,0,8.533333,1.0,0.0,1,7.0,0.0,3.0,0.0,3.0,1.0,4.0,1.75,0.0,0,True,True,True,False,True,True,True,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False
11,0,53.3485,1,0,10.683333,0.048055,0.0,66,574.943647,0,0.035499,0.000284,8.0,0.154968,1.0,0,0.0,0,8.533333,0.0,0.0,1,15.0,0.0,3.0,0.75,3.0,1.0,4.0,2.0,0.0,0,True,True,True,False,True,True,True,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False


40890

## Categorify and Fill Missing

In [198]:
#from docs https://docs.fast.ai/tabular.transform.html
tfm = Categorify(cat_names=res_cat, cont_names=res_cont)
tfm(df_s)

In [199]:
df_s['cc.blackdum'].cat.categories

Int64Index([0, 1], dtype='int64')

In [200]:
tfm1 = FillMissing(cat_names=res_cat, cont_names=res_cont)
tfm1(df_s)

## Split out train/test

In [202]:
# split out data
from sklearn.model_selection import train_test_split
# train,val_test = train_test_split(df_s, test_size=0.2)
# val, test = train_test_split(val_test, test_size=0.5)
# print(str(len(train)))
# print(str(len(val)))
# print(str(len(test)))   
train,tst = train_test_split(df_s, test_size=0.1)
print(str(len(train)))
# print(str(len(val)))
print(str(len(tst)))

36801
4089


In [203]:
len(df_s)

40890

# RandomForest

In [204]:
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display

from sklearn import metrics
m_rf = RandomForestRegressor(n_estimators=40, n_jobs=-1, oob_score=True)

In [205]:
y=train['cc.trumpgevote'].copy()
trn = train.copy()
trn=trn.drop('cc.trumpgevote',axis=1)

In [182]:
# trn.T
y

32366    0
35897    0
60875    0
1439     1
4023     0
45456    0
46657    1
34215    0
43568    1
32435    1
33860    0
35551    0
28278    1
63451    1
23313    1
34613    1
18853    0
35379    0
56910    0
44611    1
34310    0
18203    1
33443    0
1760     0
38570    0
39855    0
10291    1
14266    1
17810    1
13098    0
        ..
48192    1
7766     0
33516    1
29494    0
19273    0
48979    0
1425     1
51430    1
6018     1
43794    0
39360    0
7251     0
44913    0
45234    1
13141    1
44744    1
24738    0
4992     0
27632    1
14155    1
59687    1
17765    0
9237     0
61998    0
58793    1
42480    0
10205    0
3370     1
39067    1
20582    1
Name: cc.trumpgevote, Length: 36801, dtype: int64

In [206]:
print(f"Total columns with no Nans= {len(trn.columns[~trn.isnull().any()])}") 
print(f"Total columns with Nans= {len(trn.columns[trn.isnull().any()])}") #add ~ to get columns with no missing values

Total columns with no Nans= 53
Total columns with Nans= 0


In [207]:

m_rf.fit(trn, y)

def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=-1,
           oob_score=True, random_state=None, verbose=0, warm_start=False)

In [208]:
m_rf.score(trn,y)

0.9609484081009083

In [209]:
print_score(m_rf)

NameError: name 'X_train' is not defined

# DATA CLEANING ABOVE

In [170]:
procs = [FillMissing, Categorify, Normalize]

In [171]:
test_long = TabularList.from_df(tst.copy(),  cat_names=res_cat, cont_names=res_cont,procs=procs)
test_vec = TabularList.from_df(tst.copy(),  cat_names=res_cat, cont_names=res_cont,procs=procs)

len(test_long)
len(test_vec)

4089

4089

In [172]:
#all this to get a random list of validation indexes
from numpy import random

#generate a list of all indexes
i = list(range(len(train)))

#shuffle it
random.shuffle(i)

#get number of indexes corresponding to val percentage
number = int(len(i)*.1)

#select validation indexes
val_idx=i[:number]
max(val_idx)

36796

In [173]:
# data = (TabularList.from_df(train,  cat_names=res_cat, cont_names=res_cont, procs=procs)
#                            .split_by_idx(val_idx)
#                            .label_from_df(cols=columns_dep_var)
#                            .databunch())

data = (TabularList.from_df(train,cat_names=res_cat, cont_names=res_cont,  procs=procs)
                           .split_by_idx(val_idx)
                           .label_from_df(cols=columns_dep_var)
                           .add_test(test_vec)
                           .databunch())

In [174]:
data.show_batch(rows=50)

cc.blackdum,cc.sex,cc.maxeduc.4yr,cc.emp.nojob,cc.isimmigrant,cc.cc16_305_2,cc.whitedum,econ.mhi_na,econ.hhpov.p_na,mort.ucd.despair.disc95.pdpy_na,cc.union_na,demo.popdense_na,rustpc_na,taa.wrks.disc95.pcpy_na,cc.faminc_na,crashpc_na,cc.newsint_na,cc.catholic_na,job.uer_na,cc.child18_na,cc.evanprot_na,cc.religiosity_na,cc.cc16_351b_na,cc.cc16_307_na,cc.cc16_304_na,cc.inddum_na,cc.ideo7_na,cc.raceviewsum_na,cc.repdum_na,econ.mhi,econ.hhpov.p,mort.ucd.despair.disc95.pdpy,cc.union,cc.age,demo.popdense,rustpc,taa.wrks.disc95.pcpy,cc.faminc,crashpc,cc.newsint,cc.catholic,job.uer,cc.child18,cc.evanprot,cc.religiosity,cc.cc16_351b,cc.cc16_307,cc.immviewsum,cc.cc16_304,cc.inddum,cc.ideo7,cc.raceviewsum,cc.repdum,target
0,1,0,0,1,1,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,-1.0759,2.2757,-0.0619,-0.6276,-0.5312,-0.1894,1.8939,2.1998,-0.5356,-0.0481,0.3704,1.8764,-0.1803,1.8173,1.6226,0.4055,-1.0188,-1.7888,-0.6291,1.0865,1.6821,-0.0337,0.0778,-0.6111,1
1,1,1,0,0,0,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.0733,0.5168,-0.3167,-0.6276,0.3488,0.4458,0.6592,-0.3092,0.1153,-0.0667,-0.6605,-0.5329,1.0004,-0.5503,-0.6163,0.4055,0.9815,-1.7888,-0.6291,1.0865,-0.5945,-0.0337,-0.8449,-0.6111,0
0,1,0,0,1,0,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.6367,0.3223,0.7215,1.5934,0.7887,-0.2801,-0.2065,0.7546,-0.2101,0.2127,-0.6605,-0.5329,1.5085,-0.5503,-0.6163,-1.1159,0.9815,0.8133,0.7259,0.0270,1.6821,0.5155,-0.2297,-0.6111,1
0,1,1,0,1,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.5791,0.7273,0.2468,-0.6276,-0.6569,-0.2615,-0.2134,-0.1671,0.4408,-0.6621,0.3704,-0.5329,1.2652,-0.5503,-0.6163,-0.2465,0.9815,-0.4878,-1.3065,0.0270,-0.5945,-1.1319,-0.8449,-0.6111,0
0,1,0,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.0733,0.5168,-0.3167,-0.6276,1.6058,0.4458,0.6592,-0.3092,-0.2101,-0.0667,-0.6605,-0.5329,1.0004,-0.5503,-0.6163,-1.3333,-1.0188,0.8133,-1.3065,-1.0325,-0.5945,-1.1319,-1.1524,-0.6111,0
0,1,0,0,0,0,1,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.1629,0.2151,4.6268,-0.6276,-1.5996,-0.2876,-0.7932,-0.7469,-0.2101,-0.0441,-0.6605,-0.5329,-0.1517,1.8173,1.6226,1.2749,-1.0188,0.8133,1.4033,1.0865,-0.5945,1.6138,1.6156,1.6363,1
0,0,0,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-1.5371,1.1719,-1.1903,1.5934,0.4745,-0.2945,-0.4294,-0.8275,0.7662,4.6838,0.3704,-0.5329,0.7285,-0.5503,-0.6163,-1.1159,-1.0188,0.8133,-0.6291,0.0270,-0.5945,0.5155,1.3080,-0.6111,1
0,0,0,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.8770,-0.1144,0.9855,-0.6276,0.1602,-0.2933,-0.7790,-0.4842,-0.2101,0.2809,0.3704,-0.5329,-0.3449,-0.5503,-0.6163,-0.6812,0.9815,-1.7888,-1.3065,0.0270,1.6821,-0.0337,-0.2297,-0.6111,0
0,0,0,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.3135,-0.1184,0.0627,-0.6276,-0.0912,-0.2598,1.8156,1.4551,1.7425,1.1037,0.3704,-0.5329,0.6569,1.8173,-0.6163,0.4055,0.9815,-0.4878,1.4033,0.0270,-0.5945,1.0646,0.3854,1.6363,1
0,0,0,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.7139,0.6161,0.2869,1.5934,-1.5996,-0.2294,-0.2877,0.0228,0.1153,1.4939,-0.6605,-0.5329,0.4136,-0.5503,-0.6163,-1.5506,-1.0188,-1.7888,-1.3065,-1.0325,1.6821,-0.5828,0.0778,-0.6111,0
0,0,0,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.7778,-0.9919,2.1162,1.5934,1.0401,-0.2312,-1.0496,0.3944,0.1153,-1.1182,-0.6605,-0.5329,-1.1964,-0.5503,-0.6163,-1.1159,-1.0188,-0.4878,-1.3065,0.0270,-0.5945,-1.6811,-0.8449,-0.6111,0
0,0,0,0,0,0,1,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.1636,0.9536,0.2766,-0.6276,0.6002,0.0367,-0.9019,-0.4077,-0.2101,-1.4386,-0.6605,-0.5329,-0.3950,-0.5503,-0.6163,-1.5506,0.9815,0.8133,0.7259,1.0865,-0.5945,1.6138,0.3854,1.6363,1
0,1,1,0,1,0,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.5980,0.2032,-0.4292,-0.6276,-1.8511,-0.2128,0.0228,0.4227,-0.5356,-0.1993,0.3704,-0.5329,-0.0157,-0.5503,1.6226,1.0576,-1.0188,-1.7888,-1.3065,0.0270,1.6821,-0.5828,-1.4600,-0.6111,0
0,1,0,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.8492,0.5526,0.1962,-0.6276,-0.6569,-0.2312,-1.0496,-0.6010,0.7662,1.2122,0.3704,-0.5329,1.9950,1.8173,1.6226,1.4923,0.9815,0.8133,0.7259,0.0270,-0.5945,1.6138,0.0778,1.6363,1
0,1,1,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.4766,-0.4559,0.7049,1.5934,0.1602,-0.2831,0.2763,0.9500,-0.8610,-0.3032,-0.6605,-0.5329,-0.3663,-0.5503,-0.6163,0.1882,0.9815,0.8133,1.4033,2.1461,1.6821,-0.0337,1.9231,-0.6111,1
0,0,1,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.1200,0.3024,1.3651,-0.6276,1.7944,-0.2403,-0.7752,-0.3611,0.4408,-0.7034,-0.6605,-0.5329,-0.1731,-0.5503,-0.6163,-1.5506,0.9815,0.8133,1.4033,0.0270,-0.5945,1.0646,2.5382,1.6363,1
0,0,0,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,2.8799,-1.5716,-1.3665,1.5934,0.2859,0.3396,-0.3669,-0.6305,-0.5356,-0.4944,-0.6605,-0.5329,-0.9030,-0.5503,-0.6163,-1.5506,0.9815,0.8133,0.0484,-1.0325,1.6821,-0.5828,-1.1524,-0.6111,0
0,0,0,0,1,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-1.2783,0.1397,-1.0409,-0.6276,0.6002,-0.2965,0.0146,0.9602,-0.5356,0.7357,-0.6605,-0.5329,-0.7098,-0.5503,-0.6163,-1.3333,-1.0188,-0.4878,-0.6291,-1.0325,1.6821,-0.0337,-0.8449,-0.6111,0
0,1,0,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.5501,-0.1581,1.2895,-0.6276,1.7315,-0.1815,0.0589,0.4088,-0.8610,-0.6694,-0.6605,-0.5329,-0.9459,-0.5503,1.6226,1.0576,0.9815,0.8133,1.4033,0.0270,-0.5945,1.6138,1.6156,1.6363,1
0,0,0,0,0,1,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1.1696,-0.2852,-0.3893,1.5934,-1.0968,0.7885,-0.9426,-0.7616,-0.2101,-0.7619,-0.6605,1.8764,-0.7026,1.8173,-0.6163,-0.4639,0.9815,0.8133,-0.6291,1.0865,-0.5945,0.5155,0.6929,1.6363,1
0,1,0,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.2328,-0.0628,-0.3884,1.5934,0.4745,-0.2840,0.2202,0.8808,0.7662,0.1453,-0.6605,1.8764,0.7643,-0.5503,-0.6163,1.2749,0.9815,0.8133,0.7259,0.0270,-0.5945,1.0646,-0.2297,1.6363,1
0,1,0,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.0606,-0.8410,-1.3025,1.5934,-0.8454,-0.2229,0.7818,-0.1954,0.7662,-0.3113,-0.6605,-0.5329,-0.9960,1.8173,-0.6163,-1.5506,0.9815,0.8133,-0.6291,-1.0325,-0.5945,-1.6811,-1.4600,-0.6111,0
1,1,0,0,0,0,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,2.9929,-1.7740,-1.0537,1.5934,1.1658,-0.1518,-0.2626,0.2333,0.7662,-0.3178,-0.6605,-0.5329,-1.0175,-0.5503,-0.6163,0.6229,-1.0188,-0.4878,-0.6291,-1.0325,1.6821,-0.0337,-0.2297,-0.6111,0
0,1,0,1,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.7943,-0.1939,0.4620,-0.6276,0.6630,-0.2126,-0.7225,-0.8215,-0.8610,-0.6438,-0.6605,-0.5329,0.8144,-0.5503,-0.6163,0.4055,-1.0188,-0.4878,0.0484,0.0270,-0.5945,-0.5828,-0.8449,-0.6111,0
0,0,1,0,0,0,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,2.8799,-1.5716,-1.3665,1.5934,0.2231,0.3396,-0.3669,-0.6305,1.0916,-0.4944,-0.6605,1.8764,-0.9030,1.8173,-0.6163,1.4923,-1.0188,0.8133,1.4033,0.0270,1.6821,1.0646,0.6929,-0.6111,1
0,0,0,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.5794,-0.0708,0.8186,-0.6276,0.5373,-0.2673,-0.6794,-0.7948,-0.2101,-0.3806,-0.6605,-0.5329,0.0630,-0.5503,-0.6163,0.4055,-1.0188,0.8133,-0.6291,-1.0325,-0.5945,-1.1319,-0.8449,-0.6111,0
1,1,0,0,0,1,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,1.3814,-1.1269,0.3454,1.5934,-1.2854,-0.1883,-1.0496,-0.4413,-1.1864,-1.0627,0.3704,-0.5329,-0.1230,1.8173,-0.6163,0.6229,-1.0188,-1.7888,-1.3065,-1.0325,-0.5945,-0.0337,-1.4600,-0.6111,0
1,1,0,0,0,1,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-1.3551,1.8390,-1.2579,-0.6276,-1.4111,-0.2815,2.5288,0.7527,3.0443,2.0190,-0.6605,-0.5329,2.7607,-0.5503,-0.6163,-0.2465,-1.0188,-3.0899,-0.6291,2.1461,-0.5945,-1.1319,-0.8449,-0.6111,0
0,1,1,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1.9924,-0.3368,-1.6437,-0.6276,-0.0284,-0.0523,-0.1193,1.0099,2.0679,-0.5693,-0.6605,-0.5329,-0.5238,1.8173,-0.6163,-0.8986,-1.0188,-0.4878,-0.6291,0.0270,1.6821,-0.0337,-1.1524,-0.6111,0
0,1,1,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1.6136,-1.5914,-0.6539,1.5934,-0.0284,-0.0729,0.6813,0.1171,0.1153,-0.4224,-0.6605,-0.5329,-0.9889,1.8173,-0.6163,-1.5506,-1.0188,-3.0899,-1.3065,2.1461,-0.5945,0.5155,-0.5373,-0.6111,0
0,0,1,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.3612,-1.0792,0.5193,-0.6276,-1.8511,-0.2211,-0.5458,-0.5988,-0.8610,-0.0594,-0.6605,1.8764,-1.1105,-0.5503,-0.6163,1.0576,-1.0188,0.8133,-1.3065,-2.0921,-0.5945,-0.0337,-0.5373,1.6363,0
0,0,0,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.5309,0.7829,3.9546,1.5934,1.5429,-0.2204,-0.4985,-0.3317,0.7662,-0.0790,-0.6605,1.8764,-0.3162,-0.5503,-0.6163,0.4055,0.9815,-0.4878,-0.6291,0.0270,-0.5945,-0.0337,0.0778,1.6363,0
1,1,0,0,0,0,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.4409,0.6955,-1.0930,1.5934,0.9773,-0.1647,0.3796,0.5943,-1.1864,0.2818,1.4014,-0.5329,0.8430,-0.5503,1.6226,1.0576,0.9815,-0.4878,-0.6291,0.0270,-0.5945,1.0646,-0.2297,-0.6111,0
0,1,0,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.9122,-1.2341,1.5806,-0.6276,-1.4739,-0.2019,-0.4630,-0.4971,0.7662,-0.0058,0.3704,-0.5329,-0.8601,-0.5503,-0.6163,-1.5506,-1.0188,-0.4878,0.7259,1.0865,-0.5945,0.5155,1.0005,1.6363,1
0,1,0,1,0,1,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.5188,0.1675,-0.6955,-0.6276,0.2231,-0.2845,1.0155,-0.4217,-1.1864,0.4366,-0.6605,-0.5329,0.1274,-0.5503,-0.6163,-1.1159,0.9815,-0.4878,1.4033,2.1461,1.6821,-0.0337,1.6156,-0.6111,1
0,0,0,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.9761,1.2514,0.1007,-0.6276,-0.2798,-0.1244,0.1592,-0.4510,-0.2101,0.5881,0.3704,-0.5329,1.4941,-0.5503,-0.6163,-1.5506,0.9815,-0.4878,-0.6291,1.0865,-0.5945,-0.0337,-0.5373,-0.6111,0
0,1,0,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,2.1904,-1.4802,-0.3740,-0.6276,-0.9083,-0.0768,-0.3783,-0.5679,1.0916,-0.9630,-0.6605,-0.5329,-0.8744,1.8173,1.6226,0.8402,0.9815,0.8133,1.4033,0.0270,-0.5945,1.0646,0.3854,1.6363,1
0,1,0,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.6824,-0.2971,-1.0847,1.5934,1.6686,-0.2542,-0.4744,-0.8275,-1.5119,-0.2393,-0.6605,-0.5329,0.6569,-0.5503,1.6226,1.2749,-1.0188,0.8133,-0.6291,0.0270,-0.5945,-0.5828,-0.8449,-0.6111,0
0,1,0,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.1551,-0.1541,2.5813,-0.6276,0.9773,-0.2888,-0.8358,-0.7185,-0.5356,0.1170,-0.6605,-0.5329,0.4351,-0.5503,-0.6163,-1.1159,-1.0188,0.8133,0.0484,-1.0325,-0.5945,-0.5828,-0.8449,-0.6111,0
0,1,1,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.1200,0.3024,1.3651,-0.6276,-0.0912,-0.2403,-0.7752,-0.3611,1.7425,-0.7034,0.3704,-0.5329,-0.1731,-0.5503,-0.6163,-1.5506,0.9815,-0.4878,-1.3065,-1.0325,-0.5945,-1.6811,-1.4600,-0.6111,0
0,0,0,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.4848,0.3144,1.5364,-0.6276,-0.5940,-0.1542,-0.4437,-0.1358,0.7662,-0.9671,-0.6605,-0.5329,-0.5882,1.8173,-0.6163,-0.8986,-1.0188,-1.7888,1.4033,0.0270,-0.5945,-1.6811,0.6929,-0.6111,0
0,0,0,0,0,0,1,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.5517,-0.1899,-0.8598,1.5934,0.7887,-0.2822,2.3491,2.6423,-0.2101,0.7851,-0.6605,-0.5329,0.1417,-0.5503,1.6226,1.0576,0.9815,0.8133,0.0484,0.0270,-0.5945,1.0646,0.6929,1.6363,1
0,1,1,0,1,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.4341,-0.7060,0.5667,1.5934,1.5429,-0.2498,-0.6604,-0.6758,1.7425,-0.6280,-0.6605,-0.5329,-0.0729,-0.5503,-0.6163,-1.1159,0.9815,0.8133,-0.6291,0.0270,-0.5945,-1.1319,-0.5373,-0.6111,0
1,0,1,0,0,0,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.6967,-0.3527,0.3453,1.5934,1.0401,0.1576,-0.4670,-0.4294,-0.8610,1.6219,-0.6605,-0.5329,0.1632,-0.5503,1.6226,0.4055,0.9815,-1.7888,-1.3065,0.0270,-0.5945,-1.1319,-1.4600,-0.6111,0
0,1,0,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.4930,-0.2137,-1.2964,-0.6276,-0.4683,-0.2869,1.3940,-0.2127,1.7425,0.6047,0.3704,-0.5329,-0.2232,1.8173,1.6226,1.0576,0.9815,0.8133,0.0484,1.0865,-0.5945,1.6138,-0.2297,1.6363,1
0,1,0,0,1,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.1957,0.8106,-0.5210,1.5934,0.8516,1.5968,0.6391,-0.2934,-1.1864,-0.1081,0.3704,1.8764,0.4494,-0.5503,-0.6163,0.4055,-1.0188,0.8133,1.4033,0.0270,-0.5945,-0.5828,0.6929,-0.6111,1
0,1,0,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.4818,-0.2336,-1.3659,-0.6276,0.6630,-0.2941,-0.2259,-0.8275,-0.8610,1.1293,0.3704,-0.5329,-0.0873,-0.5503,-0.6163,0.4055,0.9815,0.8133,0.7259,2.1461,-0.5945,0.5155,1.6156,1.6363,1
0,1,0,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0.3750,-0.2058,0.2735,-0.6276,-1.9139,-0.1908,-0.5689,-0.8275,-1.1864,-0.9724,-0.6605,-0.5329,-0.3735,-0.5503,-0.6163,-0.8986,0.9815,-0.4878,-1.3065,1.0865,1.6821,-1.6811,-1.1524,-0.6111,0
0,1,0,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1.3272,-1.5954,-0.2928,-0.6276,-1.2225,-0.1395,-0.7156,-0.4060,1.0916,-0.4405,0.3704,-0.5329,-1.6758,1.8173,-0.6163,-1.5506,-1.0188,0.8133,0.0484,0.0270,-0.5945,-1.1319,-0.8449,-0.6111,0
0,1,0,0,0,0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,-0.4994,-0.3288,0.3303,-0.6276,-2.0396,-0.2651,-0.4990,-0.4640,-0.5356,-0.9749,0.3704,-0.5329,-1.3538,-0.5503,-0.6163,0.8402,-1.0188,-0.4878,-1.3065,-1.0325,-0.5945,-1.6811,-1.1524,-0.6111,0


## Create a learner

In [175]:
#create a learner
learn = tabular_learner(data, layers=[200,100], metrics=accuracy)

## Fit the data

In [176]:
learn.fit(epochs=5, lr=1e-2)

epoch,train_loss,valid_loss,accuracy
1,0.214401,0.224406,0.909511
2,0.208332,0.213334,0.914402
3,0.222388,0.204547,0.919565
4,0.213244,0.251306,0.919022
5,0.21214,0.215613,0.9125


# Test the model on test data

## Get a list of correct labels

In [157]:
#lets get a correct list of all the test labels
targets = [int(item['cc.trumpgevote']) for item in test_long]
# targets1 = [int(item[31]) for item in test_long]
# print(f"targets length={len(targets)}, targets1 length={len(targets1)}")
# print(f"Matches in targets and targets1={len([1 for i,x in zip(targets,targets1) if i==x])}")

In [158]:
def eval_accuracy(preds,targs):
    totals = len(preds)
    matches = 0
    for x in zip(preds,targs):
        if x[0]==x[1]:
            matches+=1
    print(f"Got {matches} right out of {totals} samples, Accuracy is {matches/totals}")

## The long, very slow way.  One at a time

In [160]:
#do it the hard way one at a time
preds1=[]
# for item in test_long[:100]:
for item in test_long:
    _,pred, _ = learn.predict(item)
    preds1.append(pred.item())
# len(preds1)
# preds1

In [161]:
eval_accuracy(preds1,targets)

Got 3739 right out of 4089 samples, Accuracy is 0.9144044998777208


## The easy, fast batched way
However note that get_preds does not return the labels as its second param as it indicates in documentation, bug in FastaAI?  

In [162]:
#WARNING THE FOLLOWING CALL, get_preds DOES NOT RETURN CORRECT LABELS AS DOCS IMPLY
predictions, _ = learn.get_preds(DatasetType.Test)

# type(predictions)
# predictions.tolist()[:10]
#targets.tolist()[:100]

In [163]:
#find index of largest output
preds=[torch.argmax(x).item() for x in predictions]
# preds

In [164]:
eval_accuracy(preds,targets)

Got 3739 right out of 4089 samples, Accuracy is 0.9144044998777208


In [165]:
#all the same size?
len(test_vec)
len(test_long)
len(preds)
len(preds1)
len(targets)

4089

4089

4089

4089

4089

## Check outputs of both methods

In [167]:
print(f"Targets same values are ={len([1 for x,y in zip(targets,targets1) if x==y])}")
print(f"Predictions same values are ={len([1 for x,y in zip(preds,preds1) if x==y])}")

Targets same values are =4089
Predictions same values are =4089


In [None]:
from fastai.callbacks import *
from fastai.callbacks.hooks import *
print(model_summary(learn))

# Results
Show graphs and stats here

# Conclusions and Next Steps
This model is 92% accurate with no data tweaks

# Scratch

In [None]:
data = test[0]
data[31]
type(data)
data.shape
tmp=learn.predict(data)
len(tmp)
out,pred,res=tmp

out
pred
res

# len(out)
# len(pred)
len(res)