# Introduction
This is the random forest learner associated with Voter Analysis.
Please be sure to run DataCleaning.ipynb first to prepare the data

## Imports
Import libraries and write settings here.

In [1]:
from fastai.tabular import *
from fastai import *

# Data manipulation
import pandas as pd
import numpy as np

# Options for pandas
pd.options.display.max_columns = 60
pd.options.display.max_rows = 60

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

# Visualizations
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)

import cufflinks as cf
cf.go_offline(connected=True)
cf.set_config_file(theme='white')



# Load train/test and catagorical and continuous column names
If either of the following fail then please be sure to run DataCleaning.ipynb first to prepare the data

In [2]:
trn = pd.read_feather('tmp/train')
tst = pd.read_feather('tmp/tst')

In [3]:
with open('tmp/catcont.json') as f:
    data1 = json.load(f)

    res_cont = data1[0]
    res_cat = data1[1] 
    columns_dep_var = data1[2][0]
    
# res_cont
# res_cat

In [4]:
#remove the dependant variable
res_cont = [x for x in res_cont if x not in columns_dep_var]
res_cat = [x for x in res_cat if x not in columns_dep_var]

In [5]:
#convert trumpgevote to long (otherwise fit fails)
trn[columns_dep_var] = trn[columns_dep_var].astype('int64');
tst[columns_dep_var] = tst[columns_dep_var].astype('int64');

print(str(len(trn)))
print(str(len(tst)))

36801
4089


## Categorify and Fill Missing

In [6]:
#from docs https://docs.fast.ai/tabular.transform.html
tfm = Categorify(cat_names=res_cat, cont_names=res_cont)
tfm(trn)
tfm(tst)

In [7]:
#just checking to see if it works on any old variable
trn[res_cat[0]].cat.categories
tst[res_cat[0]].cat.categories

Int64Index([0, 1], dtype='int64')

Int64Index([0, 1], dtype='int64')

In [8]:
tfm1 = FillMissing(cat_names=res_cat, cont_names=res_cont)
tfm1(trn)
tfm1(tst)

## Clean up any missing columns that result from unfortunate test selection

In [9]:
trn_y=trn[columns_dep_var].copy()
trn=trn.drop(columns_dep_var,axis=1)

tst_y=tst[columns_dep_var].copy()
tst=tst.drop(columns_dep_var,axis=1)

# trn.T
# trn_y

In [20]:
len(trn)
len(tst)

36801

4089

In [10]:
#You cannot have any Nan (missing data) fields or random forest will not work.
print(f"Total trn columns = {len(trn.columns)}, total tst columns = {len(tst.columns)}") 
print(f"Total trn columns with Nans= {len(trn.columns[trn.isnull().any()])}") #add ~ to get columns with no missing values
print(f"Total tst columns with Nans= {len(tst.columns[tst.isnull().any()])}") #add ~ to get columns with no missing values

Total trn columns = 53, total tst columns = 52
Total trn columns with Nans= 0
Total tst columns with Nans= 0


In [11]:
#hmmm if either has 1 more column than train then see what it is
set(tst.columns)-set(trn.columns)
set(trn.columns)-set(tst.columns)

set()

{'cc.catholic_na'}

In [33]:
trn.columns

Index(['cc.blackdum', 'econ.mhi', 'cc.sex', 'cc.maxeduc.4yr', 'econ.hhpov.p',
       'mort.ucd.despair.disc95.pdpy', 'cc.union', 'cc.age', 'demo.popdense',
       'cc.emp.nojob', 'rustpc', 'taa.wrks.disc95.pcpy', 'cc.faminc',
       'crashpc', 'cc.newsint', 'cc.isimmigrant', 'cc.catholic',
       'cc.cc16_305_2', 'job.uer', 'cc.child18', 'cc.evanprot', 'cc.whitedum',
       'cc.religiosity', 'cc.cc16_351b', 'cc.cc16_307', 'cc.immviewsum',
       'cc.cc16_304', 'cc.inddum', 'cc.ideo7', 'cc.raceviewsum', 'cc.repdum',
       'econ.mhi_na', 'econ.hhpov.p_na', 'mort.ucd.despair.disc95.pdpy_na',
       'cc.union_na', 'demo.popdense_na', 'rustpc_na',
       'taa.wrks.disc95.pcpy_na', 'cc.faminc_na', 'crashpc_na',
       'cc.newsint_na', 'cc.catholic_na', 'job.uer_na', 'cc.child18_na',
       'cc.evanprot_na', 'cc.religiosity_na', 'cc.cc16_351b_na',
       'cc.cc16_307_na', 'cc.cc16_304_na', 'cc.inddum_na', 'cc.ideo7_na',
       'cc.raceviewsum_na', 'cc.repdum_na'],
      dtype='object')

In [12]:
#missing one of the _na columns.  This is added, and set to 1, when a variable has an NaN value to mark
#columns that have NaNs

In [13]:
#find the index of the column in trn
idx=trn.columns.tolist().index('cc.catholic_na')
idx
type(trn.columns)

41

pandas.core.indexes.base.Index

In [14]:
tst.insert(loc=idx, column='cc.catholic_na', value=False)

# Linear Regression

In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

lr = LogisticRegression(C=100, solver = 'lbfgs', max_iter=2000, n_jobs=-1).fit(trn, trn_y)

## Cross validate scores

In [34]:
all = pd.concat([trn.copy(), tst.copy()])
all_y = pd.concat([trn_y.copy(), tst_y.copy()])

scores = cross_val_score(lr, trn, trn_y, cv=5) # Cross-validating the model on the whole dataset

print("CV scores: {}".format(scores))
print("CV scores mean: {}".format(scores.mean()))


lbfgs failed to converge. Increase the number of iterations.



CV scores: [0.913735 0.905176 0.909647 0.910326 0.913168]
CV scores mean: 0.9104101700292342



lbfgs failed to converge. Increase the number of iterations.



# Test the model on test data

In [35]:
def eval_accuracy(preds,targs):
    totals = len(preds)
    matches = 0
    for x in zip(preds,targs):
        if x[0]==x[1]:
            matches+=1
    print(f"Got {matches} right out of {totals} samples, Accuracy is {100*matches/totals} percent")

In [39]:
#do it the hard way one at a time
preds1 = lr.predict(tst)

In [40]:
eval_accuracy(preds1,tst_y)

Got 3729 right out of 4089 samples, Accuracy is 91.19589141599413 percent


## Check outputs of both methods

In [None]:
print(f"Targets same values are ={len([1 for x,y in zip(targets,targets1) if x==y])}")
print(f"Predictions same values are ={len([1 for x,y in zip(preds,preds1) if x==y])}")

In [None]:
from fastai.callbacks import *
from fastai.callbacks.hooks import *
print(model_summary(learn))

# Results
Show graphs and stats here

## Conclusions and Next Steps
This model is 91.2% accurate with no data tweaks

# Scratch

In [None]:
data = test[0]
data[31]
type(data)
data.shape
tmp=learn.predict(data)
len(tmp)
out,pred,res=tmp

out
pred
res

# len(out)
# len(pred)
len(res)