# Introduction
This is the random forest learner associated with Voter Analysis.
Please be sure to run DataCleaning.ipynb first to prepare the data

## Imports
Import libraries and write settings here.

In [2]:
from fastai.tabular import *
from fastai import *

# Data manipulation
import pandas as pd
import numpy as np

# Options for pandas
pd.options.display.max_columns = 60
pd.options.display.max_rows = 60

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

# Visualizations
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)

import cufflinks as cf
cf.go_offline(connected=True)
cf.set_config_file(theme='white')



# Load df_s and catagorical and continuous column names
If either of the following fail then please be sure to run DataCleaning.ipynb first to prepare the data

In [35]:
df_s = pd.read_feather('tmp/df_s')
# df_s.T

In [36]:
with open('tmp/catcont.json') as f:
    data1 = json.load(f)
    res_cont = data1[0]
    res_cat = data1[1] 
    columns_dep_var = data1[2][0]
    
# res_cont
# res_cat

In [37]:
columns_dep_var = data1[2][0]
columns_dep_var

'cc.trumpgevote'

## Categorify and Fill Missing

In [38]:
#from docs https://docs.fast.ai/tabular.transform.html
tfm = Categorify(cat_names=res_cat, cont_names=res_cont)
tfm(df_s)

In [39]:
#just checking to see if it works on any old variable
df_s[res_cat[0]].cat.categories

Int64Index([0, 1], dtype='int64')

In [40]:
tfm1 = FillMissing(cat_names=res_cat, cont_names=res_cont)
tfm1(df_s)

## Split out train/test

In [41]:
# split out data
from sklearn.model_selection import train_test_split
 
trn,tst = train_test_split(df_s, test_size=0.1)
print(str(len(trn)))
print(str(len(tst)))

len(df_s)

36801
4089


40890

# RandomForest

In [42]:
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics

#create a random forest object
m_rf = RandomForestRegressor(n_estimators=40, n_jobs=-1, oob_score=True)

In [43]:
trn_y=trn[columns_dep_var].copy()
trn=trn.drop(columns_dep_var,axis=1)

tst_y=tst[columns_dep_var].copy()
tst=tst.drop(columns_dep_var,axis=1)

# trn.T
# trn_y

In [44]:
#You cannot have any Nan (missing data) fields or random forest will not work.
print(f"Total trn columns = {len(trn.columns)}, total tst columns = {len(tst.columns)}") 
print(f"Total trn columns with Nans= {len(trn.columns[trn.isnull().any()])}") #add ~ to get columns with no missing values
print(f"Total tst columns with Nans= {len(tst.columns[tst.isnull().any()])}") #add ~ to get columns with no missing values

Total trn columns = 53, total tst columns = 53
Total trn columns with Nans= 0
Total tst columns with Nans= 0


In [45]:
#hmmm if tst has 1 more column than train then see what it is
set(tst.columns)-set(trn.columns)

set()

In [46]:
#train the random forest 
m_rf.fit(trn, trn_y)

def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(trn), trn_y), rmse(m.predict(tst), tst_y),
                m.score(trn, trn_y), m.score(tst, tst_y)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=-1,
           oob_score=True, random_state=None, verbose=0, warm_start=False)

In [47]:
#how good is it? R**2 => 1 means perfect, 0 means always predicts the mean, - value less than 1/2 right 
m_rf.score(trn,trn_y)

0.9610694979455728

In [48]:
print_score(m_rf)

[0.09830229574847503, 0.24587884706615393, 0.9610694979455728, 0.7570594976612659, 0.7205749887677839]


# Test the model on test data

## Get a list of correct labels

In [None]:
#lets get a correct list of all the test labels
targets = [int(item['cc.trumpgevote']) for item in test_long]
# targets1 = [int(item[31]) for item in test_long]
# print(f"targets length={len(targets)}, targets1 length={len(targets1)}")
# print(f"Matches in targets and targets1={len([1 for i,x in zip(targets,targets1) if i==x])}")

In [None]:
def eval_accuracy(preds,targs):
    totals = len(preds)
    matches = 0
    for x in zip(preds,targs):
        if x[0]==x[1]:
            matches+=1
    print(f"Got {matches} right out of {totals} samples, Accuracy is {matches/totals}")

## The long, very slow way.  One at a time

In [None]:
#do it the hard way one at a time
preds1=[]
# for item in test_long[:100]:
for item in test_long:
    _,pred, _ = learn.predict(item)
    preds1.append(pred.item())
# len(preds1)
# preds1

In [None]:
eval_accuracy(preds1,targets)

## The easy, fast batched way
However note that get_preds does not return the labels as its second param as it indicates in documentation, bug in FastaAI?  

In [None]:
#WARNING THE FOLLOWING CALL, get_preds DOES NOT RETURN CORRECT LABELS AS DOCS IMPLY
predictions, _ = learn.get_preds(DatasetType.Test)

# type(predictions)
# predictions.tolist()[:10]
#targets.tolist()[:100]

In [None]:
#find index of largest output
preds=[torch.argmax(x).item() for x in predictions]
# preds

In [None]:
eval_accuracy(preds,targets)

In [None]:
#all the same size?
len(test_vec)
len(test_long)
len(preds)
len(preds1)
len(targets)

## Check outputs of both methods

In [None]:
print(f"Targets same values are ={len([1 for x,y in zip(targets,targets1) if x==y])}")
print(f"Predictions same values are ={len([1 for x,y in zip(preds,preds1) if x==y])}")

In [None]:
from fastai.callbacks import *
from fastai.callbacks.hooks import *
print(model_summary(learn))

# Results
Show graphs and stats here

# Conclusions and Next Steps
This model is 92% accurate with no data tweaks

# Scratch

In [None]:
data = test[0]
data[31]
type(data)
data.shape
tmp=learn.predict(data)
len(tmp)
out,pred,res=tmp

out
pred
res

# len(out)
# len(pred)
len(res)