# Introduction
This notebook calculates pValues for the independant variables via bootstrapping.  Note that we are just looking for p-values. So train the rf on the entire dataset with 1 column scrambled, then get MEMs .  Do this many times.  Then train the entire rf with that column unscrambled, then get MEMs.  Plot these MEMs values.
Please be sure to run DataCleaning.ipynb first to prepare the data

This notebook based on Fastai V1 ML course

## Imports
Import libraries and write settings here.

In [None]:
from fastai.tabular import *
from fastai import *

# Data manipulation
import pandas as pd
import numpy as np

# Options for pandas
pd.options.display.max_columns = 60
pd.options.display.max_rows = 60

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

# Visualizations
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)

import cufflinks as cf
cf.go_offline(connected=True)
cf.set_config_file(theme='white')

# Load train/test
If either of the following fail then please be sure to run DataCleaning.ipynb first to prepare the data

In [None]:
# first column in each file is the dependent variable, the rest are independant
csv_files=['vote12gop','TrumpGEVote','TrumpPVote']

whichfle=2
fn = csv_files[whichfle]

trnfile = 'tmp/' + fn +'_train'
tstfile = 'tmp/' + fn +'_tst'
jsonfile = 'tmp/' + fn + '_catcont.json'

outdir = 'outBElectionResultsPVALSs'
filename='results.csv'
filename_all = 'results_complete.csv'
filename_model_params = "RF_model_params.sav"
os.makedirs(outdir, exist_ok=True)

In [None]:
trnfile

In [None]:
#get processed training and test data
trn = pd.read_feather(trnfile)
tst = pd.read_feather(tstfile)
# columns_dep_var= 'cc.vote16'

In [None]:
trn.head()

In [None]:
trn.drop('index',axis=1,inplace=True);
tst.drop('index',axis=1,inplace=True);

## Get continuous and categorical variables, convert dependant variable to int64

In [None]:
#get continuous and categorical variables
with open(jsonfile) as f:
    data1 = json.load(f)
    res_cont = data1[0]
    res_cat = data1[1] 
   
# res_cont
# res_cat

In [None]:
columns_dep_var = trn.columns[0]
columns_dep_var

In [None]:
#remove the dependant variable
res_cont = [x for x in res_cont if x not in columns_dep_var]
res_cat = [x for x in res_cat if x not in columns_dep_var]

In [None]:
trn.head()

In [None]:
#convert trumpgevote to long (otherwise fit fails)
trn[columns_dep_var] = trn[columns_dep_var].astype('int64');
tst[columns_dep_var] = tst[columns_dep_var].astype('int64');

print(str(len(trn)))
print(str(len(tst)))

In [None]:
#split out trn_y and tst_y
#this is the dep_var, converted to an int
trn_y = trn[columns_dep_var].copy()
tst_y = tst[columns_dep_var].copy()
trn_y.astype('int64');
trn_y.astype('int64');

trn.drop(columns_dep_var,axis=1,inplace=True);
tst.drop(columns_dep_var,axis=1,inplace=True);

## Lets see what features are corelated with each other

In [None]:
# from rfpimp import plot_corr_heatmap
# viz = plot_corr_heatmap(trn, figsize=(50,30))
# viz.view()

## Categorify and Fill Missing

In [None]:
#from docs https://docs.fast.ai/tabular.transform.html
# tfm = Categorify(cat_names=res_cat, cont_names=res_cont)
# tfm(trn)
# tfm(tst)
# #just checking to see if it works on any old variable
# trn[res_cat[0]].cat.categories
# tst[res_cat[0]].cat.categories

In [None]:
tfm1 = FillMissing(cat_names=res_cat, cont_names=res_cont, add_col=False)
tfm1(trn)
tfm1(tst)

## Clean up any missing columns that result from unfortunate test selection

In [None]:
columns_dep_var

In [None]:
#You cannot have any Nan (missing data) fields or random forest will not work.
print(f"Total trn columns = {len(trn.columns)}, total tst columns = {len(tst.columns)}") 
print(f"Total trn columns with Nans= {len(trn.columns[trn.isnull().any()])}") #add ~ to get columns with no missing values
print(f"Total tst columns with Nans= {len(tst.columns[tst.isnull().any()])}") #add ~ to get columns with no missing values

In [None]:
#hmmm if either has 1 more column than train then see what it is
set(tst.columns)-set(trn.columns)
set(trn.columns)-set(tst.columns)

#missing one of the _na columns.  This is added, and set to 1, when a variable has an NaN value to mark
#columns that have NaNs

#find the index of the column in trn
# idx=trn.columns.tolist().index('cc.catholic_na')
# idx
# type(trn.columns)

# tst.insert(loc=idx, column='cc.catholic_na', value=False)

In [None]:
trn.head()

## Generate MEM row 
RUN ON WHOLE DATASET OR JUST THE TestSET?  I'm thinking the whole dataset.  
The model has some idea of how voters will vote based on the input features, lets use that knowledge
to see what happens when we start changing variables

In [None]:
all = trn.copy()
all_y = trn_y.copy()
len(all)
all.head()

# Generate pValue predictions

In [None]:
#this directory contains symlink created at command line like this
# ln -s ../Marginal_Effects_at_Means ./Marginal_Effects_at_Means
#it allows this directory to find Marginal_Effects_at_Means, a directory 1 above this one
#this dir contains a file called mem.py which contains MEMs
from p_value_calculation.pvals import pValue,pValInfo

In [None]:
#run the test cases
# from p_value_calculation.test_pvals import Test_pvals
# tst = Test_pvals()
# tst.setUp()
# tst.test_get_pval_bin()

In [None]:
# #get something small to train on
# NUMBERSAMPS = 1000
# trn_sm = trn[:NUMBERSAMPS]
# trn_sm_y= trn_y[:NUMBERSAMPS]
# tst_sm = tst[:NUMBERSAMPS]
# tst_sm_y = tst[:NUMBERSAMPS]
# columns=['cc.raceviewsum']

# len(trn_sm),len(trn_sm_y),len(tst_sm) 
# columns = ['cc.raceviewsum', 'cc.CC16_307']

In [None]:
columns=all.columns
columns
# columns1=['cc.emp.nojob']

In [None]:
NUMBERITERATIONS=500
pv=pValue(trn, trn_y,columns,numb_iter=NUMBERITERATIONS, verbose=True)
res = pv.get_all_pvals()

# Save, print and plot 

In [None]:
outfile ='PVAL_results_'+fn+'.csv'
outfile

In [None]:
# save this info to disk
pickle.dump(res, open(outdir+"/"+outfile, 'wb'))

In [None]:
# load the model from disk
res = pickle.load(open(outdir+"/"+outfile, 'rb'))

In [None]:
#used to convert old pValInfo objects to new ones with comparisons (lt, eq) and self.pVal
# res2=[]
# for c in res1:
#     a=pValInfo(c.col)
#     a.correct_pred = c.correct_pred[:]
#     a.permuted_preds = c.permuted_preds[:]
#     res2.append(a)

In [None]:
res=sorted(res)

#print values
for item in res:
    print(f'Column {item.col}, p-value={item.get_pval()}')

In [None]:
filenamecsv = 'pvals'+fn+ '.csv'

In [None]:
with open(outdir+"/"+filenamecsv, 'w' ) as f:
    for item in res:
        _=f.write(f'{item.col},{item.get_pval()}\n');

In [None]:
def draw_histograms(res, variables, n_rows, n_cols):
    fig, ax=plt.subplots(nrows=n_rows,ncols=n_cols, figsize=(15,30) )
    fig.subplots_adjust(hspace=.5, wspace=.1)
    for i in range(n_rows):
        for j in range(n_cols):
            try:
                tmp = res[((i)*n_cols)+j]
            except:
                pass   
            ax[i,j].set_title(f'{tmp.col}, pval={tmp.get_pval()}')
            ax[i,j].hist(tmp.permuted_preds,bins= 50,ec='red', label='permuted')
            ax[i,j].axvline(tmp.correct_pred, color='k', linestyle='dashed', linewidth=1, label='not-permuted')
    plt.show()

In [None]:
print ("--- line is not-permuted calc, others are permuted calcs")
draw_histograms(res, columns, 11,3)