# Introduction
This is the Marginal Effects at Means portion of the project
Please be sure to run DataCleaning.ipynb first to prepare the data

This notebook based on Fastai V1 ML course

## Imports
Import libraries and write settings here.

In [18]:
from fastai.tabular import *
from fastai import *

# Data manipulation
import pandas as pd
import numpy as np

# Options for pandas
pd.options.display.max_columns = 60
pd.options.display.max_rows = 60

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

# Visualizations
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)

import cufflinks as cf
cf.go_offline(connected=True)
cf.set_config_file(theme='white')

# DETERMINE IF YOU ARE USING RAW OR SCALED
Scaled is Zhao's data, continuous variables divided by standard deviation
<br>Raw is unscaled raw data

In [19]:
useSavedModel=True
filename_model_params = "RF_model_params.sav"
useRaw=False
outdir = 'outBElectionResultsScaled2'
filename='results.csv'
filename_all = 'results_complete.csv'
filename_model_params = "RF_model_params.sav"
os.makedirs(outdir, exist_ok=True)

# the dependant variable(s)
columns_dep_var= ['cc.TrumpGEVote','cc.TrumpPVote', 'cc.vote12.gop']

#get the data
if (useRaw == True):
    df = pd.read_csv('./data/ccesplus.csv',encoding = "ISO-8859-1")
else:
    df = pd.read_csv('./data/ccesplus_fscaled.csv',encoding = "ISO-8859-1")

## Get the columns of interest, including all dependent vars

In [20]:
all = pd.read_excel('./data/CCESplusVariables.xlsx',encoding = "ISO-8859-1")

# for every regid=3.1 get the third column (variable name) convert to a list
columns = list(all[ (all['regid']==1.1) & (all['exclude']!='t')  ].iloc[:,2])

# #strip dependant variables
# columns = [col for col in columns if col not in columns_dep_var]
# columns

In [21]:
#WARNING! 'cc.i.white.educhs not in df! No worries though 
#its correlated with composed of cc.maxeduc.hs and cc.WhiteDum
#so drop it
columns.remove('cc.i.white.educhs')

## Extract just those columns from orig dataframe

In [22]:
df = df[columns]

# Choose which dependant variable to operate on

In [23]:
dep_var = columns_dep_var[0]
dump_these_dep_var_columns=columns_dep_var.copy()
dump_these_dep_var_columns.remove(dep_var)

## Remove NaN dependent variable rows


In [24]:
def remove_dep_var_nan_rows(df, col_dep_var, dump_these_dep_var_columns ,silent = False):
    """
    parse df into rows where df.col_dep_var does not have NaNs, 
    df: dataframe to pull NaN rows out of
    col_dep_var: dependent variable
    dump_these_dep_var_columns: other dep_var columns that may be highly correlated with dep var
    """
    dftmp= df.copy()
    
    if(silent is False):
        print(f'dropping {(pd.isnull(dftmp[col_dep_var])).sum()} rows where {col_dep_var}=NaN')
                      
    #dump the NaN rows
    dftmp = dftmp[pd.notnull(dftmp[col_dep_var] )]
    
    #dump the dump_these_dep_var_columns
    dftmp.drop(dump_these_dep_var_columns,axis=1,inplace=True);
    
    return dftmp

# pull out the nulls from the column of interest
dftmp= remove_dep_var_nan_rows(df, columns_dep_var[0],dump_these_dep_var_columns )

dropping 23710 rows where cc.TrumpGEVote=NaN


## Split out categorical and continuous variables
see if fastai can auto do it

In [25]:
# #give fastai a shot at splitting cat and cont variables
res_cont, res_cat = cont_cat_split(dftmp,max_card=18)

## Categorify and Fill Missing

In [26]:
#from docs https://docs.fast.ai/tabular.transform.html
tfm = Categorify(cat_names=res_cat, cont_names=res_cont)
tfm(dftmp)

In [27]:
tfm1 = FillMissing(cat_names=res_cat, cont_names=res_cont)
tfm1(dftmp)

## Generate train and test set

In [28]:
# split out train/test sets
from sklearn.model_selection import train_test_split

trn=tst=None
trn,tst = train_test_split(dftmp, test_size=0.1)
print(str(len(trn)))
print(str(len(tst)))

36801
4089


In [29]:
#split out trn_y and tst_y
#this is the dep_var, converted to an int
trn_y = trn[dep_var].copy()
tst_y = tst[dep_var].copy()
trn_y.astype('int64');
trn_y.astype('int64');

trn.drop(dep_var,axis=1,inplace=True);
tst.drop(dep_var,axis=1,inplace=True);

# Train a RandomForest on all data

In [30]:
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestClassifier
from IPython.display import display
from sklearn import metrics


if (useSavedModel):
    # load the model from disk
    m_rf = pickle.load(open(outdir+"/"+filename_model_params, 'rb'))
else:
    #create a random forest object
    m_rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, oob_score=True, max_features='auto', min_samples_leaf=5)

here


In [31]:

def rmse(x,y): 
    '''this and R**2 used for continuous variables'''
    return math.sqrt(((x-y)**2).mean())

def print_score(m, trn, trn_y, tst, tst_y):
    '''
    
    '''
    res = [m.score(trn, trn_y), m.score(tst, tst_y)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

def eval_accuracy(preds,targs, silent=True):
    totals = len(preds)
    matches = 0
    for x in zip(preds,targs):
        if x[0]==x[1]:
            matches+=1
    acc=100*matches/totals 
    if( silent == False):
        print(f"Got {matches} right out of {totals} samples, Accuracy is {acc} percent")
    return acc

In [32]:
#train the random forest 
m_rf.fit(trn, trn_y)
print_score(m_rf, trn, trn_y, tst, tst_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=True, random_state=999, verbose=0, warm_start=False)

[0.9385614521344529, 0.9151381755930545, 0.9162794489280183]


In [33]:
#just another way to see stuff
# preds1 = m_rf.predict(tst)
# eval_accuracy(preds1,tst_y,silent=False);

# Now run MEMs on columns of interest

In [34]:
#this directory contains symlink created at command line like this
# ln -s ../Marginal_Effects_at_Means ./Marginal_Effects_at_Means
#it allows this directory to find Marginal_Effects_at_Means, a directory 1 above this one
#this dir contains a file called mem.py which contains MEMs
from Marginal_Effects_at_Means.mem import MEMs

In [35]:
#RUN ON WHOLE DATASET OR JUST THE TestSET?  I'm thinking the whole dataset.  
# The model has some idea of how voters will vote based on the input features, lets use that knowledge
#to see what happens when we start changing variables
mems = MEMs(df)

## TODO: For categorical variables, increment to next int, For continuous vars, get the standard deviations of each column of interest.  increment =std/1000.  See paper 