# Data Models

### Import Libraries

In [86]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

from sklearn.metrics import r2_score
import statsmodels.api as sm
from statsmodels.api import OLS
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import KFold
from sklearn.decomposition import PCA

#import pydotplus
#import io
from sklearn.tree import export_graphviz
from IPython.display import Image
from IPython.display import display
import seaborn as sns
%matplotlib inline

### Define Functions

In [87]:
def despine():
    sns.despine(left=True, bottom=True)
    
def get_axs(rows, columns, fig_size_width, fig_size_height):
    dims = (fig_size_width, fig_size_height)
    fig, axs = plt.subplots(rows, columns, figsize=dims)
    if(rows*columns>1):
         axs = axs.ravel()
    return axs

def get_accuracy_model(X, Y, model):
    Y_pred = model.predict(X)
    misclassification_rate = np.mean([int(x) for x in Y_pred != Y])
    return 1 - misclassification_rate

def get_accuracy_pred(Y, Y_pred):
    misclassification_rate = np.mean([int(x) for x in Y_pred != Y])
    return 1 - misclassification_rate

def split_dataset(data, train_size_pc, y_col):
    np.random.seed(9001)
    msk = np.random.rand(len(data)) < train_size_pc
    data_train = data[msk]
    data_test = data[~msk]

    x_train = data_train.iloc[:,0:y_col]
    y_train = data_train.iloc[:,y_col]

    x_test = data_test.iloc[:,0:y_col]
    y_test = data_test.iloc[:,y_col]
    return x_train, y_train, x_test, y_test

def set_title_xlabel_ylabel(ax, title, xlabel, ylabel):
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)

In [88]:
sns.set(rc={'axes.facecolor':'white', 'figure.facecolor':'white'})
sns.set_style("whitegrid")
sns.set(font_scale=1.3)

## Import Dataset

In [89]:
census_data = pd.read_csv("crime_data.csv", index_col=0)
results = pd.DataFrame([], columns = ["model", "train_score", "test_score"])

### Dropping all rows with missing values

In [90]:
census_data = census_data.dropna(how='any')

### Hot One Encoding Categorical Variables

In [91]:
#categorical
cat_vars = ['year']

split = {}
split_test = {}

def hot_one_encoding(data, cat_vars):
    for var in cat_vars:
        s_var = pd.Series(data[var])
        split[var] = pd.get_dummies(s_var)

        func = lambda x: var + '_'+ str(x)

        cols = list(map(func, list(split[var].columns)[1:]))
        split[var] = split[var].drop(split[var].columns[0], axis=1)
        split[var].columns = cols

        data = data.join(split[var])

        del data[var]
    return data

### Normalizing all quantitative variables

In [92]:
quant_vars = ['msa', 'pop', 'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7', 'm1', 'm2',
       'm3', 'm4', 'm5', 'i1', 'i2', 'e1', 'e2', 'e3', 'e4', 'e5', 'a1', 'a2',
       'a3', 'a4', 'a5', 'a6', 'a7', 'e6', 'vr', 'mtof', 'firearms',
       'murder_rate']

In [93]:
census_data = census_data.drop(['msa', 'year'], axis=1)
# census_data = hot_one_encoding(census_data, cat_vars)
quant_vars = list(census_data.columns)
quant_vars = ['pop', 'i1', 'i2', 'firearms']
for var in quant_vars:
    var_mean = np.mean(census_data[var])
    var_std = np.std(census_data[var])
    census_data[var + '_std'] = (census_data[var]-var_mean)/var_std
    del census_data[var]

### Train and Test Split

In [94]:
np.random.seed(9001)
msk = np.random.rand(len(census_data)) < 0.75
census_train = census_data[msk]
census_test = census_data[~msk]
features = list(census_data.columns)
features.remove('murder_rate')
end = len(census_data.columns)
x_train = census_train[features]
y_train = census_train['murder_rate']

x_test = census_test[features]
y_test = census_test['murder_rate']

### Baseline Model

In [95]:
#LINEAR REGRESSION
lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)
y_pred_train = lin_reg.predict(x_train)
y_pred_test = lin_reg.predict(x_test)

In [96]:
train_score = r2_score(y_train, y_pred_train)
test_score = r2_score(y_test, y_pred_test)
results = results.append({"model":"Linear Regression", "train_score":train_score, "test_score":test_score}, ignore_index=True)

In [97]:
x_train_with_constants = sm.add_constant(x_train)
est = sm.OLS(y_train, x_train_with_constants)
est = est.fit()
print(est.summary())

                            OLS Regression Results                            
Dep. Variable:            murder_rate   R-squared:                       0.516
Model:                            OLS   Adj. R-squared:                  0.511
Method:                 Least Squares   F-statistic:                     92.87
Date:                Thu, 07 Dec 2017   Prob (F-statistic):               0.00
Time:                        20:21:27   Log-Likelihood:                -6359.6
No. Observations:                2642   AIC:                         1.278e+04
Df Residuals:                    2611   BIC:                         1.296e+04
Df Model:                          30                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            7.2141      4.847      1.488   

### Multiple Models - Ridge, Lasso and Polynomial

#### Ridge with Cross Validation

In [98]:
ridge = RidgeCV()
ridge.fit(x_train, y_train)
y_pred_train = ridge.predict(x_train)
y_pred_test = ridge.predict(x_test)
train_score = r2_score(y_train, y_pred_train)
test_score = r2_score(y_test, y_pred_test)
results = results.append({"model":"Ridge Cross Validated", "train_score":train_score, "test_score":test_score}, ignore_index=True)

In [99]:
RidgeCoefficients = pd.DataFrame(list(zip([np.abs(coef) for coef in ridge.coef_], [np.abs(coef)/coef for coef in ridge.coef_], x_train.columns)), columns=['Value', 'Sign', 'Coef'])
RidgeCoefficients = RidgeCoefficients[RidgeCoefficients['Value']>0.1].sort_values(['Value'], ascending=[0])
RidgeCoefficients.index = list(range(0, RidgeCoefficients.shape[0]))
RidgeCoefficients

Unnamed: 0,Value,Sign,Coef
0,31.491006,1.0,r7
1,26.751318,-1.0,r5
2,20.278714,-1.0,m1
3,20.041086,-1.0,a2
4,14.669455,-1.0,r4
5,12.64554,1.0,e1
6,12.203959,1.0,e4
7,11.208201,1.0,r6
8,10.548848,1.0,m3
9,10.152961,1.0,r2


#### Lasso with Cross Validation

In [100]:
lasso = LassoCV()
lasso.fit(x_train, y_train)
y_pred_train = lasso.predict(x_train)
y_pred_test = lasso.predict(x_test)
train_score = r2_score(y_train, y_pred_train)
test_score = r2_score(y_test, y_pred_test)
results = results.append({"model":"Lasso Cross Validated", "train_score":train_score, "test_score":test_score}, ignore_index=True)

In [101]:
LassoCoefficients = pd.DataFrame(list(zip([np.abs(coef) for coef in lasso.coef_], [np.abs(coef)/coef for coef in lasso.coef_], x_train.columns)), columns=['Value', 'Sign', 'Coef'])
LassoCoefficients = LassoCoefficients[LassoCoefficients['Value']>0.1].sort_values(['Value'], ascending=[0])
LassoCoefficients.index = list(range(0, LassoCoefficients.shape[0]))
LassoCoefficients

  """Entry point for launching an IPython kernel.


Unnamed: 0,Value,Sign,Coef
0,35.319289,1.0,r7
1,26.304059,-1.0,a2
2,19.096901,-1.0,m1
3,15.19822,1.0,r6
4,14.014587,1.0,r2
5,10.903004,-1.0,r4
6,8.586073,1.0,e4
7,7.662272,1.0,e1
8,4.758945,1.0,m3
9,4.409728,-1.0,r5


#### Polynomial Features - Linear, Ridge and Lasso

In [102]:
poly = PolynomialFeatures(degree = 2)
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

lin_reg = LinearRegression()
ridge = RidgeCV()
lasso = LassoCV()

lin_reg.fit(x_train_poly, y_train)
y_pred_train = lin_reg.predict(x_train_poly)
y_pred_test = lin_reg.predict(x_test_poly)
train_score = r2_score(y_train, y_pred_train)
test_score = r2_score(y_test, y_pred_test)
results = results.append({"model":"Linear Regression with Polynomial Features", "train_score":train_score, "test_score":test_score}, ignore_index=True)

ridge.fit(x_train_poly, y_train)
y_pred_train = ridge.predict(x_train_poly)
y_pred_test = ridge.predict(x_test_poly)
train_score = r2_score(y_train, y_pred_train)
test_score = r2_score(y_test, y_pred_test)
results = results.append({"model":"Lasso Cross Validated with Polynomial Features", "train_score":train_score, "test_score":test_score}, ignore_index=True)

lasso.fit(x_train_poly, y_train)
y_pred_train = lasso.predict(x_train_poly)
y_pred_test = lasso.predict(x_test_poly)
train_score = r2_score(y_train, y_pred_train)
test_score = r2_score(y_test, y_pred_test)
results = results.append({"model":"Ridge Cross Validated with Polynomial Features", "train_score":train_score, "test_score":test_score}, ignore_index=True)



#### Interaction Terms

From our EDA we hypothesized that multi-ethnic communnities might have higher murder rates. To test this hypothesis we will use a interaction term that multiplies all the race proportions and add to our train and test models

In [103]:
x_train['multi_ethnic'] = x_train['r1'] * x_train['r2'] * x_train['r3'] * x_train['r4'] * x_train['r5'] * x_train['r6'] * x_train['r7'] 
x_test['multi_ethnic'] = x_test['r1'] * x_test['r2'] * x_test['r3'] * x_test['r4'] * x_test['r5'] * x_test['r6'] * x_test['r7'] 

#### Normalizing

In [104]:
quant_vars = ['multi_ethnic']
for var in quant_vars:
    var_mean = np.mean(x_train[var])
    var_std = np.std(x_train[var])
    x_train[var + '_std'] = (x_train[var]-var_mean)/var_std
    del x_train[var]
for var in quant_vars:
    var_mean = np.mean(x_test[var])
    var_std = np.std(x_test[var])
    x_test[var + '_std'] = (x_test[var]-var_mean)/var_std
    del x_test[var]

#### Modeling

In [105]:
lin_reg.fit(x_train, y_train)
y_pred_train = lin_reg.predict(x_train)
y_pred_test = lin_reg.predict(x_test)
train_score = r2_score(y_train, y_pred_train)
test_score = r2_score(y_test, y_pred_test)
results = results.append({"model":"Linear Regression with Interaction", "train_score":train_score, "test_score":test_score}, ignore_index=True)

ridge.fit(x_train, y_train)
y_pred_train = ridge.predict(x_train)
y_pred_test = ridge.predict(x_test)
train_score = r2_score(y_train, y_pred_train)
test_score = r2_score(y_test, y_pred_test)
results = results.append({"model":"Lasso Cross Validated with Interaction", "train_score":train_score, "test_score":test_score}, ignore_index=True)

lasso.fit(x_train, y_train)
y_pred_train = lasso.predict(x_train)
y_pred_test = lasso.predict(x_test)
train_score = r2_score(y_train, y_pred_train)
test_score = r2_score(y_test, y_pred_test)
results = results.append({"model":"Ridge Cross Validated with Interaction", "train_score":train_score, "test_score":test_score}, ignore_index=True)

#### Checking for Significance

In [106]:
x_train_with_constants = sm.add_constant(x_train)
est = sm.OLS(y_train, x_train_with_constants)
est = est.fit()
print(est.summary())

                            OLS Regression Results                            
Dep. Variable:            murder_rate   R-squared:                       0.517
Model:                            OLS   Adj. R-squared:                  0.512
Method:                 Least Squares   F-statistic:                     90.20
Date:                Thu, 07 Dec 2017   Prob (F-statistic):               0.00
Time:                        20:21:32   Log-Likelihood:                -6356.8
No. Observations:                2642   AIC:                         1.278e+04
Df Residuals:                    2610   BIC:                         1.297e+04
Df Model:                          31                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                8.4276      4.871  

## Modelling Results

In [107]:
results.index = results.model

In [108]:
results.drop(['model'], axis=1)

Unnamed: 0_level_0,train_score,test_score
model,Unnamed: 1_level_1,Unnamed: 2_level_1
Linear Regression,0.516235,0.506611
Ridge Cross Validated,0.507715,0.491949
Lasso Cross Validated,0.50052,0.485036
Linear Regression with Polynomial Features,0.773114,0.546192
Lasso Cross Validated with Polynomial Features,0.662992,0.647014
Ridge Cross Validated with Polynomial Features,0.62649,0.61803
Linear Regression with Interaction,0.517228,0.507337
Lasso Cross Validated with Interaction,0.509099,0.493297
Ridge Cross Validated with Interaction,0.504211,0.489468
