# Modeling
**Author: Jaclyn Dwyer**

**Project Goal**: The goal of this project is to predict percentage of Low Birth Weights based on California census tracts' environmental health hazard factors in order to determine how to allocate resources for low birth weight newborns in CA.

## Overview

Models are created in order to get the best predictions for LBW percentages. The models begin by predicting the LBW percentages using only environmental health hazards. Population characteristics are added in to later models to see if the predictions for LBW percentages improve. The models are evaluated using the root mean squared error to measure how accurately the model predicts the LBW percentages.

In [24]:
import pandas as pd
pd.set_option('display.max_columns', 100)
from sklearn.model_selection import train_test_split
import numpy as np
from statsmodels.formula.api import ols
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from itertools import combinations
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
%matplotlib inline
from sklearn.feature_selection import RFECV
from sklearn.metrics import r2_score

In [25]:
#read in data
df18 = pd.read_csv('data/merged/df18')

#drop Unamed
df18.drop(columns = ['Unnamed: 0'], axis = 1, inplace = True)

In [29]:
len(df18.groupby('california_county')['lbw'].mean())

56

## Environmental Health Hazard Models

### Final Data Preparation

Some final data preparation is done before running our models including creating dummy variables and dropping columns.

In [3]:
df18.head(2)

Unnamed: 0,total_population,california_county,sb_535_disadvantaged,ozone,pm2_5,diesel_pm,drinking_water,pesticides,tox_release,traffic,cleanup_sites,groundwater_threats,haz_waste,imp_water_bodies,solid_waste,pollution_burden_score,lbw,education,linguistic_isolation,unemployment,housing_burden,Pop. Char. Score,less_10_yrs,yrs_11_64,greater_65,hispanic,white,african_american,native_american,asian_american,other,prev_ozone,prev_pm2_5,prev_diesel_pm,prev_drinking_water,prev_tox_release,prev_traffic,prev_groundwater_threats,prev_haz_waste,prev_imp_water_bodies,prev_solid_waste,prev_lbw
0,3174,Fresno,Yes,0.065,15.4,48.524,681.2,2.75,18551.95719,909.14,80.5,45.75,0.795,0,21.75,9.85,7.44,53.3,16.2,17.6,26.0,9.55,18.8,73.6,7.6,65.3,4.2,24.6,0.5,3.5,1.8,0.255228,14.746087,44.23,519.88237,96414.45837,1217.53568,55.75,0.52,0,5.0,5.80253
1,6133,San Bernardino,Yes,0.062,13.31,38.556,904.66,1.37,7494.236622,782.26,66.2,36.0,1.25,5,12.0,10.0,7.04,53.3,33.4,12.3,34.1,9.07,19.7,76.1,4.2,91.1,5.8,0.7,0.3,1.4,0.7,0.465401,13.888224,47.08,604.311803,8122.687693,1232.874128,49.0,1.845,5,2.0,6.38952


In [4]:
#create dummy variables
cc_dummies = pd.get_dummies(df18['california_county'], prefix='cc', drop_first=True)
disadvantaged_dummies = pd.get_dummies(df18['sb_535_disadvantaged'], prefix='disadvantaged', drop_first=True)

df18 = pd.concat([df18, cc_dummies, disadvantaged_dummies], axis=1)

In [5]:
#drop columns
df18.drop(columns = ['california_county', 'sb_535_disadvantaged', 'education', 'linguistic_isolation', 
                    'unemployment', 'housing_burden', 'Pop. Char. Score', 'less_10_yrs', 'yrs_11_64', 
                    'greater_65', 'hispanic', 'white', 'african_american', 'native_american', 
                    'asian_american', 'other', 'prev_lbw'], axis = 1, inplace = True)

In [6]:
df18.head(2)

Unnamed: 0,total_population,ozone,pm2_5,diesel_pm,drinking_water,pesticides,tox_release,traffic,cleanup_sites,groundwater_threats,haz_waste,imp_water_bodies,solid_waste,pollution_burden_score,lbw,prev_ozone,prev_pm2_5,prev_diesel_pm,prev_drinking_water,prev_tox_release,prev_traffic,prev_groundwater_threats,prev_haz_waste,prev_imp_water_bodies,prev_solid_waste,cc_Amador,cc_Butte,cc_Calaveras,cc_Colusa,cc_Contra Costa,cc_Del Norte,cc_El Dorado,cc_Fresno,cc_Glenn,cc_Humboldt,cc_Imperial,cc_Inyo,cc_Kern,cc_Kings,cc_Lake,cc_Lassen,cc_Los Angeles,cc_Madera,cc_Marin,cc_Mariposa,cc_Mendocino,cc_Merced,cc_Mono,cc_Monterey,cc_Napa,cc_Nevada,cc_Orange,cc_Placer,cc_Plumas,cc_Riverside,cc_Sacramento,cc_San Benito,cc_San Bernardino,cc_San Diego,cc_San Francisco,cc_San Joaquin,cc_San Luis Obispo,cc_San Mateo,cc_Santa Barbara,cc_Santa Clara,cc_Santa Cruz,cc_Shasta,cc_Sierra,cc_Siskiyou,cc_Solano,cc_Sonoma,cc_Stanislaus,cc_Sutter,cc_Tehama,cc_Trinity,cc_Tulare,cc_Tuolumne,cc_Ventura,cc_Yolo,cc_Yuba,disadvantaged_Yes
0,3174,0.065,15.4,48.524,681.2,2.75,18551.95719,909.14,80.5,45.75,0.795,0,21.75,9.85,7.44,0.255228,14.746087,44.23,519.88237,96414.45837,1217.53568,55.75,0.52,0,5.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,6133,0.062,13.31,38.556,904.66,1.37,7494.236622,782.26,66.2,36.0,1.25,5,12.0,10.0,7.04,0.465401,13.888224,47.08,604.311803,8122.687693,1232.874128,49.0,1.845,5,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


### Train Test Split

In [7]:
df18_features = df18.drop(columns = 'lbw', axis = 1)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df18_features, 
                                                    df18['lbw'], 
                                                    random_state=20, 
                                                    test_size=0.2)

### Baseline Model

In [9]:
#fit to train data

#instantiate a linear regression object
baseline = LinearRegression()

#fit the linear regression to the data
baseline = baseline.fit(X_train, y_train)

In [10]:
#predict on train and test set
y_train_pred = baseline.predict(X_train)

y_test_pred = baseline.predict(X_test)

In [11]:
#give true value and predictions
mse = mean_squared_error(y_train, y_train_pred)
rmse = np.sqrt(mse)

mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)

In [12]:
#give true value and predictions
r2 = r2_score(y_train, y_train_pred)

#give true value and predictions
r2_test = r2_score(y_test, y_test_pred)

In [13]:
print('baseline train: ' + str(rmse) + ' , ' + 'baseline r2 train: ' + str(r2))
print('baseline test: ' + str(rmse_test) + ' , ' + 'baseline r2 test: ' + str(r2_test))

baseline train: 1.4119133882821457 , baseline r2 train: 0.17980185351679834
baseline test: 1.4166620543421815 , baseline r2 test: 0.1297182281465753


The train and set have very similar scores, indicating that overfitting is not a concern at this time.

In [21]:
df18.rename(columns = {'Pop. Char. Score': 'pop_char_score', 'cc_Amador ': 'cc_amador', 'cc_Butte ' : 'cc_butte', 
                       'cc_Calaveras ': 'cc_calaveras', 'cc_Colusa ': 'cc_colusa', 'cc_Contra Costa': 'cc_contra_costa', 
                       'cc_Del Norte': 'cc_del_norte', 'cc_El Dorado': 'cc_el_dorado', 'cc_Fresno ': 'cc_fresno', 
                       'cc_Glenn ': 'cc_glenn', 'cc_Humboldt ': 'cc_humboldt', 'cc_Imperial ': 'cc_imperial', 'cc_Inyo ': 'cc_inyo', 
                       'cc_Kern ': 'cc_kern', 'cc_Kings ': 'cc_kings', 'cc_Lake ': 'cc_lake', 'cc_Lassen ': 'cc_lassen', 
                       'cc_Los Angeles ': 'cc_Los_Angeles', 'cc_Madera ': 'cc_madera', 'cc_Marin ': 'cc_marin', 
                       'cc_Mariposa ': 'cc_mariposa', 'cc_Mendocino ': 'cc_mendocino', 'cc_Merced ': 'cc_merced', 'cc_Mono ': 'cc_mono', 
                       'cc_Monterey ': 'cc_monterey', 'cc_Napa ': 'cc_napa', 'cc_Nevada ': 'cc_nevada', 'cc_Orange ': 'cc_orange', 
                       'cc_Placer ': 'cc_placer', 'cc_Plumas ': 'cc_plumas', 'cc_Riverside ': 'cc_riverside', 'cc_Sacramento ': 'cc_sacramento', 
                       'cc_San Benito': 'cc_san_benito', 'cc_San Bernardino': 'cc_san_bernardino', 
                       'cc_San Diego': 'cc_san_diego', 'cc_San Francisco': 'cc_san_francisco', 'cc_San Joaquin': 'cc_san_joaquin', 
                       'cc_San Luis Obispo': 'cc_san_luis_obispo', 'cc_San Mateo': 'cc_san_mateo', 
                       'cc_Santa Barbara': 'cc_santa_barbara', 'cc_Santa Clara': 'cc_santa_clara', 'cc_Santa Cruz': 'cc_santa_cruz', 
                       'cc_Shasta ': 'cc_shasta', 'cc_Sierra ': 'cc_sierra', 'cc_Siskiyou ': 'cc_siskiyou', 'cc_Solano ': 'cc_solano', 
                       'cc_Sonoma ': 'cc_sonoma', 'cc_Stanislaus ': 'cc_stanislaus', 'cc_Sutter ': 'cc_sutter', 'cc_Tehama ': 'cc_tehama', 
                       'cc_Trinity ': 'cc_trinity', 'cc_Tulare ': 'cc_tulare', 'cc_Tuolumne ': 'cc_tuolumne', 'cc_Ventura ': 'cc_ventura', 
                       'cc_Yolo ': 'cc_yolo', 'cc_Yuba ': 'cc_yuba'}, inplace = True)

In [22]:
df18.head()

Unnamed: 0,total_population,ozone,pm2_5,diesel_pm,drinking_water,pesticides,tox_release,traffic,cleanup_sites,groundwater_threats,haz_waste,imp_water_bodies,solid_waste,pollution_burden_score,lbw,prev_ozone,prev_pm2_5,prev_diesel_pm,prev_drinking_water,prev_tox_release,prev_traffic,prev_groundwater_threats,prev_haz_waste,prev_imp_water_bodies,prev_solid_waste,cc_amador,cc_butte,cc_calaveras,cc_colusa,cc_contra_costa,cc_del_norte,cc_el_dorado,cc_fresno,cc_glenn,cc_humboldt,cc_imperial,cc_inyo,cc_kern,cc_kings,cc_lake,cc_lassen,cc_Los_Angeles,cc_madera,cc_marin,cc_mariposa,cc_mendocino,cc_merced,cc_mono,cc_monterey,cc_napa,cc_nevada,cc_orange,cc_placer,cc_plumas,cc_riverside,cc_sacramento,cc_san_benito,cc_san_bernardino,cc_san_diego,cc_san_francisco,cc_san_joaquin,cc_san_luis_obispo,cc_san_mateo,cc_santa_barbara,cc_santa_clara,cc_santa_cruz,cc_shasta,cc_sierra,cc_siskiyou,cc_solano,cc_sonoma,cc_stanislaus,cc_sutter,cc_tehama,cc_trinity,cc_tulare,cc_tuolumne,cc_ventura,cc_yolo,cc_yuba,disadvantaged_Yes
0,3174,0.065,15.4,48.524,681.2,2.75,18551.95719,909.14,80.5,45.75,0.795,0,21.75,9.85,7.44,0.255228,14.746087,44.23,519.88237,96414.45837,1217.53568,55.75,0.52,0,5.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,6133,0.062,13.31,38.556,904.66,1.37,7494.236622,782.26,66.2,36.0,1.25,5,12.0,10.0,7.04,0.465401,13.888224,47.08,604.311803,8122.687693,1232.874128,49.0,1.845,5,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,3167,0.062,15.4,47.445,681.2,3.03,12454.94841,576.52,22.0,30.25,0.2,0,2.5,8.76,10.16,0.304446,14.554656,46.91,519.88237,11826.72494,1012.283892,49.75,0.21,0,0.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,6692,0.046,12.54,24.117,278.76,12.93,2387.782922,1305.01,50.1,132.1,0.795,19,27.0,9.17,6.23,0.022731,9.29311,24.24,262.834254,1115.957173,1582.729128,213.75,6.625,19,15.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,2206,0.065,15.4,18.846,1000.24,3518.41,21790.70672,435.16,60.0,54.2,13.1,0,50.8,9.88,4.5,0.225687,14.937649,15.66,753.540501,5846.924529,785.580102,100.5,12.76,0,22.2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [23]:
#create ols model
outcome = 'lbw'
predictors = df18.drop(['lbw'], axis=1)
pred_sum = '+'.join(predictors.columns)
formula = outcome + '~' + pred_sum

model = ols(formula=formula, data=df18).fit()
model.summary()

0,1,2,3
Dep. Variable:,lbw,R-squared:,0.176
Model:,OLS,Adj. R-squared:,0.168
Method:,Least Squares,F-statistic:,20.67
Date:,"Tue, 04 May 2021",Prob (F-statistic):,6.869999999999999e-262
Time:,12:18:28,Log-Likelihood:,-13758.0
No. Observations:,7812,AIC:,27680.0
Df Residuals:,7731,BIC:,28240.0
Df Model:,80,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,5.7873,0.287,20.175,0.000,5.225,6.350
total_population,2.266e-05,8.77e-06,2.583,0.010,5.47e-06,3.99e-05
ozone,-3.7772,6.247,-0.605,0.545,-16.024,8.469
pm2_5,-0.0305,0.023,-1.347,0.178,-0.075,0.014
diesel_pm,0.0059,0.002,2.725,0.006,0.002,0.010
drinking_water,-5.045e-05,0.000,-0.331,0.740,-0.000,0.000
pesticides,8.505e-07,6.2e-06,0.137,0.891,-1.13e-05,1.3e-05
tox_release,-4.651e-06,2.16e-06,-2.150,0.032,-8.89e-06,-4.11e-07
traffic,3.135e-05,3.97e-05,0.790,0.429,-4.64e-05,0.000

0,1,2,3
Omnibus:,329.751,Durbin-Watson:,1.733
Prob(Omnibus):,0.0,Jarque-Bera (JB):,532.96
Skew:,0.371,Prob(JB):,1.86e-116
Kurtosis:,4.043,Cond. No.,7140000.0


### Interactions Model

**Create Interactions**

To try a achieve a lower rmse score, all possible interactions are created as well as cross validations. If the interaction improves the score from the baseline model, the interaction is stored in an interactions list. 

In [None]:
regression = LinearRegression()

X = df18.drop('lbw', axis=1)
y = df18['lbw']

crossvalidation = KFold(n_splits=10, shuffle=True, random_state=1)
baseline = np.mean(cross_val_score(regression, X, y, scoring='neg_root_mean_squared_error', cv=crossvalidation))


interactions = []

feat_combinations = combinations(X.columns, 2)

data = X.copy()
for i, (a, b) in enumerate(feat_combinations):
    data['interaction'] = data[a] * data[b]
    score = np.mean(cross_val_score(regression, data, y, scoring='neg_root_mean_squared_error', cv=crossvalidation))
    if score > baseline:
        interactions.append((a, b, round(score,3)))
    
    if i % 50 == 0:
        print(i)

In [None]:
interactions

In [None]:
len(interactions)

In [None]:
def create_interaction(i, dataframe, interactions):
    new_column = interactions[i][0] + '_and_' + interactions[i][1]
    dataframe[new_column] = dataframe[interactions[i][0]] * dataframe[interactions[i][1]]

In [None]:
for i in range(0,len(interactions)):
    create_interaction(i, df18, interactions)

**Train Test Split**

A second train test split is conducted in order to include the interactions.

In [None]:
df18_features_i = df18.drop(columns = 'lbw', axis = 1)

In [None]:
X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(df18_features_i, 
                                                    df18['lbw'], 
                                                    random_state=20, 
                                                    test_size=0.2)

**Create Interaction Model**

In [None]:
#fit to train data

#instantiate a linear regression object
interactions = LinearRegression()

#fit the linear regression to the data
interactions = interactions.fit(X_train_i, y_train_i)

In [None]:
#predict on train and test set
y_train_pred_i = interactions.predict(X_train_i)

y_test_pred_i = interactions.predict(X_test_i)

In [None]:
#give true value and predictions
mse_i = mean_squared_error(y_train_i, y_train_pred_i)
rmse_i = np.sqrt(mse_i)

#give true value and predictions
mse_test_i = mean_squared_error(y_test_i, y_test_pred_i)
rmse_test_i = np.sqrt(mse_test_i)

In [None]:
#give true value and predictions
r2_i = r2_score(y_train_i, y_train_pred_i)

#give true value and predictions
r2_test_i = r2_score(y_test_i, y_test_pred_i)

In [None]:
print('baseline train: ' + str(rmse) + ' , ' + 'baseline r2 train: ' + str(r2))
print('baseline test: ' + str(rmse_test) + ' , ' + 'baseline r2 test: ' + str(r2_test))
print('interactions train: ' + str(rmse_i) + ' , ' + 'interactions r2 train: ' + str(r2_i))
print('interactions test: ' + str(rmse_test_i) + ' , ' + 'interactions r2 test: ' + str(r2_test_i))

Adding the interactions resulted in a lowered rmse for the train set. However, the rmse of the test set increased substantially, indicating that the model is overfitted. In order to try and account for overfitting some features are dropped using feature elimination techniques.

### Select K Best Model

Select K best is used in order to try and eliminate some features to account for overfitting and create improved predictions. This model was previously run with a k equal to 300, 220 200, 175, 150, and 100 in order to obtain the best train and test scores. The results are shown in the graph below.

In [None]:
f, axes = plt.subplots(1, figsize=(15,5))
line = sns.lineplot(x= [300, 220, 200, 175, 150, 100],
                    y=[1.35, 1.37, 1.37, 1.38, 1.39, 1.40])
line2 = sns.lineplot(x= [300, 220, 200, 175, 150, 100],
                    y=[1.82, 1.38, 1.38, 1.38, 1.39, 1.40])
line.axes.set_title("Kbest Train vs Test Results",fontsize=18)
line.set_xlabel("K Values",fontsize=15)
line.set_ylabel("Train & Test Scores",fontsize=15)
#create proxy artist legent
blue_line = mlines.Line2D([], [], color='blue', label='Train')
orange_line = mlines.Line2D([], [], color='orange', label='Test')
line.legend(handles=[blue_line, orange_line]);

The K value of 175 is selected as any values over 175 start to indicate signs of overfitting. While values less than 175 don't show signs of overfitting the train and test rmse scores start to increase slightly compared to 175.

In [None]:
selector = SelectKBest(f_regression, k=175)

selector.fit(X_train_i, y_train_i)

In [None]:
selector.get_support();

In [None]:
selected_k_columns = X_train_i.columns[selector.get_support()]
removed_k_columns = X_train_i.columns[~selector.get_support()]

kbest = LinearRegression()

#fit the linear regression to the data
kbest = kbest.fit(X_train_i[selected_k_columns], y_train_i)

In [None]:
#predict on train and test set
y_train_pred_k = kbest.predict(X_train_i[selected_k_columns])

y_test_pred_k = kbest.predict(X_test_i[selected_k_columns])

In [None]:
#give true value and predictions
mse_k = mean_squared_error(y_train_i, y_train_pred_k)
rmse_k = np.sqrt(mse_k)

#give true value and predictions
mse_test_k = mean_squared_error(y_test_i, y_test_pred_k)
rmse_test_k = np.sqrt(mse_test_k)

In [None]:
#give true value and predictions
r2_k = r2_score(y_train_i, y_train_pred_k)

#give true value and predictions
r2_test_k = r2_score(y_test_i, y_test_pred_k)

In [None]:
print('baseline train: ' + str(rmse) + ' , ' + 'baseline r2 train: ' + str(r2))
print('baseline test: ' + str(rmse_test) + ' , ' + 'baseline r2 test: ' + str(r2_test))
print('interactions train: ' + str(rmse_i) + ' , ' + 'interactions r2 train: ' + str(r2_i))
print('interactions test: ' + str(rmse_test_i) + ' , ' + 'interactions r2 test: ' + str(r2_test_i))
print('kbest train: ' + str(rmse_k) + ' , ' + 'r2: ' + str(r2_k))
print('kbest test: ' + str(rmse_test_k) + ' , ' + 'r2: ' + str(r2_test_k))

The results from the kbest model are show a decrease in the rmse score compared to the baseline model and do not show signs of overfitting as seen in the interactions model. 

### Recursive Feature Elimination

One more feature elimination technique is run on the features selected in the kbest model.
A best subset of features is created by a process of eliminating underperforming features of a model one by one.

In [None]:
ols = LinearRegression()
# Create recursive feature eliminator that scores features by mean squared errors
selector = RFECV(estimator=ols, step=1, cv=7,  scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=1)

# Fit recursive feature eliminator 
selector.fit(X_train_i[selected_k_columns], y_train)

In [None]:
#create variables for features selected for model and removed
selected_rfe = X_train_i[selected_k_columns].columns[selector.support_]
removed_rfe = X_train_i[selected_k_columns].columns[~selector.support_]

In [None]:
len(selected_rfe)

In [None]:
selected_rfe

In [None]:
#instantiate a linear regression object
rfe = LinearRegression()

#fit the linear regression to the data
rfe = rfe.fit(X_train_i[selected_rfe], y_train_i)

In [None]:
#predict on train and test set
y_train_pred_rfe = rfe.predict(X_train_i[selected_rfe])

y_test_pred_rfe = rfe.predict(X_test_i[selected_rfe])

In [None]:
#give true value and predictions
mse_rfe = mean_squared_error(y_train_i, y_train_pred_rfe)
rmse_rfe = np.sqrt(mse_rfe)

#give true value and predictions
mse_test_rfe = mean_squared_error(y_test_i, y_test_pred_rfe)
rmse_test_rfe = np.sqrt(mse_test_rfe)

In [None]:
#give true value and predictions
r2_rfe = r2_score(y_train_i, y_train_pred_rfe)

#give true value and predictions
r2_test_rfe = r2_score(y_test_i, y_test_pred_rfe)

In [None]:
print('baseline train: ' + str(rmse) + ', ' + 'r2: ' + str(r2))
print('baseline test: ' + str(rmse_test) + ', ' + 'r2: ' + str(r2_test))
print('interactions train: ' + str(rmse_i) + ', ' + 'r2: ' + str(r2_i))
print('interactions test: ' + str(rmse_test_i) + ', ' + 'r2: ' + str(r2_test_i))
print('kbest train: ' + str(rmse_k) + ', ' + 'r2: ' + str(r2_k))
print('kbest test: ' + str(rmse_test_k) + ', ' + 'r2: ' + str(r2_test_k))
print('rfe train: ' + str(rmse_rfe) + ', ' + 'r2: ' + str(r2_rfe))
print('rfe test: ' + str(rmse_test_rfe) + ', ' + 'r2: ' + str(r2_test_rfe))

### Random Forest Model

In [None]:
from sklearn.ensemble import BaggingClassifier, RandomForestRegressor, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

In [None]:
#instantiate and fit model
forest = RandomForestRegressor(random_state = 20)

In [None]:
custom_scorer = 'neg_root_mean_squared_error'

In [None]:
#get dict of params
rf_param_grid = {
    'bootstrap': [True],
    'max_depth': [5, 10],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [50, 100, 200, 300],
    'random_state': [0],
}

#instantiate the grid search 
forest_grid_search = GridSearchCV(estimator = forest, param_grid = rf_param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, scoring = custom_scorer)

In [None]:
#fit grid search
forest_grid_search.fit(X_train_i, y_train_i)

In [None]:
#instantiate with best params model
gs_forest = forest_grid_search.best_estimator_

In [None]:
#fit with best params model
gs_forest.fit(X_train_i, y_train_i)

In [None]:
#predict on train and test set
y_train_pred_gsrf = gs_forest.predict(X_train_i)

y_test_pred_gsrf = gs_forest.predict(X_test_i)

In [None]:
#give true value and predictions
mse_gsrf = mean_squared_error(y_train_i, y_train_pred_gsrf)
rmse_gsrf = np.sqrt(mse_gsrf)

#give true value and predictions
mse_test_gsrf = mean_squared_error(y_test_i, y_test_pred_gsrf)
rmse_test_gsrf = np.sqrt(mse_test_gsrf)

In [None]:
#give true value and predictions
r2_gsrf = r2_score(y_train_i, y_train_pred_gsrf)

#give true value and predictions
r2_test_gsrf = r2_score(y_test_i, y_test_pred_gsrf)

In [None]:
print('baseline train: ' + str(rmse) + ', ' + 'r2: ' + str(r2))
print('baseline test: ' + str(rmse_test) + ', ' + 'r2: ' + str(r2_test))
print('interactions train: ' + str(rmse_i) + ', ' + 'r2: ' + str(r2_i))
print('interactions test: ' + str(rmse_test_i) + ', ' + 'r2: ' + str(r2_test_i))
print('kbest train: ' + str(rmse_k) + ', ' + 'r2: ' + str(r2_k))
print('kbest test: ' + str(rmse_test_k) + ', ' + 'r2: ' + str(r2_test_k))
print('rfe train: ' + str(rmse_rfe) + ', ' + 'r2: ' + str(r2_rfe))
print('rfe test: ' + str(rmse_test_rfe) + ', ' + 'r2: ' + str(r2_test_rfe))
print('GridSearch RF train: ' + str(rmse_gsrf) + ', ' + 'r2: ' + str(r2_gsrf))
print('GridSearch RF test: ' + str(rmse_test_gsrf) + ', ' + 'r2: ' + str(r2_test_gsrf))