# Exploratory Modeling with Feature Reduction

In [251]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

### Split Data into Training and Test Sets

In [252]:
filename = '../../data/processed/feature-eng.csv'
df = pd.read_csv(filename)
df.head()

Unnamed: 0,zipcode,start_year,avg_eqi_year_5-10,year_1_EQI_MSA,year_1_EQI_state,year_1_EQI_zip,year_1_RECPI_MSA,year_1_RECPI_state,year_1_RECPI_zip,year_2_EQI_MSA,...,recpi_zip_change_4,recpi_msa_change_1,recpi_msa_change_2,recpi_msa_change_3,recpi_msa_change_4,recpi_state_change_1,recpi_state_change_2,recpi_state_change_3,recpi_state_change_4,avg_eqi_year_1-5
0,1001,1992,0.001287,0.001549,0.002111,0.002216,1.176877,28.052156,0.048744,0.001266,...,2.333669,0.886418,1.074673,1.215163,0.883213,1.034792,1.02056,1.03639,1.251689,0.00217
1,1001,1997,0.001853,0.001233,0.002386,0.00084,1.112023,41.1745,0.035283,0.001164,...,1.378256,0.851011,2.201297,0.567565,0.995583,1.0739,1.245044,1.448812,0.69367,0.001287
2,1001,2002,0.000602,0.001494,0.00245,0.001504,1.746593,50.71338,0.054131,0.001073,...,4.531509,0.74046,1.353943,0.642606,1.292821,0.971759,0.905151,1.007785,1.046863,0.001853
3,1001,2007,0.000898,0.000702,0.001817,0.000611,1.081845,45.171143,0.036063,0.000758,...,0.672958,1.030793,0.855337,1.170414,1.02507,1.089005,0.801518,1.08881,1.051707,0.000602
4,1002,1992,0.002931,0.001549,0.002111,0.000959,1.176877,28.052156,0.020148,0.001266,...,0.392888,0.886418,1.074673,1.215163,0.883213,1.034792,1.02056,1.03639,1.251689,0.003141


In [253]:
X = df['zipcode'].unique()
y = df['zipcode'].unique()

In [254]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [255]:
X = df.drop('avg_eqi_year_5-10', axis=1)
y = df[['zipcode','start_year','avg_eqi_year_5-10']]

In [256]:
X.head()

Unnamed: 0,zipcode,start_year,year_1_EQI_MSA,year_1_EQI_state,year_1_EQI_zip,year_1_RECPI_MSA,year_1_RECPI_state,year_1_RECPI_zip,year_2_EQI_MSA,year_2_EQI_state,...,recpi_zip_change_4,recpi_msa_change_1,recpi_msa_change_2,recpi_msa_change_3,recpi_msa_change_4,recpi_state_change_1,recpi_state_change_2,recpi_state_change_3,recpi_state_change_4,avg_eqi_year_1-5
0,1001,1992,0.001549,0.002111,0.002216,1.176877,28.052156,0.048744,0.001266,0.002057,...,2.333669,0.886418,1.074673,1.215163,0.883213,1.034792,1.02056,1.03639,1.251689,0.00217
1,1001,1997,0.001233,0.002386,0.00084,1.112023,41.1745,0.035283,0.001164,0.002573,...,1.378256,0.851011,2.201297,0.567565,0.995583,1.0739,1.245044,1.448812,0.69367,0.001287
2,1001,2002,0.001494,0.00245,0.001504,1.746593,50.71338,0.054131,0.001073,0.002257,...,4.531509,0.74046,1.353943,0.642606,1.292821,0.971759,0.905151,1.007785,1.046863,0.001853
3,1001,2007,0.000702,0.001817,0.000611,1.081845,45.171143,0.036063,0.000758,0.002125,...,0.672958,1.030793,0.855337,1.170414,1.02507,1.089005,0.801518,1.08881,1.051707,0.000602
4,1002,1992,0.001549,0.002111,0.000959,1.176877,28.052156,0.020148,0.001266,0.002057,...,0.392888,0.886418,1.074673,1.215163,0.883213,1.034792,1.02056,1.03639,1.251689,0.003141


In [257]:
y.head()

Unnamed: 0,zipcode,start_year,avg_eqi_year_5-10
0,1001,1992,0.001287
1,1001,1997,0.001853
2,1001,2002,0.000602
3,1001,2007,0.000898
4,1002,1992,0.002931


In [258]:
X_train = pd.DataFrame(X_train)
X_train.columns = ['zipcode']

X_test = pd.DataFrame(X_test)
X_test.columns = ['zipcode']

y_train = pd.DataFrame(y_train)
y_train.columns = ['zipcode']

y_test = pd.DataFrame(y_test)
y_test.columns = ['zipcode']

In [259]:
X_train = X_train.merge(X, on='zipcode').sort_values(['zipcode','start_year'], ignore_index=True)
X_test = X_test.merge(X, on='zipcode').sort_values(['zipcode','start_year'], ignore_index=True)
y_train = y_train.merge(y, on='zipcode').sort_values(['zipcode','start_year'], ignore_index=True)
y_test = y_test.merge(y, on='zipcode').sort_values(['zipcode','start_year'], ignore_index=True)

In [260]:
X_train.head()

Unnamed: 0,zipcode,start_year,year_1_EQI_MSA,year_1_EQI_state,year_1_EQI_zip,year_1_RECPI_MSA,year_1_RECPI_state,year_1_RECPI_zip,year_2_EQI_MSA,year_2_EQI_state,...,recpi_zip_change_4,recpi_msa_change_1,recpi_msa_change_2,recpi_msa_change_3,recpi_msa_change_4,recpi_state_change_1,recpi_state_change_2,recpi_state_change_3,recpi_state_change_4,avg_eqi_year_1-5
0,1001,1992,0.001549,0.002111,0.002216,1.176877,28.052156,0.048744,0.001266,0.002057,...,2.333669,0.886418,1.074673,1.215163,0.883213,1.034792,1.02056,1.03639,1.251689,0.00217
1,1001,1997,0.001233,0.002386,0.00084,1.112023,41.1745,0.035283,0.001164,0.002573,...,1.378256,0.851011,2.201297,0.567565,0.995583,1.0739,1.245044,1.448812,0.69367,0.001287
2,1001,2002,0.001494,0.00245,0.001504,1.746593,50.71338,0.054131,0.001073,0.002257,...,4.531509,0.74046,1.353943,0.642606,1.292821,0.971759,0.905151,1.007785,1.046863,0.001853
3,1001,2007,0.000702,0.001817,0.000611,1.081845,45.171143,0.036063,0.000758,0.002125,...,0.672958,1.030793,0.855337,1.170414,1.02507,1.089005,0.801518,1.08881,1.051707,0.000602
4,1002,1992,0.001549,0.002111,0.000959,1.176877,28.052156,0.020148,0.001266,0.002057,...,0.392888,0.886418,1.074673,1.215163,0.883213,1.034792,1.02056,1.03639,1.251689,0.003141


In [261]:
y_train.head()

Unnamed: 0,zipcode,start_year,avg_eqi_year_5-10
0,1001,1992,0.001287
1,1001,1997,0.001853
2,1001,2002,0.000602
3,1001,2007,0.000898
4,1002,1992,0.002931


In [262]:
X_test.head()

Unnamed: 0,zipcode,start_year,year_1_EQI_MSA,year_1_EQI_state,year_1_EQI_zip,year_1_RECPI_MSA,year_1_RECPI_state,year_1_RECPI_zip,year_2_EQI_MSA,year_2_EQI_state,...,recpi_zip_change_4,recpi_msa_change_1,recpi_msa_change_2,recpi_msa_change_3,recpi_msa_change_4,recpi_state_change_1,recpi_state_change_2,recpi_state_change_3,recpi_state_change_4,avg_eqi_year_1-5
0,1013,1992,0.001549,0.002111,0.000836,1.176877,28.052156,0.02006,0.001266,0.002057,...,3.247644,0.886418,1.074673,1.215163,0.883213,1.034792,1.02056,1.03639,1.251689,0.001165
1,1013,1997,0.001233,0.002386,0.000986,1.112023,41.1745,0.013807,0.001164,0.002573,...,0.804941,0.851011,2.201297,0.567565,0.995583,1.0739,1.245044,1.448812,0.69367,0.001317
2,1013,2002,0.001494,0.00245,0.000854,1.746593,50.71338,0.016218,0.001073,0.002257,...,2.654792,0.74046,1.353943,0.642606,1.292821,0.971759,0.905151,1.007785,1.046863,0.001629
3,1013,2007,0.000702,0.001817,0.000439,1.081845,45.171143,0.018446,0.000758,0.002125,...,0.803385,1.030793,0.855337,1.170414,1.02507,1.089005,0.801518,1.08881,1.051707,0.000743
4,1029,1992,0.001124,0.002111,0.000168,0.175382,28.052156,0.000168,0.001097,0.002057,...,1.458515,1.056878,1.25089,1.024236,0.923484,1.034792,1.02056,1.03639,1.251689,0.000978


In [263]:
y_test.head()

Unnamed: 0,zipcode,start_year,avg_eqi_year_5-10
0,1013,1992,0.001317
1,1013,1997,0.001629
2,1013,2002,0.000743
3,1013,2007,0.000804
4,1029,1992,0.000924


In [264]:
X_test = X_test.drop(['zipcode','start_year'], axis=1)
X_train = X_train.drop(['zipcode','start_year'], axis=1)
y_test = y_test[['avg_eqi_year_5-10']].to_numpy().ravel()
y_train = y_train[['avg_eqi_year_5-10']].to_numpy().ravel()

### Data Pipeline

In [265]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.metrics import r2_score, mean_gamma_deviance

In [266]:
column_trans = ColumnTransformer([
    ('scale', StandardScaler(), make_column_selector(dtype_include=np.float64))
    ], remainder='drop')

column_trans.fit(X_train)

ColumnTransformer(transformers=[('scale', StandardScaler(),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x000002530F3216A0>)])

## Linear Models

In [267]:
from sklearn import linear_model

### OLS

In [268]:
pipe = Pipeline([('transformer', column_trans), ('ols', linear_model.LinearRegression())])

In [269]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('scale', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000002530F3216A0>)])),
                ('ols', LinearRegression())])

In [270]:
y_pred = pipe.predict(X_train)
r2_score(y_train, y_pred)

0.10604068427541535

In [271]:
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

0.2412324319252005

### Ridge

In [272]:
pipe = Pipeline([('transformer', column_trans), ('ridge', linear_model.Ridge())])

In [273]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('scale', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000002530F3216A0>)])),
                ('ridge', Ridge())])

In [274]:
y_pred = pipe.predict(X_train)
r2_score(y_train, y_pred)

0.4090779819745761

In [275]:
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

0.4203981293465048

### Lasso

In [276]:
pipe = Pipeline([('transformer', column_trans), ('lasso', linear_model.Lasso())])

In [277]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('scale', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000002530F3216A0>)])),
                ('lasso', Lasso())])

In [278]:
y_pred = pipe.predict(X_train)
r2_score(y_train, y_pred)

0.0

In [279]:
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

-0.0002776847582237618

## KNN Regressor

In [280]:
from sklearn.neighbors import KNeighborsRegressor

In [281]:
pipe = Pipeline([('transformer', column_trans), ('knn', KNeighborsRegressor())])

In [282]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('scale', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000002530F3216A0>)])),
                ('knn', KNeighborsRegressor())])

In [283]:
y_pred = pipe.predict(X_train)
r2_score(y_train, y_pred)

0.5930726693934081

In [284]:
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

0.3759002557687914

## SVM Regressor

In [285]:
from sklearn.svm import SVR

In [286]:
pipe = Pipeline([('transformer', column_trans), ('svr', SVR())])

In [287]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('scale', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000002530F3216A0>)])),
                ('svr', SVR())])

In [288]:
y_pred = pipe.predict(X_train)
r2_score(y_train, y_pred)

-397.20368992952837

In [289]:
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

-316.652141134662

## Ensemble Methods

### Random Forest

In [290]:
from sklearn.ensemble import RandomForestRegressor

In [291]:
pipe = Pipeline([('transformer', column_trans), ('rf', RandomForestRegressor(n_jobs=-1))])

In [292]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('scale', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000002530F3216A0>)])),
                ('rf', RandomForestRegressor(n_jobs=-1))])

In [293]:
y_pred = pipe.predict(X_train)
r2_score(y_train, y_pred)

0.9240955981126345

In [294]:
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

0.46787602338840084

### Gradient Boosting

In [295]:
from sklearn.ensemble import GradientBoostingRegressor

In [296]:
pipe = Pipeline([('transformer', column_trans), ('gbr', GradientBoostingRegressor())])

In [297]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('scale', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000002530F3216A0>)])),
                ('gbr', GradientBoostingRegressor())])

In [298]:
y_pred = pipe.predict(X_train)
r2_score(y_train, y_pred)

0.6349046086338193

In [299]:
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

0.5242195244568959

### AdaBoost

In [300]:
from sklearn.ensemble import AdaBoostRegressor

In [301]:
pipe = Pipeline([('transformer', column_trans), ('adaboost', AdaBoostRegressor())])

In [302]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('scale', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000002530F3216A0>)])),
                ('adaboost', AdaBoostRegressor())])

In [303]:
y_pred = pipe.predict(X_train)
r2_score(y_train, y_pred)

-13.086833659867962

In [304]:
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

-10.350265124867452

### Tree Bagging

In [305]:
from sklearn.ensemble import BaggingRegressor

In [306]:
pipe = Pipeline([('transformer', column_trans), ('bagging', BaggingRegressor(n_jobs=-1))])

In [307]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('scale', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000002530F3216A0>)])),
                ('bagging', BaggingRegressor(n_jobs=-1))])

In [308]:
y_pred = pipe.predict(X_train)
r2_score(y_train, y_pred)

0.8735032581120307

In [309]:
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

0.4199717121570723

## NN Model

In [310]:
from sklearn.neural_network import MLPRegressor

In [311]:
pipe = Pipeline([('transformer', column_trans), ('perceptron', MLPRegressor())])

In [312]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('scale', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000002530F3216A0>)])),
                ('perceptron', MLPRegressor())])

In [313]:
y_pred = pipe.predict(X_train)
r2_score(y_train, y_pred)

-112018.87349521568

In [314]:
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

-30956.003691179016

## Voting Regressor

In [315]:
from sklearn.ensemble import VotingRegressor

In [316]:
vote = VotingRegressor(estimators=[('knn', KNeighborsRegressor()), ('rf', RandomForestRegressor(n_jobs=-1))])

In [317]:
pipe = Pipeline([('transformer', column_trans), ('vote', vote)])

In [318]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('scale', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000002530F3216A0>)])),
                ('vote',
                 VotingRegressor(estimators=[('knn', KNeighborsRegressor()),
                                             ('rf',
                                              RandomForestRegressor(n_jobs=-1))]))])

In [319]:
y_pred = pipe.predict(X_train)
r2_score(y_train, y_pred)

0.80296870264815

In [320]:
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

0.4572948236139843