# Exploratory Modeling with Feature Reduction

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

### Split Data into Training and Test Sets

In [2]:
filename = '../../data/processed/feature-eng-clustered.csv'
df = pd.read_csv(filename)
df.head()

Unnamed: 0,zipcode,start_year,avg_eqi_year_5-10,year_1_EQI_MSA,year_1_EQI_state,year_1_EQI_zip,year_1_RECPI_MSA,year_1_RECPI_state,year_1_RECPI_zip,year_2_EQI_MSA,...,recpi_state_change_2,recpi_state_change_3,recpi_state_change_4,avg_eqi_year_1-5,dataset_cluster,zip_cluster,msa_cluster,state_cluster,eqi_cluster,recpi_cluster
0,1001,1992,0.001287,0.001549,0.002111,0.002216,1.176876,28.052156,0.048744,0.001266,...,1.02056,1.03639,1.251689,0.00217,3,0,0,1,0,0
1,1001,1997,0.001853,0.001233,0.002386,0.00084,1.112023,41.1745,0.035283,0.001164,...,1.245044,1.448812,0.69367,0.001287,3,0,0,1,0,0
2,1001,2002,0.000602,0.001494,0.00245,0.001504,1.746593,50.71338,0.054131,0.001073,...,0.905151,1.007785,1.046863,0.001853,3,0,0,1,0,0
3,1001,2007,0.000898,0.000702,0.001817,0.000611,1.081845,45.171143,0.036063,0.000758,...,0.801518,1.08881,1.051707,0.000602,3,0,0,1,0,0
4,1002,1992,0.002931,0.001549,0.002111,0.000959,1.176876,28.052156,0.020148,0.001266,...,1.02056,1.03639,1.251689,0.003141,3,1,0,1,0,0


In [3]:
X = df['zipcode'].unique()
y = df['zipcode'].unique()

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
X = df.drop('avg_eqi_year_5-10', axis=1)
y = df[['zipcode','start_year','avg_eqi_year_5-10']]

In [6]:
X.head()

Unnamed: 0,zipcode,start_year,year_1_EQI_MSA,year_1_EQI_state,year_1_EQI_zip,year_1_RECPI_MSA,year_1_RECPI_state,year_1_RECPI_zip,year_2_EQI_MSA,year_2_EQI_state,...,recpi_state_change_2,recpi_state_change_3,recpi_state_change_4,avg_eqi_year_1-5,dataset_cluster,zip_cluster,msa_cluster,state_cluster,eqi_cluster,recpi_cluster
0,1001,1992,0.001549,0.002111,0.002216,1.176876,28.052156,0.048744,0.001266,0.002057,...,1.02056,1.03639,1.251689,0.00217,3,0,0,1,0,0
1,1001,1997,0.001233,0.002386,0.00084,1.112023,41.1745,0.035283,0.001164,0.002573,...,1.245044,1.448812,0.69367,0.001287,3,0,0,1,0,0
2,1001,2002,0.001494,0.00245,0.001504,1.746593,50.71338,0.054131,0.001073,0.002257,...,0.905151,1.007785,1.046863,0.001853,3,0,0,1,0,0
3,1001,2007,0.000702,0.001817,0.000611,1.081845,45.171143,0.036063,0.000758,0.002125,...,0.801518,1.08881,1.051707,0.000602,3,0,0,1,0,0
4,1002,1992,0.001549,0.002111,0.000959,1.176876,28.052156,0.020148,0.001266,0.002057,...,1.02056,1.03639,1.251689,0.003141,3,1,0,1,0,0


In [7]:
y.head()

Unnamed: 0,zipcode,start_year,avg_eqi_year_5-10
0,1001,1992,0.001287
1,1001,1997,0.001853
2,1001,2002,0.000602
3,1001,2007,0.000898
4,1002,1992,0.002931


In [8]:
X_train = pd.DataFrame(X_train)
X_train.columns = ['zipcode']

X_test = pd.DataFrame(X_test)
X_test.columns = ['zipcode']

y_train = pd.DataFrame(y_train)
y_train.columns = ['zipcode']

y_test = pd.DataFrame(y_test)
y_test.columns = ['zipcode']

In [9]:
X_train = X_train.merge(X, on='zipcode').sort_values(['zipcode','start_year'], ignore_index=True)
X_test = X_test.merge(X, on='zipcode').sort_values(['zipcode','start_year'], ignore_index=True)
y_train = y_train.merge(y, on='zipcode').sort_values(['zipcode','start_year'], ignore_index=True)
y_test = y_test.merge(y, on='zipcode').sort_values(['zipcode','start_year'], ignore_index=True)

In [10]:
X_train.head()

Unnamed: 0,zipcode,start_year,year_1_EQI_MSA,year_1_EQI_state,year_1_EQI_zip,year_1_RECPI_MSA,year_1_RECPI_state,year_1_RECPI_zip,year_2_EQI_MSA,year_2_EQI_state,...,recpi_state_change_2,recpi_state_change_3,recpi_state_change_4,avg_eqi_year_1-5,dataset_cluster,zip_cluster,msa_cluster,state_cluster,eqi_cluster,recpi_cluster
0,1002,1992,0.001549,0.002111,0.000959,1.176876,28.052156,0.020148,0.001266,0.002057,...,1.02056,1.03639,1.251689,0.003141,3,1,0,1,0,0
1,1002,1997,0.001233,0.002386,0.004272,1.112023,41.1745,0.128172,0.001164,0.002573,...,1.245044,1.448812,0.69367,0.002931,3,1,0,1,0,0
2,1002,2002,0.001494,0.00245,0.005398,1.746593,50.71338,0.29147,0.001073,0.002257,...,0.905151,1.007785,1.046863,0.00264,3,1,0,1,0,0
3,1002,2007,0.000702,0.001817,0.000895,1.081845,45.171143,0.068894,0.000758,0.002125,...,0.801518,1.08881,1.051707,0.001622,3,0,0,1,0,0
4,1005,1992,0.000529,0.002111,0.000714,0.039692,28.052156,0.004995,0.000527,0.002057,...,1.02056,1.03639,1.251689,0.001013,3,0,0,1,0,0


In [11]:
y_train.head()

Unnamed: 0,zipcode,start_year,avg_eqi_year_5-10
0,1002,1992,0.002931
1,1002,1997,0.00264
2,1002,2002,0.001622
3,1002,2007,0.002022
4,1005,1992,0.001055


In [12]:
X_test.head()

Unnamed: 0,zipcode,start_year,year_1_EQI_MSA,year_1_EQI_state,year_1_EQI_zip,year_1_RECPI_MSA,year_1_RECPI_state,year_1_RECPI_zip,year_2_EQI_MSA,year_2_EQI_state,...,recpi_state_change_2,recpi_state_change_3,recpi_state_change_4,avg_eqi_year_1-5,dataset_cluster,zip_cluster,msa_cluster,state_cluster,eqi_cluster,recpi_cluster
0,1001,1992,0.001549,0.002111,0.002216,1.176876,28.052156,0.048744,0.001266,0.002057,...,1.02056,1.03639,1.251689,0.00217,3,0,0,1,0,0
1,1001,1997,0.001233,0.002386,0.00084,1.112023,41.1745,0.035283,0.001164,0.002573,...,1.245044,1.448812,0.69367,0.001287,3,0,0,1,0,0
2,1001,2002,0.001494,0.00245,0.001504,1.746593,50.71338,0.054131,0.001073,0.002257,...,0.905151,1.007785,1.046863,0.001853,3,0,0,1,0,0
3,1001,2007,0.000702,0.001817,0.000611,1.081845,45.171143,0.036063,0.000758,0.002125,...,0.801518,1.08881,1.051707,0.000602,3,0,0,1,0,0
4,1007,1992,0.001549,0.002111,0.001419,1.176876,28.052156,0.017027,0.001266,0.002057,...,1.02056,1.03639,1.251689,0.001102,3,0,0,1,0,0


In [13]:
y_test.head()

Unnamed: 0,zipcode,start_year,avg_eqi_year_5-10
0,1001,1992,0.001287
1,1001,1997,0.001853
2,1001,2002,0.000602
3,1001,2007,0.000898
4,1007,1992,0.001195


In [14]:
X_test = X_test.drop(['zipcode','start_year'], axis=1)
X_train = X_train.drop(['zipcode','start_year'], axis=1)
y_test = y_test[['avg_eqi_year_5-10']].to_numpy().ravel()
y_train = y_train[['avg_eqi_year_5-10']].to_numpy().ravel()

### Data Pipeline

In [15]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.metrics import r2_score, mean_gamma_deviance

In [20]:
column_trans = ColumnTransformer([
    ('scale', StandardScaler(), make_column_selector(dtype_include=np.float64)),
    ('dataset_cluster', OneHotEncoder(dtype='int'), ['dataset_cluster']),
    ('zip_cluster', OneHotEncoder(dtype='int'), ['zip_cluster']),
    ('msa_cluster', OneHotEncoder(dtype='int'), ['msa_cluster']),
    ('state_cluster', OneHotEncoder(dtype='int'), ['state_cluster']),
    ('eqi_cluster', OneHotEncoder(dtype='int'), ['eqi_cluster']),
    ('recpi_cluster', OneHotEncoder(dtype='int'), ['recpi_cluster'])
    ], remainder='drop')

column_trans.fit(X_train)

ColumnTransformer(transformers=[('scale', StandardScaler(),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x000001FB6E8822E0>),
                                ('dataset_cluster', OneHotEncoder(dtype='int'),
                                 ['dataset_cluster']),
                                ('zip_cluster', OneHotEncoder(dtype='int'),
                                 ['zip_cluster']),
                                ('msa_cluster', OneHotEncoder(dtype='int'),
                                 ['msa_cluster']),
                                ('state_cluster', OneHotEncoder(dtype='int'),
                                 ['state_cluster']),
                                ('eqi_cluster', OneHotEncoder(dtype='int'),
                                 ['eqi_cluster']),
                                ('recpi_cluster', OneHotEncoder(dtype='int'),
                                 ['recpi_cluster'])])

## Linear Models

In [21]:
from sklearn import linear_model

### OLS

In [22]:
pipe = Pipeline([('transformer', column_trans), ('ols', linear_model.LinearRegression())])

In [23]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('scale', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001FB6E8822E0>),
                                                 ('dataset_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['dataset_cluster']),
                                                 ('zip_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['zip_cluster']),
                                                 ('msa_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['msa_cluster']),
                                                 ('state_cluster',
                                           

In [24]:
y_pred = pipe.predict(X_train)
r2_score(y_train, y_pred)

0.4768521391268796

In [25]:
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

0.5426951820488843

### Ridge

In [26]:
pipe = Pipeline([('transformer', column_trans), ('ridge', linear_model.Ridge())])

In [27]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('scale', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001FB6E8822E0>),
                                                 ('dataset_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['dataset_cluster']),
                                                 ('zip_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['zip_cluster']),
                                                 ('msa_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['msa_cluster']),
                                                 ('state_cluster',
                                           

In [28]:
y_pred = pipe.predict(X_train)
r2_score(y_train, y_pred)

0.47689666605293124

In [29]:
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

0.5426943229350225

### Lasso

In [30]:
pipe = Pipeline([('transformer', column_trans), ('lasso', linear_model.Lasso())])

In [31]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('scale', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001FB6E8822E0>),
                                                 ('dataset_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['dataset_cluster']),
                                                 ('zip_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['zip_cluster']),
                                                 ('msa_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['msa_cluster']),
                                                 ('state_cluster',
                                           

In [32]:
y_pred = pipe.predict(X_train)
r2_score(y_train, y_pred)

0.0

In [33]:
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

-0.00021962686999033032

## KNN Regressor

In [34]:
from sklearn.neighbors import KNeighborsRegressor

In [35]:
pipe = Pipeline([('transformer', column_trans), ('knn', KNeighborsRegressor())])

In [36]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('scale', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001FB6E8822E0>),
                                                 ('dataset_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['dataset_cluster']),
                                                 ('zip_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['zip_cluster']),
                                                 ('msa_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['msa_cluster']),
                                                 ('state_cluster',
                                           

In [37]:
y_pred = pipe.predict(X_train)
r2_score(y_train, y_pred)

0.6294739568880735

In [38]:
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

0.49400275605401067

## SVM Regressor

In [39]:
from sklearn.svm import SVR

In [40]:
pipe = Pipeline([('transformer', column_trans), ('svr', SVR())])

In [41]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('scale', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001FB6E8822E0>),
                                                 ('dataset_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['dataset_cluster']),
                                                 ('zip_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['zip_cluster']),
                                                 ('msa_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['msa_cluster']),
                                                 ('state_cluster',
                                           

In [42]:
y_pred = pipe.predict(X_train)
r2_score(y_train, y_pred)

-1433.5621528159181

In [43]:
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

-1350.4368767212102

## Ensemble Methods

### Random Forest

In [44]:
from sklearn.ensemble import RandomForestRegressor

In [45]:
pipe = Pipeline([('transformer', column_trans), ('rf', RandomForestRegressor(n_jobs=-1))])

In [46]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('scale', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001FB6E8822E0>),
                                                 ('dataset_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['dataset_cluster']),
                                                 ('zip_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['zip_cluster']),
                                                 ('msa_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['msa_cluster']),
                                                 ('state_cluster',
                                           

In [47]:
y_pred = pipe.predict(X_train)
r2_score(y_train, y_pred)

0.9178528120402455

In [48]:
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

0.598658352528512

### Gradient Boosting

In [49]:
from sklearn.ensemble import GradientBoostingRegressor

In [50]:
pipe = Pipeline([('transformer', column_trans), ('gbr', GradientBoostingRegressor())])

In [51]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('scale', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001FB6E8822E0>),
                                                 ('dataset_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['dataset_cluster']),
                                                 ('zip_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['zip_cluster']),
                                                 ('msa_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['msa_cluster']),
                                                 ('state_cluster',
                                           

In [52]:
y_pred = pipe.predict(X_train)
r2_score(y_train, y_pred)

0.7310372530504303

In [53]:
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

0.5727063917198174

### AdaBoost

In [54]:
from sklearn.ensemble import AdaBoostRegressor

In [55]:
pipe = Pipeline([('transformer', column_trans), ('adaboost', AdaBoostRegressor())])

In [56]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('scale', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001FB6E8822E0>),
                                                 ('dataset_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['dataset_cluster']),
                                                 ('zip_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['zip_cluster']),
                                                 ('msa_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['msa_cluster']),
                                                 ('state_cluster',
                                           

In [57]:
y_pred = pipe.predict(X_train)
r2_score(y_train, y_pred)

-5.495531193428755

In [58]:
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

-5.34837045544623

### Tree Bagging

In [59]:
from sklearn.ensemble import BaggingRegressor

In [60]:
pipe = Pipeline([('transformer', column_trans), ('bagging', BaggingRegressor(n_jobs=-1))])

In [61]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('scale', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001FB6E8822E0>),
                                                 ('dataset_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['dataset_cluster']),
                                                 ('zip_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['zip_cluster']),
                                                 ('msa_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['msa_cluster']),
                                                 ('state_cluster',
                                           

In [62]:
y_pred = pipe.predict(X_train)
r2_score(y_train, y_pred)

0.8673635737756299

In [63]:
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

0.5887937255214739

## NN Model

In [64]:
from sklearn.neural_network import MLPRegressor

In [65]:
pipe = Pipeline([('transformer', column_trans), ('perceptron', MLPRegressor())])

In [66]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('scale', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001FB6E8822E0>),
                                                 ('dataset_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['dataset_cluster']),
                                                 ('zip_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['zip_cluster']),
                                                 ('msa_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['msa_cluster']),
                                                 ('state_cluster',
                                           

In [67]:
y_pred = pipe.predict(X_train)
r2_score(y_train, y_pred)

-1233.8478930040455

In [68]:
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

-381.44443466057834

## Voting Regressor

In [69]:
from sklearn.ensemble import VotingRegressor

In [70]:
vote = VotingRegressor(estimators=[('knn', KNeighborsRegressor()), ('rf', RandomForestRegressor(n_jobs=-1))])

In [71]:
pipe = Pipeline([('transformer', column_trans), ('vote', vote)])

In [72]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('scale', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001FB6E8822E0>),
                                                 ('dataset_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['dataset_cluster']),
                                                 ('zip_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['zip_cluster']),
                                                 ('msa_cluster',
                                                  OneHotEncoder(dtype='int'),
                                                  ['msa_cluster']),
                                                 ('state_cluster',
                                           

In [73]:
y_pred = pipe.predict(X_train)
r2_score(y_train, y_pred)

0.8157165865950646

In [74]:
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

0.5754750524016146