In [1]:
### Before we submit, review which modules we actually use !!!

%load_ext autoreload
%autoreload 2

import warnings
import calendar

import pandas as pd
import numpy as np
import seaborn as sns
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.pylab as pltlab

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge, Lasso, LinearRegression, ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, train_test_split

import pipeline as pipe
pd.set_option('display.max_columns', None)

warnings.filterwarnings('ignore')
%matplotlib inline
sns.set(rc={'figure.figsize':(11, 4)})

# Data Import & Exploration

In [2]:
data = pd.read_pickle('data/final_dataset.pk1')

In [3]:
pipe.find_outliers(data, 'median_income', 0, 10000000)

Number of outliers found: 0
Outlier values found: []


Unnamed: 0,name,total_pop,median_income,state,county,county_name,state_name,prop_white,prop_black,prop_hisp,log_med_income,prop_no_internet,prop_ba,prop_services,pop_density,FIPS,covid_cases,Testing_Rate,gov_party,election_diff,Apr-19,Mar-19,Feb-20,Mar-20,days_closed,yearly_change,monthly_change


In [4]:
data.head()

Unnamed: 0,name,total_pop,median_income,state,county,county_name,state_name,prop_white,prop_black,prop_hisp,log_med_income,prop_no_internet,prop_ba,prop_services,pop_density,FIPS,covid_cases,Testing_Rate,gov_party,election_diff,Apr-19,Mar-19,Feb-20,Mar-20,days_closed,yearly_change,monthly_change
0,"Washington County, Mississippi",47086,30834.0,28,151,Washington,Mississippi,0.256913,0.721701,0.015482,10.336373,0.336958,0.187937,0.100274,62.178441,28151.0,77,2291.374085,1,-0.364695,6.4,7.5,7.4,7.0,17.0,-0.066667,-0.054054
1,"Perry County, Mississippi",12028,39007.0,28,111,Perry,Mississippi,0.787745,0.196874,0.015048,10.571496,0.310103,0.109539,0.07885,18.434583,28111.0,27,2291.374085,1,0.536635,5.7,6.2,6.8,6.3,17.0,0.016129,-0.073529
2,"Choctaw County, Mississippi",8321,37203.0,28,19,Choctaw,Mississippi,0.676722,0.311982,0.003966,10.524145,0.368837,0.176582,0.014207,19.794546,28019.0,13,2291.374085,1,0.386224,4.8,5.3,5.4,5.0,17.0,-0.056604,-0.074074
3,"Itawamba County, Mississippi",23480,40510.0,28,57,Itawamba,Mississippi,0.909114,0.071593,0.015332,10.609304,0.226281,0.134468,0.057524,44.138916,28057.0,57,2291.374085,1,0.755161,4.1,4.4,4.7,4.7,17.0,0.068182,0.0
4,"Carroll County, Mississippi",10129,43060.0,28,15,Carroll,Mississippi,0.643992,0.345839,0.002863,10.67035,0.332969,0.145006,0.057415,15.774563,28015.0,38,2291.374085,1,0.383321,5.5,5.8,6.2,6.0,17.0,0.034483,-0.032258


In [5]:
pipe.get_summary_stats(data)

          total_pop  median_income   prop_white   prop_black    prop_hisp  \
count  2.826000e+03    2825.000000  2826.000000  2826.000000  2826.000000   
mean   1.117590e+05   51714.241416     0.827329     0.098748     0.091962   
std    3.414925e+05   13821.224638     0.164131     0.149708     0.135249   
min    4.180000e+02   20188.000000     0.093534     0.000000     0.000000   
25%    1.412200e+04   42491.000000     0.757614     0.008110     0.021500   
50%    3.013550e+04   49936.000000     0.889050     0.028182     0.041568   
75%    7.725500e+04   57886.000000     0.947572     0.117748     0.095686   
max    1.009805e+07  136268.000000     0.997743     0.874123     0.990688   

       log_med_income  prop_no_internet      prop_ba  prop_services  \
count     2825.000000       2826.000000  2826.000000    2825.000000   
mean        10.821023          0.225514     0.217787       0.083571   
std          0.251809          0.086132     0.096556       0.034479   
min          9.912844 

After reviewing the summary statistics, we confirmed that the median income is non-negative. We also looked into the potential outlier on population density and confirmed that that value is New York City.

In [6]:
data.dtypes

name                 object
total_pop             int64
median_income       float64
state                object
county               object
county_name          object
state_name           object
prop_white          float64
prop_black          float64
prop_hisp           float64
log_med_income      float64
prop_no_internet    float64
prop_ba             float64
prop_services       float64
pop_density         float64
FIPS                float64
covid_cases           int64
Testing_Rate        float64
gov_party             int64
election_diff       float64
Apr-19              float64
Mar-19              float64
Feb-20              float64
Mar-20              float64
days_closed         float64
yearly_change       float64
monthly_change      float64
dtype: object

# Model Fitting & Evaluation

In [7]:
best_models = {}

## Monthly Data

### With States

In [8]:
# Select variables from full dataframe
df_mo_st = data.drop(['Apr-19','Mar-19','Feb-20','Mar-20','median_income',
                  'yearly_change','FIPS','name','state_name','county_name','county','total_pop'], axis=1)

In [9]:
# One-hot encode state
df_mo_st = pipe.hot_encode(df_mo_st, ['state'])

# Create training and testing sets
train, test = pipe.split_data(df_mo_st)

# Impute/nomralize continuous variables
numeric_cols = train.select_dtypes(include=['float64']).columns
train, test = pipe.impute_missing(train, test, numeric_cols)
train, test = pipe.normalize(train, test, numeric_cols)

# Separate features and targets
train_features = train.drop('monthly_change', axis=1)
test_features = test.drop('monthly_change', axis=1)

train_target = train['monthly_change']
test_target = test['monthly_change']

Training set contains 2260 observations
Testing set contains 566 observation

Imputing log_med_income missing values with median 10.819318178577827
Imputing prop_services missing values with median 0.0790869921304704


### Ridge

In [10]:
# Set up Pipeline for Ridge
pipeline = Pipeline([
    ('poly', PolynomialFeatures(include_bias=False)),
    ('ridge', Ridge(alpha=0.1))
])

alpha_range = np.arange(0.1, 1.1, 0.1)
params = {
          'ridge__alpha': alpha_range,
          'poly__degree': (1,2)
         }

grid_model, grid_model_result, cv_results = pipe.run_gridsearch(pipeline, params, train_features, train_target, verbose=1)

cv_results

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  1.2min finished


Unnamed: 0,param_ridge__alpha,param_poly__degree,rank_test_score,mean_test_score
0,0.1,1,1,-0.173317
1,0.2,1,2,-0.173346
2,0.3,1,3,-0.173451
3,0.4,1,4,-0.173619
4,0.5,1,5,-0.173839
5,0.6,1,6,-0.174103
6,0.7,1,7,-0.174404
7,0.8,1,8,-0.174737
8,0.9,1,9,-0.175099
9,1.0,1,10,-0.175485


In [11]:
print(grid_model_result.best_estimator_, '\n')

# Evaluate model
mse, r2, adj = pipe.evaluate_model(grid_model, test_features, test_target)

d = {'model': grid_model_result,
     'mse': mse,
     'r2': r2,
     'adj': adj}

best_models['state_ridge'] = d

# Rate coefficients
pipe.rank_coefs(grid_model_result, train_features.columns)

Pipeline(steps=[('poly', PolynomialFeatures(degree=1, include_bias=False)),
                ('ridge', Ridge(alpha=0.1))]) 

Mean Squared Error: 0.19068663539029224
R-Squared: 0.8133947444400503
Adjusted R-Squared: 0.7895569473226116


Unnamed: 0,feature,coefficient,coefficient (absolute)
18,state_08,2.74562,2.745624
41,state_32,2.49338,2.493379
31,state_22,1.73959,1.739590
24,state_15,-1.64312,1.643118
40,state_31,1.61774,1.617736
...,...,...,...
11,election_diff,-0.00676653,0.006767
2,prop_hisp,-0.00434714,0.004347
7,pop_density,-0.00157847,0.001578
8,covid_cases,5.973e-06,0.000006


### Lasso

In [12]:
# Lasso
pipeline = Pipeline([
    ('poly', PolynomialFeatures(include_bias=False)),
    ('lasso', Lasso(alpha=0.1))
])

params = {
          'lasso__alpha': alpha_range,
          'poly__degree': (1,2)
         }

grid_model, grid_model_result, cv_results = pipe.run_gridsearch(pipeline, params, train_features, train_target, verbose=1)

cv_results

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  1.2min finished


Unnamed: 0,param_lasso__alpha,param_poly__degree,rank_test_score,mean_test_score
0,0.1,1,1,-0.953522
2,0.2,1,2,-0.991177
18,1.0,1,3,-1.009114
16,0.9,1,4,-1.009132
14,0.8,1,5,-1.009151
12,0.7,1,6,-1.00917
10,0.6,1,7,-1.009188
8,0.5,1,8,-1.009207
6,0.4,1,9,-1.009225
4,0.3,1,10,-1.009244


In [13]:
print(grid_model_result.best_estimator_, '\n')

# Evaluate model
mse, r2, adj = pipe.evaluate_model(grid_model, test_features, test_target)

d = {'model': grid_model_result,
     'mse': mse,
     'r2': r2,
     'adj': adj}

best_models['state_lasso'] = d

# Rank coefficients
pipe.rank_coefs(grid_model_result, train_features.columns)

Pipeline(steps=[('poly', PolynomialFeatures(degree=1, include_bias=False)),
                ('lasso', Lasso(alpha=0.1))]) 

Mean Squared Error: 0.9293975623279228
R-Squared: 0.0904948882231652
Adjusted R-Squared: -0.025689397512797862


Unnamed: 0,feature,coefficient,coefficient (absolute)
2,prop_hisp,0.140502,0.140502
9,Testing_Rate,-0.0447084,0.044708
8,covid_cases,1.83762e-06,0.000002
41,state_32,0,0.000000
47,state_38,-0,0.000000
...,...,...,...
28,state_19,0,0.000000
29,state_20,-0,0.000000
30,state_21,0,0.000000
31,state_22,0,0.000000


### Linear Regression

In [14]:
pipeline = Pipeline([
    ('poly', PolynomialFeatures(include_bias=False)), ('linreg', LinearRegression())
])

params = {
          'poly__degree': (1,2)
         }

grid_model, grid_model_result, cv_results = pipe.run_gridsearch(pipeline, params, train_features, train_target, verbose=1)

cv_results

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    8.8s finished


Unnamed: 0,param_poly__degree,rank_test_score,mean_test_score
0,1,1,-0.173385
1,2,2,-26581.232478


In [15]:
print(grid_model_result.best_estimator_, '\n')

# Evaluate model
mse, r2, adj = pipe.evaluate_model(grid_model, test_features, test_target)

d = {'model': grid_model_result,
     'mse': mse,
     'r2': r2,
     'adj': adj}

best_models['state_linear'] = d

# Rank coefficients
pipe.rank_coefs(grid_model_result, train_features.columns)

Pipeline(steps=[('poly', PolynomialFeatures(degree=1, include_bias=False)),
                ('linreg', LinearRegression())]) 

Mean Squared Error: 0.19073508819751872
R-Squared: 0.8133473286971663
Adjusted R-Squared: 0.7895034744788403


Unnamed: 0,feature,coefficient,coefficient (absolute)
18,state_08,2.75726,2.757260e+00
41,state_32,2.51579,2.515794e+00
31,state_22,1.74584,1.745843e+00
24,state_15,-1.70401,1.704007e+00
40,state_31,1.62663,1.626633e+00
...,...,...,...
3,log_med_income,-0.00794765,7.947650e-03
2,prop_hisp,-0.00626239,6.262394e-03
7,pop_density,-0.00186651,1.866510e-03
8,covid_cases,6.06054e-06,6.060543e-06


### Elastic Net

In [16]:
pipeline = Pipeline([
    ('poly', PolynomialFeatures(include_bias=False)), ('en',ElasticNet())
])

params = {'en__alpha': alpha_range,
          'poly__degree': (1,2)
         }

grid_model, grid_model_result, cv_results = pipe.run_gridsearch(pipeline, params, train_features, train_target, verbose=1)

cv_results

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  1.5min finished


Unnamed: 0,param_en__alpha,param_poly__degree,rank_test_score,mean_test_score
0,0.1,1,1,-0.926325
2,0.2,1,2,-0.956329
4,0.3,1,3,-0.978176
6,0.4,1,4,-0.99394
8,0.5,1,5,-1.008761
18,1.0,1,6,-1.009207
16,0.9,1,7,-1.009216
14,0.8,1,8,-1.009225
12,0.7,1,9,-1.009235
10,0.6,1,10,-1.009244


In [17]:
print(grid_model_result.best_estimator_, '\n')

# Evaluate model
mse, r2, adj = pipe.evaluate_model(grid_model, test_features, test_target)

d = {'model': grid_model_result,
     'mse': mse,
     'r2': r2,
     'adj': adj}

best_models['state_en'] = d

# Rank coefficients
pipe.rank_coefs(grid_model_result, train_features.columns)

Pipeline(steps=[('poly', PolynomialFeatures(degree=1, include_bias=False)),
                ('en', ElasticNet(alpha=0.1))]) 

Mean Squared Error: 0.8875966940733764
R-Squared: 0.13140106755399283
Adjusted R-Squared: 0.020442321692626608


Unnamed: 0,feature,coefficient,coefficient (absolute)
2,prop_hisp,0.187891,0.187891
9,Testing_Rate,-0.0861573,0.086157
1,prop_black,0.0446405,0.044640
5,prop_ba,0.0366324,0.036632
12,days_closed,-0.0243419,0.024342
...,...,...,...
30,state_21,0,0.000000
31,state_22,0,0.000000
33,state_24,-0,0.000000
34,state_25,-0,0.000000


### Without States

In [18]:
# Select variables from full dataframe
df_mo = data.drop(['Apr-19','Mar-19','Feb-20','Mar-20','median_income',
                  'yearly_change','FIPS','name','state_name','county_name','county','state','total_pop'], axis=1)

In [19]:
# Create training and testing sets
train, test = pipe.split_data(df_mo)

# Impute/normalize continuous variables
numeric_cols = train.select_dtypes(include=['float64']).columns #Normalizes state one-hots as well
train, test = pipe.impute_missing(train, test, numeric_cols)
train, test = pipe.normalize(train, test, numeric_cols)

# Separate features and targets                              
train_features = train.drop('monthly_change', axis=1)
test_features = test.drop('monthly_change', axis=1)

train_target = train['monthly_change']
test_target = test['monthly_change']

Training set contains 2260 observations
Testing set contains 566 observation

Imputing log_med_income missing values with median 10.819318178577827
Imputing prop_services missing values with median 0.0790869921304704


### Ridge

In [20]:
# Set up Pipeline for Ridge
pipeline = Pipeline([
    ('poly', PolynomialFeatures(include_bias=False)),
    ('ridge', Ridge(alpha=0.1))
])

alpha_range = np.arange(0,1,0.1)
params = {
          'ridge__alpha': alpha_range,
          'poly__degree': (1,2)
         }

grid_model, grid_model_result, cv_results = pipe.run_gridsearch(pipeline, params, train_features, train_target, verbose=1)

cv_results

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    2.3s finished


Unnamed: 0,param_ridge__alpha,param_poly__degree,rank_test_score,mean_test_score
9,0.9,1,1,-0.86212
8,0.8,1,2,-0.862122
7,0.7,1,3,-0.862124
6,0.6,1,4,-0.862126
5,0.5,1,5,-0.862129
4,0.4,1,6,-0.862131
3,0.3,1,7,-0.862133
2,0.2,1,8,-0.862135
1,0.1,1,9,-0.862137
0,0.0,1,10,-0.862139


In [21]:
print(grid_model_result.best_estimator_, '\n')

# Evaluate model
mse, r2, adj = pipe.evaluate_model(grid_model, test_features, test_target)

d = {'model': grid_model_result,
     'mse': mse,
     'r2': r2,
     'adj': adj}

best_models['ridge'] = d

# Rank coefficients
pipe.rank_coefs(grid_model_result, train_features.columns)

Pipeline(steps=[('poly', PolynomialFeatures(degree=1, include_bias=False)),
                ('ridge', Ridge(alpha=0.9))]) 

Mean Squared Error: 0.8228599553540296
R-Squared: 0.19475220722938713
Adjusted R-Squared: 0.17578803819674593


Unnamed: 0,feature,coefficient,coefficient (absolute)
1,prop_black,0.420519,0.420519
2,prop_hisp,0.356983,0.356983
11,election_diff,0.297739,0.297739
5,prop_ba,0.244667,0.244667
0,prop_white,0.198083,0.198083
10,gov_party,-0.121706,0.121706
9,Testing_Rate,-0.113902,0.113902
6,prop_services,0.101941,0.101941
12,days_closed,-0.0705661,0.070566
7,pop_density,0.0580875,0.058087


### Lasso

In [22]:
# Lasso
pipeline = Pipeline([
    ('poly', PolynomialFeatures(include_bias=False)),
    ('lasso', Lasso(alpha=0.1))
])

params = {
          'lasso__alpha': alpha_range,
          'poly__degree': (1,2)
         }

grid_model, grid_model_result, cv_results = pipe.run_gridsearch(pipeline, params, train_features, train_target, verbose=1)

cv_results

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    6.8s finished


Unnamed: 0,param_lasso__alpha,param_poly__degree,rank_test_score,mean_test_score
0,0.0,1,1,-0.862139
2,0.1,1,2,-0.953522
4,0.2,1,3,-0.991177
18,0.9,1,4,-1.009132
16,0.8,1,5,-1.009151
14,0.7,1,6,-1.00917
12,0.6,1,7,-1.009188
10,0.5,1,8,-1.009207
8,0.4,1,9,-1.009225
6,0.3,1,10,-1.009244


In [23]:
print(grid_model_result.best_estimator_, '\n')

# Evaluate model
mse, r2, adj = pipe.evaluate_model(grid_model, test_features, test_target)

d = {'model': grid_model_result,
     'mse': mse,
     'r2': r2,
     'adj': adj}

best_models['lasso'] = d

# Rank coefficients
pipe.rank_coefs(grid_model_result, train_features.columns)

Pipeline(steps=[('poly', PolynomialFeatures(degree=1, include_bias=False)),
                ('lasso', Lasso(alpha=0.0))]) 

Mean Squared Error: 0.8228498973383429
R-Squared: 0.1947620499673629
Adjusted R-Squared: 0.17579811273833346


Unnamed: 0,feature,coefficient,coefficient (absolute)
1,prop_black,0.421956,0.421956
2,prop_hisp,0.357502,0.357502
11,election_diff,0.298371,0.298371
5,prop_ba,0.245248,0.245248
0,prop_white,0.199147,0.199147
10,gov_party,-0.122274,0.122274
9,Testing_Rate,-0.11389,0.11389
6,prop_services,0.102084,0.102084
12,days_closed,-0.0706407,0.070641
7,pop_density,0.0582246,0.058225


### Linear Regression

In [24]:
pipeline = Pipeline([
    ('poly', PolynomialFeatures(include_bias=False)), ('linreg', LinearRegression())
])

params = {
          'poly__degree': (1,2)
         }

grid_model, grid_model_result, cv_results = pipe.run_gridsearch(pipeline, params, train_features, train_target, verbose=1)

cv_results

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished


Unnamed: 0,param_poly__degree,rank_test_score,mean_test_score
0,1,1,-0.862139
1,2,2,-9.451854


In [25]:
print(grid_model_result.best_estimator_, '\n')

# Evaluate model
mse, r2, adj = pipe.evaluate_model(grid_model, test_features, test_target)

d = {'model': grid_model_result,
     'mse': mse,
     'r2': r2,
     'adj': adj}

best_models['linear'] = d

# Rank coefficients
pipe.rank_coefs(grid_model_result, train_features.columns)

Pipeline(steps=[('poly', PolynomialFeatures(degree=1, include_bias=False)),
                ('linreg', LinearRegression())]) 

Mean Squared Error: 0.8228498973383425
R-Squared: 0.19476204996736335
Adjusted R-Squared: 0.1757981127383339


Unnamed: 0,feature,coefficient,coefficient (absolute)
1,prop_black,0.421956,0.421956
2,prop_hisp,0.357502,0.357502
11,election_diff,0.298371,0.298371
5,prop_ba,0.245248,0.245248
0,prop_white,0.199147,0.199147
10,gov_party,-0.122274,0.122274
9,Testing_Rate,-0.11389,0.11389
6,prop_services,0.102084,0.102084
12,days_closed,-0.0706407,0.070641
7,pop_density,0.0582246,0.058225


### Elastic Net

In [26]:
pipeline = Pipeline([
    ('poly', PolynomialFeatures(include_bias=False)), ('en',ElasticNet())
])

params = {'en__alpha': alpha_range,
          'poly__degree': (1,2)
         }

grid_model, grid_model_result, cv_results = pipe.run_gridsearch(pipeline, params, train_features, train_target, verbose=1)

cv_results

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    6.7s finished


Unnamed: 0,param_en__alpha,param_poly__degree,rank_test_score,mean_test_score
0,0.0,1,1,-0.862139
2,0.1,1,2,-0.927865
4,0.2,1,3,-0.956329
6,0.3,1,4,-0.978176
8,0.4,1,5,-0.99394
10,0.5,1,6,-1.008761
18,0.9,1,7,-1.009216
16,0.8,1,8,-1.009225
14,0.7,1,9,-1.009235
12,0.6,1,10,-1.009244


In [27]:
print(grid_model_result.best_estimator_, '\n')

# Evaluate model
pipe.evaluate_model(grid_model, test_features, test_target)

mse, r2, adj = pipe.evaluate_model(grid_model, test_features, test_target)

d = {'model': grid_model_result,
     'mse': mse,
     'r2': r2,
     'adj': adj}

best_models['en'] = d

# Rank coefficients
pipe.rank_coefs(grid_model_result, train_features.columns)

Pipeline(steps=[('poly', PolynomialFeatures(degree=1, include_bias=False)),
                ('en', ElasticNet(alpha=0.0))]) 

Mean Squared Error: 0.8228498973383429
R-Squared: 0.1947620499673629
Adjusted R-Squared: 0.17579811273833346
Mean Squared Error: 0.8228498973383429
R-Squared: 0.1947620499673629
Adjusted R-Squared: 0.17579811273833346


Unnamed: 0,feature,coefficient,coefficient (absolute)
1,prop_black,0.421956,0.421956
2,prop_hisp,0.357502,0.357502
11,election_diff,0.298371,0.298371
5,prop_ba,0.245248,0.245248
0,prop_white,0.199147,0.199147
10,gov_party,-0.122274,0.122274
9,Testing_Rate,-0.11389,0.11389
6,prop_services,0.102084,0.102084
12,days_closed,-0.0706407,0.070641
7,pop_density,0.0582246,0.058225


In [28]:
best = pd.DataFrame.from_dict(best_models).transpose()
best.columns = ['Model Objects', 'Mean Squared Error', 'R-Squared', 'Adjusted R-Squared']
best.sort_values(by='Adjusted R-Squared',ascending=False)


Unnamed: 0,Model Objects,Mean Squared Error,R-Squared,Adjusted R-Squared
state_ridge,"GridSearchCV(cv=5,\n estimator=Pip...",0.190687,0.813395,0.789557
state_linear,"GridSearchCV(cv=5,\n estimator=Pip...",0.190735,0.813347,0.789503
linear,"GridSearchCV(cv=5,\n estimator=Pip...",0.82285,0.194762,0.175798
lasso,"GridSearchCV(cv=5,\n estimator=Pip...",0.82285,0.194762,0.175798
en,"GridSearchCV(cv=5,\n estimator=Pip...",0.82285,0.194762,0.175798
ridge,"GridSearchCV(cv=5,\n estimator=Pip...",0.82286,0.194752,0.175788
state_en,"GridSearchCV(cv=5,\n estimator=Pip...",0.887597,0.131401,0.0204423
state_lasso,"GridSearchCV(cv=5,\n estimator=Pip...",0.929398,0.0904949,-0.0256894


In [29]:
best.sort_values(by='Mean Squared Error')

Unnamed: 0,Model Objects,Mean Squared Error,R-Squared,Adjusted R-Squared
state_ridge,"GridSearchCV(cv=5,\n estimator=Pip...",0.190687,0.813395,0.789557
state_linear,"GridSearchCV(cv=5,\n estimator=Pip...",0.190735,0.813347,0.789503
linear,"GridSearchCV(cv=5,\n estimator=Pip...",0.82285,0.194762,0.175798
lasso,"GridSearchCV(cv=5,\n estimator=Pip...",0.82285,0.194762,0.175798
en,"GridSearchCV(cv=5,\n estimator=Pip...",0.82285,0.194762,0.175798
ridge,"GridSearchCV(cv=5,\n estimator=Pip...",0.82286,0.194752,0.175788
state_en,"GridSearchCV(cv=5,\n estimator=Pip...",0.887597,0.131401,0.0204423
state_lasso,"GridSearchCV(cv=5,\n estimator=Pip...",0.929398,0.0904949,-0.0256894
