In [31]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import statsmodels.api as sm
import pylab as py
import scipy.stats as stats 
from scipy.stats import norm, skew, probplot
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV
from sklearn.decomposition import PCA

In [32]:
from Feat_eng import Y_train, X_train, Y_test, X_test, c1_train, c2_train, c3_train, c1_test, c2_test, c3_test, Y_c1train, Y_c2train, Y_c3train, Y_c1test, Y_c2test ,Y_c3test  

## Run a Lasso Regression

In [33]:
# Run a Lasso Regression on all variables

scaler = StandardScaler().fit(X_train)
features = scaler.transform(X_train)
X_std = pd.DataFrame(features, columns = X_train.columns)

alphas = 10**np.linspace(10,-2,500)*0.5

lassocv = LassoCV(alphas = alphas, random_state=0)
lassocv.fit(X_std, Y_train)
lassocv_alpha = lassocv.alpha_

las = Lasso(alpha = lassocv.alpha_, random_state = 0)
las.fit(X_std,Y_train)

print('Score for training data:', las.score(X_std,Y_train))

pred = las.predict(X_std)
pred_unlog = np.exp(pred)
Y_unlog = np.exp(Y_train)
train_RMSE = mean_squared_error(Y_unlog, pred_unlog, squared = False)
print('(not log) RMSE for training data is:', train_RMSE)

test_features = scaler.transform(X_test)
X_test_std = pd.DataFrame(test_features, columns = X_test.columns)

print('Score for test data:', las.score(X_test_std,Y_test))

pred_test = las.predict(X_test_std)
pred_test_unlog = np.exp(pred_test)
Y_test_unlog = np.exp(Y_test)
Test_RMSE = mean_squared_error(Y_test_unlog, pred_test_unlog, squared = False)
print('(not log) RMSE for test data is:', Test_RMSE)

las_imp = pd.DataFrame()

las_imp['Feature'] = X_test_std.columns
las_imp['Importance'] = list(las.coef_)
las_imp['Abs Importance'] = abs(las_imp['Importance'])
las_imp = las_imp.sort_values(by = 'Abs Importance', ascending = False)
las_imp

ranking = list(range(1,las_imp.shape[0]+1))
las_imp['ranking'] = ranking
las_imp

  return f(*args, **kwargs)


Score for training data: 0.9879606121800647
(not log) RMSE for training data is: 76.62370321766204
Score for test data: 0.9869116433228605
(not log) RMSE for test data is: 83.51169109456808


Unnamed: 0,Feature,Importance,Abs Importance,ranking
128,log_value,0.409725,0.409725,1
126,interest,-0.008158,0.008158,2
40,owner_occupied_housing_units_lower_value_quartile,0.003129,0.003129,3
125,gdp,0.002229,0.002229,4
21,vacant_housing_units,-0.000556,0.000556,5
...,...,...,...,...
41,owner_occupied_housing_units_median_value,0.000000,0.000000,133
39,renter_occupied_housing_units_paying_cash_medi...,0.000000,0.000000,134
38,aggregate_travel_time_to_work,0.000000,0.000000,135
37,commute_45_59_mins,0.000000,0.000000,136


In [34]:
r2_train = las.score(X_std, Y_train)
r2_test = las.score(X_test_std, Y_test)
lasso_output = pd.DataFrame()
lasso_output['all'] = [train_RMSE, Test_RMSE, r2_train, r2_test]
lasso_output.index = ['RMSE_train', 'RMSE_test', 'R2_train', 'R2_test']
lasso_output

Unnamed: 0,all
RMSE_train,76.623703
RMSE_test,83.511691
R2_train,0.987961
R2_test,0.986912


In [35]:
import pickle
Pkl_Filename = "Pickle_lasso_all.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(las, file)

## Run a PCA 

In [36]:
scaler_pca = StandardScaler().fit(X_train)
features = scaler_pca.transform(X_train)
df_scal = pd.DataFrame(features, columns = X_train.columns)

In [37]:
pca = PCA(n_components = 0.95)
df_reduced = pca.fit_transform(df_scal)

print(df_reduced.shape)

df_reduced = pd.DataFrame(pca.components_, columns = X_train.columns, index = list(range(1,df_reduced.shape[1]+1)))
df_reduced = df_reduced.apply(lambda x: abs(x))


(5122, 35)


In [38]:
# Try to reduce features by finding most important one for each of the dimensions
feat = []
for i in range(0,df_reduced.shape[0]):
    ff = df_reduced.iloc[i].idxmax()
    df_reduced.drop(ff, inplace = True, axis = 1)
    feat.append(ff)

In [39]:
feat.append('log_value')
feat

['total_pop',
 'income_per_capita',
 'dwellings_1_units_detached',
 'vacant_housing_units_for_rent',
 'hispanic_ratio',
 'cpi',
 'in_undergrad_college',
 'white_ratio',
 'median_age',
 'median_year_structure_built',
 'asian_including_hispanic',
 'housing_built_2005_or_later',
 'dwellings_1_units_attached',
 'vacant_housing_units',
 'gdp',
 'interest',
 'amerindian_including_hispanic',
 'male_ratio',
 'gini_index',
 'dwellings_10_to_19_units',
 'employed_public_administration',
 'est',
 'commute_20_24_mins',
 'commute_5_9_mins',
 'percent_income_spent_on_rent',
 'housing_built_2000_to_2004',
 'renter_occupied_housing_units_paying_cash_median_gross_rent',
 'different_house_year_ago_same_city',
 'dwellings_5_to_9_units',
 'different_house_year_ago_different_city',
 'housing_built_1939_or_earlier',
 'dwellings_3_to_4_units',
 'employed_construction',
 'Cluster',
 'employed_information',
 'log_value']

In [40]:
X_red_train = X_train[feat]
X_red_test = X_test[feat]

In [41]:
# Run Lasso Regression on PCA variables:
scaler_red = StandardScaler().fit(X_red_train)
features_red = scaler_red.transform(X_red_train)
X_red_std = pd.DataFrame(features_red, columns = X_red_train.columns)

alphas = 10**np.linspace(10,-2,500)*0.5

lassocv_red = LassoCV(alphas = alphas, random_state=0)
lassocv_red.fit(X_red_std, Y_train)
lassocv_red_alpha = lassocv_red.alpha_

las_red = Lasso(alpha = lassocv_red.alpha_, random_state = 0)
las_red.fit(X_red_std,Y_train)

print('Score for training data:', las_red.score(X_red_std,Y_train))

pred_red = las_red.predict(X_red_std)
pred_red = np.exp(pred_red)
Y_train_red = np.exp(Y_train)
train_RMSE_red = mean_squared_error(Y_train_red, pred_red, squared = False)
print('(not log) RMSE for training data is:', train_RMSE_red)

test_features_red = scaler_red.transform(X_red_test)
X_red_test_std = pd.DataFrame(test_features_red, columns = X_red_test.columns)

print('Score for test data:', las_red.score(X_red_test_std,Y_test))

pred_test_red = las_red.predict(X_red_test_std)
pred_test_red = np.exp(pred_test_red)
Y_test_red = np.exp(Y_test)
Test_RMSE_red = mean_squared_error(Y_test_red, pred_test_red, squared = False)
print('(not log) RMSE for test data is:', Test_RMSE_red)

las_imp_red = pd.DataFrame()

las_imp_red['Feature'] = X_red_test_std.columns
las_imp_red['Importance'] = list(las_red.coef_)
las_imp_red['Abs Importance'] = abs(las_imp_red['Importance'])
las_imp_red = las_imp_red.sort_values(by = 'Abs Importance', ascending = False)
las_imp_red

ranking_red = list(range(1,las_imp_red.shape[0]+1))
las_imp_red['ranking'] = ranking_red
las_imp_red

  return f(*args, **kwargs)


Score for training data: 0.9879161855443958
(not log) RMSE for training data is: 76.57667893090586
Score for test data: 0.9862853629013361
(not log) RMSE for test data is: 86.46023940864983


Unnamed: 0,Feature,Importance,Abs Importance,ranking
35,log_value,0.412199,0.412199,1
15,interest,-0.008393,0.008393,2
14,gdp,0.002033,0.002033,3
13,vacant_housing_units,-0.000747,0.000747,4
26,renter_occupied_housing_units_paying_cash_medi...,0.0,0.0,5
22,commute_20_24_mins,0.0,0.0,6
23,commute_5_9_mins,-0.0,0.0,7
24,percent_income_spent_on_rent,-0.0,0.0,8
25,housing_built_2000_to_2004,0.0,0.0,9
27,different_house_year_ago_same_city,-0.0,0.0,10


In [42]:
r2_train = las_red.score(X_red_std,Y_train)
r2_test = las_red.score(X_red_test_std, Y_test)
lasso_output['all_PCA'] = [train_RMSE_red, Test_RMSE_red, r2_train, r2_test]
lasso_output

Unnamed: 0,all,all_PCA
RMSE_train,76.623703,76.576679
RMSE_test,83.511691,86.460239
R2_train,0.987961,0.987916
R2_test,0.986912,0.986285


In [43]:
Pkl_Filename = "Pickle_lasso_all_PCA.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(las, file)

## Run a Lasso Regression on the Clusters

In [44]:
# Run Lasso Regression on Cluster 1:
scaler = StandardScaler().fit(c1_train)
features = scaler.transform(c1_train)
c1_strain = pd.DataFrame(features, columns = c1_train.columns)

alphas = 10**np.linspace(10,-2,500)*0.5

lassocv = LassoCV(alphas = alphas, random_state=0)
lassocv.fit(c1_strain, Y_c1train)
lassocv_alpha = lassocv.alpha_

las = Lasso(alpha = lassocv.alpha_, random_state = 0)
las.fit(c1_strain,Y_c1train)

print('Score for training data:', las.score(c1_strain,Y_c1train))

pred = las.predict(c1_strain)
pred = np.exp(pred)
Y_c1_train = np.exp(Y_c1train)
train_RMSE = mean_squared_error(Y_c1_train, pred, squared = False)
print('(not log) RMSE for training data is:', train_RMSE)

test_features = scaler.transform(c1_test)
c1_stest = pd.DataFrame(test_features, columns = c1_test.columns)

print('Score for test data:', las.score(c1_stest,Y_c1test))

pred_test = las.predict(c1_stest)
pred_test = np.exp(pred_test)
Y_c1_test = np.exp(Y_c1test)
Test_RMSE = mean_squared_error(Y_c1_test, pred_test, squared = False)
print('(not log) RMSE for test data is:', Test_RMSE)

las_imp = pd.DataFrame()

las_imp['Feature'] = c1_strain.columns
las_imp['Importance'] = list(las.coef_)
las_imp['Abs Importance'] = abs(las_imp['Importance'])
las_imp = las_imp.sort_values(by = 'Abs Importance', ascending = False)
las_imp

ranking = list(range(1,las_imp.shape[0]+1))
las_imp['ranking'] = ranking
las_imp

  return f(*args, **kwargs)


Score for training data: 0.9856466179233528
(not log) RMSE for training data is: 90.9518031091713
Score for test data: 0.9659316461252685
(not log) RMSE for test data is: 139.52754313775404


Unnamed: 0,Feature,Importance,Abs Importance,ranking
128,log_value,0.337409,0.337409,1
126,interest,-0.012264,0.012264,2
40,owner_occupied_housing_units_lower_value_quartile,0.003498,0.003498,3
56,housing_built_1939_or_earlier,0.003458,0.003458,4
85,employed_public_administration,-0.002901,0.002901,5
...,...,...,...,...
39,renter_occupied_housing_units_paying_cash_medi...,0.000000,0.000000,133
38,aggregate_travel_time_to_work,-0.000000,0.000000,134
37,commute_45_59_mins,-0.000000,0.000000,135
36,commute_30_34_mins,-0.000000,0.000000,136


In [45]:
r2_train = las.score(c1_strain,Y_c1train)
r2_test = las.score(c1_stest,Y_c1test)
lasso_output['Clust_1'] = [train_RMSE, Test_RMSE, r2_train, r2_test]
lasso_output

Unnamed: 0,all,all_PCA,Clust_1
RMSE_train,76.623703,76.576679,90.951803
RMSE_test,83.511691,86.460239,139.527543
R2_train,0.987961,0.987916,0.985647
R2_test,0.986912,0.986285,0.965932


In [46]:
Pkl_Filename = "Pickle_lasso_c1.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(las, file)

In [47]:
# Run a PCA on Cluster 1:
scaler_pca = StandardScaler().fit(c1_train)
features = scaler_pca.transform(c1_train)
df1_scal = pd.DataFrame(features, columns = c1_train.columns)

pca = PCA(n_components = 0.95)
df_reduced = pca.fit_transform(df1_scal)

df_reduced = pd.DataFrame(pca.components_, columns = c1_train.columns, index = list(range(1,df_reduced.shape[1]+1)))
df_reduced = df_reduced.apply(lambda x: abs(x))

feat1 = []
for i in range(0,df_reduced.shape[0]):
    ff = df_reduced.iloc[i].idxmax()
    df_reduced.drop(ff, inplace = True, axis = 1)
    feat1.append(ff)

feat1.append('log_value')

c1_train = c1_train[feat1]
c1_test = c1_test[feat1]

# Run Lasso Regression on Cluster 1 with PCA variables:

scaler = StandardScaler().fit(c1_train)
features = scaler.transform(c1_train)
c1_strain = pd.DataFrame(features, columns = c1_train.columns)

alphas = 10**np.linspace(10,-2,500)*0.5

lassocv = LassoCV(alphas = alphas, random_state=0)
lassocv.fit(c1_strain, Y_c1train)
lassocv_alpha = lassocv.alpha_

las = Lasso(alpha = lassocv.alpha_, random_state = 0)
las.fit(c1_strain,Y_c1train)

print('Score for training data:', las.score(c1_strain,Y_c1train))

pred = las.predict(c1_strain)
pred = np.exp(pred)
Y_c1_train = np.exp(Y_c1train)
train_RMSE = mean_squared_error(Y_c1_train, pred, squared = False)
print('(not log) RMSE for training data is:', train_RMSE)

test_features = scaler.transform(c1_test)
c1_stest = pd.DataFrame(test_features, columns = c1_test.columns)

print('Score for test data:', las.score(c1_stest,Y_c1test))

pred_test = las.predict(c1_stest)
pred_test = np.exp(pred_test)
Y_c1_test = np.exp(Y_c1test)
Test_RMSE = mean_squared_error(Y_c1_test, pred_test, squared = False)
print('(not log) RMSE for test data is:', Test_RMSE)

las_imp = pd.DataFrame()

las_imp['Feature'] = c1_strain.columns
las_imp['Importance'] = list(las.coef_)
las_imp['Abs Importance'] = abs(las_imp['Importance'])
las_imp = las_imp.sort_values(by = 'Abs Importance', ascending = False)
las_imp

ranking = list(range(1,las_imp.shape[0]+1))
las_imp['ranking'] = ranking
las_imp

  return f(*args, **kwargs)


Score for training data: 0.985351366864904
(not log) RMSE for training data is: 92.05015780140334
Score for test data: 0.9873532085456018
(not log) RMSE for test data is: 79.43623622378539


Unnamed: 0,Feature,Importance,Abs Importance,ranking
33,log_value,0.340175,0.340175,1
6,cpi,-0.011492,0.011492,2
5,employed_public_administration,-0.003192,0.003192,3
23,housing_built_1939_or_earlier,0.002843,0.002843,4
14,gdp,0.001771,0.001771,5
10,asian_ratio,0.00082,0.00082,6
26,housing_built_2000_to_2004,-0.0,0.0,7
21,est,0.0,0.0,8
22,dwellings_3_to_4_units,0.0,0.0,9
24,in_undergrad_college,-0.0,0.0,10


In [48]:
r2_train = las.score(c1_strain,Y_c1train)
r2_test = las.score(c1_stest,Y_c1test)
lasso_output['Clust_1_PCA'] = [train_RMSE, Test_RMSE, r2_train, r2_test]
lasso_output

Unnamed: 0,all,all_PCA,Clust_1,Clust_1_PCA
RMSE_train,76.623703,76.576679,90.951803,92.050158
RMSE_test,83.511691,86.460239,139.527543,79.436236
R2_train,0.987961,0.987916,0.985647,0.985351
R2_test,0.986912,0.986285,0.965932,0.987353


In [49]:
Pkl_Filename = "Pickle_lasso_c1_PCA.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(las, file)

In [50]:
# Run Lasso Regression on Cluster 2:

scaler = StandardScaler().fit(c2_train)
features = scaler.transform(c2_train)
c2_strain = pd.DataFrame(features, columns = c2_train.columns)

alphas = 10**np.linspace(10,-2,500)*0.5

lassocv = LassoCV(alphas = alphas, random_state=0)
lassocv.fit(c2_strain, Y_c2train)
lassocv_alpha = lassocv.alpha_

las = Lasso(alpha = lassocv.alpha_, random_state = 0)
las.fit(c2_strain,Y_c2train)

print('Score for training data:', las.score(c2_strain,Y_c2train))

pred = las.predict(c2_strain)
pred = np.exp(pred)
Y_c2_train = np.exp(Y_c2train)
train_RMSE = mean_squared_error(Y_c2_train, pred, squared = False)
print('(not log) RMSE for training data is:', train_RMSE)

test_features = scaler.transform(c2_test)
c2_stest = pd.DataFrame(test_features, columns = c2_test.columns)

print('Score for test data:', las.score(c2_stest,Y_c2test))

pred_test = las.predict(c2_stest)
pred_test = np.exp(pred_test)
Y_c2_test = np.exp(Y_c2test)
Test_RMSE = mean_squared_error(Y_c2_test, pred_test, squared = False)
print('(not log) RMSE for test data is:', Test_RMSE)

las_imp = pd.DataFrame()

las_imp['Feature'] = c2_strain.columns
las_imp['Importance'] = list(las.coef_)
las_imp['Abs Importance'] = abs(las_imp['Importance'])
las_imp = las_imp.sort_values(by = 'Abs Importance', ascending = False)
las_imp

ranking = list(range(1,las_imp.shape[0]+1))
las_imp['ranking'] = ranking
las_imp

  return f(*args, **kwargs)


Score for training data: 0.9850989331994094
(not log) RMSE for training data is: 67.44935946833942
Score for test data: 0.9887094467042553
(not log) RMSE for test data is: 64.80163947749534


Unnamed: 0,Feature,Importance,Abs Importance,ranking
128,log_value,0.375639,0.375639,1
126,interest,-0.005490,0.005490,2
23,median_rent,0.004420,0.004420,3
40,owner_occupied_housing_units_lower_value_quartile,0.004377,0.004377,4
125,gdp,0.003698,0.003698,5
...,...,...,...,...
42,owner_occupied_housing_units_upper_value_quartile,0.000000,0.000000,133
41,owner_occupied_housing_units_median_value,0.000000,0.000000,134
39,renter_occupied_housing_units_paying_cash_medi...,0.000000,0.000000,135
38,aggregate_travel_time_to_work,0.000000,0.000000,136


In [51]:
r2_train = las.score(c2_strain,Y_c2train)
r2_test = las.score(c2_stest,Y_c2test)
lasso_output['Clust_2'] = [train_RMSE, Test_RMSE, r2_train, r2_test]
lasso_output

Unnamed: 0,all,all_PCA,Clust_1,Clust_1_PCA,Clust_2
RMSE_train,76.623703,76.576679,90.951803,92.050158,67.449359
RMSE_test,83.511691,86.460239,139.527543,79.436236,64.801639
R2_train,0.987961,0.987916,0.985647,0.985351,0.985099
R2_test,0.986912,0.986285,0.965932,0.987353,0.988709


In [52]:
Pkl_Filename = "Pickle_lasso_c2.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(las, file)

In [53]:
# Run a PCA on Cluster 2:
scaler_pca = StandardScaler().fit(c2_train)
features = scaler_pca.transform(c2_train)
df2_scal = pd.DataFrame(features, columns = c2_train.columns)

pca = PCA(n_components = 0.95)
df_reduced = pca.fit_transform(df2_scal)

df_reduced = pd.DataFrame(pca.components_, columns = c2_train.columns, index = list(range(1,df_reduced.shape[1]+1)))
df_reduced = df_reduced.apply(lambda x: abs(x))

feat2 = []
for i in range(0,df_reduced.shape[0]):
    ff = df_reduced.iloc[i].idxmax()
    df_reduced.drop(ff, inplace = True, axis = 1)
    feat2.append(ff)

feat2.append('log_value')

c2_train = c2_train[feat2]
c2_test = c2_test[feat2]

# Run Lasso Regression on Cluster 2 with PCA variables:

scaler = StandardScaler().fit(c2_train)
features = scaler.transform(c2_train)
c2_strain = pd.DataFrame(features, columns = c2_train.columns)

alphas = 10**np.linspace(10,-2,500)*0.5

lassocv = LassoCV(alphas = alphas, random_state=0)
lassocv.fit(c2_strain, Y_c2train)
lassocv_alpha = lassocv.alpha_

las = Lasso(alpha = lassocv.alpha_, random_state = 0)
las.fit(c2_strain,Y_c2train)

print('Score for training data:', las.score(c2_strain,Y_c2train))

pred = las.predict(c2_strain)
pred = np.exp(pred)
Y_c2_train = np.exp(Y_c2train)
train_RMSE = mean_squared_error(Y_c2_train, pred, squared = False)
print('(not log) RMSE for training data is:', train_RMSE)

test_features = scaler.transform(c2_test)
c2_stest = pd.DataFrame(test_features, columns = c2_test.columns)

print('Score for test data:', las.score(c2_stest,Y_c2test))

pred_test = las.predict(c2_stest)
pred_test = np.exp(pred_test)
Y_c2_test = np.exp(Y_c2test)
Test_RMSE = mean_squared_error(Y_c2_test, pred_test, squared = False)
print('(not log) RMSE for test data is:', Test_RMSE)

las_imp = pd.DataFrame()

las_imp['Feature'] = c2_strain.columns
las_imp['Importance'] = list(las.coef_)
las_imp['Abs Importance'] = abs(las_imp['Importance'])
las_imp = las_imp.sort_values(by = 'Abs Importance', ascending = False)
las_imp

ranking = list(range(1,las_imp.shape[0]+1))
las_imp['ranking'] = ranking
las_imp

  return f(*args, **kwargs)


Score for training data: 0.9847068444022079
(not log) RMSE for training data is: 67.87959306518874
Score for test data: 0.9879513814489449
(not log) RMSE for test data is: 66.66946137859512


Unnamed: 0,Feature,Importance,Abs Importance,ranking
40,log_value,0.383265,0.383265,1
15,interest,-0.006485,0.006485,2
14,gdp,0.003018,0.003018,3
3,median_income,0.001437,0.001437,4
23,employed_arts_entertainment_recreation_accommo...,0.000834,0.000834,5
18,dwellings_2_units,-0.000585,0.000585,6
35,worked_at_home,0.000479,0.000479,7
16,asian_ratio,1.5e-05,1.5e-05,8
29,employed_manufacturing,-0.0,0.0,9
25,dwellings_5_to_9_units,0.0,0.0,10


In [54]:
r2_train = las.score(c2_strain,Y_c2train)
r2_test = las.score(c2_stest,Y_c2test)
lasso_output['Clust_2_PCA'] = [train_RMSE, Test_RMSE, r2_train, r2_test]
lasso_output

Unnamed: 0,all,all_PCA,Clust_1,Clust_1_PCA,Clust_2,Clust_2_PCA
RMSE_train,76.623703,76.576679,90.951803,92.050158,67.449359,67.879593
RMSE_test,83.511691,86.460239,139.527543,79.436236,64.801639,66.669461
R2_train,0.987961,0.987916,0.985647,0.985351,0.985099,0.984707
R2_test,0.986912,0.986285,0.965932,0.987353,0.988709,0.987951


In [55]:
Pkl_Filename = "Pickle_lasso_c2_PCA.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(las, file)

In [56]:
# Run Lasso Regression on Cluster 3:

scaler = StandardScaler().fit(c3_train)
features = scaler.transform(c3_train)
c3_strain = pd.DataFrame(features, columns = c3_train.columns)

alphas = 10**np.linspace(10,-2,500)*0.5

lassocv = LassoCV(alphas = alphas, random_state=0)
lassocv.fit(c3_strain, Y_c3train)
lassocv_alpha = lassocv.alpha_

las = Lasso(alpha = lassocv.alpha_, random_state = 0)
las.fit(c3_strain,Y_c3train)

print('Score for training data:', las.score(c3_strain,Y_c3train))

pred = las.predict(c3_strain)
pred = np.exp(pred)
Y_c3_train = np.exp(Y_c3train)
train_RMSE = mean_squared_error(Y_c3_train, pred, squared = False)
print('(not log) RMSE for training data is:', train_RMSE)

test_features = scaler.transform(c3_test)
c3_stest = pd.DataFrame(test_features, columns = c3_test.columns)

print('Score for test data:', las.score(c3_stest,Y_c3test))

pred_test = las.predict(c3_stest)
pred_test = np.exp(pred_test)
Y_c3_test = np.exp(Y_c3test)
Test_RMSE = mean_squared_error(Y_c3_test, pred_test, squared = False)
print('(not log) RMSE for test data is:', Test_RMSE)

las_imp = pd.DataFrame()

las_imp['Feature'] = c3_strain.columns
las_imp['Importance'] = list(las.coef_)
las_imp['Abs Importance'] = abs(las_imp['Importance'])
las_imp = las_imp.sort_values(by = 'Abs Importance', ascending = False)
las_imp

ranking = list(range(1,las_imp.shape[0]+1))
las_imp['ranking'] = ranking
las_imp

  return f(*args, **kwargs)


Score for training data: 0.9866795286221771
(not log) RMSE for training data is: 68.53397539099598
Score for test data: 0.9862607098004216
(not log) RMSE for test data is: 76.0035178545983


Unnamed: 0,Feature,Importance,Abs Importance,ranking
128,log_value,0.340628,0.340628,1
126,interest,-0.006348,0.006348,2
125,gdp,0.004728,0.004728,3
23,median_rent,0.003719,0.003719,4
40,owner_occupied_housing_units_lower_value_quartile,0.003375,0.003375,5
...,...,...,...,...
43,married_households,-0.000000,0.000000,133
42,owner_occupied_housing_units_upper_value_quartile,0.000000,0.000000,134
41,owner_occupied_housing_units_median_value,0.000000,0.000000,135
39,renter_occupied_housing_units_paying_cash_medi...,0.000000,0.000000,136


In [57]:
r2_train = las.score(c3_strain,Y_c3train)
r2_test = las.score(c3_stest,Y_c3test)
lasso_output['Clust_3'] = [train_RMSE, Test_RMSE, r2_train, r2_test]
lasso_output

Unnamed: 0,all,all_PCA,Clust_1,Clust_1_PCA,Clust_2,Clust_2_PCA,Clust_3
RMSE_train,76.623703,76.576679,90.951803,92.050158,67.449359,67.879593,68.533975
RMSE_test,83.511691,86.460239,139.527543,79.436236,64.801639,66.669461,76.003518
R2_train,0.987961,0.987916,0.985647,0.985351,0.985099,0.984707,0.98668
R2_test,0.986912,0.986285,0.965932,0.987353,0.988709,0.987951,0.986261


In [58]:
Pkl_Filename = "Pickle_lasso_c3.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(las, file)

In [59]:
# Run a PCA on Cluster 3:
scaler_pca = StandardScaler().fit(c3_train)
features = scaler_pca.transform(c3_train)
df3_scal = pd.DataFrame(features, columns = c3_train.columns)

pca = PCA(n_components = 0.95)
df_reduced = pca.fit_transform(df3_scal)

df_reduced = pd.DataFrame(pca.components_, columns = c3_train.columns, index = list(range(1,df_reduced.shape[1]+1)))
df_reduced = df_reduced.apply(lambda x: abs(x))

feat3 = []
for i in range(0,df_reduced.shape[0]):
    ff = df_reduced.iloc[i].idxmax()
    df_reduced.drop(ff, inplace = True, axis = 1)
    feat3.append(ff)

feat3.append('log_value')

c3_train = c3_train[feat3]
c3_test = c3_test[feat3]

# Run Lasso Regression on Cluster 3 with PCA variables:

scaler = StandardScaler().fit(c3_train)
features = scaler.transform(c3_train)
c3_strain = pd.DataFrame(features, columns = c3_train.columns)

alphas = 10**np.linspace(10,-2,500)*0.5

lassocv = LassoCV(alphas = alphas, random_state=0)
lassocv.fit(c3_strain, Y_c3train)
lassocv_alpha = lassocv.alpha_

las = Lasso(alpha = lassocv.alpha_, random_state = 0)
las.fit(c3_strain,Y_c3train)

print('Score for training data:', las.score(c3_strain,Y_c3train))

pred = las.predict(c3_strain)
pred = np.exp(pred)
Y_c3_train = np.exp(Y_c3train)
train_RMSE = mean_squared_error(Y_c3_train, pred, squared = False)
print('(not log) RMSE for training data is:', train_RMSE)

test_features = scaler.transform(c3_test)
c3_stest = pd.DataFrame(test_features, columns = c3_test.columns)

print('Score for test data:', las.score(c3_stest,Y_c3test))

pred_test = las.predict(c3_stest)
pred_test = np.exp(pred_test)
Y_c3_test = np.exp(Y_c3test)
Test_RMSE = mean_squared_error(Y_c3_test, pred_test, squared = False)
print('(not log) RMSE for test data is:', Test_RMSE)

las_imp = pd.DataFrame()

las_imp['Feature'] = c3_strain.columns
las_imp['Importance'] = list(las.coef_)
las_imp['Abs Importance'] = abs(las_imp['Importance'])
las_imp = las_imp.sort_values(by = 'Abs Importance', ascending = False)
las_imp

ranking = list(range(1,las_imp.shape[0]+1))
las_imp['ranking'] = ranking
las_imp

  return f(*args, **kwargs)


Score for training data: 0.9856957034724503
(not log) RMSE for training data is: 72.21554996275913
Score for test data: 0.9818225128890798
(not log) RMSE for test data is: 89.638352256496


Unnamed: 0,Feature,Importance,Abs Importance,ranking
34,log_value,0.345485,0.345485,1
15,gdp,0.005761,0.005761,2
6,cpi,-0.002223,0.002223,3
22,renter_occupied_housing_units_paying_cash_medi...,0.001392,0.001392,4
3,households_retirement_income,-0.000667,0.000667,5
29,housing_built_1939_or_earlier,0.000272,0.000272,6
26,employed_education_health_social,-0.0,0.0,7
21,commute_5_9_mins,-0.0,0.0,8
23,commute_25_29_mins,-0.0,0.0,9
24,housing_built_2005_or_later,-0.0,0.0,10


In [60]:
r2_train = las.score(c3_strain,Y_c3train)
r2_test = las.score(c3_stest,Y_c3test)
lasso_output['Clust_3_PCA'] = [train_RMSE, Test_RMSE, r2_train, r2_test]
lasso_output

Unnamed: 0,all,all_PCA,Clust_1,Clust_1_PCA,Clust_2,Clust_2_PCA,Clust_3,Clust_3_PCA
RMSE_train,76.623703,76.576679,90.951803,92.050158,67.449359,67.879593,68.533975,72.21555
RMSE_test,83.511691,86.460239,139.527543,79.436236,64.801639,66.669461,76.003518,89.638352
R2_train,0.987961,0.987916,0.985647,0.985351,0.985099,0.984707,0.98668,0.985696
R2_test,0.986912,0.986285,0.965932,0.987353,0.988709,0.987951,0.986261,0.981823


In [61]:
Pkl_Filename = "Pickle_lasso_c3_PCA.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(las, file)