In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import r2_score
import os
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.linear_model import ElasticNetCV, ElasticNet
from sklearn.tree import DecisionTreeRegressor

In [2]:
dataset_path_USA = os.path.join('../../datasets/USA_houseprices_2014/')
file_name_USA = 'preprocessed_data_USAhouseprices2014.csv'
df = pd.read_csv(dataset_path_USA + file_name_USA)

X = df.drop(['price'], axis = 1)
y = df['price']

# Random Forest

In [3]:
from sklearn.model_selection import RepeatedKFold
import eli5
from eli5.sklearn import PermutationImportance
from eli5 import show_prediction, show_weights

Xfeature_names = X.columns

regressor =  RandomForestRegressor(n_estimators = 5, random_state = 42)

# K-fold cross validation with permutation testing
cv = RepeatedKFold(n_splits=5, n_repeats = 5, random_state = 42)
perm = PermutationImportance(regressor, cv = cv)
perm.fit(X.values,y)

show_weights(perm, feature_names = X.columns.tolist())

Weight,Feature
1.0022  ± 0.3002,sqft_living
0.1743  ± 0.0771,statezip
0.1620  ± 0.0620,city
0.0307  ± 0.0512,sqft_lot
0.0291  ± 0.0336,view
0.0281  ± 0.0539,bathrooms
0.0110  ± 0.0271,condition
0.0106  ± 0.0404,year_since_1st_renovation
0.0100  ± 0.0320,sqft_basement
0.0082  ± 0.0253,bedrooms


In [4]:
df_fi = pd.DataFrame(dict(feature_names=X.columns.tolist(),
                          feat_imp=perm.feature_importances_, 
                          std=perm.feature_importances_std_,
                          ))
df_fi = df_fi.round(4)
df_fi.sort_values('feat_imp', ascending=True)

Unnamed: 0,feature_names,feat_imp,std
12,have_basement,0.0012,0.004
5,waterfront,0.0019,0.0084
4,floors,0.0065,0.0107
0,bedrooms,0.0082,0.0126
8,sqft_basement,0.01,0.016
9,year_since_1st_renovation,0.0106,0.0202
7,condition,0.011,0.0136
1,bathrooms,0.0281,0.0269
6,view,0.0291,0.0168
3,sqft_lot,0.0307,0.0256


In [5]:
from sklearn.model_selection import cross_val_score

baseline_cv_results = cross_val_score(regressor, X, y, cv=6)  # use the same number of cvs
baseline_score = baseline_cv_results.mean()

import plotly.express as px
import plotly.graph_objects as go

# create df with columns = each feature, rows = score for each permutation each cv (600 in your case)
df_results = pd.DataFrame(data=perm.results_, columns=X.columns)
# feat_imps values will be same as perm.feature_importances_, but as a pd.Series with index labels corresponding to the feature names
feat_imps = df_results.mean().sort_values(ascending=False)
# reorder columns from most to least important
df_results = df_results[feat_imps.index]
# create boxplots of full results. pd.melt() is a handy way to reformat into a longform dataframe that plays well with plotly express.
fig = px.box(df_results.melt(), x='variable', y='value', orientation='v')
# add a marker showing the mean feature importance for each
fig.add_trace(go.Scatter(x=feat_imps.index, y=feat_imps.values, mode='markers', marker=dict(color='red'), name = 'Mean'))
fig

# CART

In [7]:
Xfeature_names = X.columns

regressor =  DecisionTreeRegressor(random_state = 42)

# K-fold cross validation with permutation testing
cv = RepeatedKFold(n_splits=5, n_repeats = 5, random_state = 42)
perm = PermutationImportance(regressor, cv = cv)
perm.fit(X.values,y)

show_weights(perm, feature_names = X.columns.tolist())

Weight,Feature
1.0962  ± 0.5690,sqft_living
0.3075  ± 0.1724,statezip
0.2818  ± 0.1718,city
0.0771  ± 0.1419,bathrooms
0.0747  ± 0.1193,sqft_lot
0.0345  ± 0.0747,view
0.0326  ± 0.0905,floors
0.0283  ± 0.0883,year_since_1st_renovation
0.0262  ± 0.0733,bedrooms
0.0167  ± 0.0583,condition


In [8]:
df_fi = pd.DataFrame(dict(feature_names=X.columns.tolist(),
                          feat_imp=perm.feature_importances_, 
                          std=perm.feature_importances_std_,
                          ))
df_fi = df_fi.round(4)
df_fi.sort_values('feat_imp', ascending=True)

Unnamed: 0,feature_names,feat_imp,std
5,waterfront,0.0002,0.0158
12,have_basement,0.005,0.01
8,sqft_basement,0.0166,0.0342
7,condition,0.0167,0.0291
0,bedrooms,0.0262,0.0367
9,year_since_1st_renovation,0.0283,0.0442
4,floors,0.0326,0.0452
6,view,0.0345,0.0373
3,sqft_lot,0.0747,0.0597
1,bathrooms,0.0771,0.071


In [9]:
from sklearn.model_selection import cross_val_score

baseline_cv_results = cross_val_score(regressor, X, y, cv=6)  # use the same number of cvs
baseline_score = baseline_cv_results.mean()

import plotly.express as px
import plotly.graph_objects as go

# create df with columns = each feature, rows = score for each permutation each cv (600 in your case)
df_results = pd.DataFrame(data=perm.results_, columns=X.columns)
# feat_imps values will be same as perm.feature_importances_, but as a pd.Series with index labels corresponding to the feature names
feat_imps = df_results.mean().sort_values(ascending=False)
# reorder columns from most to least important
df_results = df_results[feat_imps.index]
# create boxplots of full results. pd.melt() is a handy way to reformat into a longform dataframe that plays well with plotly express.
fig = px.box(df_results.melt(), x='variable', y='value', orientation='v')
# add a marker showing the mean feature importance for each
fig.add_trace(go.Scatter(x=feat_imps.index, y=feat_imps.values, mode='markers', marker=dict(color='red'), name = 'Mean'))
fig

# Elastic Net

In [11]:
cv_model = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, .995, 1], eps=0.001, n_alphas=100, fit_intercept=True, 
                        precompute='auto', max_iter=2500, tol=0.0001, cv=5, copy_X=True, verbose=0, n_jobs=-1, positive=False, 
                        random_state=42, selection='cyclic')

cv_model.fit(X, y)

# K-fold cross validation with permutation testing
cv = RepeatedKFold(n_splits=5, n_repeats = 5, random_state = 42)
perm = PermutationImportance(cv_model, cv = cv)
perm.fit(X.values,y)

show_weights(perm, feature_names = X.columns.tolist())

Weight,Feature
1.0038  ± 0.2055,sqft_living
0.0466  ± 0.0197,view
0.0277  ± 0.0196,city
0.0209  ± 0.0172,bedrooms
0.0175  ± 0.0111,condition
0.0106  ± 0.0129,waterfront
0.0091  ± 0.0131,sqft_lot
0.0058  ± 0.0114,sqft_basement
0.0054  ± 0.0124,have_basement
0.0032  ± 0.0048,statezip


In [12]:
df_fi = pd.DataFrame(dict(feature_names=X.columns.tolist(),
                          feat_imp=perm.feature_importances_, 
                          std=perm.feature_importances_std_,
                          ))
df_fi = df_fi.round(4)
df_fi.sort_values('feat_imp', ascending=True)

Unnamed: 0,feature_names,feat_imp,std
9,year_since_1st_renovation,0.0019,0.0019
1,bathrooms,0.0023,0.0026
4,floors,0.0024,0.0023
11,statezip,0.0032,0.0024
12,have_basement,0.0054,0.0062
8,sqft_basement,0.0058,0.0057
3,sqft_lot,0.0091,0.0065
5,waterfront,0.0106,0.0065
7,condition,0.0175,0.0056
0,bedrooms,0.0209,0.0086


In [13]:
from sklearn.model_selection import cross_val_score

baseline_cv_results = cross_val_score(cv_model, X, y, cv=6)  # use the same number of cvs
baseline_score = baseline_cv_results.mean()

import plotly.express as px
import plotly.graph_objects as go

# create df with columns = each feature, rows = score for each permutation each cv (600 in your case)
df_results = pd.DataFrame(data=perm.results_, columns=X.columns)
# feat_imps values will be same as perm.feature_importances_, but as a pd.Series with index labels corresponding to the feature names
feat_imps = df_results.mean().sort_values(ascending=False)
# reorder columns from most to least important
df_results = df_results[feat_imps.index]
# create boxplots of full results. pd.melt() is a handy way to reformat into a longform dataframe that plays well with plotly express.
fig = px.box(df_results.melt(), x='variable', y='value', orientation='v')
# add a marker showing the mean feature importance for each
fig.add_trace(go.Scatter(x=feat_imps.index, y=feat_imps.values, mode='markers', marker=dict(color='red'), name = 'Mean'))
fig

# Linear Regression

In [14]:
lr = LinearRegression()
model = lr.fit(X,y)

# K-fold cross validation with permutation testing
cv = RepeatedKFold(n_splits=5, n_repeats = 5, random_state = 42)
perm = PermutationImportance(model, cv = cv)
perm.fit(X.values,y)

show_weights(perm, feature_names = X.columns.tolist())

Weight,Feature
1.0426  ± 0.2132,sqft_living
0.0463  ± 0.0209,view
0.0284  ± 0.0175,city
0.0237  ± 0.0188,bedrooms
0.0188  ± 0.0121,condition
0.0098  ± 0.0149,waterfront
0.0098  ± 0.0107,sqft_lot
0.0091  ± 0.0111,sqft_basement
0.0068  ± 0.0121,have_basement
0.0033  ± 0.0059,statezip


In [15]:
df_fi = pd.DataFrame(dict(feature_names=X.columns.tolist(),
                          feat_imp=perm.feature_importances_, 
                          std=perm.feature_importances_std_,
                          ))
df_fi = df_fi.round(4)
df_fi.sort_values('feat_imp', ascending=True)

Unnamed: 0,feature_names,feat_imp,std
9,year_since_1st_renovation,0.0017,0.0019
4,floors,0.0018,0.0019
1,bathrooms,0.0022,0.0028
11,statezip,0.0033,0.003
12,have_basement,0.0068,0.006
8,sqft_basement,0.0091,0.0056
3,sqft_lot,0.0098,0.0054
5,waterfront,0.0098,0.0074
7,condition,0.0188,0.0061
0,bedrooms,0.0237,0.0094


In [39]:
from sklearn.model_selection import cross_val_score

baseline_cv_results = cross_val_score(model, X, y, cv=6)  # use the same number of cvs
baseline_score = baseline_cv_results.mean()
baseline_score

0.5435980076711925

In [16]:
import plotly.express as px
import plotly.graph_objects as go

# create df with columns = each feature, rows = score for each permutation each cv (600 in your case)
df_results = pd.DataFrame(data=perm.results_, columns=X.columns)
# feat_imps values will be same as perm.feature_importances_, but as a pd.Series with index labels corresponding to the feature names
feat_imps = df_results.mean().sort_values(ascending=False)
# reorder columns from most to least important
df_results = df_results[feat_imps.index]
# create boxplots of full results. pd.melt() is a handy way to reformat into a longform dataframe that plays well with plotly express.
fig = px.box(df_results.melt(), x='variable', y='value', orientation='v')
# add a marker showing the mean feature importance for each
fig.add_trace(go.Scatter(x=feat_imps.index, y=feat_imps.values, mode='markers', marker=dict(color='red'), name = 'Mean'))
fig