In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.cluster import KMeans
from sklearn.gaussian_process.kernels import Matern, RBF
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

import plotly.express as px


import plotly.graph_objects as go

from scipy.linalg import toeplitz, block_diag

from spe.mse_estimator import ErrorComparer
from spe.data_generation import gen_rbf_X, gen_matern_X, create_clus_split, gen_cov_mat
from spe.forest import BlurredForest
from spe.estimators import kfoldcv, kmeanscv, better_test_est_split, cp_rf_train_test, cp_general_train_test, bag_kfoldcv, bag_kmeanscv

import os

In [41]:


n_estimators = 100
max_depth = 6

In [135]:
df = pd.read_csv('~/Downloads/housing.csv')#, header=None)

In [136]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [137]:
df['median_house_value'] = df.median_house_value / 1000

In [138]:
# df = df.drop('ocean_proximity',axis=1)
df = pd.get_dummies(df)

In [139]:
df.isna().mean()

longitude                     0.000000
latitude                      0.000000
housing_median_age            0.000000
total_rooms                   0.000000
total_bedrooms                0.010029
population                    0.000000
households                    0.000000
median_income                 0.000000
median_house_value            0.000000
ocean_proximity_<1H OCEAN     0.000000
ocean_proximity_INLAND        0.000000
ocean_proximity_ISLAND        0.000000
ocean_proximity_NEAR BAY      0.000000
ocean_proximity_NEAR OCEAN    0.000000
dtype: float64

In [140]:
mean_bdr = df.total_bedrooms.mean()
df['total_bedrooms'] = df.total_bedrooms.fillna(value=mean_bdr)

In [141]:
df.isna().mean()

longitude                     0.0
latitude                      0.0
housing_median_age            0.0
total_rooms                   0.0
total_bedrooms                0.0
population                    0.0
households                    0.0
median_income                 0.0
median_house_value            0.0
ocean_proximity_<1H OCEAN     0.0
ocean_proximity_INLAND        0.0
ocean_proximity_ISLAND        0.0
ocean_proximity_NEAR BAY      0.0
ocean_proximity_NEAR OCEAN    0.0
dtype: float64

In [142]:
X = df.drop('median_house_value',axis=1)
y = df['median_house_value']

In [143]:
X_tr, X_ts, y_tr, y_ts = train_test_split(X, y, test_size=.2)

In [144]:
X_tr.shape, X_ts.shape, y_tr.shape, y_ts.shape

((16512, 13), (4128, 13), (16512,), (4128,))

In [145]:
rf = RandomForestRegressor(n_estimators=n_estimators,
                          max_depth=max_depth)
rf.fit(X_tr, y_tr)

In [146]:
# rf_mse = rf.score(X_ts, y_ts)
rf_preds = rf.predict(X_ts)
rf_mae = np.mean(np.fabs(y_ts.values - rf_preds))
rf_mse = np.mean((y_ts.values - rf_preds)**2)

In [147]:
rf_mse, rf_mae

(4184.466175063796, 45.33389695258656)

In [148]:
bf = BlurredForest(n_estimators=n_estimators,  
                  max_depth=max_depth, 
                  bootstrap_type=None)
bf.fit(X_tr, y_tr)

In [149]:
bf_preds = bf.predict(X_ts, full_refit=True)

In [150]:
bf_mae = np.mean(np.fabs(y_ts.values - bf_preds))
bf_mse = np.mean((y_ts.values - bf_preds)**2)

In [151]:
bf_mse, bf_mae

(6807.784268947851, 64.9454099115738)

In [152]:
mae_df = pd.DataFrame({
    'RF': rf_mae,
    'RF_FR': bf_mae
}, index=[0])

In [153]:
mse_df = pd.DataFrame({
    'RF': rf_mse,
    'RF_FR': bf_mse
}, index=[0])

In [154]:
fig = go.Figure()
fig.add_trace(go.Bar(
#     x=['VRF', 'BF_WR', 'BF_FR'], 
    x = mae_df.columns,
    y=mae_df.values.flatten(),
    marker_color=px.colors.qualitative.Plotly,
    text=np.around(mae_df.values.flatten(),3),
    textposition='outside',
))
# fig.add_trace(go.Bar(
#     name='Experimental',
#     x=['GenCp', 'KFCV', 'SPCV'], y=(df).mean(),
#     error_y=dict(type='data', array=[1, 2])
# ))
# fig.update_layout(barmode='group')
# fig.add_hline(y=1., line_color='red')
fig.update_layout(
#     title=f"FC_\u03B4{delta}_snr{snr}_nk{noise_kernel}_nls{noise_length_scale}_nv{noise_nu}_Xk{X_kernel}_Xls{X_length_scale}_Xv{X_nu}",
    title=f"CA Housing Data: Random Forest vs Full Refit Random Forest, n_trees={n_estimators}, depth={max_depth}",
    xaxis_title="Method",
    yaxis_title="MAE",
#     legend_title="Legend Title",
#     font=dict(
#         family="Courier New, monospace",
#         size=18,
#         color="RebeccaPurple"
#     )
)
# barfp = os.path.expanduser(savedir + fig.layout.title['text'] + ".jpeg")
# fig.write_image(os.path.expanduser(barfp))
fig.show()

In [155]:
fig = go.Figure()
fig.add_trace(go.Bar(
#     x=['VRF', 'BF_WR', 'BF_FR'], 
    x = mse_df.columns,
    y=mse_df.values.flatten(),
    marker_color=px.colors.qualitative.Plotly,
    text=np.around(mse_df.values.flatten(),3),
    textposition='outside',
))
# fig.add_trace(go.Bar(
#     name='Experimental',
#     x=['GenCp', 'KFCV', 'SPCV'], y=(df).mean(),
#     error_y=dict(type='data', array=[1, 2])
# ))
# fig.update_layout(barmode='group')
# fig.add_hline(y=1., line_color='red')
fig.update_layout(
#     title=f"FC_\u03B4{delta}_snr{snr}_nk{noise_kernel}_nls{noise_length_scale}_nv{noise_nu}_Xk{X_kernel}_Xls{X_length_scale}_Xv{X_nu}",
    title=f"CA Housing Data: Random Forest vs Full Refit Random Forest, n_trees={n_estimators}, depth={max_depth}",
    xaxis_title="Method",
    yaxis_title="MSE",
#     legend_title="Legend Title",
#     font=dict(
#         family="Courier New, monospace",
#         size=18,
#         color="RebeccaPurple"
#     )
)
# barfp = os.path.expanduser(savedir + fig.layout.title['text'] + ".jpeg")
# fig.write_image(os.path.expanduser(barfp))
fig.show()