# Part III: Regression Analysis

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
df = pd.read_pickle("../data/housing_step2.csv")

In [3]:
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household,log_total_rooms,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,median_house_value
0,-1.327835,1.052548,0.982143,-0.804819,-0.972476,-0.974429,-0.977033,2.344766,0.628559,-1.149930,-0.049597,-1.129255,0.0,0.0,0.0,1.0,0.0,452600.0
1,-1.322844,1.043185,-0.607019,2.045890,1.357143,0.861439,1.669961,2.332238,0.327041,-0.990381,-0.092512,1.648839,0.0,0.0,0.0,1.0,0.0,358500.0
2,-1.332827,1.038503,1.856182,-0.535746,-0.827024,-0.820777,-0.843637,1.782699,1.155620,-1.445865,-0.025843,-0.449227,0.0,0.0,0.0,1.0,0.0,352100.0
3,-1.337818,1.038503,1.856182,-0.624215,-0.719723,-0.766028,-0.733781,0.932968,0.156966,-0.493627,-0.050329,-0.636925,0.0,0.0,0.0,1.0,0.0,341300.0
4,-1.337818,1.038503,1.856182,-0.462404,-0.612423,-0.759847,-0.629157,-0.012881,0.344711,-0.707889,-0.085616,-0.311482,0.0,0.0,0.0,1.0,0.0,342200.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-0.758826,1.801647,-0.289187,-0.444985,-0.388283,-0.512592,-0.443449,-1.216128,-0.155023,0.202517,-0.049110,-0.280761,0.0,1.0,0.0,0.0,0.0,78100.0
20636,-0.818722,1.806329,-0.845393,-0.888704,-0.922403,-0.944405,-1.008420,-0.691593,0.276881,0.039312,0.005021,-1.439476,0.0,1.0,0.0,0.0,0.0,77100.0
20637,-0.823713,1.778237,-0.924851,-0.174995,-0.123608,-0.369537,-0.174042,-1.142593,-0.090318,0.038706,-0.071735,0.122265,0.0,1.0,0.0,0.0,0.0,92300.0
20638,-0.873626,1.778237,-0.845393,-0.355600,-0.304827,-0.604429,-0.393753,-1.054583,-0.040211,0.120501,-0.091225,-0.133391,0.0,1.0,0.0,0.0,0.0,84700.0


In [4]:
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

In [5]:
train_set.shape

(16512, 18)

In [6]:
test_set.shape

(4128, 18)

In [7]:
X_train = train_set.drop('median_house_value', axis=1)  
y_train = train_set['median_house_value'] 

In [8]:
X_test = test_set.drop('median_house_value', axis=1)  
y_test = test_set['median_house_value'] 

## Model Selection

### Linear Regression

In [9]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [10]:
some_data = X_test[:5]
some_labels = y_test[:5]
print("Predictions:", lin_reg.predict(some_data))
print("Labels:", list(some_labels))

Predictions: [ 35415.45231364 141310.2633874  297567.35984304 285256.64686991
 265167.96488489]
Labels: [47700.0, 45800.0, 500001.0, 218600.0, 278000.0]


In [12]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

lin_mae = mean_absolute_error(y_test, lin_reg.predict(X_test))
lin_mae

49569.84570677133

In [17]:
lin_rmse = mean_squared_error(y_test, lin_reg.predict(X_test), squared=False)
lin_rmse

69025.25137863508

### Decision Tree Regression

In [18]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(X_train, y_train)

In [19]:
tree_mae = mean_absolute_error(y_test, tree_reg.predict(X_test))
tree_mae

43046.99055232558

In [20]:
tree_rmse = mean_squared_error(y_test, tree_reg.predict(X_test), squared=False)
tree_rmse

69917.6428609465

### RandomForest Regressor

In [21]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(X_train, y_train)

In [22]:
forest_mae = mean_absolute_error(y_test, forest_reg.predict(X_test))
forest_mae

31911.19426841085

In [23]:
forest_rmse = mean_squared_error(y_test, forest_reg.predict(X_test), squared=False)
forest_rmse

49834.638872024654

## Fine Tuning 

### Grid Search

In [24]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42) 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(X_train, y_train)

In [25]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 30}

In [26]:
grid_search.best_estimator_

In [29]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

63402.74635396081 {'max_features': 2, 'n_estimators': 3}
55392.15009342948 {'max_features': 2, 'n_estimators': 10}
52913.810452917896 {'max_features': 2, 'n_estimators': 30}
60331.20000447827 {'max_features': 4, 'n_estimators': 3}
53022.31400261046 {'max_features': 4, 'n_estimators': 10}
50574.23551774511 {'max_features': 4, 'n_estimators': 30}
59449.51310116705 {'max_features': 6, 'n_estimators': 3}
51830.877457991126 {'max_features': 6, 'n_estimators': 10}
49789.01187344432 {'max_features': 6, 'n_estimators': 30}
59422.22908066376 {'max_features': 8, 'n_estimators': 3}
51927.51490681802 {'max_features': 8, 'n_estimators': 10}
49841.32044123972 {'max_features': 8, 'n_estimators': 30}
63635.15185026304 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
54367.84715627053 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
60813.196349698665 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52945.854083730745 {'bootstrap': False, 'max_features': 3, 'n_estimators'

### Randomized Search

In [27]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=8),
    }

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(X_train, y_train)

In [28]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

49075.89452977815 {'max_features': 7, 'n_estimators': 180}
51223.798934508086 {'max_features': 5, 'n_estimators': 15}
50817.40867284987 {'max_features': 3, 'n_estimators': 72}
50410.16140666042 {'max_features': 5, 'n_estimators': 21}
49157.578286618154 {'max_features': 7, 'n_estimators': 122}
50765.8007866212 {'max_features': 3, 'n_estimators': 75}
50666.028775530125 {'max_features': 3, 'n_estimators': 88}
49375.31425397575 {'max_features': 5, 'n_estimators': 100}
50425.9133140494 {'max_features': 3, 'n_estimators': 150}
65388.09259201296 {'max_features': 5, 'n_estimators': 2}
