In [59]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

In [60]:
from sklearn.base import TransformerMixin, BaseEstimator

rooms_ix, bedrooms_ix, population_ix, households_ix=3,4,5,6

class CombinedAttributeAdder(BaseEstimator, TransformerMixin):
    
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    
    def fit(self,X,y=None):
        
        return self
    
    def transform(self,X,y=None):
        rooms_per_household=X[:,rooms_ix]/X[:,households_ix]
        population_per_household=X[:,population_ix]/X[:,households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix]/X[:,rooms_ix]
            return np.c_[X, rooms_per_household,
                         population_per_household,
                        bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household,
                         population_per_household]
        

In [70]:
housing = pd.read_csv('../data_for_ml/housing.csv')
housing_y = housing['median_house_value']
housing_x = housing.drop(columns = ['median_house_value'])
housing_numeric_col = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income']
housing_cat_col = ['ocean_proximity']

In [71]:
numerical_transformers = Pipeline(steps=[
    ('simple_imputer',SimpleImputer()),
    ('attrib_adder', CombinedAttributeAdder()),
    ('scaling', StandardScaler())
])

fullpipeline = ColumnTransformer([
    ('num', numerical_transformers, housing_numeric_col),
    ('cat', OneHotEncoder(), housing_cat_col)
])

In [119]:
# running on whole first
housing_pross = fullpipeline.fit_transform(housing_x)

In [53]:
dtr = DecisionTreeRegressor()
dtr.fit(housing_pross, housing_y)

DecisionTreeRegressor()

In [55]:
some_data = housing_pross[:100]
some_label = housing_y[:100]
preds = dtr.predict(some_data)

In [61]:
tree_mse = mean_squared_error(some_label, preds)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

In [22]:
housing['income_cat'] = pd.cut(housing['median_income'],
                    bins=[0.,1.5,3.0,4.5,6.0,np.inf],
                    labels = [1,2,3,4,5])

In [27]:
housing['income_cat'].value_counts()/housing.shape[0]

3    0.350581
2    0.318847
4    0.176308
5    0.114438
1    0.039826
Name: income_cat, dtype: float64

In [36]:
# splitting with stratified shuffling the data
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index,test_index in split.split(housing, housing['income_cat']):
    strait_train_set = housing.loc[train_index] 
    strait_test_set  = housing.loc[test_index]

In [39]:
strait_test_set['income_cat'].value_counts()/strait_test_set.shape[0]

3    0.350533
2    0.318798
4    0.176357
5    0.114583
1    0.039729
Name: income_cat, dtype: float64

In [62]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(dtr, housing_pross, housing_y,scoring="neg_mean_squared_error", cv=10)


#scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
#scoring="neg_mean_squared_error", cv=10)
#tree_rmse_scores = np.sqrt(-scores)
#

In [64]:
np.sqrt(-scores)

array([119687.88503312,  72863.40731142,  83488.60047963,  76187.7107169 ,
        89571.32929015,  79865.12472443,  67531.88473692, 102574.00709472,
        95123.81881365,  72220.11020519])

### Using Random Forest Regressor

It uses ensemble of various trees and averages over their predictions.


In [73]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(housing_pross, housing_y)

RandomForestRegressor()

#### Scoring the random forest with

In [74]:
score_r_for = cross_val_score(forest_reg, housing_pross, housing_y,
                             scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-score_r_for)

In [75]:
tree_rmse_scores

array([97558.90957667, 47698.75252643, 65492.87769314, 56616.68224428,
       61311.08443543, 59576.08876532, 46483.25294373, 79553.79577879,
       74393.68322745, 49224.93702995])

In [76]:
def display_scored(scores):
    print("Scores:" , scores)
    print("Mean:", scores.mean())
    print("Standard deviation", scores.std())
display_scored(tree_rmse_scores)

Scores: [97558.90957667 47698.75252643 65492.87769314 56616.68224428
 61311.08443543 59576.08876532 46483.25294373 79553.79577879
 74393.68322745 49224.93702995]
Mean: 63791.00642211883
Standard deviation 15330.119855342624


## Techniques For Fine-Tuning Models

### Grid Search
Fiddle with the hyperparameters manually, we find a great combination of hyperparameter values.

### Randomized Search
The grid search approach is fine when you are exploring relatively few combinations,
like in the previous example, but when the hyperparameter search space is large, it is
often preferable to use RandomizedSearchCV instead. This class can be used in much
the same way as the GridSearchCV class, but instead of trying out all possible combi‐
nations, it evaluates a given number of random combinations by selecting a random
value for each hyperparameter at every iteration.

In [77]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators':[3, 10, 30], 'max_features':[2, 4, 6, 8]},
    {'bootstrap':[False], 'n_estimators':[3,10], 'max_features':[2,3,4]},
]
# n_estimators: The number of trees in the forest.
#
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                          scoring = 'neg_mean_squared_error',
                          return_train_score = True)
grid_search.fit(housing_pross, housing_y)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [78]:
grid_search.best_params_

{'max_features': 4, 'n_estimators': 30}

In [84]:
crves = grid_search.cv_results_
for mean_score, params in zip(crves['mean_test_score'],crves['params']):
    print(np.sqrt(-mean_score), params)

85665.87447465396 {'max_features': 2, 'n_estimators': 3}
74426.31480774049 {'max_features': 2, 'n_estimators': 10}
70335.76747652866 {'max_features': 2, 'n_estimators': 30}
77690.3752731141 {'max_features': 4, 'n_estimators': 3}
69225.60862305058 {'max_features': 4, 'n_estimators': 10}
68043.8634421903 {'max_features': 4, 'n_estimators': 30}
78447.75109243182 {'max_features': 6, 'n_estimators': 3}
71404.96006910324 {'max_features': 6, 'n_estimators': 10}
68644.53052182941 {'max_features': 6, 'n_estimators': 30}
78284.03821698007 {'max_features': 8, 'n_estimators': 3}
72259.70156933276 {'max_features': 8, 'n_estimators': 10}
68972.3230549032 {'max_features': 8, 'n_estimators': 30}
80294.22429324695 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
72873.1251990916 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
80723.20730028646 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
74063.47006288648 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}
80

In [86]:
important_feat = grid_search.best_estimator_.feature_importances_
important_feat

(16,)

16

In [150]:
arrx = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
        'rooms_per_household','population_per_household','bedrooms_per_room']
arrx.extend(
    fullpipeline.named_transformers_['cat'].categories_[0].tolist()
)

In [160]:
"""
'median_income'
'INLAND'

These affect the prediction the most

"""
list(zip(arrx,important_feat))

[('longitude', 0.08020316816710923),
 ('latitude', 0.0761056571934163),
 ('housing_median_age', 0.04051633369611144),
 ('total_rooms', 0.023479810416991726),
 ('total_bedrooms', 0.021125626738222422),
 ('population', 0.023916943195943962),
 ('households', 0.01956604179520294),
 ('median_income', 0.3146426956888565),
 ('rooms_per_household', 0.06372173775252479),
 ('population_per_household', 0.09549174044917362),
 ('bedrooms_per_room', 0.07772014647366733),
 ('<1H OCEAN', 0.018062678146430786),
 ('INLAND', 0.13008751571195715),
 ('ISLAND', 0.0001449959036030404),
 ('NEAR BAY', 0.007498031979024085),
 ('NEAR OCEAN', 0.007716876691764648)]

In [107]:
from sklearn.preprocessing import OneHotEncoder

array = np.array(['cat','cat','dog','dog','goat','goat','lion','cat','lion','dog','lion','goat'])
array = array.reshape(-1,1)

ohe = OneHotEncoder()
ohe.fit(array)

OneHotEncoder()

In [108]:
ohe.categories_

[array(['cat', 'dog', 'goat', 'lion'], dtype='<U4')]

In [111]:
ohe.transform(array).toarray()

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.]])