In [92]:
#importing libraries
import numpy as np
import pandas as pd

In [93]:
#loading housing data
housing = pd.read_csv('housing.csv')

In [94]:
#spliting data into test and training set
from sklearn.model_selection import train_test_split

train, test = train_test_split(housing,test_size=0.2,random_state=42)

In [95]:
#display the dimensions of the test and train data
train.shape,test.shape

((16512, 10), (4128, 10))

In [96]:
# importing the SimpleImputer library and using the median strategy to fill missing data
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'median')

In [97]:
#dropping the 'ocean_promixity' column from the training dataset 
train_num = train.drop('ocean_proximity',axis =1)

In [98]:

# fit the median strategy unto the numerical columns
imputer.fit(train_num)

SimpleImputer(strategy='median')

In [99]:
# display median value of various numerical columns
imputer.statistics_

array([-1.1851e+02,  3.4260e+01,  2.9000e+01,  2.1290e+03,  4.3700e+02,
        1.1670e+03,  4.1000e+02,  3.5458e+00,  1.7985e+05])

In [100]:
#tranform numerical columns with median strategy
X = imputer.transform(train_num)

In [101]:
#converting transformed numerical columns back to a dataframe
train_tr = pd.DataFrame(X,columns=train_num.columns)

In [102]:
# copying the 'ocean_proximity' column into another variable
train_cat =train[['ocean_proximity']]
train_cat.head(10)

Unnamed: 0,ocean_proximity
14196,NEAR OCEAN
8267,NEAR OCEAN
17445,NEAR OCEAN
14265,NEAR OCEAN
2271,INLAND
17848,<1H OCEAN
6252,<1H OCEAN
9389,NEAR BAY
6113,<1H OCEAN
6061,<1H OCEAN


In [103]:
# importing OneHotEncoder to encode categorical data
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
train_cat_1hot = cat_encoder.fit_transform(train_cat)
train_cat_1hot

<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>

In [104]:
# converting the resulting sparse matrix to array
train_cat_1hot.toarray()

array([[0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

In [105]:
#feature engineering
train['rooms_per_household'] = train['total_rooms']/train['households']
train['bedrooms_per_room'] = train['total_bedrooms']/train['total_rooms']
train['population_per_household'] = train['population']/train['households']

In [106]:

corr_matrix = train.corr()
corr_matrix['median_house_value'].sort_values(ascending = False)

median_house_value          1.000000
median_income               0.690647
rooms_per_household         0.158485
total_rooms                 0.133989
housing_median_age          0.103706
households                  0.063714
total_bedrooms              0.047980
population_per_household   -0.022030
population                 -0.026032
longitude                  -0.046349
latitude                   -0.142983
bedrooms_per_room          -0.257419
Name: median_house_value, dtype: float64

In [107]:
train_labels =train['median_house_value'].copy()
#drop the labels from the train set via the pandas drop function
train = train.drop('median_house_value',axis =1)
train_num = train.drop('ocean_proximity',axis =1)

train_labels.head()

14196    103000.0
8267     382100.0
17445    172600.0
14265     93400.0
2271      96500.0
Name: median_house_value, dtype: float64

In [108]:
train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,NEAR OCEAN,5.017657,0.200576,3.691814
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,NEAR OCEAN,4.473545,0.232703,1.738095
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,NEAR OCEAN,5.645833,0.174486,2.723214
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,NEAR OCEAN,4.002817,0.258269,3.994366
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,INLAND,6.268421,0.18094,2.3


In [109]:
# Feature scaling with Standardardization
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([('imputer',SimpleImputer(strategy ='median')),('std_scaler',StandardScaler())])

train_num_tr = num_pipeline.fit_transform(train_num)

In [110]:
# combine separated numerical columns with categorical column
from sklearn.compose import ColumnTransformer
num_attribs = list(train_num)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([('num',num_pipeline,num_attribs),('cat',OneHotEncoder(),cat_attribs)])
train_prepared = full_pipeline.fit_transform(train)

In [111]:
train_prepared.shape

(16512, 16)

In [112]:
# importing the decision tree model
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(train_prepared,train_labels)

DecisionTreeRegressor()

In [113]:
#calculating the root mean square value of the decision tree model on training data
train_predictions = tree_reg.predict(train_prepared)
tree_mse = mean_squared_error(train_labels, train_predictions)
tree_rmse = np.sqrt(tree_mse)

tree_rmse

0.0

In [114]:
#cross validation for decision tree model
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg,train_prepared,train_labels,scoring='neg_mean_squared_error',cv=10)
tree_rmse_scores = np.sqrt(-scores)

In [115]:
#displaying the root mean squared error values of all cross validations, the mean of the scores and the standard deviation of all scores
def display_scores(scores):
    print('scores:',scores)
    print('mean:',scores.mean())
    print('standard deviation:',scores.std())
display_scores(tree_rmse_scores)

scores: [66273.86506455 72059.12519435 67695.2179732  71779.44148925
 72611.75008854 65685.27471234 67234.2128703  67421.37729482
 66758.62935786 70369.29778524]
mean: 68788.81918304403
standard deviation: 2496.252219701509


In [116]:

# Loading training data into the RandomForestRegressor model
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(train_prepared,train_labels)

training_predictions = forest_reg.predict(train_prepared)
forest_mse = mean_squared_error(train_labels,train_predictions)

forest_rmse = np.sqrt(forest_mse)
forest_rmse

0.0

In [117]:
#Cross validation of training set
from sklearn.model_selection import cross_val_score
forest_scores = cross_val_score(tree_reg,train_prepared,train_labels,scoring='neg_mean_squared_error',cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

scores: [65222.78217435 69166.91754364 68977.13243318 67834.49278163
 73636.13863165 69374.13541    66766.99302888 67706.38388344
 68631.93313753 70023.1062499 ]
mean: 68734.00152742179
standard deviation: 2107.661149643172


In [118]:
#Hyperparameter tuning with GridSearchCV
from sklearn.model_selection import GridSearchCV

param_grid = [{'n_estimators':[3,10,30],'max_features':[2,4,6,8]},{'bootstrap':[False],'n_estimators':[3,10],
'max_features':[2,3,4]}]

forest_reg =RandomForestRegressor()
grid_search = GridSearchCV(forest_reg,param_grid,cv=5,scoring ='neg_mean_squared_error',
return_train_score = True)
grid_search.fit(train_prepared,train_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [119]:
#the optimum hyperparameters for the random forest model
grid_search.best_estimator_

RandomForestRegressor(max_features=6, n_estimators=30)

In [120]:
#display of optimization iterations and the root mean squared value at each step
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'],cvres['params']):
    print(np.sqrt(-mean_score),params)

63968.75448952913 {'max_features': 2, 'n_estimators': 3}
55043.839058982994 {'max_features': 2, 'n_estimators': 10}
52261.26798936238 {'max_features': 2, 'n_estimators': 30}
59561.289855778734 {'max_features': 4, 'n_estimators': 3}
52715.14648651876 {'max_features': 4, 'n_estimators': 10}
50281.69676057713 {'max_features': 4, 'n_estimators': 30}
59348.57296342453 {'max_features': 6, 'n_estimators': 3}
51333.177511957794 {'max_features': 6, 'n_estimators': 10}
49871.192334154446 {'max_features': 6, 'n_estimators': 30}
58648.73503785436 {'max_features': 8, 'n_estimators': 3}
51969.29494453594 {'max_features': 8, 'n_estimators': 10}
50071.95027406211 {'max_features': 8, 'n_estimators': 30}
62470.73233201347 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
54500.01815838714 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
60059.6777394648 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52522.83383280123 {'bootstrap': False, 'max_features': 3, 'n_estimators':

In [121]:
# saving the best model from the grid_search_CV iteration
import pickle
filename = 'forest_housing_model.pkl'
filename = 'forest_housing_model.sav'

pickle.dump(grid_search.best_estimator_,open(filename,'wb'))

In [122]:
# laoding training data into staochastic gradient regressor
from sklearn.linear_model import SGDRegressor
stochastic = SGDRegressor(random_state=25)
stochastic.fit(train_prepared,train_labels)
stochastic_train = stochastic.fit(train_prepared,train_labels)

In [123]:
# calculating the root mean squared value of the stochastic gradient descent algorithm
train_predictions =stochastic.predict(train_prepared)
stochastic_mse = mean_squared_error(train_labels,train_predictions)
stochastic_rmse = np.sqrt(stochastic_mse)
stochastic_rmse

67879.44998029819

In [124]:
#A Stochastic Gradient Descent Model without cross validation and fine tuning is chosen as the final model as it performed better
filename = 'stochastic_housing_model.pkl'
filename = 'stochastic_housing_model.sav'
pickle.dump(stochastic_train,open(filename,'wb'))

In [125]:
# cross validation of the SGD model and the results of the cross validation were very poor
from sklearn.model_selection import cross_val_score
stochastic_scores = cross_val_score(stochastic,train_prepared,train_labels,scoring='neg_mean_squared_error',cv=10)
stochastic_rmse_scores = np.sqrt(-stochastic_scores)
display_scores(stochastic_rmse_scores)

scores: [  102340.89808645   103369.98771779   388718.13796761   105769.91693549
    90029.75164964    64901.45970082   211593.5173794     68684.63067833
 28307762.66276588    69193.23624057]
mean: 2951236.4199121995
standard deviation: 8452700.785215924


In [126]:
#get the parameters of the SGDRegressor function
estimator = SGDRegressor()
estimator.get_params().keys()

dict_keys(['alpha', 'average', 'early_stopping', 'epsilon', 'eta0', 'fit_intercept', 'l1_ratio', 'learning_rate', 'loss', 'max_iter', 'n_iter_no_change', 'penalty', 'power_t', 'random_state', 'shuffle', 'tol', 'validation_fraction', 'verbose', 'warm_start'])

In [127]:
#Fine tuning the SGD algorithm with GridSearchCV 
from sklearn.model_selection import GridSearchCV

param_grid = [{'learning_rate':['optimal'],'max_iter':[300,500,700,1000], 'alpha':[0.001,0.002,0.003],'random_state':[10,20,25,35,65]},
{'warm_start':[True],'learning_rate':['optimal'],
'max_iter':[300,500,700,1000], 'alpha':[0.001,0.002,0.003],'random_state':[10,20,25,35,65]}]

stochastic = SGDRegressor(random_state = 42)
grid_search = GridSearchCV(stochastic,param_grid,cv=5,scoring ='neg_mean_squared_error',
return_train_score = True)
grid_search.fit(train_prepared,train_labels)

GridSearchCV(cv=5, estimator=SGDRegressor(random_state=42),
             param_grid=[{'alpha': [0.001, 0.002, 0.003],
                          'learning_rate': ['optimal'],
                          'max_iter': [300, 500, 700, 1000],
                          'random_state': [10, 20, 25, 35, 65]},
                         {'alpha': [0.001, 0.002, 0.003],
                          'learning_rate': ['optimal'],
                          'max_iter': [300, 500, 700, 1000],
                          'random_state': [10, 20, 25, 35, 65],
                          'warm_start': [True]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [128]:
grid_search.best_estimator_

SGDRegressor(alpha=0.001, learning_rate='optimal', max_iter=300,
             random_state=20)

In [129]:
#displaying the various iterations used by the grid_searchCV algorithm
kbay = grid_search.cv_results_
for mean_score, params in zip(kbay['mean_test_score'],kbay['params']):
    print(np.sqrt(-mean_score),params)

210110370162.7432 {'alpha': 0.001, 'learning_rate': 'optimal', 'max_iter': 300, 'random_state': 10}
10130252105.209005 {'alpha': 0.001, 'learning_rate': 'optimal', 'max_iter': 300, 'random_state': 20}
195492611857.90668 {'alpha': 0.001, 'learning_rate': 'optimal', 'max_iter': 300, 'random_state': 25}
80397542693.687 {'alpha': 0.001, 'learning_rate': 'optimal', 'max_iter': 300, 'random_state': 35}
82583249454.26189 {'alpha': 0.001, 'learning_rate': 'optimal', 'max_iter': 300, 'random_state': 65}
210110370162.7432 {'alpha': 0.001, 'learning_rate': 'optimal', 'max_iter': 500, 'random_state': 10}
10130252105.209005 {'alpha': 0.001, 'learning_rate': 'optimal', 'max_iter': 500, 'random_state': 20}
195492611857.90668 {'alpha': 0.001, 'learning_rate': 'optimal', 'max_iter': 500, 'random_state': 25}
80397542693.687 {'alpha': 0.001, 'learning_rate': 'optimal', 'max_iter': 500, 'random_state': 35}
82583249454.26189 {'alpha': 0.001, 'learning_rate': 'optimal', 'max_iter': 500, 'random_state': 65}
