<a href="https://colab.research.google.com/github/julienbonin/MachineLearningApplications/blob/master/Chapters/Chapter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import tarfile
from six.moves import urllib
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score


In [None]:
DOWNLOAD_ROOT = 'https://raw.githubusercontent.com/ageron/handson-ml2/master/'
HOUSING_PATH = os.path.join('datasets', 'housing')
HOUSING_URL = DOWNLOAD_ROOT + 'datasets/housing/housing.tgz'

In [None]:
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
  if not os.path.isdir(housing_path):
    os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [None]:
def load_housing_data(housing_path=HOUSING_PATH):
  csv_path = os.path.join(housing_path, "housing.csv")
  return pd.read_csv(csv_path)

In [None]:
fetch_housing_data()
housing = load_housing_data()
housing.head()

In [None]:
housing.info()

In [None]:
housing['ocean_proximity'].value_counts()

In [None]:
housing.describe()

In [None]:
housing.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
def split_train_test(data, test_ratio):
  shuffled_indices = np.random.permutation(len(data))
  test_set_size = int(len(data) * test_ratio)
  test_indices = shuffled_indices[:test_set_size]
  train_indices = shuffled_indices[test_set_size:]
  return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = split_train_test(housing, 0.2)

In [None]:
len(train_set)

In [None]:
len(test_set)

In [None]:
housing['income_cat'] = pd.cut(housing['median_income'], bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels=[1,2,3,4,5])
housing['income_cat'].hist()

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['income_cat']):
  strat_train_set = housing.loc[train_index]
  strat_test_set = housing.loc[test_index]

In [None]:
strat_test_set['income_cat'].value_counts() / len(strat_test_set)

In [None]:
for set_ in (strat_train_set, strat_test_set):
  set_.drop('income_cat', axis=1, inplace=True)

In [None]:
housing = strat_train_set.copy()

In [None]:
housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.1)

In [None]:
housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.4, s=housing['population']/100, label='population', figsize=(10,7), c='median_house_value', cmap=plt.get_cmap('jet'), colorbar=True)
plt.legend()

In [None]:
corr_matrix = housing.corr()

In [None]:
corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
attributes = ['median_house_value', 'median_income', 'total_rooms', 'housing_median_age']
scatter_matrix(housing[attributes], figsize=(12,8))

In [None]:
housing.plot(kind='scatter', x='median_income', y='median_house_value', alpha=0.1)

In [None]:
housing['rooms_per_household'] = housing['total_rooms']/housing['households']
housing['bedrooms_per_room'] = housing['total_bedrooms']/housing['total_rooms']
housing['population_per_household'] = housing['population']/housing['households']

In [None]:
corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
housing = strat_train_set.drop('median_house_value', axis=1)
housing_labels = strat_train_set['median_house_value'].copy()

In [None]:
imputer = SimpleImputer(strategy='median')
housing_num = housing.drop('ocean_proximity', axis=1)
imputer.fit(housing_num)
imputer.statistics_

In [None]:
housing_num.median().values

In [None]:
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns)

In [None]:
housing_cat = housing[['ocean_proximity']]
housing_cat.head(10)

In [None]:
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]

In [None]:
ordinal_encoder.categories_

In [None]:
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

In [None]:
housing_cat_1hot.toarray()

In [None]:
cat_encoder.categories_

#Custom Transformers

In [None]:
rooms_ix, bedrooms_ix, population_ix, households_ix = 3,4,5,6

In [None]:
class CombinedAttributesAddr(BaseEstimator, TransformerMixin):
  def __init__(self, add_bedrooms_per_room = True):
    self.add_bedrooms_per_room = add_bedrooms_per_room

  def fit(self, X, y=None): 
    return self
  
  def transform(self, X, y=None):
    rooms_per_household = X[:, bedrooms_ix] / X[:, households_ix]
    population_per_houshold = X[:, population_ix] / X[:, households_ix]
    if self.add_bedrooms_per_room:
      bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
      return np.c_[X, rooms_per_household, population_per_houshold, bedrooms_per_room]
    else:
      return np.c_[X, rooms_per_household, population_per_houshold]
  

In [None]:
attr_adder = CombinedAttributesAddr(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

#Transformation Pipelines

In [None]:
num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy='median')), ('attribs_adder', CombinedAttributesAddr()), ('std_scaler', StandardScaler())])

housing_num_tr = num_pipeline.fit_transform(housing_num)

In [None]:
 num_attribs = list(housing_num)
 cat_attribs = ['ocean_proximity']

 full_pipeline = ColumnTransformer([('num', num_pipeline, num_attribs),('cat', OneHotEncoder(), cat_attribs)])

 housing_prepared = full_pipeline.fit_transform(housing)

#Training and Evaluating on the Training Set

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels) # housing_prepared & housing_labels are the test data

In [None]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print('Predictions', lin_reg.predict(some_data_prepared))
print('Labels:', list(some_labels))

In [None]:
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

#Better Evaluation Using Cross-Validation

In [None]:
scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring='neg_mean_squared_error', cv=10)
tree_rmse_scores = np.sqrt(-scores)

In [None]:
def display_scores(scores):
  print('Scores: ', scores)
  print('Mean', scores.mean())
  print('Standard Deviation', scores.std())
display_scores(tree_rmse_scores)

In [None]:
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring='neg_mean_squared_error', cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

#Fine-Tune Your Model

#Grid Search


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = [ {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
  print(np.sqrt(-mean_score), params)

#Analyze the Best Models and Their Errors

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

In [None]:
extra_attribs = ['rooms_per_hhold', 'pop_per_hhold', 'bedrooms_per_room']
cat_encoder = full_pipeline.named_transformers_['cat']
cat_one_hot_attribs = list(cat_encoder.categories[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

#Evaluate Your System on the Test Set

In [None]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop('median_house_value', axis=1)
y_test = strat_test_set['median_house_value'].copy()

X_test_prepared = full_pipeline.transform(X_test)

final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

#Chapter 2 - Exercises


##Problem 1

In [None]:
from sklearn.svm import SVR


# Support Vector Machine with default 'kernel'
sv_reg = SVR()
sv_reg.fit(housing_prepared, housing_labels)

housing_predictions = sv_reg.predict(housing_prepared)
sv_mse = mean_squared_error(housing_labels, housing_predictions)
sv_rmse = np.sqrt(sv_mse)
sv_rmse



In [None]:
# Support Vector Machine with  'kernel = linear'
sv_reg = SVR(kernel='linear')
sv_reg.fit(housing_prepared, housing_labels)

housing_predictions = sv_reg.predict(housing_prepared)
sv_mse = mean_squared_error(housing_labels, housing_predictions)
sv_rmse = np.sqrt(sv_mse)
sv_rmse


In [None]:
# Support Vector Machine with  'kernel = rbf' (I think this is the default)
sv_reg = SVR(kernel='rbf')
sv_reg.fit(housing_prepared, housing_labels)

housing_predictions = sv_reg.predict(housing_prepared) # I'm confused to why we're using the training data here rather than the test (some_data_prepared)
sv_mse = mean_squared_error(housing_labels, housing_predictions)
sv_rmse = np.sqrt(sv_mse)
sv_rmse


In [None]:
#from sklearn import metrics

#print('Acuracy: ', metrics.accuracy_score(some_data_prepared, )) 

# I was trying to use this metrics class, but I'm not sure what to pass into 'accuracy_score'. Is it test data or train data? I would asume test data.


##Problem 2

note: I thought the code below wasn't running properly, but it turns out that it takes over an hour to run.

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, reciprocal


param_grid = [ {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}]
#param_dist = {'average': [True, False], 'l1_ratio': stats.uniform(0, 1), 'alpha': loguniform(1e-4, 1e0)} # I found this online for the 'param_distributions' parameter in RandomizedSearchCV
param_dist = { 'kernel':['linear','rbf'], 'C': reciprocal(20, 200000), 'gamma': expon(scale=1.0) } # I had to get this from the solutions online
sv_reg = SVR()
rand_search = RandomizedSearchCV(sv_reg, param_distributions=param_dist, n_iter=50, cv=5, scoring='neg_mean_squared_error', verbose=2, random_state=42) # This line was giving me some trouble. I wasn't sure what to pass in as parameters.
rand_search.fit(housing_prepared, housing_labels)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] C=629.782329591372, gamma=3.010121430917521, kernel=linear ......


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  C=629.782329591372, gamma=3.010121430917521, kernel=linear, total=   9.7s
[CV] C=629.782329591372, gamma=3.010121430917521, kernel=linear ......


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.7s remaining:    0.0s


[CV]  C=629.782329591372, gamma=3.010121430917521, kernel=linear, total=  10.0s
[CV] C=629.782329591372, gamma=3.010121430917521, kernel=linear ......
[CV]  C=629.782329591372, gamma=3.010121430917521, kernel=linear, total=  10.0s
[CV] C=629.782329591372, gamma=3.010121430917521, kernel=linear ......
[CV]  C=629.782329591372, gamma=3.010121430917521, kernel=linear, total=   9.9s
[CV] C=629.782329591372, gamma=3.010121430917521, kernel=linear ......
[CV]  C=629.782329591372, gamma=3.010121430917521, kernel=linear, total=   9.8s
[CV] C=26290.206464300216, gamma=0.9084469696321253, kernel=rbf ......
[CV]  C=26290.206464300216, gamma=0.9084469696321253, kernel=rbf, total=  19.2s
[CV] C=26290.206464300216, gamma=0.9084469696321253, kernel=rbf ......
[CV]  C=26290.206464300216, gamma=0.9084469696321253, kernel=rbf, total=  20.1s
[CV] C=26290.206464300216, gamma=0.9084469696321253, kernel=rbf ......
[CV]  C=26290.206464300216, gamma=0.9084469696321253, kernel=rbf, total=  20.0s
[CV] C=26290.2

#### Note: For the remaining questions, I really didn't know where to start, so I had to refer to the online solutions. I get the big picture of what we're doing, but I'm not familiar enough with the details and tools from sklearn to accomplish the remaining questions by myself. However, after reviewing the solutions, I do feel like I have a better understanding of what sklearn and scipy has to offer, as well as the broader machine learning concepts

##Problem 3

In [None]:
#feature_importances <- Defined in block 156

from sklearn.base import BaseEstimator, TransformerMixin

def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    def transform(self, X):
        return X[:, self.feature_indices_]

top_features = indices_of_top_k(feature_importances, 5)

In [None]:
pipeline = Pipeline([('preparation', full_pipeline), ('feature_selection', TopFeatureSelector(feature_importances, k))])
housing_prep = pipeline.fit_transform(housing)

##Problem 4

In [None]:
complete_pipeline = Pipeline([ ('preparation', full_pipeline), ('feature_selection', TopFeatureSelector(feature_importances, 5)), 'svm_reg', SVR(**rnd_search.best_params_) ])

complete_pipeline.fit(housing, housing_labels)

In [None]:
some_data = housing.iloc[:4]
some_labels = housing_labels.iloc[:4]

print("Predictions:\t", prepare_select_and_predict_pipeline.predict(some_data))
print("Labels:\t\t", list(some_labels))

##Problem 5

In [None]:
param_grid = [{
    'preparation__num__imputer__strategy': ['mean', 'median', 'most_frequent'],
    'feature_selection__k': list(range(1, len(feature_importances) + 1))
}]

grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=5,
                                scoring='neg_mean_squared_error', verbose=2)
grid_search_prep.fit(housing, housing_labels)