<a href="https://colab.research.google.com/github/joepeskett/tree-pixels/blob/master/tester.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



# Basic Housing Examples

> Refresher

This is a basic walkthrough from Hands on Machine Learning with Scikit Lean and Tensorflow. Notebook used as a refresher after doing lots of Ops/K8s bits more recently, and not very much pandas/sklearn. 


In [0]:
  import os, tarfile, urllib


In [0]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

In [0]:
def fetch_housing(housing_url = HOUSING_URL, housing_path = HOUSING_PATH):
    os.makedirs(housing_path,  exist_ok = True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path = housing_path)
    housing_tgz.close()


In [0]:
import pandas as pd
def load_housing_data(housing_path = HOUSING_PATH):
  csv_path = os.path.join(housing_path, "housing.csv")
  return pd.read_csv(csv_path)



In [0]:
fetch_housing()
housing = load_housing_data()

In [0]:
housing.head()

In [0]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size = 0.2, random_state = 42)


This is fine for regular splits, but what if we want to stratify our splits. 

In [0]:
housing['income_cat'] = pd.cut(housing['median_income'], bins = [0., 1.5, 3.,4.5, 6., np.inf], labels = [1, 2, 3, 4, 5])

In [0]:
housing['income_cat'].hist()

In [0]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state=43)
for train_index, test_index in split.split(housing, housing['income_cat']):
  strat_train_set = housing.loc[train_index]
  strat_test_set = housing.loc[test_index]

In [0]:
strat_test_set['income_cat'].value_counts() / len(strat_test_set)

In [0]:
#Remove the income_cat variable
for set_ in (strat_train_set, strat_test_set):
  set_.drop("income_cat", axis = 1, inplace = True)

# Visualisation in MatPlotLib

Because we need pretty pictures and colours. 

In [0]:
# make a copy of the train set (only because it's pretty small to begin with)

housing = strat_test_set.copy()

In [0]:
housing.plot(kind = "scatter", x = "longitude", y = "latitude")

In [0]:
housing.plot(kind = "scatter", x = "longitude", y = "latitude", alpha = 0.1)

In [0]:
housing.plot(kind = "scatter", x = "longitude", y = "latitude", alpha = 0.4, 
             s = housing['population']/100, label = "population", figsize = (10, 7), 
             c = "median_house_value", cmap = plt.get_cmap("jet"), colorbar = True)
plt.legend()

In [0]:
# Correlations

corr_matrix=housing.corr()
corr_matrix['median_house_value'].sort_values(ascending = False)

In [0]:
from pandas.plotting import scatter_matrix
attributes = ['median_house_value', 'median_income', 'total_rooms', 'housing_median_age']
scatter_matrix(housing[attributes], figsize = (12, 8))

#Data Cleaning 

*Yay...*

In [0]:
#Missing Values
#1
#housing.dropna(subset = ['total_bedrooms'])
#2
#housing.drop('total_bedrooms', axis = 1)
#3
#median = housing['total_bedrooms'].median()
#housing['total_bedrooms'].fillna(median, inplace = True)


In [0]:
housing = strat_train_set.drop("median_house_value", axis = 1)
housing_labels = strat_train_set["median_house_value"].copy()

In [0]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = "median")

In [0]:
#Drop text attributes so we can use the above imputation
housing_num = housing.drop("ocean_proximity", axis = 1)

In [0]:
imputer.fit(housing_num)

In [0]:
imputer.statistics_

In [0]:
x = imputer.transform(housing_num)

In [0]:
housing_tr = pd.DataFrame(x, columns=housing_num.columns, index = housing_num.index)

In [0]:
housing_tr.head()

Sidenote - sklearn has a very simple and consistent API. This imputer is very similar to using any of the scaling tools. 

In [0]:
#Dealing with Text - simply

housing_cat = housing[['ocean_proximity']]
housing_cat.head(10)

In [0]:
#ordinal encoder for creating categorical variables
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]

In [0]:
#ordinal encoding should only really be used if the categories that we're using have a sensible and meaning order to them.
#in this instance one hot is better
from sklearn.preprocessing import OneHotEncoder
cat_endcoder = OneHotEncoder()
housing_cat_1_hot = cat_endcoder.fit_transform(housing_cat)
housing_cat_1_hot

In [0]:
housing_cat_1_hot.toarray()

In [0]:
#As with other SKlearn objects, we can get the encoders categories:
cat_endcoder.categories_

In [0]:
#Custom Transformers using the same fit and transform functionality can be 
#created - example shown below

from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin): # Create a new class
  def __init__(self, add_bedrooms_per_room = True): #any options that should be set when 
    self.add_bedrooms_per_room = add_bedrooms_per_room
  def fit(self, X, y=None): #Define a fit method
    return self
  def transform(self, X, y=None): #Define a tranform method
    rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
    population_per_household = X[:, population_ix] / X[:, households_ix]
    if self.add_bedrooms_per_room:
      bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
      return np.c_[X, rooms_per_household, population_per_household,
                   bedrooms_per_room]
    else:
      return np.c_[X, rooms_per_household, population_per_household]
#This can now be used as you would any other fit/transform object in skleanr
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room = False)
housing_extra_attribs = attr_adder.transform(housing.values)

# Transformation Pipelines

Because everything ends up as a pipeline - eventually.

This is something that I've not used before, but seems very similar to the functionality that you get in the R package `recipes`

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [0]:
num_pipeline = Pipeline([
                         ('imputer', SimpleImputer(strategy='median')),
                         ('attribs_adder', CombinedAttributesAdder()),
                         ('std_scaler', StandardScaler())
])
housing_num_tr = num_pipeline.fit_transform(housing_num)

In [0]:
#What if we want to handle the numerical and categorical variables altogether?

from sklearn.compose import ColumnTransformer
num_attribs = list(housing_num)#list of the numerical attributes names
print(num_attribs)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
                                   ("num", num_pipeline, num_attribs),
                                   ("cat", OneHotEncoder(), cat_attribs)
])
housing_prepared = full_pipeline.fit_transform(housing)

#Note that you can use `drop` and `pass through` for columns that you don't want
# to transform in a pipeline.

# Pick a model and train it

In [0]:
#simple linear regression

from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)


sample_data = housing.iloc[:5]
sample_label = housing_labels.iloc[:5]
sample_data_prepped = full_pipeline.transform(sample_data)
print("Predictions:", lin_reg.predict(sample_data_prepped))
print("Labels:",
      list(sample_label))

In [0]:
#How far off are we?
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [0]:
# Try a different model

from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

In [0]:
#Now we should really be using Cross Validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels, 
                         scoring = "neg_mean_squared_error", cv = 10)
#SKLearn's CV expect a utility function (greater is better)
tree_rmse_scores = np.sqrt(-scores)

In [0]:
def display_scores(scores):
  print("Scores:", scores)
  print("Mean:", scores.mean())
  print("Standard Deviations", scores.std())

display_scores(tree_rmse_scores)

In [0]:
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                             scoring = "neg_mean_squared_error", cv = 10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

In [0]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)

forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, 
                                scoring = "neg_mean_squared_error", cv = 10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_scores)

In [0]:
#Okay so we have a model which is now performing a little better. 

#TODO: Look at hyperparameter selection using sklearn gridsearch. 