# Fetch The Housing Data
    Fetch Housing Data from the web

In [None]:
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"

def fetch_housing_data(housing_url = HOUSING_URL, housing_path = HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()


In [None]:
#fetch_housing_data()

# Load Housing Data
    Load CSV of housing data

In [None]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, 'housing.csv')
    return pd.read_csv(csv_path)

In [None]:
housing = load_housing_data()
housing.head()

In [None]:
housing["ocean_proximity"].value_counts()

# Plotting The Different Attributes

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()

# Train Test Splitting

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [None]:
housing = train_set.copy()

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, s=housing["population"]/100, label="population", 
            c = "median_house_value", cmap=plt.get_cmap("jet"), colorbar=True)
plt.legend()

In [None]:
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

# Scatter matrix
Plots every numerical attribute against every other numerical attribute

In [None]:
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12,8))

Median Income clearly has the biggest correlation between the housing value so let's plot that

In [None]:
housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)

# Adjust Attributes
For example the total number of rooms in a district is not useful if you don't take into account the amount of households in order to get the 'rooms per househould'. same thing with number of bedrooms, should be compared with total number of rooms. Population per household is also a good attribute to use

In [None]:
housing['rooms_per_household'] = housing['total_rooms'] / housing['households']
housing['bedrooms_per_room'] = housing['total_bedrooms'] / housing['total_rooms']
housing['population_per_household'] = housing['population']  / housing['households']

Now we can check the correlation matrix with the new values

In [None]:
corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)

# Data Cleaning
some values are missing in certain districts so we calculate the median of each attribute and use that to fill the missing data. sklearn does this with the Imputer class. This can only be computed on numerical values so we need a copy of the dataset without the text attribute ocean_proximity

First need to drop the median_housing_value as that is the "answer" column and we don't want that affected by transformations

In [None]:
housing = train_set.drop("median_house_value", axis=1)
housing_labels = train_set["median_house_value"].copy()

Now we can impute

In [None]:
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy="median")

#We need to drop the ocean proximity value by creating another copy

housing_num = housing.drop("ocean_proximity", axis=1)

#Fit the Imputer

imputer.fit(housing_num)

Now all the medians are stored in imputer.statistics

In [None]:
imputer.statistics_

In [None]:
housing_num.median().values

We can now transform the imputer into housing num to fill in the missing values

In [None]:
X = imputer.transform(housing_num)

# The result is a plain numpy array with the transformed values now we just need to map them back into the pd
# Data frame with labels

housing_tr = pd.DataFrame(X, columns = housing_num.columns)

# Handling Text and Categorical Attributes
Encoding text to numbers for ML algo. Need this for ocean_proximity

Because there are more than 2 possible values, we cannot use one number, we need to onehot encode the text after we LabelEncode the labels

In [None]:
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
encoder = LabelEncoder()
housing_cat = housing["ocean_proximity"]
housing_cat_encoded = encoder.fit_transform(housing_cat)
print(housing_cat_encoded)

In [None]:
# Now we can one hot encode the array
# NOTE it returns a sparse matrix to save space, use toarray() to see array.
# Also it accepts a 2D array so we need to reshape the housing_cat_encoded array
# This is also possible with LabelBinarizer and then you don't have to use LabelEncoder first

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1, 1))
print(housing_cat_1hot.toarray())

# Custom Transformers
Building a custom transformer like the ones that SK uses. Needs to implement fit()(returning self) and transform(). 
Also add TransformerMixin as a bass class in order to automatically add fit_transform() and then BaseEstimator to avoid args and automatically create get_params() and set_params(). Below is an example that adds those combined attributes.

In [None]:
from sklearn.base import TransformerMixin, BaseEstimator
import numpy as np

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room = False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [None]:
print(housing_extra_attribs)

That transformer had one hyperparameter 'add_bedrooms_per_room'

# Feature Scaling
Scaling all the attributes across the same scale so that they are easily comparable. Won't use normalization as that will be influenced by outliers whereas Standardization scales it according to the Mean and Std Deviation
Use the StandardScaler transformation from SKLearn. We'll add it to the pipeline.

# Transformation Pipelines
When lot of transformations are needed, a pipeline adds these transformations to a stack and performs them one after the other, makes things easier. Here's one for all the numerical attributes

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', Imputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())
])

housing_num_tr = num_pipeline.fit_transform(housing_num)

We need a custome DataFrameSelector class which is used below

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

We also need a custom encoder parser because The label encoder and one hot encoder only expect one parameter and the pipeline will pass in 2.

In [None]:
class MyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, encoder):
        self.encoder = encoder
    def fit(self, x, y=None):
        # Reshape it to a 1D array as expected by a one hot encoder
        if isinstance(self.encoder, OneHotEncoder):
            x = x.reshape(-1, 1)
        self.encoder.fit(x)
        return self
    def transform(self, x, y=None):
        # Reshape to a 1D array
        if isinstance(self.encoder, OneHotEncoder):
            x = x.reshape(-1, 1)
            return self.encoder.transform(x).toarray()
        else:
            return self.encoder.transform(x)

The pipeline must contain transformers with a fit and transform function except for the last index in the pipeline which only needs a fit() function.

So the above pipeline handles numerical attributes but not the categorical attributes. These 2 can be combined by using the FeatureUnion class

In [None]:
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator

num_attribs = list(housing_num)
cat_attributes = ['ocean_proximity']

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)), 
    ('imputer', Imputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())
])
    
cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attributes)),
    ('label_binarizer', MyEncoder(LabelBinarizer()))
    #('LabelEncoder', MyEncoder(LabelEncoder())),
    #('encoderHot', MyEncoder(OneHotEncoder()))
])

full_pipeline = FeatureUnion(transformer_list = [
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline)
])

In [None]:
print(housing.values.shape)
print(housing.values.reshape(-1, 1).shape)

In [None]:
housing_prepared = full_pipeline.fit_transform(housing)

# Training & Evaluating
Using training and test set. First gonna try Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:\t", lin_reg.predict(some_data_prepared))
print("Labels:\t\t", list(some_labels))

See how accurate the predictions are by measuring the RMSE (mean squared error) on the whole training set

In [None]:
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

Model has an average error of 67k which is high. Could be underfitted. Try a non-linear regressor like a decision tree regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)
housing_predictions = tree_reg.predict(housing_prepared)
lin_rmse = np.sqrt(mean_squared_error(housing_labels, housing_predictions))
print(lin_rmse)

A zeror error. Means that the model overfits the data 