In [None]:
#Main

import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"


In [None]:
#Download housing data
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
        
fetch_housing_data(HOUSING_URL, HOUSING_PATH)

In [None]:
#convert the data to a more readable pandas format

import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

housing = load_housing_data()
print(housing.head(10))
#housing["ocean_proximity"].value_counts()

In [None]:
# only in a Jupyter notebook
%matplotlib inline 
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))

In [None]:
#function that splits the training set

import hashlib

def test_set_check(identifier, test_ratio, hash):
    return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio

def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [None]:
#split the training set

housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")
print(len(train_set))
print(len(test_set))

# below is the easier way to do it with just column id's, however adding or taking out rows will change the test set
# housing_with_id = housing.reset_index() # adds an `index` column
# train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")

In [None]:
#splitting the dataset and test set with Scikit-Learn. Better if you have multiple datasets, also simpler.

from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [None]:
#splitting the test set with stratified dataset

# The following code creates
# an income category attribute by dividing the median income by 1.5 (to limit the number of income
# categories), and rounding up using ceil (to have discrete categories), and then merging all the categories
# greater than 5 into category 5


#create startum (catagories)
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

#import
from sklearn.model_selection import StratifiedShuffleSplit

#split training set with startum in mind
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
    
    
#Now remove the income_cat attribute so the data is back to its original state
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [None]:
housing = strat_train_set.copy()
housing.plot(kind="scatter", x="longitude", y="latitude")

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
s=housing["population"]/100, label="population", figsize=(10,7),
c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
) 
plt.legend()

In [38]:
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

KeyError: 'median_house_value'

In [None]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

In [None]:
import numpy as np

arr = np.array(['cat','amsterdam','boy'])

In [None]:
print(arr)

In [None]:
#Main

import os, tarfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from six.moves import urllib
from sklearn.model_selection import StratifiedShuffleSplit
import matplotlib.pyplot as plt



#Download housing data
def fetch_housing_data(housing_url, housing_path):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()


#convert the data to a more readable pandas format
def load_housing_data(housing_path):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

#stratum split
def stratum_split(housing):
	# The following code creates
	# an income category attribute by dividing the median income by 1.5 (to limit the number of income
	# categories), and rounding up using ceil (to have discrete categories), and then merging all the categories
	# greater than 5 into category 5


	#create startum (catagories)
	housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
	housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

	#split training set with startum in mind
	split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
	for train_index, test_index in split.split(housing, housing["income_cat"]):
	    strat_train_set = housing.loc[train_index]
	    strat_test_set = housing.loc[test_index]
	    
	#Now remove the income_cat attribute so the data is back to its original state
	for set_ in (strat_train_set, strat_test_set):
	    set_.drop("income_cat", axis=1, inplace=True)

	return strat_train_set, strat_test_set

def interpret_data(housing):
	housing.plot(kind="scatter", x="longitude", y="latitude")
	housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
	s=housing["population"]/100, label="population", figsize=(10,7),
	c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
	) 
	plt.legend()
	plt.show()

def main():
	DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
	HOUSING_PATH = os.path.join("datasets", "housing")
	HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

	fetch_housing_data(HOUSING_URL, HOUSING_PATH) #load data

	housing = load_housing_data(HOUSING_PATH) # convert to pandas dataframe
	
	housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
	housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
	housing["population_per_household"]=housing["population"]/housing["households"]
	# split test set with scikit-learn, instead we use the more acurate stratum split
	#train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42) 

	train_set, test_set = stratum_split(housing)
	print(len(train_set))
	print(len(test_set))

	# interpret_data(housing)

main()























	# # --------------------- splitting the test set with stratified dataset ------------------------------------------

	# # The following code creates
	# # an income category attribute by dividing the median income by 1.5 (to limit the number of income
	# # categories), and rounding up using ceil (to have discrete categories), and then merging all the categories
	# # greater than 5 into category 5


	# #create startum (catagories)
	# housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
	# housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

	# #split training set with startum in mind
	# split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
	# for train_index, test_index in split.split(housing, housing["income_cat"]):
	#     strat_train_set = housing.loc[train_index]
	#     strat_test_set = housing.loc[test_index]
	    
	# #Now remove the income_cat attribute so the data is back to its original state
	# for set_ in (strat_train_set, strat_test_set):
	#     set_.drop("income_cat", axis=1, inplace=True)

	# # -------------------------------------------------------------------------------------------------------------

In [None]:

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

fetch_housing_data(HOUSING_URL, HOUSING_PATH) #load data

housing = load_housing_data(HOUSING_PATH) # convert to pandas dataframe

housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]
# split test set with scikit-learn, instead we use the more acurate stratum split
#train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42) 

train_set, test_set = stratum_split(housing)
print(len(train_set))
print(len(test_set))

interpret_data(housing)



In [None]:
interpret_data(housing)

In [None]:
housing = train_set.drop("median_house_value", axis=1)
housing_labels = train_set["median_house_value"].copy()

In [None]:
#filling in missing values with imputer
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns)
imputer.statistics_

In [None]:
#secifically fill in the missing columns for the total bedrooms. NOT A GENERAL SOLUTION

median = housing["total_bedrooms"].median() # option 3
housing["total_bedrooms"].fillna(median, inplace=True)

In [None]:
# encode text features

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
housing_cat = housing["ocean_proximity"]
housing_cat_encoded = encoder.fit_transform(housing_cat)
housing_cat_encoded
ocean_labels = encoder.classes_
print(ocean_labels)

In [None]:
#Binarize a text column into many 

from sklearn.preprocessing import LabelBinarizer

encoder = LabelBinarizer(sparse_output=True) # make sparse_ouput=False for a dense NumPy Array. Boooooooo!
housing_cat_1hot = encoder.fit_transform(housing_cat)
housing_cat_1hot

ocean_labels = encoder.classes_ # just to use for reference to know what is what.

In [None]:
encoder.classes_

In [59]:
#Main

import os, tarfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from six.moves import urllib
from sklearn.model_selection import StratifiedShuffleSplit
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LinearRegression



#Download housing data
def fetch_housing_data(housing_url, housing_path):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()


#convert the data to a more readable pandas format
def load_housing_data(housing_path):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

#stratum split
def stratum_split(housing):
	# The following code creates
	# an income category attribute by dividing the median income by 1.5 (to limit the number of income
	# categories), and rounding up using ceil (to have discrete categories), and then merging all the categories
	# greater than 5 into category 5


	#create startum (catagories)
	housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
	housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

	#split training set with stratum in mind
	split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
	for train_index, test_index in split.split(housing, housing["income_cat"]):
	    strat_train_set = housing.loc[train_index]
	    strat_test_set = housing.loc[test_index]
	    
	#Now remove the income_cat attribute so the data is back to its original state
	for set_ in (strat_train_set, strat_test_set, housing):
	    set_.drop("income_cat", axis=1, inplace=True)

	return strat_train_set, strat_test_set

def interpret_data(housing):
	housing.plot(kind="scatter", x="longitude", y="latitude")
	housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
	s=housing["population"]/100, label="population", figsize=(10,7),
	c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
	) 
	plt.legend()
	plt.show()


# #Binarize a text column into many 
# # from sklearn.preprocessing import LabelBinarizer
# def binarize_ocean_proximity(housing):

# 	encoder = LabelBinarizer(sparse_output=True) # make sparse_ouput=False for a dense NumPy Array. Boooooooo!
# 	housing_cat_1hot = encoder.fit_transform(housing_cat)
# 	housing_cat_1hot
# 	ocean_labels = encoder.classes_ # just to use for reference to know what is what.

class LabelBinarizerPipelineFriendly(LabelBinarizer):
    def fit(self, X, y=None):
        """this would allow us to fit the model based on the X input."""
        super(LabelBinarizerPipelineFriendly, self).fit(X)
    def transform(self, X, y=None):
        return super(LabelBinarizerPipelineFriendly, self).transform(X)

    def fit_transform(self, X, y=None):
        return super(LabelBinarizerPipelineFriendly, self).fit(X).transform(X)

#Custom transformer class to  add new attributes to house data
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):

	def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
		self.add_bedrooms_per_room = add_bedrooms_per_room
	def fit(self, X, y=None):
		return self # nothing else to do
	def transform(self, X, y=None):
		rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
		rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
		population_per_household = X[:, population_ix] / X[:, household_ix]
		if self.add_bedrooms_per_room:
			bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
			print("combined attributes:")
			print(np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room].shape)
			return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
		else:
			return np.c_[X, rooms_per_household, population_per_household]

# attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
# housing_extra_attribs = attr_adder.transform(housing.values)

class DataFrameSelector(BaseEstimator, TransformerMixin):
	def __init__(self, attribute_names):
		self.attribute_names = attribute_names
	def fit(self, X, y=None):
		return self
	def transform(self, X):
		print("DataFrameSelector:")
		print(X[self.attribute_names].values.shape)
		return X[self.attribute_names].values


DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

fetch_housing_data(HOUSING_URL, HOUSING_PATH) #load data

housing = load_housing_data(HOUSING_PATH) # convert to pandas dataframe


#housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
#housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
#housing["population_per_household"]=housing["population"]/housing["households"]

# split test set with scikit-learn, instead we use the more acurate stratum split
#train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42) 

train_set, test_set = stratum_split(housing)

housing = train_set.drop("median_house_value", axis=1)
housing_labels = train_set["median_house_value"].copy()

#interpret_data(housing)
training_num = housing.drop("ocean_proximity", axis=1)
num_attribs = list(training_num)
print(num_attribs)
cat_attribs = ["ocean_proximity"]
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])
# 	housing_num_tr = num_pipeline.fit_transform(training_num)
# 	print(housing_num_tr.shape)
cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('label_binarizer', LabelBinarizerPipelineFriendly()),
    ])
# 	cat = cat_pipeline.fit_transform(train_set)
# 	print(cat.shape)
full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])
print(housing.shape)
housing_prepared = full_pipeline.fit_transform(housing)
#print(housing_prepared)
print(housing_prepared.shape)
print(housing_prepared.shape)



['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']
(16512, 9)
DataFrameSelector:
(16512, 8)
combined attributes:
(16512, 11)
DataFrameSelector:
(16512, 1)
(16512, 16)
(16512, 16)


In [36]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [60]:
some_data = housing.iloc[:5]
print(some_data.shape)
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print(some_data_prepared.shape)
print("Predictions:", lin_reg.predict(some_data_prepared))
print("Labels:", list(some_labels))

(5, 9)
DataFrameSelector:
(5, 8)
combined attributes:
(5, 11)
DataFrameSelector:
(5, 1)
(5, 16)
('Predictions:', array([210644.60459286, 317768.80697211, 210956.43331178,  59218.98886849,
       189747.55849879]))
('Labels:', [286600.0, 340600.0, 196900.0, 46300.0, 254500.0])


In [63]:
#check mean squared error
from sklearn.metrics import mean_squared_error
def check_mse(data, labels, model):
    
    predictions = model.predict(data)
    model_mse = mean_squared_error(labels, predictions)
    model_rmse = np.sqrt(model_mse)
    print(model_rmse)

check_mse(housing_prepared, housing_labels, lin_reg)

68628.19819848923


In [80]:
from sklearn.model_selection import cross_val_score
def check_mse_cross_validation(data, labels, model):
    scores = cross_val_score(model, data, labels, scoring="neg_mean_squared_error", cv=10)
    tree_rmse_scores = np.sqrt(-scores)
    return tree_rmse_scores

In [81]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
    

In [64]:
#make decision tree model
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [65]:
#check mse for decission tree
check_mse(housing_prepared, housing_labels, tree_reg)

0.0


In [84]:
tree_rmse_scores = check_mse_cross_validation(housing_prepared, housing_labels, tree_reg)
display_scores(tree_rmse_scores)

('Scores:', array([68412.16768296, 67320.32091185, 69136.82824501, 67778.09585112,
       71017.80203566, 75446.66825312, 70404.31147174, 71274.36683137,
       77502.15531233, 69932.6442633 ]))
('Mean:', 70822.53608584664)
('Standard deviation:', 3120.811186835279)


In [87]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
forest_rmse_scores = check_mse_cross_validation(housing_prepared, housing_labels, forest_reg)
display_scores(forest_rmse_scores)

('Scores:', array([51761.59314537, 49519.06351825, 52218.51312455, 54695.67527694,
       52698.91723318, 55406.90596006, 51207.08033053, 50881.01775344,
       55517.98337045, 54048.16607546]))
('Mean:', 52795.49157882074)
('Standard deviation:', 1942.826702940512)


In [88]:
#find the best hyperparameter fit --- we should look up more information on this
from sklearn.model_selection import GridSearchCV
param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared, housing_labels)
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

In [90]:
final_model = grid_search.best_estimator_
X_test = test_set.drop("median_house_value", axis=1)
y_test = test_set["median_house_value"].copy()
X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse) # => evaluates to 47,766.0

DataFrameSelector:
(4128, 8)
combined attributes:
(4128, 11)
DataFrameSelector:
(4128, 1)


In [91]:
print(final_rmse)

47726.76421578336
