In [124]:
import os
import tarfile
import urllib 
import pandas as pd
import numpy as np
from zlib import crc32
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
import joblib
from sklearn.model_selection import GridSearchCV
from scipy import stats


In [None]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [None]:
fetch_housing_data()

In [None]:
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [None]:
housing = load_housing_data()
housing.head()

In [None]:
housing.info()
housing.head()
housing.describe()

In [None]:
housing["ocean_proximity"].value_counts()
housing["population"].value_counts()

In [None]:
housing.describe()

In [None]:
%matplotlib inline

In [None]:
housing.hist(bins=50, figsize=(20,15))
plt.show()


In [None]:
def split_train_test(data,test_ratio):
    np.random.seed(42)
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = split_train_test(housing, 0.2)

In [None]:
train_set.head()

In [None]:
def test_set_cheak(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

In [None]:
def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_:test_set_cheak(id_,test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [None]:
housing_whit_id = housing.reset_index() 
train_set, test_set = split_train_test_by_id(housing_whit_id, 0.2, "index")
len(test_set)

In [None]:
housing_whit_id.head()


In [None]:
housing_whit_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
train_set , test_set = split_train_test_by_id(housing_whit_id, 0.2, "id")

In [None]:
type(train_set)

In [None]:
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [None]:
housing_whit_id.info()

In [None]:
housing["income_cat"] = pd.cut(housing["median_income"], 
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

In [None]:
housing["income_cat"].hist()

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
strat_test_set["income_cat"].value_counts()/len(strat_test_set)

In [None]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [None]:
housing = strat_train_set.copy()



In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude")

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.5,
s=housing["population"]/60, label="population", figsize=(10,7),
c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True)
plt.legend()
# 0x7fc54d03ec70
# 0x7fc54e46a670

In [None]:
corr_matrix = housing.corr()

In [None]:
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
attributes = ["median_house_value", "median_income", "total_rooms",
"housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))

In [None]:
housing.plot(kind="scatter", x="median_income", y="median_house_value",
alpha=0.1, figsize=(10,7))

In [None]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

In [None]:
housing.info()

In [None]:
housing.head()

In [None]:
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [None]:
imputer = SimpleImputer(strategy="median")
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)

In [None]:
X = imputer.transform(housing_num)

In [None]:
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index)

In [None]:
housing_cat = housing[["ocean_proximity"]]
housing_cat.head(10)

In [None]:
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded

In [None]:
ordinal_encoder.categories_


In [None]:
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot #= pd.DataFrame(housing_cat_1hot, columns=housing_cat.columns, index=housing_cat.index)

In [None]:
housing_cat_1hot.toarray()

In [None]:
cat_encoder.categories_

In [None]:
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    
    def fit(self, X, y=None):
        return self # nothing else to do
    
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
        
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [None]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])
housing_num_tr = num_pipeline.fit_transform(housing_num)

In [None]:
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])
housing_prepared = full_pipeline.fit_transform(housing)

In [None]:
housing_prepared

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:", lin_reg.predict(some_data_prepared))
print("Labels:", list(some_labels))

In [None]:

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse


In [None]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

In [None]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse


In [None]:
scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)


In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)

In [None]:
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

In [111]:
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse


18659.544560304355

In [112]:
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores: [50910.78350383 48790.40047072 46596.87049705 51940.71948009
 47872.38793752 51739.20539776 52639.92662818 49794.45515917
 48408.67370493 53748.88270408]
Mean: 50244.23054833275
Standard deviation: 2195.2978206399844


In [115]:
param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(
    forest_reg, param_grid, cv=5,
    scoring='neg_mean_squared_error',
    return_train_score=True
)
grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [117]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

65394.300106114504 {'max_features': 2, 'n_estimators': 3}
55182.86985104986 {'max_features': 2, 'n_estimators': 10}
52976.40902514194 {'max_features': 2, 'n_estimators': 30}
59946.6899372299 {'max_features': 4, 'n_estimators': 3}
52750.26918303858 {'max_features': 4, 'n_estimators': 10}
50295.26795658741 {'max_features': 4, 'n_estimators': 30}
59964.37215508855 {'max_features': 6, 'n_estimators': 3}
51786.22893791141 {'max_features': 6, 'n_estimators': 10}
50022.35259930357 {'max_features': 6, 'n_estimators': 30}
58488.09221170579 {'max_features': 8, 'n_estimators': 3}
52278.874672774 {'max_features': 8, 'n_estimators': 10}
49917.92762263737 {'max_features': 8, 'n_estimators': 30}
62978.35572964752 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
53698.47443341907 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
59649.973824350796 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52418.854484374046 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10

In [116]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([6.96642906e-02, 6.30448769e-02, 4.46724109e-02, 1.56328264e-02,
       1.44216399e-02, 1.50304187e-02, 1.43434668e-02, 3.65250782e-01,
       4.85162703e-02, 1.10195035e-01, 6.50476130e-02, 8.89956260e-03,
       1.60688483e-01, 9.30365225e-05, 1.90715213e-03, 2.59213523e-03])

In [118]:
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

[(0.3652507818462352, 'median_income'),
 (0.16068848343360312, 'INLAND'),
 (0.1101950346702832, 'pop_per_hhold'),
 (0.06966429061511047, 'longitude'),
 (0.06504761300702576, 'bedrooms_per_room'),
 (0.06304487693800145, 'latitude'),
 (0.04851627031281158, 'rooms_per_hhold'),
 (0.044672410947744434, 'housing_median_age'),
 (0.015632826405299602, 'total_rooms'),
 (0.015030418664097236, 'population'),
 (0.014421639870756956, 'total_bedrooms'),
 (0.014343466798481553, 'households'),
 (0.008899562601294733, '<1H OCEAN'),
 (0.002592135232509077, 'NEAR OCEAN'),
 (0.0019071521342226064, 'NEAR BAY'),
 (9.303652252285839e-05, 'ISLAND')]

In [121]:
final_model = grid_search.best_estimator_
type(final_model)

sklearn.ensemble._forest.RandomForestRegressor

In [122]:
X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()
X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [126]:
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
    loc=squared_errors.mean(),
    scale=stats.sem(squared_errors))
)

array([45915.35029311, 49854.64090923])