In [None]:
%load_ext autoreload
%autoreload 2

import os
import tarfile
import urllib
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from constants import PROJECT_ROOT

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = PROJECT_ROOT / os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

In [None]:
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()


fetch_housing_data()

In [None]:
def load_data(filename: str, housing_path: Path = HOUSING_PATH) -> pd.DataFrame:
    return pd.read_csv(housing_path / filename)


df = load_data("housing.csv")

In [None]:
df.head(5)

In [None]:
print(df.shape)
df.count()

In [None]:
df["ocean_proximity"].value_counts().sort_values(ascending=False)

In [None]:
df.describe()

In [None]:
%matplotlib inline

df.hist(bins=50, figsize=(20, 12))
plt.show()

In [None]:
mapping = {**{1.5 * i: i for i in range(1, 5)}, float("inf"): 5}


def map_median_income(value) -> int:
    for threshold, label in mapping.items():
        if value <= threshold:
            return label


df["income_category"] = df["median_income"].map(map_median_income)

In [None]:
df["income_category"].plot(kind="hist", figsize=(10, 6))
plt.xlabel("Income Category")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

random_state = 42

housing_train, housing_test = train_test_split(
    df, train_size=0.8, stratify=df["income_category"], random_state=random_state
)

In [None]:
housing_train = housing_train.drop("income_category", axis=1)
housing_test = housing_test.drop("income_category", axis=1)

housing = housing_train.copy()

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", figsize=(10, 6), alpha=0.1, c="blue")
plt.show()

In [None]:
housing.plot(
    kind="scatter",
    x="longitude",
    y="latitude",
    alpha=0.4,
    s=housing["population"] / 100,
    label="population",
    figsize=(10, 7),
    c="median_house_value",
    cmap="jet",
    colorbar=True,
)
plt.legend()
plt.show()

In [None]:
corr_matrix = housing.drop("ocean_proximity", axis=1).corr()

In [None]:
corr_matrix["median_house_value"].sort_values(ascending=False).head(5)

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(20, 12))
plt.show()

In [None]:
housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1, figsize=(10, 6))
plt.show()

In [None]:
housing["rooms_per_household"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["population_per_household"] = housing["population"] / housing["households"]

corr_matrix = housing.drop("ocean_proximity", axis=1).corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
housing_X_train = housing_train.drop("median_house_value", axis=1)
housing_y_train = housing_train["median_house_value"]

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")

housing_num = housing_X_train.drop("ocean_proximity", axis=1)

X = imputer.fit_transform(housing_num)
X = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index)

In [None]:
from sklearn.preprocessing import OrdinalEncoder

housing_cat = df[["ocean_proximity"]]

ordinal_encoder = OrdinalEncoder()

housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
print(housing_cat_encoded[:10])
print(ordinal_encoder.categories_)

In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()

housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot[:10]
print(cat_encoder.categories_)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from transformers import CombinedAttributesAdder

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

num_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("attribs_adder", CombinedAttributesAdder()),
        ("std_scaler", StandardScaler()),
    ]
)
housing_num_tr = num_pipeline.fit_transform(housing_num)

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer(
    [
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ]
)
housing_prepared = full_pipeline.fit_transform(housing)

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_y_train)

In [None]:
some_data = housing.iloc[:5]
some_labels = housing_y_train.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print(f"Predictions: {lin_reg.predict(some_data_prepared)}")
print(f"Labels: {list(some_labels)}")
print(f"Score: {lin_reg.score(some_data_prepared, some_labels):.2f}")

In [None]:
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_y_train, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_y_train)

housing_predictions = tree_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_y_train, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(
    tree_reg, housing_prepared, housing_y_train, scoring="neg_mean_squared_error", cv=10
)
tree_rmse_scores = np.sqrt(-scores)

In [None]:
def display_scores(scores):
    print(f"Scores: {scores}")
    print(f"Mean: {np.mean(scores)}")
    print(f"Standard deviation: {np.std(scores)}")


display_scores(tree_rmse_scores)

In [None]:
lin_scores = cross_val_score(
    lin_reg, housing_prepared, housing_y_train, scoring="neg_mean_squared_error", cv=10
)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_y_train)

for_scores = cross_val_score(
    forest_reg, housing_prepared, housing_y_train, scoring="neg_mean_squared_error", cv=10
)
for_rmse_scores = np.sqrt(-for_scores)
display_scores(for_rmse_scores)

In [None]:
import joblib

models_dir = PROJECT_ROOT / "models"
if not os.path.isdir(models_dir):
    os.mkdir(models_dir)

joblib.dump(lin_reg, models_dir / "lin_reg.pkl")
joblib.dump(tree_reg, models_dir / "tree_reg.pkl")
joblib.dump(forest_reg, models_dir / "forest_reg.pkl")

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {"n_estimators": [3, 10, 30], "max_features": [2, 4, 6, 8]},
    {"bootstrap": [False], "n_estimators": [3, 10], "max_features": [2, 3, 4]},
]
forest_reg = RandomForestRegressor()
forest_reg = (
    GridSearchCV(
        forest_reg,
        param_grid,
        cv=5,
        scoring="neg_mean_squared_error",
        return_train_score=True,
        refit=True,
    )
    .fit(housing_prepared, housing_y_train)
    .best_estimator_
)