In [None]:
%load_ext autoreload
%autoreload 2

import os
import tarfile
import urllib
from pathlib import Path

import matplotlib.pyplot as plt
import polars as pl
from constants import PROJECT_ROOT

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = PROJECT_ROOT / os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

In [None]:
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()


fetch_housing_data()

In [None]:
def load_data(filename: str, housing_path: Path = HOUSING_PATH) -> pl.DataFrame:
    return pl.read_csv(housing_path / filename)


df = load_data("housing.csv")

In [None]:
df.head(5)

In [None]:
print(df.shape)
print(df.schema)
df.null_count()

In [None]:
print(df.get_column("ocean_proximity").value_counts().sort(by=pl.col("count"), descending=True))

In [None]:
df.describe()

In [None]:
%matplotlib inline

df.to_pandas().hist(bins=50, figsize=(20, 12))
plt.show()

In [None]:
mapping = {**{1.5 * i: i for i in range(1, 5)}, float("inf"): 5}


def map_median_income(value) -> int:
    for threshold, label in mapping.items():
        if value <= threshold:
            return label


df = df.with_columns(
    pl.col("median_income")
    .map_elements(lambda x: map_median_income(x), return_dtype=pl.Int64)
    .alias("income_category")
)

In [None]:
df.to_pandas()["income_category"].plot(kind="hist", figsize=(10, 6))
plt.xlabel("Income Category")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

random_state = 42

housing_train, housing_test = train_test_split(
    df, train_size=0.8, stratify=df["income_category"], random_state=random_state
)

In [None]:
print(
    df.get_column("income_category")
    .value_counts()
    .with_columns(proportion=(pl.col("count") / pl.col("count").sum()))
    .sort("count", descending=True)
)

print(
    housing_train.get_column("income_category")
    .value_counts()
    .with_columns(proportion=(pl.col("count") / pl.col("count").sum()))
    .sort("count", descending=True)
),

print(
    housing_test.get_column("income_category")
    .value_counts()
    .with_columns(proportion=(pl.col("count") / pl.col("count").sum()))
    .sort("count", descending=True)
)

In [None]:
housing_train = housing_train.drop("income_category")
housing_test = housing_test.drop("income_category")

housing = housing_train.to_pandas()

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", figsize=(10, 6), alpha=0.1, c="blue")
plt.show()

In [None]:
housing.plot(
    kind="scatter",
    x="longitude",
    y="latitude",
    alpha=0.4,
    s=housing["population"] / 100,
    label="population",
    figsize=(10, 7),
    c="median_house_value",
    cmap="jet",
    colorbar=True,
)
plt.legend()
plt.show()

In [None]:
corr_matrix = housing.drop("ocean_proximity", axis=1).corr()

In [None]:
corr_matrix["median_house_value"].sort_values(ascending=False).head(5)

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(20, 12))
plt.show()