In [164]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

In [166]:
def load_housing_data():
    # looking for datasets/housing.tgz file by creating a Path obj called tarball_path that will represent either a file or directory path
    tarball_path = Path("datasets/housing.tgz")
    # if housing.tgz file DNE, we need to get it from the internet
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        # the code below downloads housing.tgz file from the url and stores it inside tarball_path directory i.e datasets
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            # unzipping the tar file
            housing_tarball.extractall(path = 'datasets')
    return pd.read_csv(Path("datasets/housing/housing.csv"))

housing = load_housing_data()
housing.head()

In [167]:
housing.info()

In [168]:
housing["ocean_proximity"].value_counts()

In [169]:
housing.describe()

In [170]:
import matplotlib.pyplot as plt

housing.hist(bins=50, figsize=(12, 8))
plt.show()

In [171]:
# Creating a test set

In [172]:
import numpy as np

def shuffle_and_split_data(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]


In [173]:
train_set, test_set = shuffle_and_split_data(housing, 0.2)

In [174]:
from zlib import crc32
def is_id_in_test_set(identifier, test_ratio):
     return crc32(np.int64(identifier)) < test_ratio * 2**32
def split_data_with_id_hash(data, test_ratio, id_column):
     ids = data[id_column]
     in_test_set = ids.apply(lambda id_: is_id_in_test_set(id_, test_ratio))
     return data.loc[~in_test_set], data.loc[in_test_set]

In [175]:
housing_with_id = housing.reset_index() # adds an `index` column
train_set, test_set = split_data_with_id_hash(housing_with_id, 0.2, "index")

In [176]:
housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
train_set, test_set = split_data_with_id_hash(housing_with_id, 0.2, "id")

In [177]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)


In [178]:
housing["income_cat"] = pd.cut(housing["median_income"],
 bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
 labels=[1, 2, 3, 4, 5])

In [179]:
housing["income_cat"].value_counts().sort_index().plot.bar(rot=0, grid=True)
plt.xlabel("Income category")
plt.ylabel("Number of districts")
plt.show()

In [180]:
from sklearn.model_selection import StratifiedShuffleSplit
splitter = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
strat_splits = []
for train_index, test_index in splitter.split(housing, housing["income_cat"]):
    strat_train_set_n = housing.iloc[train_index]
    strat_test_set_n = housing.iloc[test_index]
    strat_splits.append([strat_train_set_n, strat_test_set_n])

In [181]:
strat_train_set, strat_test_set = strat_splits[0]

In [182]:
strat_train_set, strat_test_set = train_test_split(
 housing, test_size=0.2, stratify=housing["income_cat"], random_state=42)

In [183]:
strat_test_set["income_cat"].value_counts() / len(strat_test_set)

In [184]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [185]:
housing = strat_train_set.copy()

In [186]:
housing.plot(kind="scatter", x="longitude", y="latitude", grid=True)
plt.show()

In [187]:
housing.plot(kind="scatter", x="longitude", y="latitude", grid=True, alpha=0.2)
plt.show()

In [188]:
housing.plot(kind="scatter", x="longitude", y="latitude", grid=True,
 s=housing["population"] / 100, label="population",
 c="median_house_value", cmap="jet", colorbar=True,
 legend=True, sharex=False, figsize=(10, 7))
plt.show()

In [189]:
housing

In [190]:
housing = housing.drop("ocean_proximity", axis=1).corr()
corr_matrx = housing.corr()

In [191]:
corr_matrx["median_house_value"].sort_values(ascending=False)

In [192]:
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms",
 "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))
plt.show()

In [193]:
housing.plot(kind="scatter", x="median_income", y="median_house_value",
 alpha=0.1, grid=True)
plt.show()

In [194]:
housing["rooms_per_house"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_ratio"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["people_per_house"] = housing["population"] / housing["households"]

In [195]:
# corr_matrix = housing.corr()
# corr_matrix["median_housing_value"].sort_values(ascending = False)

In [196]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [197]:
# Preparing the Data for ML algo

In [198]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [199]:
housing, housing_labels

In [200]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")

In [201]:
housing_num = housing.select_dtypes(include=[np.number])

In [202]:
imputer.fit(housing_num)

In [203]:
imputer.statistics_

In [204]:
housing_num.median().values

In [205]:
X = imputer.transform(housing_num)

In [206]:
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index)

In [207]:
housing_tr

In [208]:
# handling non-numerical attribute
housing_cat = housing[["ocean_proximity"]]
housing_cat.head(8)

In [209]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)

In [210]:
 housing_cat_encoded[:8]

In [211]:
ordinal_encoder.categories_

In [212]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

In [213]:
housing_cat_1hot.toarray()

In [214]:
cat_encoder.categories_

In [215]:
df_test = pd.DataFrame({"ocean_proximity": ["INLAND", "NEAR BAY"]})
pd.get_dummies(df_test)

In [216]:
cat_encoder.transform(df_test)

In [217]:
df_test_unknown = pd.DataFrame({"ocean_proximity": ["<2H OCEAN", "ISLAND"]})
pd.get_dummies(df_test_unknown)

In [218]:
cat_encoder.handle_unknown = "ignore"
cat_encoder.transform(df_test_unknown)

In [219]:
cat_encoder.feature_names_in_

In [220]:
cat_encoder.get_feature_names_out()


In [221]:
# pg 75

In [222]:
cat_encoder

In [223]:
print("hello worl")