In [None]:
# Common imports
import numpy as np
import pandas as pd
import os

# Loading Data

In [None]:
# https://developers.google.com/machine-learning/crash-course/california-housing-data-description

dataset_path = "../datasets/housing/housing.csv"
housing = pd.read_csv(dataset_path)

housing.head()

# Prepare the data for ML

### Shuffle and Split dataset into training & test

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
    
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)
    
print(len(strat_train_set), "train +", len(strat_test_set), "test")

### Create Train set and Labels

In [None]:
# train
housing = strat_train_set.drop("median_house_value", axis=1) # drop labels for training set
housing_labels = strat_train_set["median_house_value"].copy()

### Create pipelines

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer

from combined_attributes import CombinedAttributesAdder

num_pipeline = Pipeline([
        ('imputer', Imputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

In [None]:
from future_encoders import ColumnTransformer
from future_encoders import OneHotEncoder

num_attribs = list(housing.drop('ocean_proximity', axis=1))
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

### Prepare data

In [None]:
# prepared train
housing_prepared = full_pipeline.fit_transform(housing)

In [None]:
housing_prepared.shape

In [None]:
housing_labels.shape

In [None]:
housing_prepared