# Housing Prices Predictions

## 1. Data Preparation

In [43]:
import sklearn
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [44]:
np.random.seed(88)

In [45]:
plt.rc('font', size=12)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=12)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)
plt.rc('figure', titlesize=16)
plt.rcParams['figure.figsize'] = [10, 6]

Read and stratify on income classes.

In [46]:
housing = pd.read_csv("data/chap2/housing.csv")

In [57]:
housing.isna().sum()[housing.isna().sum() > 0]  

total_bedrooms    207
dtype: int64

In [47]:
housing["income_class"] = pd.cut(housing["median_income"], bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels=[1, 2, 3, 4, 5])

In [48]:
housing_train, housing_test = sklearn.model_selection.train_test_split(housing, test_size=0.2, random_state=88, stratify=housing["income_class"])

In [49]:
housing_train.drop("income_class", axis=1, inplace=True)
housing_test.drop("income_class", axis=1, inplace=True)

In [50]:
housing_train.shape, housing_test.shape

((16512, 10), (4128, 10))

In [52]:
housing_train_labels = housing_train["median_house_value"]
housing_train.drop(columns=["median_house_value"], inplace=True)

In [53]:
housing_train.shape, housing_train_labels.shape

((16512, 9), (16512,))

### 1.1 Missing Values (Numerical)

We can drop the rows, the whole columns or impute missing values (0, median, mean, etc.)

The benefit to creating an Imputer object is that it will store the median for each column, so we can preprocess the test/validation sets with the same values.

In [60]:
imputer = sklearn.impute.SimpleImputer(strategy="median")
imputer

In [61]:
housing_train_num = housing_train.select_dtypes(include=[np.number])

In [62]:
housing_train_num.isna().sum()[housing_train_num.isna().sum() > 0]

total_bedrooms    162
dtype: int64

We can `fit_tranform` directly or fit and then transform.

In [66]:
housing_train_num = imputer.fit_transform(housing_train_num)

In [70]:
pd.DataFrame(housing_train_num, columns=housing_train.select_dtypes(include=[np.number]).columns).isna().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
dtype: int64

In [71]:
housing_train_num = housing_train.select_dtypes(include=[np.number])

This is basically "training" the imputer.

In [73]:
imputer.fit(housing_train_num)

In [74]:
imputer.statistics_

array([-118.49   ,   34.25   ,   29.     , 2138.     ,  435.     ,
       1167.     ,  410.     ,    3.52955])

In [76]:
housing_train_num.median().values

array([-118.49   ,   34.25   ,   29.     , 2138.     ,  435.     ,
       1167.     ,  410.     ,    3.52955])

In [77]:
X = imputer.transform(housing_train_num)

In [78]:
X = pd.DataFrame(X, columns=housing_train_num.columns, index=housing_train_num.index)
X

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
20180,-121.33,38.77,3.0,20214.0,3559.0,8361.0,3112.0,4.2259
16716,-121.59,39.78,16.0,2754.0,570.0,1063.0,543.0,1.4048
18333,-121.18,37.99,31.0,2450.0,559.0,1459.0,478.0,2.4674
20161,-122.44,37.75,28.0,4930.0,1381.0,2232.0,1321.0,4.3232
5051,-121.43,37.74,52.0,994.0,258.0,623.0,264.0,1.7250
...,...,...,...,...,...,...,...,...
7195,-118.24,33.89,32.0,1132.0,266.0,1211.0,279.0,2.1838
10861,-121.36,38.64,24.0,6540.0,1008.0,2667.0,1031.0,5.5632
577,-121.27,37.96,43.0,1624.0,448.0,1805.0,440.0,1.4250
3430,-117.40,33.96,51.0,1806.0,322.0,709.0,298.0,3.5750


### 1.2 Categorical Variables

In [80]:
housing.dtypes

longitude              float64
latitude               float64
housing_median_age     float64
total_rooms            float64
total_bedrooms         float64
population             float64
households             float64
median_income          float64
median_house_value     float64
ocean_proximity         object
income_class          category
dtype: object

In [81]:
housing_train_cat = housing_train.select_dtypes(include=[object])

In [83]:
housing_train_cat.value_counts()

ocean_proximity
<1H OCEAN          7317
INLAND             5217
NEAR OCEAN         2147
NEAR BAY           1826
ISLAND                5
Name: count, dtype: int64

In [85]:
sklearn.preprocessing.OrdinalEncoder??

[0;31mInit signature:[0m
[0msklearn[0m[0;34m.[0m[0mpreprocessing[0m[0;34m.[0m[0mOrdinalEncoder[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcategories[0m[0;34m=[0m[0;34m'auto'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m=[0m[0;34m<[0m[0;32mclass[0m [0;34m'numpy.float64'[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mhandle_unknown[0m[0;34m=[0m[0;34m'error'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0munknown_value[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mencoded_missing_value[0m[0;34m=[0m[0mnan[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_frequency[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_categories[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mOrdinalEncoder[0m[0;34m([0m[0mOneToOneFeatureMixin[0

In [86]:
ordinal_encoder = sklearn.preprocessing.OrdinalEncoder()
ordinal_encoder

In [87]:
ordinal_encoder.fit(housing_train_cat)

In [90]:
ordinal_encoder.feature_names_in_

array(['ocean_proximity'], dtype=object)

Representing categorical values with numbers can cause problems as some models will assume that nearby values are more similar than distant ones. That's fine with ordered categories but it's not the case here.

In [88]:
ordinal_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

In [99]:
housing_train_cat_encoded = ordinal_encoder.transform(housing_train_cat)

In [100]:
housing_train_cat_encoded[:10]

array([[1.],
       [1.],
       [1.],
       [3.],
       [1.],
       [4.],
       [4.],
       [1.],
       [1.],
       [1.]])