In [2]:
%matplotlib inline

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Polynomial Features Houses Demo

## Housing dataset

Data from [here](https://archive.ics.uci.edu/ml/machine-learning-databases/housing/).

### Prepare data

- read
- set column names
- get to know
    - shape
    - dtypes
- separate predictor and target variables
- normalize
- train test split
- model with linear regression


In [4]:
housing = pd.read_fwf("../datasets/housing/housing.data", header = None)
housing.columns = ["crime_rate", "zoned_land", "industry", "bounds_river", "nox_conc", "rooms", "age", "distance",  "highways", "tax", "pt_ratio", "b_estimator", "pop_status", "price"]
housing.head()

Unnamed: 0,crime_rate,zoned_land,industry,bounds_river,nox_conc,rooms,age,distance,highways,tax,pt_ratio,b_estimator,pop_status,price
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [5]:
housing.shape

(506, 14)

In [6]:
housing.dtypes

crime_rate      float64
zoned_land      float64
industry        float64
bounds_river      int64
nox_conc        float64
rooms           float64
age             float64
distance        float64
highways          int64
tax             float64
pt_ratio        float64
b_estimator     float64
pop_status      float64
price           float64
dtype: object

In [7]:
housing_attributes = housing.drop(columns="price")
housing_prices = housing.price

housing_attributes.shape, housing_prices.shape

((506, 13), (506,))

In [8]:
scaler = MinMaxScaler()
housing_attributes_scaled = scaler.fit_transform(housing_attributes)


first_record = pd.DataFrame({
    "before scaling": housing_attributes.iloc[0],
    "after scaling": housing_attributes_scaled[0],
})

first_record

Unnamed: 0,before scaling,after scaling
crime_rate,0.00632,0.0
zoned_land,18.0,0.18
industry,2.31,0.067815
bounds_river,0.0,0.0
nox_conc,0.538,0.314815
rooms,6.575,0.577505
age,65.2,0.641607
distance,4.09,0.405722
highways,1.0,0.0
tax,296.0,0.208015


In [9]:
housing_attributes_train, housing_attributes_test, \
housing_prices_train, housing_prices_test \
= train_test_split(housing_attributes_scaled, housing_prices, train_size=0.8)


for data_set in [
    housing_attributes_train, housing_attributes_test, \
    housing_prices_train, housing_prices_test,
]:
    print(data_set.shape)

(404, 13)
(102, 13)
(404,)
(102,)


### Polynomial Features

In [21]:
polynomial_features = PolynomialFeatures(degree=2, interaction_only=True)
housing_attributes_train_poly = polynomial_features.fit_transform(housing_attributes_train)
housing_attributes_test_poly = polynomial_features.fit_transform(housing_attributes_test)

print(housing_attributes_train_poly.shape, housing_attributes_train.shape)
print(housing_attributes_test_poly.shape, housing_attributes_test.shape)

(404, 92) (404, 13)
(102, 92) (102, 13)


### Linear regression

In [22]:
housing_linreg = LinearRegression()
housing_linreg.fit(housing_attributes_train, housing_prices_train)

housing_poly_linreg = LinearRegression()
housing_poly_linreg.fit(housing_attributes_train_poly, housing_prices_train)

In [23]:
R2_train = housing_linreg.score(housing_attributes_train, housing_prices_train)
R2_poly_train = housing_poly_linreg.score(housing_attributes_train_poly, housing_prices_train)

print(
    "R_squared on training data",
    "-" * 30,
    f"Initial 13 features: {R2_train:0.2f}",
    f"With polynomial features, 92 features in total: {R2_poly_train:0.2f}",
    sep="\n",
)

R_squared on training data
------------------------------
Initial 13 features: 0.68
With polynomial features, 92 features in total: 0.90


In [24]:
R2_test = housing_linreg.score(housing_attributes_test, housing_prices_test)
R2_poly_test = housing_poly_linreg.score(housing_attributes_test_poly, housing_prices_test)

print(
    "R_squared on test data",
    "-" * 30,
    f"Initial 13 features: {R2_test:0.2f}",
    f"With polynomial features, 92 features in total: {R2_poly_test:0.2f}",
    sep="\n",
)

R_squared on test data
------------------------------
Initial 13 features: 0.85
With polynomial features, 92 features in total: 0.90
