In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split,KFold,cross_validate
from sklearn.preprocessing import PolynomialFeatures,StandardScaler

from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet,LassoLarsIC
from sklearn.datasets import california_housing

In [2]:
data = california_housing.fetch_california_housing()

In [3]:
X = pd.DataFrame(data['data'],columns=data['feature_names'])
y = pd.DataFrame(data['target'])

In [4]:
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


# Feature Engineering

In [5]:
poly = PolynomialFeatures(5)
X_poly = poly.fit_transform(X)
X_poly = pd.DataFrame(X_poly,columns = poly.get_feature_names(X.columns))
X_poly.head()

Unnamed: 0,1,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedInc^2,...,AveOccup Latitude^3 Longitude,AveOccup Latitude^2 Longitude^2,AveOccup Latitude Longitude^3,AveOccup Longitude^4,Latitude^5,Latitude^4 Longitude,Latitude^3 Longitude^2,Latitude^2 Longitude^3,Latitude Longitude^4,Longitude^5
0,1.0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,69.308955,...,-16978250.0,54784900.0,-176778200.0,570422400.0,77991960.0,-251662000.0,812055200.0,-2620314000.0,8455148000.0,-27282810000.0
1,1.0,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,68.913242,...,-13993750.0,45174740.0,-145833500.0,470781100.0,77786290.0,-251110400.0,810636900.0,-2616906000.0,8447919000.0,-27271650000.0
2,1.0,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,52.669855,...,-18574600.0,59988340.0,-193737800.0,625693700.0,77683610.0,-250886300.0,810259900.0,-2616808000.0,8451217000.0,-27293970000.0
3,1.0,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,31.844578,...,-16890270.0,54553120.0,-176198600.0,569096000.0,77683610.0,-250906800.0,810392500.0,-2617450000.0,8453983000.0,-27305140000.0
4,1.0,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,14.793254,...,-14460900.0,46706590.0,-150855500.0,487241400.0,77683610.0,-250906800.0,810392500.0,-2617450000.0,8453983000.0,-27305140000.0


# Train / Validation / Test split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_poly,y,test_size=1000, random_state=42)
X_train_v, X_val, y_train_v, y_val = train_test_split(X_train,y_train,test_size=1000, random_state=123)

In [7]:
scaler = StandardScaler()

In [8]:
scaler.fit(X_train_v)

X_train_v = scaler.transform(X_train_v)
X_val = scaler.transform(X_val)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
X_train_v = pd.DataFrame(X_train_v,columns = poly.get_feature_names(X.columns))
X_train_v.head()

Unnamed: 0,1,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedInc^2,...,AveOccup Latitude^3 Longitude,AveOccup Latitude^2 Longitude^2,AveOccup Latitude Longitude^3,AveOccup Longitude^4,Latitude^5,Latitude^4 Longitude,Latitude^3 Longitude^2,Latitude^2 Longitude^3,Latitude Longitude^4,Longitude^5
0,0.0,-0.031841,-0.606435,0.178642,0.160661,-0.595176,0.041127,-0.545903,-0.237692,-0.186257,...,-0.005482,0.01682,-0.029316,0.043002,-0.590697,0.545365,-0.477044,0.368404,-0.179051,-0.20365
1,0.0,0.710348,-1.955232,2.009597,0.717012,-0.975745,0.02693,-0.995294,1.124305,0.398111,...,0.027197,-0.018086,0.007829,0.003626,-0.936847,0.960049,-0.986692,1.018847,-1.059197,1.104234
2,0.0,-0.442032,1.297749,-0.468423,-0.443074,-0.389582,-0.041752,1.083141,-1.375184,-0.431269,...,-0.008149,0.000576,0.008104,-0.017937,1.032272,-1.075963,1.126667,-1.18947,1.273577,-1.391023
3,0.0,0.035014,-1.717209,-0.03664,-0.13673,1.251674,0.005834,1.438909,-0.901229,-0.141064,...,-0.069161,0.054684,-0.038891,0.021787,1.475339,-1.437963,1.38522,-1.305472,1.16946,-0.88541
4,0.0,-0.714486,-0.447753,-0.130547,-0.150741,-0.386957,-0.041662,1.101866,-0.362417,-0.563336,...,-0.005086,-0.006701,0.019669,-0.033834,1.054723,-1.015211,0.954652,-0.857364,0.685706,-0.329515


# Fit and evaluate model

In [10]:
lin_reg = LinearRegression()
lin_reg.fit(X_train_v,y_train_v)
lin_reg.score(X_train_v,y_train_v)

0.8233232564899363

In [11]:
lin_reg.score(X_val,y_val)

-53.45944834320309

In [12]:
lin_reg.coef_[0][:4]

array([   66905.66524961,  -842004.72360017,  1799035.51366957,
       -2250306.08356813])

## Ridge

In [13]:
ridge = Ridge(.01)
ridge.fit(X_train_v,y_train_v)
ridge.score(X_train_v,y_train_v)

0.7571279702495323

In [14]:
ridge.score(X_val,y_val)

0.6843848679561307

In [15]:
ridge.coef_[0][:4]

array([ 0.        , -0.93219811, -1.423182  ,  5.75769697])

## Lasso

In [16]:
lasso = Lasso(.01)
lasso.fit(X_train_v,y_train_v)
lasso.score(X_train_v,y_train_v)

  positive)


0.6417759491067909

In [17]:
lasso.score(X_val,y_val)

0.635870879051025

In [18]:
lasso.coef_[:4]

array([ 0.        ,  0.04625235, -0.        , -0.        ])

## Elastic net

In [19]:
elastic = ElasticNet(.01)
elastic.fit(X_train_v,y_train_v)
elastic.score(X_train_v,y_train_v)

  positive)


0.6570321035496832

In [20]:
elastic.score(X_val,y_val)

0.6479229739305834

In [21]:
elastic.coef_[:4]

array([ 0.        ,  0.07733202, -0.        , -0.        ])

## Lasso with BIC

In [22]:
lassobic = LassoLarsIC('bic')
lassobic.fit(X_train_v,y_train_v)
lassobic.score(X_train_v,y_train_v)

  y = column_or_1d(y, warn=True)


0.6966487189392462

In [23]:
lassobic.score(X_val,y_val)

0.6982960192777602

In [24]:
lassobic.coef_[:4]

array([ 0.        ,  0.        ,  0.        , -0.04466391])

In [25]:
lassobic.alpha_

4.030554607546478e-06

## Lasso with AIC

In [26]:
lassoaic = LassoLarsIC('aic')
lassoaic.fit(X_train_v,y_train_v)
lassoaic.score(X_train_v,y_train_v)

  y = column_or_1d(y, warn=True)


0.709521853970287

In [27]:
lassoaic.score(X_val,y_val)

0.7113568994121754

In [28]:
lassoaic.coef_[:4]

array([ 0.        , -0.27480587, -0.50402825,  0.        ])

In [29]:
lassoaic.alpha_

1.8045574403967986e-06

# Final check on the Test dataset

In [30]:
lassoaic.fit(X_train,y_train)
lassoaic.score(X_test,y_test)

  y = column_or_1d(y, warn=True)


0.7112870526060826