In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split,KFold,cross_validate
from sklearn.preprocessing import PolynomialFeatures,StandardScaler

from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet,LassoLarsIC
from sklearn.datasets import california_housing

In [2]:
data = california_housing.fetch_california_housing()

In [3]:
X = pd.DataFrame(data['data'],columns=data['feature_names'])
y = pd.DataFrame(data['target'])

In [14]:
X.shape

(20640, 8)

In [4]:
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


# Feature Engineering

In [5]:
poly = PolynomialFeatures(3)
X_poly = poly.fit_transform(X)
X_poly = pd.DataFrame(X_poly,columns = poly.get_feature_names(X.columns))
X_poly.head()

Unnamed: 0,1,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedInc^2,...,AveOccup^3,AveOccup^2 Latitude,AveOccup^2 Longitude,AveOccup Latitude^2,AveOccup Latitude Longitude,AveOccup Longitude^2,Latitude^3,Latitude^2 Longitude,Latitude Longitude^2,Longitude^3
0,1.0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,69.308955,...,16.689986,247.389136,-798.267531,3666.952356,-11832.407244,38180.441856,54353.799872,-175387.142512,565933.749452,-1826137.0
1,1.0,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,68.913242,...,9.391819,168.531236,-544.054085,3024.204235,-9762.76391,31516.24419,54267.751656,-175187.654712,565542.397224,-1825689.0
2,1.0,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,52.669855,...,22.005195,297.223199,-959.909216,4014.580565,-12965.451209,41873.097907,54224.761625,-175123.7744,565578.07616,-1826586.0
3,1.0,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,31.844578,...,16.541323,245.723138,-793.650028,3650.24363,-11789.756507,38079.200342,54224.761625,-175138.100625,565670.615625,-1827034.0
4,1.0,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,14.793254,...,10.381164,180.120545,-581.763186,3125.218967,-10094.003137,32602.163369,54224.761625,-175138.100625,565670.615625,-1827034.0


In [19]:
X_poly.MedInc.mean(),X_poly.MedInc.std()

(3.8706710029069766, 1.8998217179452688)

# Train / Validation / Test split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_poly,y,test_size=1000, random_state=42)
X_train_v, X_val, y_train_v, y_val = train_test_split(X_train,y_train,test_size=1000, random_state=123)

In [10]:
scaler = StandardScaler()

In [12]:
X_train_v.shape

(18640, 165)

In [15]:
scaler.fit(X_train_v)

X_train_v = scaler.transform(X_train_v)
X_val = scaler.transform(X_val)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [16]:
X_train_v = pd.DataFrame(X_train_v,columns = poly.get_feature_names(X.columns))
X_train_v.head()

Unnamed: 0,1,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedInc^2,...,AveOccup^3,AveOccup^2 Latitude,AveOccup^2 Longitude,AveOccup Latitude^2,AveOccup Latitude Longitude,AveOccup Longitude^2,Latitude^3,Latitude^2 Longitude,Latitude Longitude^2,Longitude^3
0,0.0,-0.031841,-0.606435,0.178642,0.160661,-0.595176,0.041127,-0.545903,-0.237692,-0.186257,...,-0.008607,-0.009915,0.009856,0.016224,-0.028653,0.042264,-0.570919,0.479383,-0.293239,-0.22072
1,0.0,0.710348,-1.955232,2.009597,0.717012,-0.975745,0.02693,-0.995294,1.124305,0.398111,...,-0.008608,-0.010022,0.009968,-0.008952,-0.002306,0.014845,-0.967408,0.999812,-1.046976,1.114333
2,0.0,-0.442032,1.297749,-0.468423,-0.443074,-0.389582,-0.041752,1.083141,-1.375184,-0.431269,...,-0.008609,-0.010288,0.010312,-0.009461,0.018921,-0.029578,1.061375,-1.121935,1.214756,-1.383256
3,0.0,0.035014,-1.717209,-0.03664,-0.13673,1.251674,0.005834,1.438909,-0.901229,-0.141064,...,-0.008608,-0.010019,0.010063,0.048508,-0.031992,0.014103,1.460523,-1.394254,1.265869,-0.893547
4,0.0,-0.714486,-0.447753,-0.130547,-0.150741,-0.386957,-0.041662,1.101866,-0.362417,-0.563336,...,-0.008609,-0.010287,0.010321,-0.008876,0.022565,-0.037514,1.08199,-0.999793,0.830729,-0.346061


# Fit and evaluate model

In [20]:
lin_reg = LinearRegression()
lin_reg.fit(X_train_v,y_train_v)
lin_reg.score(X_train_v,y_train_v)

0.7400074016715636

In [21]:
lin_reg.score(X_val,y_val)

0.7208757118008782

In [22]:
lin_reg.coef_[0][:4]

array([ 1.27452942e-08,  1.10402586e+02,  2.39397374e+02, -1.34144271e+02])

## Ridge

In [23]:
ridge = Ridge(.01)
ridge.fit(X_train_v,y_train_v)
ridge.score(X_train_v,y_train_v)

0.7219628882718296

In [24]:
ridge.score(X_val,y_val)

0.7179066633866409

In [25]:
ridge.coef_[0][:4]

array([ 0.        , -5.87107175, -3.85038808,  4.88116481])

## Lasso

In [26]:
lasso = Lasso(.01)
lasso.fit(X_train_v,y_train_v)
lasso.score(X_train_v,y_train_v)

  positive)


0.6253833099873541

In [27]:
lasso.score(X_val,y_val)

0.6017477680187624

In [30]:
lasso.coef_[:4]

array([ 0.        ,  0.20558595, -0.        , -0.        ])

## Elastic net

In [32]:
elastic = ElasticNet(.01,l1_ratio=.1)
elastic.fit(X_train_v,y_train_v)
elastic.score(X_train_v,y_train_v)

  positive)


0.6596695576264529

In [33]:
elastic.score(X_val,y_val)

0.6413883415552051

In [36]:
elastic.coef_[:4]

array([ 0.        ,  0.13024514, -0.01656491, -0.03794804])

## Lasso with BIC

In [None]:
lassobic = LassoLarsIC('bic')
lassobic.fit(X_train_v,y_train_v)
lassobic.score(X_train_v,y_train_v)

In [None]:
lassobic.score(X_val,y_val)

In [None]:
lassobic.coef_[:4]

In [None]:
lassobic.alpha_

## Lasso with AIC

In [None]:
lassoaic = LassoLarsIC('aic')
lassoaic.fit(X_train_v,y_train_v)
lassoaic.score(X_train_v,y_train_v)

In [None]:
lassoaic.score(X_val,y_val)

In [None]:
lassoaic.coef_[:4]

In [None]:
lassoaic.alpha_

# Final check on the Test dataset

In [None]:
lin_reg.fit(X_train,y_train)
lin_reg.score(X_test,y_test)