In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split,KFold,cross_validate
from sklearn.preprocessing import PolynomialFeatures,StandardScaler

from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet,LassoLarsIC
from sklearn.datasets import california_housing

In [2]:
data = california_housing.fetch_california_housing()

In [3]:
X = pd.DataFrame(data['data'],columns=data['feature_names'])
y = pd.DataFrame(data['target'])

In [4]:
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


# Feature Engineering

In [5]:
poly = PolynomialFeatures(4)
X_poly = poly.fit_transform(X)
X_poly = pd.DataFrame(X_poly,columns = poly.get_feature_names(X.columns))
X_poly.head()

Unnamed: 0,1,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedInc^2,...,AveOccup^2 Longitude^2,AveOccup Latitude^3,AveOccup Latitude^2 Longitude,AveOccup Latitude Longitude^2,AveOccup Longitude^3,Latitude^4,Latitude^3 Longitude,Latitude^2 Longitude^2,Latitude Longitude^3,Longitude^4
0,1.0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,69.308955,...,97572.240298,138904.155228,-448211.58642,1446275.0,-4666795.0,2058922.0,-6643665.0,21437570.0,-69174080.0,223208800.0
1,1.0,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,68.913242,...,66494.290247,114496.372343,-369618.24162,1193205.0,-3851915.0,2054577.0,-6632605.0,21411440.0,-69120590.0,223135700.0
2,1.0,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,52.669855,...,117339.30261,151951.874384,-490742.328262,1584897.0,-5118567.0,2052407.0,-6628435.0,21407130.0,-69136260.0,223281800.0
3,1.0,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,31.844578,...,97023.715941,138161.721401,-446242.283784,1441298.0,-4655182.0,2052407.0,-6628977.0,21410630.0,-69153230.0,223354900.0
4,1.0,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,14.793254,...,71120.549434,118289.537908,-382058.018738,1233992.0,-3985614.0,2052407.0,-6628977.0,21410630.0,-69153230.0,223354900.0


# Train / Validation / Test split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_poly,y,test_size=1000, random_state=42)
X_train_v, X_val, y_train_v, y_val = train_test_split(X_train,y_train,test_size=1000, random_state=123)

In [7]:
scaler = StandardScaler()

In [8]:
scaler.fit(X_train_v)

X_train_v = scaler.transform(X_train_v)
X_val = scaler.transform(X_val)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
X_train_v = pd.DataFrame(X_train_v,columns = poly.get_feature_names(X.columns))
X_train_v.head()

Unnamed: 0,1,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedInc^2,...,AveOccup^2 Longitude^2,AveOccup Latitude^3,AveOccup Latitude^2 Longitude,AveOccup Latitude Longitude^2,AveOccup Longitude^3,Latitude^4,Latitude^3 Longitude,Latitude^2 Longitude^2,Latitude Longitude^3,Longitude^4
0,0.0,-0.031841,-0.606435,0.178642,0.160661,-0.595176,0.041127,-0.545903,-0.237692,-0.186257,...,-0.00983,0.005252,-0.016561,0.029028,-0.042681,-0.581496,0.5198,-0.417376,0.226303,0.212196
1,0.0,0.710348,-1.955232,2.009597,0.717012,-0.975745,0.02693,-0.995294,1.124305,0.398111,...,-0.00996,-0.023165,0.013604,-0.002856,-0.009131,-0.952464,0.978705,-1.011467,1.055372,-1.109298
2,0.0,-0.442032,1.297749,-0.468423,-0.443074,-0.389582,-0.041752,1.083141,-1.375184,-0.431269,...,-0.010268,0.003544,0.004389,-0.013453,0.023692,1.047769,-1.096871,1.15955,-1.248752,1.387178
3,0.0,0.035014,-1.717209,-0.03664,-0.13673,1.251674,0.005834,1.438909,-0.901229,-0.141064,...,-0.010024,0.066459,-0.051657,0.035507,-0.018016,1.468881,-1.420977,1.346491,-1.211717,0.889535
4,0.0,-0.714486,-0.447753,-0.130547,-0.150741,-0.386957,-0.041662,1.101866,-0.362417,-0.563336,...,-0.010287,0.004366,0.007742,-0.021066,0.035617,1.069312,-1.014503,0.922587,-0.748965,0.33781


# Fit and evaluate model

In [10]:
lin_reg = LinearRegression()
lin_reg.fit(X_train_v,y_train_v)
lin_reg.score(X_train_v,y_train_v)

0.7860248960326462

In [11]:
lin_reg.score(X_val,y_val)

0.08289367918819879

In [12]:
lin_reg.coef_[0][:4]

array([-2.77837740e-02, -2.85041581e+01, -1.74602175e+04,  3.06457446e+03])

## Ridge

In [13]:
ridge = Ridge(.01)
ridge.fit(X_train_v,y_train_v)
ridge.score(X_train_v,y_train_v)

0.7432457622992911

In [14]:
ridge.score(X_val,y_val)

0.7222128760745654

In [15]:
ridge.coef_[0][:4]

array([ 0.        , -2.64438438, -2.82102713,  6.01174362])

## Lasso

In [16]:
lasso = Lasso(.01)
lasso.fit(X_train_v,y_train_v)
lasso.score(X_train_v,y_train_v)

  positive)


0.6391517029387219

In [17]:
lasso.score(X_val,y_val)

0.6288515164354378

In [18]:
lasso.coef_[:4]

array([ 0.        ,  0.06957557, -0.        , -0.        ])

## Elastic net

In [19]:
elastic = ElasticNet(.01)
elastic.fit(X_train_v,y_train_v)
elastic.score(X_train_v,y_train_v)

  positive)


0.653078070028835

In [20]:
elastic.score(X_val,y_val)

0.6405229534456697

In [21]:
elastic.coef_[:4]

array([ 0.        ,  0.07880727, -0.        , -0.        ])

## Lasso with BIC

In [22]:
lassobic = LassoLarsIC('bic')
lassobic.fit(X_train_v,y_train_v)
lassobic.score(X_train_v,y_train_v)

  y = column_or_1d(y, warn=True)


0.7072892146808394

In [23]:
lassobic.score(X_val,y_val)

0.7083725599709958

In [24]:
lassobic.coef_[:4]

array([ 0.        , -1.04842472, -0.87447034,  0.        ])

In [25]:
lassobic.alpha_

1.3490839925834908e-06

## Lasso with AIC

In [26]:
lassoaic = LassoLarsIC('aic')
lassoaic.fit(X_train_v,y_train_v)
lassoaic.score(X_train_v,y_train_v)

  y = column_or_1d(y, warn=True)


0.722420095325462

In [27]:
lassoaic.score(X_val,y_val)

0.7172793533768542

In [28]:
lassoaic.coef_[:4]

array([ 0.        , -2.10251945, -1.5630619 ,  0.59039225])

In [29]:
lassoaic.alpha_

3.034595585936979e-07

# Final check on the Test dataset

In [30]:
lassoaic.fit(X_train,y_train)
lassoaic.score(X_test,y_test)

  y = column_or_1d(y, warn=True)


0.7197468684407464