In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.datasets import fetch_california_housing

#### Regression Model on Housing

In [16]:
X, y = fetch_california_housing(return_X_y=True, as_frame=True)

In [17]:
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [18]:
y

0        4.526
1        3.585
2        3.521
3        3.413
4        3.422
         ...  
20635    0.781
20636    0.771
20637    0.923
20638    0.847
20639    0.894
Name: MedHouseVal, Length: 20640, dtype: float64

In [19]:
# What features should we use?
X_simple = X[['MedInc']]

In [20]:
# Create train/test split
X_train, X_test, y_train, y_test = train_test_split(X_simple, y, random_state=42) 

In [37]:
# cross validate model
lr_simple = LinearRegression()
cross_val_score(lr_simple, X_train, y_train).mean()

0.474088737030003

In [24]:
# create polynomial features
#by hand
# X_simple['med^2'] = X_simple['MedInc']**2
poly_features = PolynomialFeatures(degree = 2, include_bias=False)

X_train_quad = poly_features.fit_transform(X_train)
X_test_quad = poly_features.transform(X_test) 

In [27]:
# cross validate model
cross_val_score(lr_simple, X_train_quad, y_train, scoring = 'neg_mean_squared_error').mean()

-0.6956027209777113

In [29]:
# consider other features
X_train_all, X_test_all = train_test_split(X, random_state=42)

In [13]:
# build polynomial features
poly_features = PolynomialFeatures(include_bias=False)

In [30]:
from sklearn.pipeline import Pipeline

In [32]:
# build optimal in pipeline
poly_pipe = Pipeline([('poly', poly_features), #first we create polynomial features
                     ('model', lr_simple)]) #build model on poly features

In [None]:
# fit the pipeline


In [40]:
# cross validate
cross_val_score(poly_pipe, X_train_all, y_train)

array([   0.32809676,    0.6796999 , -250.93275431,    0.67882742,
         -1.10820833])

In [35]:
# which is best??
poly_features.fit_transform(X_train_all).shape

(15480, 44)

In [36]:
X_train_all.shape

(15480, 8)