In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split,KFold,cross_validate
from sklearn.preprocessing import PolynomialFeatures,StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.datasets import california_housing

In [2]:
data = california_housing.fetch_california_housing()

In [3]:
X = pd.DataFrame(data['data'],columns=data['feature_names'])
y = pd.DataFrame(data['target'])

In [4]:
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


# Feature Engineering

In [5]:
poly = PolynomialFeatures(2)

In [6]:
poly.fit(X);

In [7]:
X_poly = poly.transform(X)

In [8]:
X_poly.head()

AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [9]:
X_poly

array([[ 1.00000000e+00,  8.32520000e+00,  4.10000000e+01, ...,
         1.43489440e+03, -4.63007240e+03,  1.49401729e+04],
       [ 1.00000000e+00,  8.30140000e+00,  2.10000000e+01, ...,
         1.43337960e+03, -4.62724920e+03,  1.49377284e+04],
       [ 1.00000000e+00,  7.25740000e+00,  5.20000000e+01, ...,
         1.43262250e+03, -4.62678400e+03,  1.49426176e+04],
       ...,
       [ 1.00000000e+00,  1.70000000e+00,  1.70000000e+01, ...,
         1.55472490e+03, -4.77970460e+03,  1.46942884e+04],
       [ 1.00000000e+00,  1.86720000e+00,  1.80000000e+01, ...,
         1.55472490e+03, -4.78364760e+03,  1.47185424e+04],
       [ 1.00000000e+00,  2.38860000e+00,  1.60000000e+01, ...,
         1.54999690e+03, -4.77321880e+03,  1.46991376e+04]])

### Bringing back to pandas

In [10]:
X_poly = pd.DataFrame(X_poly)
X_poly.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,35,36,37,38,39,40,41,42,43,44
0,1.0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,69.308955,...,103684.0,822.888889,12197.36,-39358.06,6.530864,96.804444,-312.365556,1434.8944,-4630.0724,14940.1729
1,1.0,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,68.913242,...,5764801.0,5065.730228,90901.86,-293450.22,4.451433,79.878612,-257.864868,1433.3796,-4627.2492,14937.7284
2,1.0,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,52.669855,...,246016.0,1389.920904,18773.6,-60631.04,7.85266,106.065537,-342.548249,1432.6225,-4626.784,14942.6176
3,1.0,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,31.844578,...,311364.0,1421.753425,21120.3,-68215.5,6.492025,96.439726,-311.486301,1432.6225,-4627.1625,14945.0625
4,1.0,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,14.793254,...,319225.0,1232.528958,21385.25,-69071.25,4.758799,82.568533,-266.684363,1432.6225,-4627.1625,14945.0625


### Recovering the column names

In [11]:
cols = poly.get_feature_names(X.columns)
cols[:5]

['1', 'MedInc', 'HouseAge', 'AveRooms', 'AveBedrms']

In [12]:
X_poly.columns = cols
X_poly.head()

Unnamed: 0,1,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedInc^2,...,Population^2,Population AveOccup,Population Latitude,Population Longitude,AveOccup^2,AveOccup Latitude,AveOccup Longitude,Latitude^2,Latitude Longitude,Longitude^2
0,1.0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,69.308955,...,103684.0,822.888889,12197.36,-39358.06,6.530864,96.804444,-312.365556,1434.8944,-4630.0724,14940.1729
1,1.0,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,68.913242,...,5764801.0,5065.730228,90901.86,-293450.22,4.451433,79.878612,-257.864868,1433.3796,-4627.2492,14937.7284
2,1.0,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,52.669855,...,246016.0,1389.920904,18773.6,-60631.04,7.85266,106.065537,-342.548249,1432.6225,-4626.784,14942.6176
3,1.0,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,31.844578,...,311364.0,1421.753425,21120.3,-68215.5,6.492025,96.439726,-311.486301,1432.6225,-4627.1625,14945.0625
4,1.0,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,14.793254,...,319225.0,1232.528958,21385.25,-69071.25,4.758799,82.568533,-266.684363,1432.6225,-4627.1625,14945.0625


In [13]:
poly3 = PolynomialFeatures(3)
X_poly3 = poly3.fit_transform(X)
X_poly3 = pd.DataFrame(X_poly3,columns = poly3.get_feature_names(X.columns))
X_poly3.head()

Unnamed: 0,1,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedInc^2,...,AveOccup^3,AveOccup^2 Latitude,AveOccup^2 Longitude,AveOccup Latitude^2,AveOccup Latitude Longitude,AveOccup Longitude^2,Latitude^3,Latitude^2 Longitude,Latitude Longitude^2,Longitude^3
0,1.0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,69.308955,...,16.689986,247.389136,-798.267531,3666.952356,-11832.407244,38180.441856,54353.799872,-175387.142512,565933.749452,-1826137.0
1,1.0,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,68.913242,...,9.391819,168.531236,-544.054085,3024.204235,-9762.76391,31516.24419,54267.751656,-175187.654712,565542.397224,-1825689.0
2,1.0,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,52.669855,...,22.005195,297.223199,-959.909216,4014.580565,-12965.451209,41873.097907,54224.761625,-175123.7744,565578.07616,-1826586.0
3,1.0,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,31.844578,...,16.541323,245.723138,-793.650028,3650.24363,-11789.756507,38079.200342,54224.761625,-175138.100625,565670.615625,-1827034.0
4,1.0,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,14.793254,...,10.381164,180.120545,-581.763186,3125.218967,-10094.003137,32602.163369,54224.761625,-175138.100625,565670.615625,-1827034.0


# Train / Validation / Test split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_poly,y,test_size=1000, random_state=42)
X_train_v, X_val, y_train_v, y_val = train_test_split(X_train,y_train,test_size=1000, random_state=123)

In [15]:
scaler = StandardScaler()

In [16]:
X_train_v.head()

Unnamed: 0,1,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedInc^2,...,Population^2,Population AveOccup,Population Latitude,Population Longitude,AveOccup^2,AveOccup Latitude,AveOccup Longitude,Latitude^2,Latitude Longitude,Longitude^2
17477,1.0,3.8056,21.0,5.881517,1.175355,746.0,3.535545,34.47,-120.05,14.482591,...,556516.0,2637.516588,25714.62,-89557.3,12.500079,121.870237,-424.44218,1188.1809,-4138.1235,14412.0025
12220,1.0,5.2066,4.0,10.5,1.445652,311.0,3.380435,33.51,-117.32,27.108684,...,96721.0,1051.315217,10421.61,-36486.52,11.427339,113.27837,-396.592609,1122.9201,-3931.3932,13763.9824
1764,1.0,3.0313,45.0,4.24933,0.882038,981.0,2.630027,37.95,-122.33,9.18878,...,962361.0,2580.0563,37228.95,-120005.73,6.917041,99.809517,-321.73118,1440.2025,-4642.4235,14964.6289
12869,1.0,3.9318,7.0,5.338479,1.030871,2857.0,3.149945,38.71,-121.38,15.459051,...,8162449.0,8999.392503,110594.47,-346782.66,9.922153,121.934366,-382.340309,1498.4641,-4698.6198,14733.1044
20086,1.0,2.517,23.0,5.101604,1.024064,984.0,2.631016,37.99,-120.3,6.335289,...,968256.0,2588.919786,37382.16,-118375.2,6.922245,99.952299,-316.51123,1443.2401,-4570.197,14472.09


In [17]:
scaler.fit(X_train_v)

X_train_v = scaler.transform(X_train_v)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [18]:
X_train_v.head()

AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [19]:
X_train_v

array([[ 0.        , -0.03184148, -0.60643492, ..., -0.55903237,
         0.3919493 ,  0.22921982],
       [ 0.        ,  0.71034785, -1.95523152, ..., -0.98168359,
         1.03006446, -1.11933583],
       [ 0.        , -0.44203219,  1.29774851, ...,  1.07314556,
        -1.16467534,  1.37925738],
       ...,
       [ 0.        ,  0.23669226,  1.85313534, ...,  0.98976849,
        -1.08415775,  1.30801756],
       [ 0.        , -0.89455048, -0.84445785, ...,  0.34566525,
        -0.32905312,  0.18926053],
       [ 0.        ,  0.78260668, -0.92379882, ..., -0.89898136,
         0.89572557, -0.81088389]])

In [20]:
X_train_v = pd.DataFrame(X_train_v,columns = poly.get_feature_names(X.columns))
X_train_v.head()

Unnamed: 0,1,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedInc^2,...,Population^2,Population AveOccup,Population Latitude,Population Longitude,AveOccup^2,AveOccup Latitude,AveOccup Longitude,Latitude^2,Latitude Longitude,Longitude^2
0,0.0,-0.031841,-0.606435,0.178642,0.160661,-0.595176,0.041127,-0.545903,-0.237692,-0.186257,...,-0.201979,-0.035212,-0.618676,0.593722,-0.009883,0.028188,-0.041747,-0.559032,0.391949,0.22922
1,0.0,0.710348,-1.955232,2.009597,0.717012,-0.975745,0.02693,-0.995294,1.124305,0.398111,...,-0.235333,-0.056218,-0.999334,0.983844,-0.009974,0.007664,-0.020776,-0.981684,1.030064,-1.119336
2,0.0,-0.442032,1.297749,-0.468423,-0.443074,-0.389582,-0.041752,1.083141,-1.375184,-0.431269,...,-0.172538,-0.035973,-0.332074,0.369896,-0.010357,-0.02451,0.035597,1.073146,-1.164675,1.379257
3,0.0,0.035014,-1.717209,-0.03664,-0.13673,1.251674,0.005834,1.438909,-0.901229,-0.141064,...,0.349765,0.04904,1.494065,-1.297135,-0.010102,0.028341,-0.010043,1.450468,-1.338137,0.897446
4,0.0,-0.714486,-0.447753,-0.130547,-0.150741,-0.386957,-0.041662,1.101866,-0.362417,-0.563336,...,-0.172111,-0.035855,-0.32826,0.381882,-0.010357,-0.024169,0.039528,1.092818,-0.941734,0.354264


# Fit and evaluate model

In [21]:
lin_reg = LinearRegression()

In [22]:
lin_reg.fit(X_train_v,y_train_v);

In [23]:
lin_reg.score(X_train_v,y_train_v)

0.6829105379055355

In [24]:
lin_reg.score(X_val,y_val)

0.678956128839695

# Final check on the Test dataset

In [None]:
X_train = scaler.transform(X_train)
lin_reg.fit(X_train,y_train);
lin_reg.score(X_test,y_test)