## Linear Regression Practical Implementation

In [1]:
from sklearn.datasets import fetch_california_housing

In [2]:
dataset=fetch_california_housing()

In [3]:
dataset

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [4]:
dataset.target

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [5]:
dataset.data

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])

In [6]:
print(dataset.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [7]:
import pandas as pd

In [8]:
df=pd.DataFrame(dataset.data,columns=dataset.feature_names)

In [9]:
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [10]:
df['Price']=dataset['target']

In [11]:
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [12]:
df.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
Price         0
dtype: int64

In [13]:
## Divide the dataset into independent and dependent feature
x=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [14]:
x

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [15]:
y

0        4.526
1        3.585
2        3.521
3        3.413
4        3.422
         ...  
20635    0.781
20636    0.771
20637    0.923
20638    0.847
20639    0.894
Name: Price, Length: 20640, dtype: float64

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.33,random_state=36)

In [18]:
x_train

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
6556,9.5320,50.0,7.114428,0.925373,620.0,3.084577,34.19,-118.11
8399,2.8438,32.0,3.851986,1.032491,1053.0,3.801444,33.95,-118.37
17929,3.9250,35.0,5.056680,0.939271,556.0,2.251012,37.35,-121.97
19277,2.1864,29.0,4.031627,1.040663,1880.0,2.831325,38.43,-122.73
11910,3.8640,24.0,5.444714,1.094775,2398.0,2.913730,33.96,-117.41
...,...,...,...,...,...,...,...,...
5769,2.8342,35.0,3.923077,1.067155,2401.0,2.931624,34.16,-118.30
986,10.3203,43.0,5.428571,0.952381,83.0,1.976190,37.72,-121.85
7329,2.9844,36.0,3.582857,1.011429,834.0,4.765714,33.98,-118.17
19368,3.9336,18.0,6.188406,1.207729,1128.0,2.724638,38.38,-122.93


In [19]:
x_test

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
13210,4.0096,5.0,5.061294,1.017026,2028.0,2.301930,34.04,-117.69
12777,1.8611,42.0,4.751381,1.055249,1069.0,2.953039,38.64,-121.42
5288,4.1750,27.0,4.574844,1.073805,1725.0,1.793139,34.05,-118.47
5632,2.4500,41.0,3.536585,1.032520,1384.0,3.750678,33.75,-118.28
17603,4.1406,44.0,5.289817,1.033943,908.0,2.370757,37.29,-121.88
...,...,...,...,...,...,...,...,...
17366,3.6815,43.0,6.516129,1.109677,692.0,2.232258,34.95,-120.43
6862,3.3906,27.0,5.074324,1.044402,3496.0,3.374517,34.06,-118.14
5252,11.1978,32.0,7.270344,1.031646,2768.0,2.502712,34.10,-118.47
4039,8.7172,32.0,7.693431,1.127737,769.0,2.806569,34.16,-118.48


In [20]:
y_train

6556     4.833
8399     1.817
17929    2.871
19277    1.432
11910    1.234
         ...  
5769     2.568
986      4.000
7329     1.636
19368    2.625
610      1.883
Name: Price, Length: 13828, dtype: float64

In [21]:
y_test

13210    1.82600
12777    0.60500
5288     5.00001
5632     1.86800
17603    2.74100
          ...   
17366    1.81800
6862     2.37200
5252     5.00001
4039     5.00001
6261     1.48300
Name: Price, Length: 6812, dtype: float64

In [22]:
from sklearn.preprocessing import StandardScaler

In [23]:
scaler=StandardScaler()

In [24]:
scaler.fit(x_train)

StandardScaler()

In [25]:
x_train=scaler.fit_transform(x_train)

In [26]:
x_train

array([[ 2.94615603,  1.69257444,  0.65840852, ...,  0.00631568,
        -0.67676958,  0.7291751 ],
       [-0.53712836,  0.26178584, -0.61551141, ...,  0.10882511,
        -0.78878315,  0.59982719],
       [ 0.02597188,  0.50025061, -0.14510172, ..., -0.11288116,
         0.79807576, -1.19114398],
       ...,
       [-0.46390242,  0.57973886, -0.72060079, ...,  0.24671234,
        -0.77478146,  0.69932558],
       [ 0.03045085, -0.85104973,  0.29681501, ..., -0.04515438,
         1.27880067, -1.66873629],
       [ 0.48355594, -0.69207322, -0.02275645, ..., -0.12518059,
         0.95676166, -1.25084302]])

In [27]:
x_train.mean()

-2.527467516285792e-16

In [28]:
x_train.std()

0.9999999999999999

In [29]:
x_test=scaler.transform(x_test)

In [30]:
x_test

array([[ 0.07003245, -1.88439705, -0.14330011, ..., -0.10560014,
        -0.74677806,  0.93812174],
       [-1.04892878,  1.0566684 , -0.26431499, ..., -0.01249385,
         1.40014871, -0.91752338],
       [ 0.15617449, -0.13565543, -0.33324929, ..., -0.17835535,
        -0.74211083,  0.55007799],
       ...,
       [ 3.81372206,  0.26178584,  0.71929056, ..., -0.0768889 ,
        -0.71877467,  0.55007799],
       [ 2.52179968,  0.26178584,  0.88449779, ..., -0.03343845,
        -0.69077128,  0.54510307],
       [-0.37291683,  0.50025061, -0.01804122, ...,  0.18848079,
        -0.74677806,  0.8037989 ]])

In [31]:
x_test.shape

(6812, 8)

In [32]:
x_test.mean()

-0.000816482494622889

In [33]:
x_test.std()

1.1722364316177707

In [34]:
from sklearn.linear_model import LinearRegression

In [35]:
regression=LinearRegression()

In [36]:
regression

LinearRegression()

In [37]:
regression.fit(x_train,y_train)

LinearRegression()

In [38]:
regression.coef_

array([ 0.83700024,  0.12271899, -0.26347102,  0.30713139, -0.0081633 ,
       -0.02764702, -0.90609856, -0.87576409])

In [39]:
regression.intercept_

2.0708259184263804

In [40]:
x_test

array([[ 0.07003245, -1.88439705, -0.14330011, ..., -0.10560014,
        -0.74677806,  0.93812174],
       [-1.04892878,  1.0566684 , -0.26431499, ..., -0.01249385,
         1.40014871, -0.91752338],
       [ 0.15617449, -0.13565543, -0.33324929, ..., -0.17835535,
        -0.74211083,  0.55007799],
       ...,
       [ 3.81372206,  0.26178584,  0.71929056, ..., -0.0768889 ,
        -0.71877467,  0.55007799],
       [ 2.52179968,  0.26178584,  0.88449779, ..., -0.03343845,
        -0.69077128,  0.54510307],
       [-0.37291683,  0.50025061, -0.01804122, ...,  0.18848079,
        -0.74677806,  0.8037989 ]])

In [41]:
y_pred=regression.predict(x_test)

In [42]:
y_pred

array([1.74054339, 0.90455512, 2.45240228, ..., 5.22766823, 4.15452378,
       1.73613111])

In [43]:
x.shape

(20640, 8)

In [44]:
from sklearn.metrics import mean_squared_error,mean_absolute_error

In [45]:
mse=mean_squared_error(y_test,y_pred)
print(mse)

0.5335029155157139


In [46]:
mae=mean_absolute_error(y_test,y_pred)
print(mae)

0.540898948179417


In [47]:
import numpy as np
print(np.sqrt(mse))

0.730412839095613


In [48]:
from sklearn.metrics import r2_score
score=r2_score(y_test,y_pred)

In [49]:
score

0.5875394343499214

In [59]:
x.shape[1]

8

In [61]:
x

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [69]:
## Adjusted R-squared
1-(1-score)*(len(y)-1)/(len(y)-x.shape[1]-1)

0.5873794961731389

In [70]:
x

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [71]:
y

0        4.526
1        3.585
2        3.521
3        3.413
4        3.422
         ...  
20635    0.781
20636    0.771
20637    0.923
20638    0.847
20639    0.894
Name: Price, Length: 20640, dtype: float64

In [75]:
x_test.shape

(6812, 8)

In [76]:
y_test.shape

(6812,)

In [78]:
x_train.shape

(13828, 8)

In [79]:
y_train.shape

(13828,)

In [107]:
from sklearn.linear_model import Ridge
ridge=Ridge(alpha=20)
ridge.fit(x_train,y_train)

Ridge(alpha=20)

In [108]:
y_pred=ridge.predict(x_test)

In [109]:
y_pred.shape

(6812,)

In [110]:
y_test.shape

(6812,)

In [111]:
y_pred.shape

(6812,)

In [112]:
mse=mean_squared_error(y_test,y_pred)
print(mse)
mae=mean_absolute_error(y_test,y_pred)
print(mae)

0.5335706984910803
0.5408065397594831


In [113]:
from sklearn.linear_model import Lasso
lasso=Lasso()
lasso.fit(x_train,y_train)

Lasso()

In [117]:
y_pred=lasso.predict(x_test)

In [118]:
mse=mean_squared_error(y_test,y_pred)
print(mse)
mae=mean_absolute_error(y_test,y_pred)
print(mae)

1.2935112672240048
0.9000629779192262


In [120]:
from sklearn.linear_model import ElasticNet
elasticnet=ElasticNet()
elasticnet.fit(x_train,y_train)

ElasticNet()

In [122]:
y_pred=elasticnet.predict(x_test)

In [123]:
mse=mean_squared_error(y_test,y_pred)
print(mse)
mae=mean_absolute_error(y_test,y_pred)
print(mae)

1.0289937580534227
0.8018589810371534
