In [64]:
import sklearn
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [65]:
from sklearn.datasets import fetch_california_housing

In [66]:
dataset = fetch_california_housing()

In [67]:
print(dataset.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [68]:
df = pd.DataFrame(dataset.data, columns=dataset.feature_names)
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [69]:
df['price'] = pd.DataFrame(dataset.target,columns=['price'])


In [70]:
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,price
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [71]:
# Lets divide dataset into dependent and independent datasets.
X = df.iloc[:,:-1]
y= df.iloc[:,-1]

In [72]:
# Now we re  gonna spliT our dataset into Train and test.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)

In [73]:
#  afterward Split we need to scaling our data before training model.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [74]:
#  Now our pre work has done. We are ready to train model.
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train,y_train)

In [75]:
reg.coef_

array([ 8.46603472e-01,  1.20333548e-01, -2.98800785e-01,  3.47243173e-01,
       -8.81413334e-05, -4.17242067e-02, -8.94420371e-01, -8.70401317e-01])

In [76]:
reg.intercept_

2.0666362048018536

In [77]:
#  Lets predict our part of the dataset with respect to trained model
y_pred = reg.predict(X_test)

In [78]:
# Now We are gonna check the error and accuracy of prediction
from sklearn.metrics import mean_absolute_error,mean_squared_error
mse = mean_squared_error(y_pred,y_test)
mae = mean_absolute_error(y_pred,y_test)
rmse = np.sqrt(mse)

In [79]:

mse,mae,rmse

(0.5369686543372459, 0.5295710106684453, 0.7327814505957735)

In [80]:
# accuracy with r2 score 
from sklearn.metrics import r2_score
score = r2_score(y_pred,y_test)
score

0.33957429603867395

In [81]:
#display adjusted R-squared
1 - (1-score)*(len(y)-1)/(len(y)-X.shape[1]-1)

0.3393182054162276

Its giving 33-34% accuracy. _  
Not much impressive. But being a Simple linear regression, What else we can expect.

In [82]:
# Now we re gonna appying new algorithm named Ridge and Lasso and will train our model
#  First we re gonna use Ridge Which is also0 known as L2 regularization.
from sklearn.linear_model import Ridge
model = Ridge()
model.fit(X_train,y_train)

In [83]:
#  Lets preedict the X_test
y_pred = model.predict(X_test)

In [84]:
#  To chceck the error and  accuracy
mse = mean_squared_error(y_pred,y_test)
mae = mean_absolute_error(y_pred,y_test)
rmse = np.sqrt(mse)
mse, mae, rmse

(0.5369457054801822, 0.5295668709657081, 0.7327657916962159)

In [85]:
score = r2_score(y_pred,y_test)
adjust_score = 1 - (1-score)*(len(y)-1)/(len(y)-X.shape[1]-1)
print(score,adjust_score)

0.3392950524855305 0.3390388535819332


In [86]:
#  Now its Lassso turn
from sklearn.linear_model import Lasso
model = Lasso(10.0)
model.fit(X_train,y_train)

In [87]:
y_pred = model.predict(X_test)

In [88]:
mse = mean_squared_error(y_pred,y_test)
mae = mean_absolute_error(y_pred,y_test)
rmse = np.sqrt(mse)
mse, mae, rmse

(1.3326257277946882, 0.9126511897647483, 1.15439409553007)

In [89]:
score = r2_score(y_pred,y_test)
adjust_score = 1 - (1-score)*(len(y)-1)/(len(y)-X.shape[1]-1)
print(score,adjust_score)

-6.75721521487407e+30 -6.759835433075756e+30


In [90]:
#  Elastic net is a combination of ridge and lasso reg
from sklearn.linear_model import ElasticNet
elasticnet=ElasticNet(alpha=20.0)
elasticnet.fit(X_train,y_train)

In [91]:
y_pred = elasticnet.predict(X_test)

In [92]:
mse = mean_squared_error(y_pred,y_test)
mae = mean_absolute_error(y_pred,y_test)
rmse = np.sqrt(mse)
mse, mae, rmse

(1.3326257277946882, 0.9126511897647483, 1.15439409553007)

In [93]:
score = r2_score(y_pred,y_test)
adjust_score = 1 - (1-score)*(len(y)-1)/(len(y)-X.shape[1]-1)
print(score,adjust_score)

-6.75721521487407e+30 -6.759835433075756e+30
