In [33]:
#Loading relevant sklearn modules
from sklearn.linear_model import LinearRegression
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.pipeline import Pipeline

In [34]:
#Load California Housing Dataset
dataset = fetch_california_housing()

In [35]:
#Understanding the Data (Attributes)
print(dataset.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [36]:
#Defining X, Y
X = dataset['data']
Y = dataset['target']

In [37]:
print(X)

[[   8.3252       41.            6.98412698 ...    2.55555556
    37.88       -122.23      ]
 [   8.3014       21.            6.23813708 ...    2.10984183
    37.86       -122.22      ]
 [   7.2574       52.            8.28813559 ...    2.80225989
    37.85       -122.24      ]
 ...
 [   1.7          17.            5.20554273 ...    2.3256351
    39.43       -121.22      ]
 [   1.8672       18.            5.32951289 ...    2.12320917
    39.43       -121.32      ]
 [   2.3886       16.            5.25471698 ...    2.61698113
    39.37       -121.24      ]]


From the values of all the X (Features), it is evident that all features are not on the same scale. So using Standard Scaler for uniform scale.

In [38]:
#Bringing all Features on uniform scale uing StandardScaler
scaler=StandardScaler()
X=scaler.fit_transform(X)

In [39]:
# Train and Test Data Split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20,random_state=42)

In [40]:
# Multi-linear Regression (with all 8 predictive attributes)
model = LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [41]:
#Evaluation Metrics
r2_s = r2_score(y_test, y_pred)
meanSquared_error = mean_squared_error(y_test, y_pred)
print(r2_s,meanSquared_error)

0.5757877060324511 0.555891598695244
