In [3]:
# importing the required libraries and the data
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing

In [4]:
california_data = fetch_california_housing()

In [5]:
california_data.DESCR

'.. _california_housing_dataset:\n\nCalifornia Housing dataset\n--------------------------\n\n**Data Set Characteristics:**\n\n    :Number of Instances: 20640\n\n    :Number of Attributes: 8 numeric, predictive attributes and the target\n\n    :Attribute Information:\n        - MedInc        median income in block group\n        - HouseAge      median house age in block group\n        - AveRooms      average number of rooms per household\n        - AveBedrms     average number of bedrooms per household\n        - Population    block group population\n        - AveOccup      average number of household members\n        - Latitude      block group latitude\n        - Longitude     block group longitude\n\n    :Missing Attribute Values: None\n\nThis dataset was obtained from the StatLib repository.\nhttps://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html\n\nThe target variable is the median house value for California districts,\nexpressed in hundreds of thousands of dollars ($100,000

In [6]:
california_data.feature_names

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [7]:
california_data.target

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [9]:
# creating a dataframe of the data
df_california = pd.DataFrame(california_data.data)
df_california.columns = california_data.feature_names

In [10]:
# appending the price to the dataframe
df_california['Price'] = california_data.target

In [11]:
# viewing the fist five rows
df_california.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [14]:
# assign features to the x axis, and target tothe y axis
X_features = california_data.data
Y_target = california_data.target

In [15]:
# import scikit learn linear regression model
from sklearn.linear_model import LinearRegression
linReg = LinearRegression()

In [16]:
linReg.fit(X_features, Y_target)

LinearRegression()

In [18]:
# print the intercept
print("The intercept is : %.2f" %linReg.intercept_)

The intercept is : -36.94


In [21]:
# print the coefficient
print("The coefficient is : %.2f" %len(linReg.coef_))

The coefficient is : 8.00


In [27]:
# importing train test split split the dataframe into train and test
from sklearn.model_selection import train_test_split as tts
X_train, X_test, Y_train, Y_test = tts(X_features, Y_target)

In [28]:
# print the shape of the train and test data
print(X_train.shape, X_test.shape,Y_train.shape, Y_test.shape)

(15480, 8) (5160, 8) (15480,) (5160,)


In [29]:
# fit the model.
linReg.fit(X_train, Y_train)

LinearRegression()

In [33]:
# calculating mean squared error
print("Mean Squared Error(MSE) %.2f" %np.mean((linReg.predict(X_test) - Y_test) ** 2))

Mean Squared Error(MSE) 0.50


In [34]:
# variance score
print("The variance score : %.2f" %linReg.score(X_test, Y_test))

The variance score : 0.62
