In [1]:
 from sklearn.datasets import fetch_california_housing

In [2]:
import pandas as pd
import numpy as np

In [3]:
boston = fetch_california_housing()

In [4]:
boston.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [5]:
print(boston.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per ce

In [6]:
dataset = pd.DataFrame(boston.data,columns = boston.feature_names)

In [7]:
dataset.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [8]:
dataset['MedHouseVal'] = boston.target

In [9]:
dataset.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [10]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [11]:
#checking the missing values
dataset.isnull().sum()

MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64

In [12]:
dataset.corr()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
MedInc,1.0,-0.119034,0.326895,-0.06204,0.004834,0.018766,-0.079809,-0.015176,0.688075
HouseAge,-0.119034,1.0,-0.153277,-0.077747,-0.296244,0.013191,0.011173,-0.108197,0.105623
AveRooms,0.326895,-0.153277,1.0,0.847621,-0.072213,-0.004852,0.106389,-0.02754,0.151948
AveBedrms,-0.06204,-0.077747,0.847621,1.0,-0.066197,-0.006181,0.069721,0.013344,-0.046701
Population,0.004834,-0.296244,-0.072213,-0.066197,1.0,0.069863,-0.108785,0.099773,-0.02465
AveOccup,0.018766,0.013191,-0.004852,-0.006181,0.069863,1.0,0.002366,0.002476,-0.023737
Latitude,-0.079809,0.011173,0.106389,0.069721,-0.108785,0.002366,1.0,-0.924664,-0.14416
Longitude,-0.015176,-0.108197,-0.02754,0.013344,0.099773,0.002476,-0.924664,1.0,-0.045967
MedHouseVal,0.688075,0.105623,0.151948,-0.046701,-0.02465,-0.023737,-0.14416,-0.045967,1.0


In [13]:
y = dataset["MedHouseVal"]
X = dataset.drop("MedHouseVal", axis=1)

In [14]:
#Train test split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.3 ,random_state = 0)

In [15]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [16]:
xtrain_scaled = sc.fit_transform(x_train)
xtest_scaled = sc.transform(x_test)

In [17]:
from sklearn.linear_model import LinearRegression
regression = LinearRegression()

In [18]:
regression.fit(xtrain_scaled,y_train)

In [19]:
regression.predict(xtest_scaled)

array([2.2702672 , 2.79059912, 1.90984782, ..., 3.56895295, 0.98163812,
       2.72552317])

In [20]:
from sklearn.metrics import r2_score
print(r2_score(y_test,regression.predict(xtest_scaled)))

0.5926087785518778


In [21]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, regression.predict(xtest_scaled))
print("Mean Squared Error:", mse)


Mean Squared Error: 0.5431489670037237


In [22]:
print(regression.predict(sc.fit_transform([[4.526,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23]])))

[2.06824625]


In [23]:
sc.transform(boston.data[0].reshape(1,-1))

array([[ 3.79920000e+00,  0.00000000e+00, -1.58730158e-08,
        -4.76190476e-07,  0.00000000e+00, -4.44444445e-07,
         0.00000000e+00,  0.00000000e+00]])

In [24]:
regression.predict(sc.transform(boston.data[0].reshape(1,-1)))

array([5.27815544])

In [25]:
#Pickling The Model file for Deployment
import pickle
pickle.dump(regression,open('regmodel.pkl','wb')) #It will create a pickle file

In [26]:
pickled_model = pickle.load(open('regmodel.pkl','rb'))