In [8]:
import pandas as pd
import numpy as np
# from sklearn.datasets import load_boston
from sklearn.datasets import fetch_california_housing

In [6]:
# from sklearn.datasets import load_boston

In [59]:
# boston = load_boston()
housing = fetch_california_housing()

In [60]:
print(housing.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [61]:
type(housing)

sklearn.utils.Bunch

In [62]:
housing.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [63]:
print(housing.feature_names)

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


In [64]:
dataset = pd.DataFrame(housing.data,columns=housing.feature_names)

In [65]:
dataset.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [66]:
dataset['Price'] = housing.target

In [67]:
dataset.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [68]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
 8   Price       20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [69]:
dataset.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


In [70]:
dataset.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
Price         0
dtype: int64

In [71]:
X = dataset.iloc[:,:-1]
y = dataset.iloc[:,-1]

In [72]:
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [73]:
from sklearn.model_selection import train_test_split

In [74]:
X.shape

(20640, 8)

In [75]:
y.shape

(20640,)

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [77]:
X_train

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
11636,4.6250,25.0,5.952849,1.045187,1668.0,3.277014,33.83,-118.03
14210,5.0827,18.0,5.503731,0.996269,1149.0,4.287313,32.68,-117.07
7925,4.8448,31.0,5.254973,1.045208,1806.0,3.265823,33.84,-118.08
20149,4.0750,28.0,5.815832,1.034733,3793.0,3.063813,34.38,-119.12
5224,1.1384,25.0,3.472727,0.909091,801.0,3.640909,33.95,-118.25
...,...,...,...,...,...,...,...,...
822,3.3611,23.0,5.326923,0.978022,841.0,2.310440,37.63,-122.06
8729,3.5073,42.0,4.251351,1.040541,1859.0,2.512162,33.85,-118.32
636,3.3431,10.0,4.595876,1.239175,877.0,1.808247,37.72,-122.16
9930,5.3351,14.0,7.714597,1.087146,1577.0,3.435730,38.29,-122.33


In [78]:
X_test

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
1430,4.6375,28.0,6.281780,1.129237,1591.0,3.370763,38.00,-122.02
17601,5.6437,52.0,5.545775,1.000000,629.0,2.214789,37.30,-121.90
9650,1.9330,15.0,5.431373,1.129412,775.0,3.039216,37.05,-120.82
7970,4.2083,40.0,5.164134,1.051672,1188.0,3.610942,33.88,-118.20
19247,3.7266,16.0,4.702564,1.002564,994.0,2.548718,38.47,-122.73
...,...,...,...,...,...,...,...,...
4578,2.3636,32.0,3.633947,1.096115,1684.0,3.443763,34.07,-118.28
19304,5.0323,30.0,6.295265,1.041783,958.0,2.668524,38.35,-122.76
2587,0.7684,31.0,2.569231,1.030769,780.0,6.000000,40.87,-124.07
9285,4.1705,17.0,4.666990,1.030583,4684.0,2.273786,38.09,-122.56


In [79]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [80]:
# scaler.fit(X_train)

In [81]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [82]:
from sklearn.linear_model import LinearRegression

In [83]:
reg = LinearRegression()
reg.fit(X_train,y_train)

LinearRegression()

In [84]:
reg_pred = reg.predict(X_test)

In [85]:
X_test.shape

(6192, 8)

In [86]:
from google.colab import drive

In [87]:
drive.mount("/content/drive")

Mounted at /content/drive


In [98]:
import pickle

In [94]:
data_dir = "/content/drive/MyDrive/MLOps/"

In [96]:
%cd $data_dir

/content/drive/MyDrive/MLOps


In [97]:
%ls

In [99]:
pickle.dump(reg,open(data_dir+'regmodel.pkl','wb'))

In [100]:
pickle_model = open('regmodel.pkl','rb')

In [101]:
pickle_model

<_io.BufferedReader name='regmodel.pkl'>