# Model Serialization

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [2]:
auto_df = pd.read_csv('datasets/CarPrice_Assignment.csv')
auto_df.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [4]:
auto_df.shape

(205, 26)

In [5]:
auto_df.columns

Index(['car_ID', 'symboling', 'CarName', 'fueltype', 'aspiration',
       'doornumber', 'carbody', 'drivewheel', 'enginelocation', 'wheelbase',
       'carlength', 'carwidth', 'carheight', 'curbweight', 'enginetype',
       'cylindernumber', 'enginesize', 'fuelsystem', 'boreratio', 'stroke',
       'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg',
       'price'],
      dtype='object')

In [6]:
to_drop = ['car_ID', 'symboling', 'CarName']
auto_df = auto_df.drop(to_drop, axis=1)
auto_df.shape

(205, 23)

In [7]:
auto_df = pd.get_dummies(auto_df)

In [8]:
auto_df.head()

Unnamed: 0,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,...,cylindernumber_twelve,cylindernumber_two,fuelsystem_1bbl,fuelsystem_2bbl,fuelsystem_4bbl,fuelsystem_idi,fuelsystem_mfi,fuelsystem_mpfi,fuelsystem_spdi,fuelsystem_spfi
0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,...,0,0,0,0,0,0,0,1,0,0
1,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,...,0,0,0,0,0,0,0,1,0,0
2,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154,...,0,0,0,0,0,0,0,1,0,0
3,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102,...,0,0,0,0,0,0,0,1,0,0
4,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115,...,0,0,0,0,0,0,0,1,0,0


In [9]:
auto_df.shape

(205, 52)

In [10]:
X = auto_df.drop('price', axis=1)

Y = auto_df['price']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size =0.2, random_state = 42)

In [16]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((164, 51), (41, 51), (164,), (41,))

In [17]:
model = LinearRegression().fit(X_train, y_train)

In [18]:
train_score = model.score(X_train, y_train)

In [19]:
train_score

0.9475833672384809

In [20]:
y_pred = model.predict(X_test)
y_pred

array([28709.89220759, 20249.66545848, 10764.25575611, 12564.33071732,
       27600.92000588,  5588.70673955,  8514.88132127,  7944.58879593,
        9033.48353584,  9074.91177687, 17205.00543465,  7289.99091627,
       17457.66627041,  8900.38026064, 44756.93554223,  6512.50307696,
       -3075.66277509, 13643.53861386, 10833.39806902, 10250.04099784,
       11217.62295019, 16999.33615922,  4580.88493716,  2458.43849777,
        7397.41464086, 29015.76210272, 11763.2384524 , 16823.64937786,
        4785.6395683 , 17831.0608826 , 28082.47105263,  6411.43680105,
        6501.90001745, 20128.22426275,  7177.1067574 , 25436.42856786,
       12595.31574484, 12735.60839992,  6696.22022225, 13971.20609035,
        6926.93626975])

In [21]:
test_score = r2_score(y_test, y_pred)
test_score

0.8731241343685101

## Serializing model with json

In [22]:
import json

In [23]:
model.coef_

array([ 9.03542803e+01, -8.02077112e+01,  5.70968625e+02,  1.12622371e+02,
        3.75664003e+00,  1.26049554e+02, -2.17000465e+03, -3.89254253e+03,
       -2.13693394e+02,  2.23608077e+00,  2.00253121e+00, -2.49473001e+02,
        2.03842158e+02,  1.64428221e+03, -1.64428221e+03, -9.32635999e+02,
        9.32635999e+02,  1.09176489e+02, -1.09176489e+02,  2.27206551e+03,
       -1.54932427e+03, -3.05758028e+02,  5.97573731e+02, -1.01455694e+03,
       -1.72988353e+02, -4.27491055e+02,  6.00479408e+02, -5.27335801e+03,
        5.27335801e+03,  9.66518174e+02, -5.25651002e+03, -8.90793476e+02,
        3.18765860e+03,  1.62054882e+03, -3.41650113e+03,  3.78907904e+03,
        6.49663783e+03, -1.00383089e+03, -2.04198822e+03, -1.02205357e+03,
        0.00000000e+00, -6.21784418e+03,  3.78907904e+03,  4.75108613e+02,
        6.98042155e+02, -6.02779882e+01,  1.64428221e+03, -2.04059400e+03,
        8.12321516e+02, -1.64280326e+03,  1.13920756e+02])

In [25]:
model.intercept_

-33819.481162749325

In [26]:
model_param = {}

model_param['coef'] = list(model.coef_)
model_param['intercept'] = model.intercept_.tolist()

In [27]:
json_text = json.dumps(model_param, indent = 4)
json_text

'{\n    "coef": [\n        90.35428029095607,\n        -80.2077111661918,\n        570.9686248117977,\n        112.62237145861354,\n        3.7566400325807656,\n        126.04955448628361,\n        -2170.004654113559,\n        -3892.54253139227,\n        -213.6933943701245,\n        2.236080767668489,\n        2.0025312148488976,\n        -249.47300096520763,\n        203.84215802125394,\n        1644.2822128316777,\n        -1644.2822128316566,\n        -932.6359993997052,\n        932.635999399682,\n        109.17648902685872,\n        -109.17648902685796,\n        2272.065511425809,\n        -1549.324273729785,\n        -305.7580280893151,\n        597.5737310035627,\n        -1014.556940610304,\n        -172.98835295014754,\n        -427.49105539016387,\n        600.479408340259,\n        -5273.35801099285,\n        5273.358010992847,\n        966.5181742244429,\n        -5256.510022000037,\n        -890.793476249673,\n        3187.658601830149,\n        1620.5488152938437,\n      

In [28]:
with open('models/regressor_param.txt', 'w') as file:
    file.write(json_text)

In [29]:
with open('models/regressor_param.txt', 'r') as file:
    json_txt = json.load(file)

In [30]:
json_model = LinearRegression()

In [33]:
json_model.coef_ = np.array(json_txt['coef'])
json_model.intercept_ = np.array(json_txt['intercept'])

In [34]:
y_pred = json_model.predict(X_test)

r2_score(y_test, y_pred)

0.8731241343685101

In [35]:
test_score

0.8731241343685101

## Pickling models

In [36]:
import pickle

In [37]:
pickle.dump(model, open('models/model.pkl', 'wb'))

In [38]:
pickle_model = pickle.load(open('models/model.pkl', 'rb'))

In [40]:
y_pred = pickle_model.predict(X_test)
r2_score(y_test, y_pred)

0.8731241343685101

## Saving with joblib

better fit to serialize sklearn models, as it works more efficiently with python objects that contain large sparse arrays

In [41]:
import joblib

In [42]:
filename = 'models/model.joblib'

joblib.dump(model, filename)

['models/model.joblib']

In [43]:
joblib_model = joblib.load(filename)

In [44]:
y_pred = joblib_model.predict(X_test)
r2_score(y_test, y_pred)

0.8731241343685101