In [6]:
import time

import pickle
from joblib import dump, load

import pandas as pd

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

In [4]:
# https://raw.githubusercontent.com/jbrownlee/Datasets/master/adult-all.csv
# https://archive.ics.uci.edu/ml/datasets/Adult 
# adult-all is a combination of adult.data and test.test
names = [
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education_num',
    'marital_status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital_gain',
    'capital_loss',
    'hours_per_week',
    'native_country',
    'income'
]
df = pd.read_csv('data.csv', header=None, names=names)

In [5]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [10]:
df['income'].value_counts()

<=50K    37155
>50K     11687
Name: income, dtype: int64

In [11]:
df['target'] = 0
df.loc[df['income'] == '>50K', 'target'] = 1

In [12]:
df['target'].value_counts()

0    37155
1    11687
Name: target, dtype: int64

In [13]:
features = [
    'age', 
    'fnlwgt', 
    'education_num', 
    'capital_gain', 
    'capital_loss', 
    'hours_per_week'
]

X = df[features]
y = df.target

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [25]:
clf = GradientBoostingClassifier(
    n_estimators=2000, 
    learning_rate=1.0,
    max_depth=20, 
    random_state=0
)

clf = clf.fit(X_train, y_train)

In [15]:
clf.predict(X_test)

array([0, 0, 0, ..., 1, 0, 0])

In [17]:
input_data = dict(X_test.iloc[11])

# clf.predict([[input_data[feature] for feature in features]])
clf.predict_proba([[input_data[feature] for feature in features]])[0][1]

1.329539649746285e-32

In [22]:
%time x = [clf.predict_proba([[input_data[feature] for feature in features]])[0][1] for _ in range(1000)]

CPU times: user 2.57 s, sys: 28.2 ms, total: 2.6 s
Wall time: 2.67 s


In [12]:
input_data

{'age': 34,
 'fnlwgt': 261799,
 'education_num': 11,
 'capital_gain': 0,
 'capital_loss': 0,
 'hours_per_week': 45}

In [29]:
with open('model_v3.pkl', "wb") as f_out:
    pickle.dump(clf, f_out)

In [30]:
dump(clf, 'model_v3.joblib') 

['model_v3.joblib']

In [2]:
%time x = [load('model_v3.joblib') for _ in range(1)]

CPU times: user 3.91 s, sys: 3.1 s, total: 7 s
Wall time: 8.14 s


In [10]:
import time
import pickle
from joblib import load

def load_pickled_model(filepath):
    with open(filepath, "rb") as f_in:
        pickle.loads(f_in.read())

def timed_load(model_version_num, source, num_iterations):
    if source == 'pickle':
        load_func = load_pickled_model 
        filepath = f'model_v{model_version_num}.pkl'
    elif source == 'joblib':
        load_func = load
        filepath = f'model_v{model_version_num}.joblib'
    else:
        raise ValueError('source must be pickle or joblib')
    
    start = time.time()
    for x in range(0, num_iterations):
        load_func(filepath)
    end = time.time()
    t = (end-start)*1000
    print(
        f'Model v{model_version_num} from {source}\n'
        f'Total load time for {num_iterations} loads: {int(t)} ms, avg_load_time: {int(t/num_iterations)} ms'
    )

timed_load(1, 'pickle', 10)
timed_load(1, 'joblib', 10)

Model v1 from pickle
Total load time for 10 loads: 2898 ms, avg_load_time: 289 ms
Model v1 from joblib
Total load time for 10 loads: 7246 ms, avg_load_time: 724 ms


In [11]:
timed_load(2, 'pickle', 10)
timed_load(2, 'joblib', 10)

Model v2 from pickle
Total load time for 10 loads: 13103 ms, avg_load_time: 1310 ms
Model v2 from joblib
Total load time for 10 loads: 24164 ms, avg_load_time: 2416 ms


In [12]:
timed_load(2, 'pickle', 5)
timed_load(2, 'joblib', 5)

Model v2 from pickle
Total load time for 5 loads: 5357 ms, avg_load_time: 1071 ms
Model v2 from joblib
Total load time for 5 loads: 13056 ms, avg_load_time: 2611 ms
