In [1]:
import pickle
from joblib import dump, load

import pandas as pd

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

In [2]:
# https://raw.githubusercontent.com/jbrownlee/Datasets/master/adult-all.csv
# https://archive.ics.uci.edu/ml/datasets/Adult 
# adult-all is a combination of adult.data and test.test
names = [
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education_num',
    'marital_status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital_gain',
    'capital_loss',
    'hours_per_week',
    'native_country',
    'income'
]
df = pd.read_csv('data.csv', header=None, names=names)

In [3]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df['income'].value_counts()

<=50K    37155
>50K     11687
Name: income, dtype: int64

In [10]:
df['target'] = 0
df.loc[df['income'] == '>50K', 'target'] = 1

In [11]:
df['target'].value_counts()

0    37155
1    11687
Name: target, dtype: int64

In [12]:
features = [
    'age', 
    'fnlwgt', 
    'education_num', 
    'capital_gain', 
    'capital_loss', 
    'hours_per_week'
]

X = df[features]
y = df.target

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [14]:
clf = GradientBoostingClassifier(
    n_estimators=500, learning_rate=1.0,
    max_depth=20, random_state=0
)

clf = clf.fit(X_train, y_train)

In [15]:
clf.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0])

In [16]:
input_data = dict(X_test.iloc[11])

# clf.predict([[input_data[feature] for feature in features]])
clf.predict_proba([[input_data[feature] for feature in features]])[0][1]

0.025242329977330145

In [17]:
input_data

{'age': 34,
 'fnlwgt': 261799,
 'education_num': 11,
 'capital_gain': 0,
 'capital_loss': 0,
 'hours_per_week': 45}

In [18]:
with open('model.pkl', "wb") as f_out:
    pickle.dump(clf, f_out)

In [19]:
dump(clf, 'model.joblib') 

['model.joblib']

In [22]:
%time x = [load('model.joblib') for _ in range(10)]

CPU times: user 4.88 s, sys: 2.33 s, total: 7.22 s
Wall time: 7.5 s


In [23]:
def load_pickled_model():
    with open('model.pkl', "rb") as f_in:
        model = pickle.loads(f_in.read())

%time x = [load_pickled_model() for _ in range(10)]

CPU times: user 1.03 s, sys: 2.24 s, total: 3.27 s
Wall time: 3.36 s
