## ch 1-8 ML Fit
- train / dev / test split
- ml fit

In [1]:
import pandas as pd
import numpy as np

trn = pd.read_csv('data/train_feng.baseline.csv')

In [2]:
# use latest 20% as dev

dev_split = int(trn.shape[0] * 0.2)

In [3]:
x_dev = trn.iloc[-dev_split:, :]

In [4]:
x_dev.shape

(355699, 22)

In [5]:
x_trn = trn.iloc[:-dev_split, :]

In [6]:
x_trn.shape

(1422797, 22)

In [7]:
# prepare for ml fit

In [8]:
col = 'target'
y_trn = x_trn[col]
x_trn = x_trn.drop(col, axis=1)

In [9]:
y_dev = x_dev[col]
x_dev = x_dev.drop(col, axis=1)

In [10]:
# ml fit

In [11]:
from sklearn.tree import DecisionTreeClassifier

In [29]:
# initialize model
dt_model = DecisionTreeClassifier(max_depth=5, random_state=777)

# fit
dt_model.fit(x_trn, y_trn)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=777,
            splitter='best')

In [13]:
from sklearn.metrics import log_loss

In [30]:
# trn
log_loss(y_trn, dt_model.predict_proba(x_trn))

1.7828470926622679

In [31]:
# dev
log_loss(y_dev, dt_model.predict_proba(x_dev))

1.8371992157081662

In [53]:
import pickle

In [59]:
def generate_model_name(model, params, trn_loss, dev_loss):
    return '{}-{}-trn{}-dev{}'.format(model, params, trn_loss, dev_loss)

In [60]:
# save model
model_name = generate_model_name('dt', 'depth5', 1.7828, 1.8371)
pickle.dump(dt_model, open('model/{}.pkl'.format(model_name), 'wb'))

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
# initialize model
rf_model = RandomForestClassifier(max_depth=10, n_jobs=-1, random_state=777)

# fit
rf_model.fit(x_trn, y_trn)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=777, verbose=0, warm_start=False)

In [22]:
# trn
log_loss(y_trn, rf_model.predict_proba(x_trn))

1.7042356631809443

In [23]:
# dev
log_loss(y_dev, rf_model.predict_proba(x_dev))

1.7341571942843692

In [63]:
# save model
model_name = generate_model_name('rf', 'depth10', 1.7042, 1.7341)
pickle.dump(rf_model, open('model/{}.pkl'.format(model_name), 'wb'))

In [24]:
from sklearn.ensemble import ExtraTreesClassifier

In [25]:
# initialize model
et_model = ExtraTreesClassifier(max_depth=10, n_jobs=-1, random_state=777)

# fit
et_model.fit(x_trn, y_trn)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=10, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=777, verbose=0, warm_start=False)

In [26]:
# trn
log_loss(y_trn, et_model.predict_proba(x_trn))

1.7914208465427808

In [27]:
# dev
log_loss(y_dev, et_model.predict_proba(x_dev))

1.770633021045777

In [64]:
# save model
model_name = generate_model_name('et', 'depth10', 1.7914, 1.7706)
pickle.dump(et_model, open('model/{}.pkl'.format(model_name), 'wb'))