# Validating models in the real world

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings

from datetime import datetime
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier

warnings.filterwarnings('ignore')

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
TARGET = "is_listened"

### Prepare the dataset

In [3]:
data = pd.read_csv("train.csv")
data = data.sample(frac=0.5)

In [4]:
data["ts_listen"] = data["ts_listen"].apply(lambda x: datetime.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S'))

In [5]:
production_data = data[data["ts_listen"] >= "2016-11-25 00:00:00"]
data = data[data["ts_listen"] < "2016-11-25 00:00:00"]
data = data[data["ts_listen"] > "2016-09-01 00:00:00"]

In [6]:
print(data.shape)
print(production_data.shape)

(3205723, 15)
(573011, 15)


In [7]:
print(production_data["ts_listen"].min())
print(production_data["ts_listen"].max())

2016-11-25 00:00:00
2016-12-01 23:44:05


### Random split

In [8]:
features = ['genre_id', 
            'media_id', 
            'album_id', 
            'context_type',
            'release_date', 
            'platform_name', 
            'platform_family', 
            'media_duration',
            'listen_type', 
            'user_gender', 
            'user_id', 
            'artist_id', 
            'user_age']

param_dict = {"max_depth": [4, 8, 12]}

In [9]:
X_train, X_test, y_train, y_test = train_test_split(data[features], 
                                                    data[TARGET], 
                                                    test_size=0.3, 
                                                    random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_test, 
                                                y_test,
                                                test_size=0.5, 
                                                random_state=42)

In [10]:
model_params = {"max_depth": 8,
                "scale_pos_weight": y_train.mean(),
                "num_leaves": 10, 
                "colsample_bytree": 0.8,
                "reg_lambda": 1,
                "reg_alpha": 1, 
                "min_child_weight": 0.9,
                "min_split_gain": 2, 
                "subsample": 0.8,
                "learning_rate": 0.02,
                "n_estimators": 50,
                "is_unbalanced": False, 
                "nthread": -1,
                "silent": -1,
                "verose": -1}

model = LGBMClassifier(**model_params)

model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], 
          eval_metric= 'auc', verbose=10)

y_pred_val = model.predict_proba(X_val, num_iteration=model.best_iteration_)[:, 1]
y_pred = model.predict_proba(X_test, num_iteration=model.best_iteration_)[:, 1]
test_auc = roc_auc_score(y_test, y_pred)
val_auc = roc_auc_score(y_val, y_pred_val)

[10]	training's binary_logloss: 0.60859	training's auc: 0.65725	valid_1's binary_logloss: 0.608847	valid_1's auc: 0.657452
[20]	training's binary_logloss: 0.601873	training's auc: 0.657291	valid_1's binary_logloss: 0.602119	valid_1's auc: 0.657569
[30]	training's binary_logloss: 0.597901	training's auc: 0.659331	valid_1's binary_logloss: 0.598136	valid_1's auc: 0.659788
[40]	training's binary_logloss: 0.595489	training's auc: 0.660118	valid_1's binary_logloss: 0.595703	valid_1's auc: 0.660683
[50]	training's binary_logloss: 0.593948	training's auc: 0.660452	valid_1's binary_logloss: 0.594147	valid_1's auc: 0.661044


In [11]:
print("AUC on validation: {:.4f}".format(val_auc))
print("AUC on test: {:.4f}".format(test_auc))

AUC on validation: 0.6610
AUC on test: 0.6605


In [12]:
y_pred_prod = model.predict_proba(production_data[features], num_iteration=model.best_iteration_)[:, 1]
prod_auc = roc_auc_score(production_data[TARGET], y_pred_prod)

print("AUC on prod: {:.4f}".format(prod_auc))

AUC on prod: 0.6567


### K-fold

In [13]:
X_train, X_test, y_train, y_test = train_test_split(data[features], 
                                                    data[TARGET], 
                                                    test_size=0.3, 
                                                    random_state=42)


param_dict = {"max_depth": [4, 6, 8, 10, 12]}

grid = GridSearchCV(LGBMClassifier(**model_params), param_dict, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.8,
        importance_type='split', is_unbalanced=False, learning_rate=0.02,
        max_depth=8, min_child_samples=20, min_child_weight=0.9,
        min_split_gain=2, n_estimators=50, n_jobs=-1, nthread=-1,
       ...69, silent=-1,
        subsample=0.8, subsample_for_bin=200000, subsample_freq=0,
        verose=-1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [4, 6, 8, 10, 12]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [14]:
model = grid.best_estimator_
y_pred_val = model.predict_proba(X_val, num_iteration=model.best_iteration_)[:, 1]
y_pred = model.predict_proba(X_test, num_iteration=model.best_iteration_)[:, 1]
test_auc = roc_auc_score(y_test, y_pred)
val_auc = roc_auc_score(y_val, y_pred_val)

print("AUC on validation: {:.4f}".format(val_auc))
print("AUC on test: {:.4f}".format(test_auc))

y_pred_prod = model.predict_proba(production_data[features], num_iteration=model.best_iteration_)[:, 1]
prod_auc = roc_auc_score(production_data[TARGET], y_pred_prod)

print("AUC on prod: {:.4f}".format(prod_auc))

AUC on validation: 0.6610
AUC on test: 0.6608
AUC on prod: 0.6567


### Nested K-fold

In [15]:
grid = GridSearchCV(LGBMClassifier(**model_params), param_dict, cv=5)
nested_scores = cross_val_score(grid, data[features], data[TARGET], cv=5)

In [16]:
nested_scores

array([0.69521403, 0.69530917, 0.69460107, 0.69491409, 0.69525879])

### Time split

In [17]:
print(data["ts_listen"].min())
print(data["ts_listen"].max())

2016-09-01 06:03:29
2016-11-24 23:59:58


In [18]:
val_split = "2016-11-08 00:00:00"
test_split = "2016-11-18 00:00:00"

train_data = (data["ts_listen"] <= val_split) 
X_train = data[train_data][features]
y_train = data[train_data][TARGET]

val_data = (data["ts_listen"] > val_split) & (data["ts_listen"] <= test_split)
X_val = data[val_data][features]
y_val = data[val_data][TARGET]

test_data = (data["ts_listen"] > test_split) 
X_test = data[test_data][features]
y_test = data[test_data][TARGET]

In [19]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
print(production_data.shape)

(1104297, 13)
(1317201, 13)
(784225, 13)
(573011, 15)


In [20]:
model = LGBMClassifier(**model_params)

model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], 
          eval_metric= 'auc', verbose=10)

y_pred_val = model.predict_proba(X_val, num_iteration=model.best_iteration_)[:, 1]
y_pred = model.predict_proba(X_test, num_iteration=model.best_iteration_)[:, 1]
test_auc = roc_auc_score(y_test, y_pred)
val_auc = roc_auc_score(y_val, y_pred_val)

[10]	training's binary_logloss: 0.6088	training's auc: 0.65648	valid_1's binary_logloss: 0.605954	valid_1's auc: 0.655444
[20]	training's binary_logloss: 0.602392	training's auc: 0.656608	valid_1's binary_logloss: 0.599685	valid_1's auc: 0.655544
[30]	training's binary_logloss: 0.598557	training's auc: 0.65679	valid_1's binary_logloss: 0.596018	valid_1's auc: 0.655271
[40]	training's binary_logloss: 0.596237	training's auc: 0.657889	valid_1's binary_logloss: 0.593856	valid_1's auc: 0.656544
[50]	training's binary_logloss: 0.594741	training's auc: 0.658462	valid_1's binary_logloss: 0.592512	valid_1's auc: 0.657173


In [21]:
print("AUC on validation: {:.4f}".format(val_auc))
print("AUC on test: {:.4f}".format(test_auc))

AUC on validation: 0.6572
AUC on test: 0.6633


### Bootstrap time split

In [22]:
n_rounds = 5

val_aucs = []
test_aucs = []
prod_aucs = []
for i in range(n_rounds):
    sample_data = data.sample(frac=0.8)
    
    X_train = sample_data[train_data][features]
    y_train = sample_data[train_data][TARGET]

    X_val = sample_data[val_data][features]
    y_val = sample_data[val_data][TARGET]

    X_test = sample_data[test_data][features]
    y_test = sample_data[test_data][TARGET]
    
    model = LGBMClassifier(**model_params)

    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], 
          eval_metric= 'auc', verbose=-1)

    y_pred_val = model.predict_proba(X_val, num_iteration=model.best_iteration_)[:, 1]
    y_pred = model.predict_proba(X_test, num_iteration=model.best_iteration_)[:, 1]
    test_aucs.append(roc_auc_score(y_test, y_pred))
    val_aucs.append(roc_auc_score(y_val, y_pred_val))
    y_pred_prod = model.predict_proba(production_data[features], num_iteration=model.best_iteration_)[:, 1]
    prod_aucs.append(roc_auc_score(production_data[TARGET], y_pred_prod))

    

print("AUC on validation: {:.4f}".format(np.mean(val_aucs)))
print("AUC on test: {:.4f}".format(np.mean(test_auc)))
print("AUC on prod: {:.4f}".format(np.mean(prod_auc)))

AUC on validation: 0.6574
AUC on test: 0.6633
AUC on prod: 0.6567
