In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from target_encoding import TargetEncoderClassifier, TargetEncoder
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_score
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from functools import reduce
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import cross_val_score
import json
from functools import partial

from dsb2019.data.validation import InstallationFold
from dsb2019.data import DATA_DIR

In [3]:
keep_cols = ['event_id', 'game_session', 'installation_id', 'event_count', 'event_code', 'title', 'game_time', 'type', 'world']

train = pd.read_csv(DATA_DIR / 'raw/train.csv', usecols=keep_cols)
test = pd.read_csv(DATA_DIR / 'raw/test.csv', usecols=keep_cols)
train_labels = pd.read_csv(DATA_DIR / 'raw/train_labels.csv')
submission = pd.read_csv(DATA_DIR / 'raw/sample_submission.csv')

In [4]:
def group_and_reduce(df):
    # group1 and group2 are intermediary "game session" groups,
    # which are reduced to one record by game session. group1 takes
    # the max value of game_time (final game time in a session) and 
    # of event_count (total number of events happened in the session).
    # group2 takes the total number of event_code of each type
    group1 = df.drop(columns=['event_id', 'event_code']).groupby(
        ['game_session', 'installation_id', 'title', 'type', 'world']
    ).max().reset_index()

    group2 = pd.get_dummies(
        df[['installation_id', 'event_code']], 
        columns=['event_code']
    ).groupby(['installation_id']).sum()

    # group3, group4 and group5 are grouped by installation_id 
    # and reduced using summation and other summary stats
    group3 = pd.get_dummies(
        group1.drop(columns=['game_session', 'event_count', 'game_time']),
        columns=['title', 'type', 'world']
    ).groupby(['installation_id']).sum()

    group4 = group1[
        ['installation_id', 'event_count', 'game_time']
    ].groupby(
        ['installation_id']
    ).agg([np.sum, np.mean, np.std])

    return group2.join(group3).join(group4).reset_index()

In [5]:
train = group_and_reduce(train)
test = group_and_reduce(test)

print(train.shape)
train.head()



(17000, 101)


Unnamed: 0,installation_id,event_code_2000,event_code_2010,event_code_2020,event_code_2025,event_code_2030,event_code_2035,event_code_2040,event_code_2050,event_code_2060,...,world_CRYSTALCAVES,world_MAGMAPEAK,world_NONE,world_TREETOPCITY,"(event_count, sum)","(event_count, mean)","(event_count, std)","(game_time, sum)","(game_time, mean)","(game_time, std)"
0,0001e90f,10.0,0.0,61.0,0.0,60.0,0.0,15.0,15.0,1.0,...,0.0,7.0,1.0,2.0,1357,135.7,230.198972,1172787,117278.7,182750.896625
1,000447c4,5.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,1.0,...,0.0,4.0,1.0,0.0,181,36.2,54.1821,277707,55541.4,80311.743287
2,0006a69f,80.0,4.0,112.0,12.0,97.0,8.0,21.0,18.0,7.0,...,0.0,35.0,4.0,41.0,3801,47.5125,58.236531,5575453,69693.1625,177206.811293
3,0006c192,50.0,0.0,52.0,2.0,45.0,2.0,6.0,5.0,1.0,...,13.0,15.0,4.0,18.0,2224,44.48,76.898619,2063664,41273.28,82496.162713
4,0009a5a9,7.0,0.0,10.0,0.0,9.0,0.0,0.0,0.0,1.0,...,0.0,6.0,1.0,0.0,412,58.857143,77.962445,1854998,264999.714286,627146.960108


In [6]:
labels = train_labels[['installation_id', 'accuracy_group']]
train = train.merge(labels, how='left', on='installation_id').dropna()

In [7]:
def make_features(train, test, alpha=10, max_unique=50):
    test = test.drop("accuracy_group", axis=1)
    len_uniques = []
    train_labeled = train.fillna(-999)
    test_labeled = test.fillna(-999)

    x_train, x_val, y_train, y_val = train_test_split(
        train.drop(['installation_id', 'accuracy_group'], axis=1),
        train['accuracy_group'],
        test_size=0.15,
        random_state=2019,
    )
    
    for c in train.columns.drop(['installation_id', 'accuracy_group']):
        le = LabelEncoder()
        le.fit(pd.concat([train_labeled[c], test_labeled[c]])) 
        train_labeled[c] = le.transform(train_labeled[c])
        test_labeled[c] = le.transform(test_labeled[c])
        len_uniques.append(len(le.classes_))

    x_train_labeled_ix, x_val_labeled_ix = train_test_split(
        np.arange(len(train_labeled)),
        test_size=0.15,
        random_state=2019,
    )
    x_train_labeled = train_labeled.drop(['installation_id', 'accuracy_group'], axis=1).iloc[x_train_labeled_ix]
    x_val_labeled = train_labeled.drop(['installation_id', 'accuracy_group'], axis=1).iloc[x_val_labeled_ix]
    
    cv = InstallationFold(train_labeled.installation_id.values[x_train_labeled_ix])

    enc = TargetEncoder(alpha=alpha, max_unique=max_unique, split=[cv])
    x_train_encoded = enc.transform_train(x_train_labeled, y=y_train)
    x_val_encoded = enc.transform_test(x_val_labeled)
    x_test_encoded = enc.transform_test(test.drop(['installation_id'], axis=1))

    x_train_encoded = pd.DataFrame(x_train_encoded)
    x_val_encoded = pd.DataFrame(x_val_encoded)
    x_test_encoded = pd.DataFrame(x_test_encoded)

    x_train_all = pd.concat([x_train.reset_index(drop=True), x_train_encoded], axis=1)
    x_val_all = pd.concat([x_val.reset_index(drop=True), x_val_encoded], axis=1)
    x_test_all = pd.concat([test.drop(['installation_id'], axis=1), x_test_encoded], axis=1)

    return x_train_all, x_val_all, x_test_all, y_train, y_val


def train_baseline(x_train_all, y_train, x_val_all, y_val):
    train_set = lgb.Dataset(x_train_all, y_train)
    val_set = lgb.Dataset(x_val_all, y_val)

    params = {
        'learning_rate': 0.01,
        'bagging_fraction': 0.9,
        'feature_fraction': 0.9,
        'num_leaves': 14,
        'lambda_l1': 0.1,
        'lambda_l2': 1,
        'metric': 'multiclass',
        'objective': 'multiclass',
        'num_classes': 4,
        'random_state': 2019
    }

    return lgb.train(params, train_set, num_boost_round=10000, early_stopping_rounds=300, valid_sets=[train_set, val_set], verbose_eval=100)

In [8]:
cv = InstallationFold()

quad_kappa = partial(cohen_kappa_score, weights="quadratic")

In [9]:
def fit_fold(df, train_ix, test_ix):
    train = df.iloc[train_ix].reset_index().copy()
    test = df.iloc[test_ix].reset_index().copy()
    x_train_all, x_val_all, x_test_all, y_train, y_val = make_features(train, test)
    
    baseline = train_baseline(x_train_all, y_train, x_val_all, y_val)
    test_pred = baseline.predict(x_test_all).argmax(axis=1)
    test_true = test.accuracy_group.values
    return test_true, test_pred
    
def cross_validate(train, labels):
    predicts = []
    for ix_train, ix_test in cv.split(train, labels, train.installation_id.values):
        predicts.append(fit_fold(train, ix_train, ix_test))
    return predicts

In [10]:
predicts=cross_validate(train, labels)

Training until validation scores don't improve for 300 rounds
[100]	training's multi_logloss: 1.11235	valid_1's multi_logloss: 1.10856
[200]	training's multi_logloss: 1.06362	valid_1's multi_logloss: 1.0694
[300]	training's multi_logloss: 1.03288	valid_1's multi_logloss: 1.0498
[400]	training's multi_logloss: 1.01165	valid_1's multi_logloss: 1.04048
[500]	training's multi_logloss: 0.994967	valid_1's multi_logloss: 1.03497
[600]	training's multi_logloss: 0.981358	valid_1's multi_logloss: 1.03196
[700]	training's multi_logloss: 0.970015	valid_1's multi_logloss: 1.03006
[800]	training's multi_logloss: 0.959897	valid_1's multi_logloss: 1.02865
[900]	training's multi_logloss: 0.950783	valid_1's multi_logloss: 1.02795
[1000]	training's multi_logloss: 0.942395	valid_1's multi_logloss: 1.02806
[1100]	training's multi_logloss: 0.934777	valid_1's multi_logloss: 1.02855
[1200]	training's multi_logloss: 0.927858	valid_1's multi_logloss: 1.02925
Early stopping, best iteration is:
[925]	training's m

In [27]:
np.mean([quad_kappa(true, pred) for pred, true in predicts]), [quad_kappa(true, pred) for pred, true in predicts]

(0.3071532214638171,
 [0.2375982917000612,
  0.2817551273901412,
  0.2884058957825951,
  0.37506980052225725,
  0.327665424307657,
  0.31855081202573343,
  0.3268631029814445,
  0.29580476447634674,
  0.27366115354439946,
  0.3461578419075352])