In [1]:
# https://github.com/KirillTushin/target_encoding


In [2]:
import os
os.chdir('../input/target-encoding')
from target_encoding import TargetEncoderClassifier, TargetEncoder
os.chdir('/kaggle/working')

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Load Data

In [3]:
%%time
# Only load those columns in order to save space
keep_cols = ['event_id', 'game_session', 'installation_id', 'event_count', 'event_code', 'title', 'game_time', 'type', 'world']

train = pd.read_csv('/kaggle/input/data-science-bowl-2019/train.csv', usecols=keep_cols)
test = pd.read_csv('/kaggle/input/data-science-bowl-2019/test.csv', usecols=keep_cols)
train_labels = pd.read_csv('/kaggle/input/data-science-bowl-2019/train_labels.csv')
submission = pd.read_csv('/kaggle/input/data-science-bowl-2019/sample_submission.csv')

CPU times: user 42.5 s, sys: 5.64 s, total: 48.1 s
Wall time: 48.6 s


# Group and Reduce

In [4]:
def group_and_reduce(df):
    # group1 and group2 are intermediary "game session" groups,
    # which are reduced to one record by game session. group1 takes
    # the max value of game_time (final game time in a session) and 
    # of event_count (total number of events happened in the session).
    # group2 takes the total number of event_code of each type
    group1 = df.drop(columns=['event_id', 'event_code']).groupby(
        ['game_session', 'installation_id', 'title', 'type', 'world']
    ).max().reset_index()

    group2 = pd.get_dummies(
        df[['installation_id', 'event_code']], 
        columns=['event_code']
    ).groupby(['installation_id']).sum()

    # group3, group4 and group5 are grouped by installation_id 
    # and reduced using summation and other summary stats
    group3 = pd.get_dummies(
        group1.drop(columns=['game_session', 'event_count', 'game_time']),
        columns=['title', 'type', 'world']
    ).groupby(['installation_id']).sum()

    group4 = group1[
        ['installation_id', 'event_count', 'game_time']
    ].groupby(
        ['installation_id']
    ).agg([np.sum, np.mean, np.std])

    return group2.join(group3).join(group4).reset_index()

In [5]:
%%time
train = group_and_reduce(train)
test = group_and_reduce(test)

print(train.shape)
train.head()



(17000, 101)
CPU times: user 19.6 s, sys: 16.1 s, total: 35.7 s
Wall time: 34.5 s


Unnamed: 0,installation_id,event_code_2000,event_code_2010,event_code_2020,event_code_2025,event_code_2030,event_code_2035,event_code_2040,event_code_2050,event_code_2060,...,world_CRYSTALCAVES,world_MAGMAPEAK,world_NONE,world_TREETOPCITY,"(event_count, sum)","(event_count, mean)","(event_count, std)","(game_time, sum)","(game_time, mean)","(game_time, std)"
0,0001e90f,10.0,0.0,61.0,0.0,60.0,0.0,15.0,15.0,1.0,...,0.0,7.0,1.0,2.0,1357,135.7,230.198972,1172787,117278.7,182750.896625
1,000447c4,5.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,1.0,...,0.0,4.0,1.0,0.0,181,36.2,54.1821,277707,55541.4,80311.743287
2,0006a69f,80.0,4.0,112.0,12.0,97.0,8.0,21.0,18.0,7.0,...,0.0,35.0,4.0,41.0,3801,47.5125,58.236531,5575453,69693.1625,177206.811293
3,0006c192,50.0,0.0,52.0,2.0,45.0,2.0,6.0,5.0,1.0,...,13.0,15.0,4.0,18.0,2224,44.48,76.898619,2063664,41273.28,82496.162713
4,0009a5a9,7.0,0.0,10.0,0.0,9.0,0.0,0.0,0.0,1.0,...,0.0,6.0,1.0,0.0,412,58.857143,77.962445,1854998,264999.714286,627146.960108


# Training model

In [6]:
labels = train_labels[['installation_id', 'accuracy_group']]
train = train.merge(labels, how='left', on='installation_id').dropna()

In [7]:
x_train, x_val, y_train, y_val = train_test_split(
    train.drop(['installation_id', 'accuracy_group'], axis=1),
    train['accuracy_group'],
    test_size=0.15,
    random_state=2019,
)

In [8]:
len_uniques = []
train_labeled = train.fillna(-999)
test_labeled = test.fillna(-999)

for c in train.columns.drop(['installation_id', 'accuracy_group']):
    le = LabelEncoder()
    le.fit(pd.concat([train_labeled[c], test_labeled[c]])) 
    train_labeled[c] = le.transform(train_labeled[c])
    test_labeled[c] = le.transform(test_labeled[c])
    len_uniques.append(len(le.classes_))

x_train_labeled, x_val_labeled = train_test_split(
    train_labeled.drop(['installation_id', 'accuracy_group'], axis=1),
    test_size=0.15,
    random_state=2019,
)

In [9]:
ALPHA = 10
MAX_UNIQUE = 50
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)




In [10]:
'''
split: list of int or cross-validator class,
            if split is [], then algorithm will encode features without cross-validation
            This situation features will overfit on target

            if split len is 1 for example [5], algorithm will encode features by using cross-validation on 5 folds
            This situation you will not overfit on tests, but when you will validate, your score will overfit

            if split len is 2 for example [5, 3], algorithm will separate data on 5 folds, afterwords
            will encode features by using cross-validation on 3 folds
            This situation is the best way to avoid overfit, but algorithm will use small data for encode.
'''


enc = TargetEncoder(alpha=ALPHA, max_unique=MAX_UNIQUE, split=[cv])
x_train_encoded = enc.transform_train(x_train_labeled, y=y_train)
x_val_encoded = enc.transform_test(x_val_labeled)
x_test_encoded = enc.transform_test(test.drop(['installation_id'], axis=1))

x_train_encoded = pd.DataFrame(x_train_encoded)
x_val_encoded = pd.DataFrame(x_val_encoded)
x_test_encoded = pd.DataFrame(x_test_encoded)

In [11]:
x_train_all = pd.concat([x_train.reset_index(drop=True), x_train_encoded], axis=1)
x_val_all = pd.concat([x_val.reset_index(drop=True), x_val_encoded], axis=1)
x_test_all = pd.concat([test.drop(['installation_id'], axis=1), x_test_encoded], axis=1)

In [12]:
train_set = lgb.Dataset(x_train_all, y_train)
val_set = lgb.Dataset(x_val_all, y_val)

params = {
    'learning_rate': 0.01,
    'bagging_fraction': 0.9,
    'feature_fraction': 0.9,
    'num_leaves': 14,
    'lambda_l1': 0.1,
    'lambda_l2': 1,
    'metric': 'multiclass',
    'objective': 'multiclass',
    'num_classes': 4,
    'random_state': 2019
}

model = lgb.train(params, train_set, num_boost_round=10000, early_stopping_rounds=300, valid_sets=[train_set, val_set], verbose_eval=100)

Training until validation scores don't improve for 300 rounds
[100]	training's multi_logloss: 1.11112	valid_1's multi_logloss: 1.10567
[200]	training's multi_logloss: 1.06181	valid_1's multi_logloss: 1.06993
[300]	training's multi_logloss: 1.03084	valid_1's multi_logloss: 1.05175
[400]	training's multi_logloss: 1.00849	valid_1's multi_logloss: 1.04202
[500]	training's multi_logloss: 0.990293	valid_1's multi_logloss: 1.03714
[600]	training's multi_logloss: 0.974725	valid_1's multi_logloss: 1.03493
[700]	training's multi_logloss: 0.960712	valid_1's multi_logloss: 1.03386
[800]	training's multi_logloss: 0.947742	valid_1's multi_logloss: 1.0333
[900]	training's multi_logloss: 0.935601	valid_1's multi_logloss: 1.03289
[1000]	training's multi_logloss: 0.924091	valid_1's multi_logloss: 1.03284
[1100]	training's multi_logloss: 0.913208	valid_1's multi_logloss: 1.03277
[1200]	training's multi_logloss: 0.90281	valid_1's multi_logloss: 1.03312
[1300]	training's multi_logloss: 0.892913	valid_1's m

In [13]:
val_pred = model.predict(x_val_all).argmax(axis=1)
print(classification_report(y_val, val_pred))

              precision    recall  f1-score   support

         0.0       0.57      0.51      0.54       636
         1.0       0.41      0.03      0.06       349
         2.0       0.91      0.03      0.06       307
         3.0       0.60      0.90      0.72      1361

    accuracy                           0.59      2653
   macro avg       0.62      0.37      0.35      2653
weighted avg       0.60      0.59      0.51      2653



In [14]:
y_pred = model.predict(x_test_all).argmax(axis=1)
test['accuracy_group'] = y_pred
test[['installation_id', 'accuracy_group']].to_csv('submission.csv', index=False)