In [1]:
%load_ext autoreload
%autoreload 2

In [29]:
import os
from target_encoding import TargetEncoderClassifier, TargetEncoder
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_score
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from functools import reduce
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import cross_val_score
import json
from tqdm import tqdm
tqdm.pandas()
from functools import partial

from dsb2019.data.validation import InstallationFold, cross_validate, quad_kappa
from dsb2019.data import DATA_DIR

In [15]:
keep_cols = ['event_id', 'game_session', 'installation_id', 'event_count', 'event_code', 'title', 'game_time', 'type', 'world', 'timestamp']

train = pd.read_csv(DATA_DIR / 'interim/train.csv', usecols=keep_cols)
test = pd.read_csv(DATA_DIR / 'raw/test.csv', usecols=keep_cols)
train_labels = pd.read_csv(DATA_DIR / 'raw/train_labels.csv')
submission = pd.read_csv(DATA_DIR / 'raw/sample_submission.csv')

In [16]:
def group_and_reduce(df):
    # group1 and group2 are intermediary "game session" groups,
    # which are reduced to one record by game session. group1 takes
    # the max value of game_time (final game time in a session) and 
    # of event_count (total number of events happened in the session).
    # group2 takes the total number of event_code of each type
    group1 = df.drop(columns=['event_id', 'event_code']).groupby(
        ['game_session', 'installation_id', 'title', 'type', 'world']
    ).max().reset_index()

    group2 = pd.get_dummies(
        df[['installation_id', 'event_code']], 
        columns=['event_code']
    ).groupby(['installation_id']).sum()

    # group3, group4 and group5 are grouped by installation_id 
    # and reduced using summation and other summary stats
    group3 = pd.get_dummies(
        group1.drop(columns=['game_session', 'event_count', 'game_time']),
        columns=['title', 'type', 'world']
    ).groupby(['installation_id']).sum()

    group4 = group1[
        ['installation_id', 'event_count', 'game_time']
    ].groupby(
        ['installation_id']
    ).agg([np.sum, np.mean, np.std])

    return group2.join(group3).join(group4).reset_index()

In [17]:
def group_reduce_wrapper(df):
    result = group_and_reduce(df)
    return result.iloc[0].to_dict()


def process_installations(train_labels, train, process_log):
    result = []
    train=train.sort_values("timestamp")
    installations = train.groupby("installation_id")
    for i, game_session, title, installation_id, accuracy_group in tqdm(train_labels[["game_session", "title", "installation_id", "accuracy_group"]].itertuples(), 
                                                              total=len(train_labels)):
        player_log = installations.get_group(installation_id).reset_index()
        log_length = player_log[(player_log.game_session==game_session) & (player_log.title==title)].index[0]
        player_log = player_log.iloc[:(log_length + 1)]
        player_log["accuracy_group"] = accuracy_group
        player_log["target_game_session"] = game_session
        features = process_log(player_log)
        features["installation_id"] = installation_id
        features["accuracy_group"] = accuracy_group
        result.append(features)
    return pd.DataFrame(data=result)

In [22]:
train = process_installations(train_labels, train, group_reduce_wrapper)
test = group_and_reduce(test)

print(train.shape)
train.head()

100%|██████████| 17690/17690 [23:40<00:00, 12.46it/s] 


(17690, 103)


Unnamed: 0,installation_id,event_code_2000,event_code_2010,event_code_2020,event_code_2025,event_code_2030,event_code_2035,event_code_2040,event_code_2050,event_code_2060,...,title_Pan Balance,title_Cart Balancer (Assessment),title_Chest Sorter (Assessment),title_Egg Dropper (Activity),title_Happy Camel,"title_Heavy, Heavier, Heaviest",title_Honey Cake,event_code_4050,title_Leaf Leader,event_code_4080
0,0006a69f,27.0,1.0,27.0,5.0,22.0,1.0,6.0,6.0,1.0,...,,,,,,,,,,
1,0006a69f,26.0,1.0,26.0,5.0,22.0,1.0,6.0,6.0,1.0,...,,,,,,,,,,
2,0006a69f,19.0,,20.0,4.0,18.0,,6.0,6.0,,...,,,,,,,,,,
3,0006a69f,48.0,2.0,52.0,9.0,43.0,5.0,10.0,9.0,2.0,...,,,,,,,,,,
4,0006a69f,57.0,3.0,64.0,10.0,53.0,6.0,10.0,9.0,3.0,...,,,,,,,,,,


In [24]:
#labels = train_labels[['installation_id', 'accuracy_group']]
#train = train.merge(labels, how='left', on='installation_id').dropna()

In [25]:
def make_features(train, test, alpha=10, max_unique=50):
    test = test.drop("accuracy_group", axis=1)
    len_uniques = []
    train_labeled = train.fillna(-999)
    test_labeled = test.fillna(-999)

    x_train, x_val, y_train, y_val = train_test_split(
        train.drop(['installation_id', 'accuracy_group'], axis=1),
        train['accuracy_group'],
        test_size=0.15,
        random_state=2019,
    )
    
    for c in train.columns.drop(['installation_id', 'accuracy_group']):
        le = LabelEncoder()
        le.fit(pd.concat([train_labeled[c], test_labeled[c]])) 
        train_labeled[c] = le.transform(train_labeled[c])
        test_labeled[c] = le.transform(test_labeled[c])
        len_uniques.append(len(le.classes_))

    x_train_labeled_ix, x_val_labeled_ix = train_test_split(
        np.arange(len(train_labeled)),
        test_size=0.15,
        random_state=2019,
    )
    x_train_labeled = train_labeled.drop(['installation_id', 'accuracy_group'], axis=1).iloc[x_train_labeled_ix]
    x_val_labeled = train_labeled.drop(['installation_id', 'accuracy_group'], axis=1).iloc[x_val_labeled_ix]
    
    cv = InstallationFold(train_labeled.installation_id.values[x_train_labeled_ix])

    enc = TargetEncoder(alpha=alpha, max_unique=max_unique, split=[cv])
    x_train_encoded = enc.transform_train(x_train_labeled, y=y_train)
    x_val_encoded = enc.transform_test(x_val_labeled)
    x_test_encoded = enc.transform_test(test.drop(['installation_id'], axis=1))

    x_train_encoded = pd.DataFrame(x_train_encoded)
    x_val_encoded = pd.DataFrame(x_val_encoded)
    x_test_encoded = pd.DataFrame(x_test_encoded)

    x_train_all = pd.concat([x_train.reset_index(drop=True), x_train_encoded], axis=1)
    x_val_all = pd.concat([x_val.reset_index(drop=True), x_val_encoded], axis=1)
    x_test_all = pd.concat([test.drop(['installation_id'], axis=1), x_test_encoded], axis=1)

    return x_train_all, x_val_all, x_test_all, y_train, y_val


def train_baseline(x_train_all,x_val_all,y_train,y_val):
    train_set = lgb.Dataset(x_train_all, y_train)
    val_set = lgb.Dataset(x_val_all, y_val)

    params = {
        'learning_rate': 0.01,
        'bagging_fraction': 0.9,
        'feature_fraction': 0.9,
        'num_leaves': 14,
        'lambda_l1': 0.1,
        'lambda_l2': 1,
        'metric': 'multiclass',
        'objective': 'multiclass',
        'num_classes': 4,
        'random_state': 2019
    }

    return lgb.train(params, train_set, num_boost_round=10000, early_stopping_rounds=300, valid_sets=[train_set, val_set], verbose_eval=100)

In [27]:
# def fit_fold(df, train_ix, test_ix):
#     train = df.iloc[train_ix].reset_index().copy()
#     test = df.iloc[test_ix].reset_index().copy()
#     x_train_all, x_val_all, x_test_all, y_train, y_val = make_features(train, test)
    
#     baseline = train_baseline(x_train_all, y_train, x_val_all, y_val)
#     test_pred = baseline.predict(x_test_all).argmax(axis=1)
#     test_true = test.accuracy_group.values
#     return test_true, test_pred


# def cross_validate(train, labels):
#     predicts = []
#     for ix_train, ix_test in cv.split(train, labels, train.installation_id.values):
#         predicts.append(fit_fold(train, ix_train, ix_test))
#     return predicts


def make_features_wrapper(train, test):
    x_train_all, x_val_all, x_test_all, y_train, y_val = make_features(train, test)
    return (x_train_all,x_val_all,y_train,y_val), (x_test_all,test.accuracy_group.values)

def make_predictions(model,x_test_all,y_test):
    pred=model.predict(x_test_all).argmax(axis=1)
    return pred,y_test

predicts=cross_validate(train, train.accuracy_group.values, make_features_wrapper,train_baseline,make_predictions)

Training until validation scores don't improve for 300 rounds
[100]	training's multi_logloss: 1.14178	valid_1's multi_logloss: 1.15511
[200]	training's multi_logloss: 1.10206	valid_1's multi_logloss: 1.13437
[300]	training's multi_logloss: 1.07304	valid_1's multi_logloss: 1.12389
[400]	training's multi_logloss: 1.04845	valid_1's multi_logloss: 1.11767
[500]	training's multi_logloss: 1.02734	valid_1's multi_logloss: 1.11435
[600]	training's multi_logloss: 1.00843	valid_1's multi_logloss: 1.11159
[700]	training's multi_logloss: 0.990944	valid_1's multi_logloss: 1.10999
[800]	training's multi_logloss: 0.974612	valid_1's multi_logloss: 1.10872
[900]	training's multi_logloss: 0.958925	valid_1's multi_logloss: 1.1078
[1000]	training's multi_logloss: 0.943894	valid_1's multi_logloss: 1.10726
[1100]	training's multi_logloss: 0.929746	valid_1's multi_logloss: 1.10672
[1200]	training's multi_logloss: 0.915851	valid_1's multi_logloss: 1.10603
[1300]	training's multi_logloss: 0.902424	valid_1's mu

In [30]:
np.mean([quad_kappa(true, pred) for pred, true in predicts]), [quad_kappa(true, pred) for pred, true in predicts]

(0.27631502524174467,
 [0.29748064769725047,
  0.2628900684015911,
  0.21348999589221673,
  0.37431954892073294,
  0.29026936480202903,
  0.22334322193984346,
  0.2688461802566625,
  0.3103661998964561,
  0.22402577966068105,
  0.2981192449499831])