In [7]:
import os
import gc


import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import scipy as sp

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold


from tqdm import tqdm

import lightgbm as lgb

In [8]:
path= '/mnt/7917b992-0701-4012-95c1-8b06e49e8f9b/2019DataScienceBowl/data/'


In [9]:
tqdm.pandas()

  from pandas import Panel


In [10]:
# Only load those columns in order to save space
keep_cols = ['event_id', 'game_session', 'installation_id', 'event_count', 'event_code', 'title', 'game_time', 'type', 'world']

In [11]:
%%time
train = pd.read_csv(path+'train.csv', usecols=keep_cols)

CPU times: user 17.4 s, sys: 1.06 s, total: 18.4 s
Wall time: 17.8 s


In [12]:
%%time
test = pd.read_csv(path+'test.csv', usecols=keep_cols)

CPU times: user 2.07 s, sys: 67.8 ms, total: 2.14 s
Wall time: 1.82 s


In [13]:
train_labels = pd.read_csv(path+'train_labels.csv')

In [14]:
submission = pd.read_csv(path+'sample_submission.csv')

In [15]:
# Copy the Assessment feature values
test_assess = test[test.type == 'Assessment'].copy()


In [16]:
# Copy the values from submission files as test labels
test_labels = submission.copy()

In [17]:

test_labels['title'] = test_labels.installation_id.progress_apply(
    lambda install_id: test_assess[test_assess.installation_id == install_id].iloc[-1].title
)

100%|██████████| 1000/1000 [00:05<00:00, 189.25it/s]


In [18]:
def group_and_reduce(df, df_labels):
    """
    Author: https://www.kaggle.com/xhlulu/
    Source: https://www.kaggle.com/xhlulu/ds-bowl-2019-simple-lgbm-using-aggregated-data
    """
    
    # First only filter the useful part of the df
    df = df[df.installation_id.isin(df_labels.installation_id.unique())]
    
    # group1 is am intermediary "game session" group,
    # which are reduced to one record by game session. group1 takes
    # the max value of game_time (final game time in a session) and 
    # of event_count (total number of events happened in the session).
    group1 = df.drop(columns=['event_id', 'event_code']).groupby(
        ['game_session', 'installation_id', 'title', 'type', 'world']
    ).max().reset_index()

    # group3, group4, group5 are grouped by installation_id 
    # and reduced using summation and other summary stats
    group3 = (
        pd.get_dummies(
            group1.drop(columns=['game_session', 'event_count', 'game_time']),
            columns=['title', 'type', 'world'])
        .groupby(['installation_id'])
        .sum()
    )

    group4 = (
        group1[['installation_id', 'event_count', 'game_time']]
        .groupby(['installation_id'])
        .agg([np.sum, np.mean, np.std, np.min, np.max])
    )
    
    # Additional stats on group1
    world_time_stats = compute_game_time_stats(group1, 'world')
    type_time_stats = compute_game_time_stats(group1, 'type')
    
    return (
        group3.join(group4)
        .join(world_time_stats)
        .join(type_time_stats)
        .fillna(0)
    )

In [19]:
def compute_game_time_stats(group1, col):
    return group1[
        ['installation_id', col, 'event_count', 'game_time']
    ].groupby(['installation_id', col]).agg(
        [np.mean, np.sum, np.std]
    ).reset_index().pivot(
        columns=col,
        index='installation_id'
    )

In [20]:
%%time
train_small = group_and_reduce(train, train_labels)
test_small = group_and_reduce(test, test_labels)

print(train_small.shape)
train_small.head()



(3614, 110)
CPU times: user 6.11 s, sys: 493 ms, total: 6.6 s
Wall time: 4.37 s


Unnamed: 0_level_0,title_12 Monkeys,title_Air Show,title_All Star Sorting,title_Balancing Act,title_Bird Measurer (Assessment),title_Bottle Filler (Activity),title_Bubble Bath,title_Bug Measurer (Activity),title_Cart Balancer (Assessment),title_Cauldron Filler (Assessment),...,"(game_time, mean, Clip)","(game_time, mean, Game)","(game_time, sum, Activity)","(game_time, sum, Assessment)","(game_time, sum, Clip)","(game_time, sum, Game)","(game_time, std, Activity)","(game_time, std, Assessment)","(game_time, std, Clip)","(game_time, std, Game)"
installation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0006a69f,2.0,2.0,4.0,0.0,2.0,2.0,2.0,2.0,0.0,0.0,...,0.0,106966.45,3199695.0,236429.0,0.0,2139329.0,350054.566401,28330.303185,0.0,58189.254197
0006c192,1.0,0.0,0.0,2.0,1.0,1.0,1.0,1.0,0.0,2.0,...,0.0,88345.5,1210530.0,323061.0,0.0,530073.0,127422.7825,98940.202632,0.0,62500.291205
00129856,0.0,0.0,0.0,1.0,1.0,2.0,0.0,2.0,0.0,0.0,...,0.0,0.0,1021179.0,39742.0,0.0,0.0,130499.803239,28043.854942,0.0,0.0
001d0ed0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,2.0,0.0,...,0.0,158426.166667,92282.0,201941.0,0.0,950557.0,24694.997226,17737.374861,0.0,123969.846618
00225f67,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,129984.75,294517.0,35637.0,0.0,519939.0,49028.831364,12301.536672,0.0,65432.543128


In [21]:
titles = train_labels.title.unique()
title2mode = {}

for title in titles:
    mode = train_labels[train_labels.title == title].accuracy_group.value_counts().index[0]
    title2mode[title] = mode

train_labels['title_mode'] = train_labels.title.apply(lambda title: title2mode[title])
test_labels['title_mode'] = test_labels.title.apply(lambda title: title2mode[title])

In [22]:
final_train = pd.get_dummies(
    (
        train_labels.set_index('installation_id')
        .drop(columns=['num_correct', 'num_incorrect', 'accuracy', 'game_session'])
        .join(train_small)
    ), 
    columns=['title']
)

# Experimental: only take the last record of each installation
final_train = final_train.reset_index().groupby('installation_id').apply(lambda x: x.iloc[-1])
final_train = final_train.drop(columns='installation_id')

print(final_train.shape)
final_train.head()

(3614, 117)


Unnamed: 0_level_0,accuracy_group,title_mode,title_12 Monkeys,title_Air Show,title_All Star Sorting,title_Balancing Act,title_Bird Measurer (Assessment),title_Bottle Filler (Activity),title_Bubble Bath,title_Bug Measurer (Activity),...,"(game_time, sum, Game)","(game_time, std, Activity)","(game_time, std, Assessment)","(game_time, std, Clip)","(game_time, std, Game)",title_Bird Measurer (Assessment),title_Cart Balancer (Assessment),title_Cauldron Filler (Assessment),title_Chest Sorter (Assessment),title_Mushroom Sorter (Assessment)
installation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0006a69f,3,0,2.0,2.0,4.0,0.0,2.0,2.0,2.0,2.0,...,2139329.0,350054.566401,28330.303185,0.0,58189.254197,1,0,0,0,0
0006c192,0,3,1.0,0.0,0.0,2.0,1.0,1.0,1.0,1.0,...,530073.0,127422.7825,98940.202632,0.0,62500.291205,0,0,0,0,1
00129856,3,0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,2.0,...,0.0,130499.803239,28043.854942,0.0,0.0,1,0,0,0,0
001d0ed0,3,3,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,...,950557.0,24694.997226,17737.374861,0.0,123969.846618,0,0,0,0,1
00225f67,0,0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,...,519939.0,49028.831364,12301.536672,0.0,65432.543128,1,0,0,0,0


In [23]:
final_test = pd.get_dummies(test_labels.set_index('installation_id').join(test_small), columns=['title'])

print(final_test.shape)
final_test.head()

(1000, 117)


Unnamed: 0_level_0,accuracy_group,title_mode,title_12 Monkeys,title_Air Show,title_All Star Sorting,title_Balancing Act,title_Bird Measurer (Assessment),title_Bottle Filler (Activity),title_Bubble Bath,title_Bug Measurer (Activity),...,"(game_time, sum, Game)","(game_time, std, Activity)","(game_time, std, Assessment)","(game_time, std, Clip)","(game_time, std, Game)",title_Bird Measurer (Assessment),title_Cart Balancer (Assessment),title_Cauldron Filler (Assessment),title_Chest Sorter (Assessment),title_Mushroom Sorter (Assessment)
installation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00abaee7,3,3,2.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,...,2285229.0,36886.664956,21240.073493,0.0,1038605.0,0,0,1,0,0
01242218,3,3,1.0,1.0,1.0,3.0,1.0,2.0,1.0,1.0,...,1420909.0,98521.245018,32761.743006,0.0,37797.81,0,1,0,0,0
017c5718,3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,6389.416875,0.0,0.0,0.0,0,0,0,0,1
01a44906,3,3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,77204.0,43064.217188,0.0,0.0,0.0,0,0,0,0,1
01bc6cb6,3,3,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,...,984880.0,0.0,0.0,0.0,178042.6,0,1,0,0,0


In [24]:
def cv_train(X, y, cv, **kwargs):
    """
    Author: https://www.kaggle.com/xhlulu/
    Source: https://www.kaggle.com/xhlulu/ds-bowl-2019-simple-lgbm-using-aggregated-data
    """
    models = []
    
    kf = KFold(n_splits=cv, random_state=2019)
    
    for train, test in kf.split(X):
        x_train, x_val, y_train, y_val = X[train], X[test], y[train], y[test]
        
        train_set = lgb.Dataset(x_train, y_train)
        val_set = lgb.Dataset(x_val, y_val)
        
        model = lgb.train(train_set=train_set, valid_sets=[train_set, val_set], **kwargs)
        models.append(model)
        
        if kwargs.get("verbose_eval"):
            print("\n" + "="*50 + "\n")
    
    return models

def cv_predict(models, X):
    """
    Author: https://www.kaggle.com/xhlulu/
    Source: https://www.kaggle.com/xhlulu/ds-bowl-2019-simple-lgbm-using-aggregated-data
    """
    return np.mean([model.predict(X) for model in models], axis=0)

In [25]:
from sklearn.model_selection import KFold
small_labels = train_labels[['installation_id', 'accuracy_group']].set_index('installation_id')
train_joined = train_small.join(small_labels).dropna()
kf = KFold(n_splits=10, random_state=2019)
X = train_joined.drop(columns='accuracy_group').values
y = train_joined['accuracy_group'].values.astype(np.int32)
y_pred = np.zeros((len(test_small), 4))
for train, test in kf.split(X):
    x_train, x_val, y_train, y_val = X[train], X[test], y[train], y[test]
    train_set = lgb.Dataset(x_train, y_train)
    val_set = lgb.Dataset(x_val, y_val)

    params = {
        'learning_rate': 0.01,
        'bagging_fraction': 0.9,
        'feature_fraction': 0.9,
        'num_leaves': 14,
        'lambda_l1': 0.1,
        'lambda_l2': 1,
        'metric': 'multiclass',
        'objective': 'multiclass',
        'num_classes': 4,
        'random_state': 2019
    }

    model = lgb.train(params, train_set, num_boost_round=10000, early_stopping_rounds=300, valid_sets=[train_set, val_set], verbose_eval=100)
    y_pred += model.predict(test_small)

Training until validation scores don't improve for 300 rounds.
[100]	training's multi_logloss: 1.14811	valid_1's multi_logloss: 1.16313
[200]	training's multi_logloss: 1.11441	valid_1's multi_logloss: 1.1394
[300]	training's multi_logloss: 1.09276	valid_1's multi_logloss: 1.12691
[400]	training's multi_logloss: 1.07648	valid_1's multi_logloss: 1.12319
[500]	training's multi_logloss: 1.06319	valid_1's multi_logloss: 1.12021
[600]	training's multi_logloss: 1.05175	valid_1's multi_logloss: 1.1189
[700]	training's multi_logloss: 1.04148	valid_1's multi_logloss: 1.11813
[800]	training's multi_logloss: 1.03229	valid_1's multi_logloss: 1.1181
[900]	training's multi_logloss: 1.02383	valid_1's multi_logloss: 1.11797
[1000]	training's multi_logloss: 1.01594	valid_1's multi_logloss: 1.11735
[1100]	training's multi_logloss: 1.00847	valid_1's multi_logloss: 1.11697
[1200]	training's multi_logloss: 1.00148	valid_1's multi_logloss: 1.11716
[1300]	training's multi_logloss: 0.994828	valid_1's multi_log

In [27]:
X_test = final_test.drop(columns=['accuracy_group'])
test_pred = cv_predict(models=models, X=X_test).argmax(axis=1)

final_test['accuracy_group'] = test_pred
final_test[['accuracy_group']].to_csv('submission.csv')

NameError: name 'models' is not defined