In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import lightgbm as lgb
import scipy as sp
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from tqdm import tqdm

In [2]:
tqdm.pandas()

  from pandas import Panel


In [3]:
%%time
# Only load those columns in order to save space
keep_cols = ['event_id', 'game_session', 'installation_id', 'event_count', 'event_code', 'title', 'game_time', 'type', 'world']

train = pd.read_csv('./source_data/train.csv', usecols=keep_cols)
test = pd.read_csv('./source_data/test.csv', usecols=keep_cols)
train_labels = pd.read_csv('./source_data/train_labels.csv')
submission = pd.read_csv('./source_data/sample_submission.csv')

CPU times: user 15.9 s, sys: 988 ms, total: 16.9 s
Wall time: 15.3 s


In [5]:
test_assess = test[test.type == 'Assessment'].copy()
test_labels = submission.copy()
test_labels['title'] = test_labels.installation_id.progress_apply(
    lambda install_id: test_assess[test_assess.installation_id == install_id].iloc[-1].title
)

  0%|          | 0/1000 [00:00<?, ?it/s]

  2%|▏         | 17/1000 [00:00<00:06, 162.82it/s]

  4%|▍         | 41/1000 [00:00<00:05, 179.31it/s]

  6%|▋         | 65/1000 [00:00<00:04, 192.91it/s]

  9%|▉         | 89/1000 [00:00<00:04, 203.53it/s]

 11%|█▏        | 114/1000 [00:00<00:04, 214.74it/s]

 14%|█▍        | 139/1000 [00:00<00:03, 223.28it/s]

 16%|█▋        | 163/1000 [00:00<00:03, 227.49it/s]

 19%|█▉        | 188/1000 [00:00<00:03, 231.28it/s]

 21%|██▏       | 213/1000 [00:00<00:03, 234.06it/s]

 24%|██▍       | 238/1000 [00:01<00:03, 237.11it/s]

 26%|██▋       | 263/1000 [00:01<00:03, 239.68it/s]

 29%|██▉       | 288/1000 [00:01<00:02, 242.65it/s]

 31%|███▏      | 313/1000 [00:01<00:02, 244.54it/s]

 34%|███▍      | 339/1000 [00:01<00:02, 246.38it/s]

 36%|███▋      | 365/1000 [00:01<00:02, 247.89it/s]

 39%|███▉      | 391/1000 [00:01<00:02, 248.74it/s]

 42%|████▏     | 416/1000 [00:01<00:02, 248.49it/s]

 44%|████▍     | 442/1000 [00:01<00:02, 249.55it/s]

 47%|████▋     | 468/1000 [00:01<00:02, 250.29it/s]

 49%|████▉     | 494/1000 [00:02<00:02, 251.29it/s]

 52%|█████▏    | 520/1000 [00:02<00:01, 249.41it/s]

 55%|█████▍    | 545/1000 [00:02<00:01, 247.76it/s]

 57%|█████▋    | 570/1000 [00:02<00:01, 247.81it/s]

 60%|█████▉    | 595/1000 [00:02<00:01, 247.38it/s]

 62%|██████▏   | 620/1000 [00:02<00:01, 245.70it/s]

 64%|██████▍   | 645/1000 [00:02<00:01, 245.19it/s]

 67%|██████▋   | 670/1000 [00:02<00:01, 243.34it/s]

 70%|██████▉   | 695/1000 [00:02<00:01, 243.85it/s]

 72%|███████▏  | 720/1000 [00:02<00:01, 245.24it/s]

 74%|███████▍  | 745/1000 [00:03<00:01, 246.31it/s]

 77%|███████▋  | 770/1000 [00:03<00:00, 247.26it/s]

 80%|███████▉  | 796/1000 [00:03<00:00, 248.39it/s]

 82%|████████▏ | 821/1000 [00:03<00:00, 246.24it/s]

 85%|████████▍ | 846/1000 [00:03<00:00, 244.55it/s]

 87%|████████▋ | 871/1000 [00:03<00:00, 244.86it/s]

 90%|████████▉ | 896/1000 [00:03<00:00, 246.17it/s]

 92%|█████████▏| 921/1000 [00:03<00:00, 243.86it/s]

 95%|█████████▍| 946/1000 [00:03<00:00, 244.14it/s]

 97%|█████████▋| 971/1000 [00:03<00:00, 244.25it/s]

100%|█████████▉| 996/1000 [00:04<00:00, 244.66it/s]

100%|██████████| 1000/1000 [00:04<00:00, 242.88it/s]




In [6]:
def compute_group1_stats(group1, col):
    return group1[
        ['installation_id', col, 'event_count', 'game_time']
    ].groupby(['installation_id', col]).agg(
        [np.mean, np.sum, np.std]
    ).reset_index().pivot(
        columns=col,
        index='installation_id'
    )


In [7]:
def group_and_reduce(df, df_labels):
    """
    Author: https://www.kaggle.com/xhlulu/
    Source: https://www.kaggle.com/xhlulu/ds-bowl-2019-simple-lgbm-using-aggregated-data
    """

    # First only filter the useful part of the df
    df = df[df.installation_id.isin(df_labels.installation_id.unique())]

    # group1 and group2 are intermediary "game session" groups,
    # which are reduced to one record by game session. group1 takes
    # the max value of game_time (final game time in a session) and 
    # of event_count (total number of events happened in the session).
    # group2 takes the total number of event_code of each type
    group1 = df.drop(columns=['event_id', 'event_code']).groupby(
        ['game_session', 'installation_id', 'title', 'type', 'world']
    ).max().reset_index()

    group2 = pd.get_dummies(
        df[['game_session', 'installation_id', 'event_code']],
        columns=['event_code']
    ).groupby(['game_session', 'installation_id']).sum().reset_index()

    # group3, group4, group5 are grouped by installation_id 
    # and reduced using summation and other summary stats
    group3 = pd.get_dummies(
        group1.drop(columns=['game_session', 'event_count', 'game_time']),
        columns=['title', 'type', 'world']
    ).groupby(['installation_id']).sum()

    group4 = group1[
        ['installation_id', 'event_count', 'game_time']
    ].groupby(
        ['installation_id']
    ).agg([np.sum, np.mean, np.std, np.min, np.max])

    # Additional stats on group1
    world_time_stats = compute_group1_stats(group1, 'world')
    type_time_stats = compute_group1_stats(group1, 'type')

    return group3.join(group4).join(
        world_time_stats).join(type_time_stats).fillna(0)


In [8]:
%%time
train_small = group_and_reduce(train, train_labels)
test_small = group_and_reduce(test, test_labels)

print(train_small.shape)
train_small.head()



(3614, 110)
CPU times: user 15.2 s, sys: 2.71 s, total: 17.9 s
Wall time: 8.77 s


Unnamed: 0_level_0,title_12 Monkeys,title_Air Show,title_All Star Sorting,title_Balancing Act,title_Bird Measurer (Assessment),title_Bottle Filler (Activity),title_Bubble Bath,title_Bug Measurer (Activity),title_Cart Balancer (Assessment),title_Cauldron Filler (Assessment),...,"(game_time, mean, Clip)","(game_time, mean, Game)","(game_time, sum, Activity)","(game_time, sum, Assessment)","(game_time, sum, Clip)","(game_time, sum, Game)","(game_time, std, Activity)","(game_time, std, Assessment)","(game_time, std, Clip)","(game_time, std, Game)"
installation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0006a69f,2.0,2.0,4.0,0.0,2.0,2.0,2.0,2.0,0.0,0.0,...,0.0,106966.45,3199695.0,236429.0,0.0,2139329.0,350054.566401,28330.303185,0.0,58189.254197
0006c192,1.0,0.0,0.0,2.0,1.0,1.0,1.0,1.0,0.0,2.0,...,0.0,88345.5,1210530.0,323061.0,0.0,530073.0,127422.7825,98940.202632,0.0,62500.291205
00129856,0.0,0.0,0.0,1.0,1.0,2.0,0.0,2.0,0.0,0.0,...,0.0,0.0,1021179.0,39742.0,0.0,0.0,130499.803239,28043.854942,0.0,0.0
001d0ed0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,2.0,0.0,...,0.0,158426.166667,92282.0,201941.0,0.0,950557.0,24694.997226,17737.374861,0.0,123969.846618
00225f67,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,129984.75,294517.0,35637.0,0.0,519939.0,49028.831364,12301.536672,0.0,65432.543128


In [13]:
print(train_small.shape)
print(train_small.head())

(3614, 110)
                 title_12 Monkeys  title_Air Show  title_All Star Sorting  \
installation_id                                                             
0006a69f                      2.0             2.0                     4.0   
0006c192                      1.0             0.0                     0.0   
00129856                      0.0             0.0                     0.0   
001d0ed0                      0.0             0.0                     0.0   
00225f67                      1.0             1.0                     1.0   

                 title_Balancing Act  title_Bird Measurer (Assessment)  \
installation_id                                                          
0006a69f                         0.0                               2.0   
0006c192                         2.0                               1.0   
00129856                         1.0                               1.0   
001d0ed0                         2.0                               1.0   
0022

In [14]:
titles = train_labels.title.unique()
title2mode = {}

for title in titles:
    mode = train_labels[train_labels.title == title].accuracy_group.value_counts().index[0]
    title2mode[title] = mode

train_labels['title_mode'] = train_labels.title.apply(lambda title: title2mode[title])
test_labels['title_mode'] = test_labels.title.apply(lambda title: title2mode[title])