In [23]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
from collections import Counter

In [24]:
class Config:
    TRAIN_PATH = '../inputs/train.csv'
    TEST_PATH = '../inputs/test.csv'
    TRAIN_LABELS = '../inputs/train_labels.csv'
    SAMPLE_SUBMISSION = '../inputs/sample_submission.csv'

In [25]:
train_df = pd.read_csv(Config.TRAIN_PATH)
train_labels = pd.read_csv(Config.TRAIN_LABELS)

In [26]:
train_labels['question'] = train_labels['session_id'].str.split('q').str[-1].astype('int8')
train_labels['session'] = train_labels['session_id'].str.split('_').str[0]
train_labels.head()

Unnamed: 0,session_id,correct,question,session
0,20090312431273200_q1,1,1,20090312431273200
1,20090312433251036_q1,0,1,20090312433251036
2,20090314121766812_q1,1,1,20090314121766812
3,20090314363702160_q1,1,1,20090314363702160
4,20090314441803444_q1,1,1,20090314441803444


In [27]:
train_labels['session'] = train_labels['session'].astype('int64')

In [28]:
import seaborn as sns
import matplotlib.pyplot as plt

In [35]:
CATS = ['event_name', 'fqid', 'room_fqid', 'text']
NUMS = ['elapsed_time','page','room_coor_x', 'room_coor_y',
        'screen_coor_x', 'screen_coor_y', 'hover_duration']
EVENTS = ['navigate_click','person_click','cutscene_click','object_click',
          'map_hover','notification_click','map_click','observation_click',
          'checkpoint']
ROOMS = ['historicalsociety',
          'kohlcenter',
          'capitol_0',
          'humanecology',
          'drycleaner',
          'library',
          'capitol_1',
          'wildlife',
          'flaghouse',
          'capitol_2']

In [36]:
def groupby_apply_elapsed_time(g):
    res = {}
    diff = np.diff(g['elapsed_time'].values)
    res['elapsed_time_diff_mean'] = np.mean(diff)
    res['elapsed_time_diff_std'] = np.std(diff)
    res['elapsed_time_positive'] = len(diff[diff>0])
    res['elapsed_time_negative'] = len(diff) - res['elapsed_time_positive']
    res['start-end'] = g['elapsed_time'].values[-1] - g['elapsed_time'].values[0]
    return pd.Series(res)

In [37]:
def feature_engineer(train):
    dfs = []
    tmp = train.groupby(['session_id','level_group']).apply(groupby_apply_elapsed_time)
    dfs.append(tmp)
    for c in EVENTS:
        train[c] = (train.event_name == c)
    train['main_room'] = train['room_fqid'].str.split('.').str[1]
    for c in ROOMS:
        train[c] = (train.main_room == c)
    for c in EVENTS +ROOMS+ ['elapsed_time']:
        tmp = train.groupby(['session_id','level_group'])[c].agg('sum')
        tmp.name = tmp.name + '_sum'
        dfs.append(tmp)
    for c in CATS:
        tmp = train.groupby(['session_id','level_group'])[c].agg('nunique')
        tmp.name = tmp.name + '_nunique'
        dfs.append(tmp)
    for c in NUMS + EVENTS + ROOMS:
        tmp = train.groupby(['session_id','level_group'])[c].agg('mean')
        tmp.name = tmp.name + '_mean'
        dfs.append(tmp)
    for c in NUMS+ EVENTS + ROOMS:
        tmp = train.groupby(['session_id','level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    train = train.drop(EVENTS+ROOMS,axis=1)

    df = pd.concat(dfs,axis=1)
    df = df.fillna(-1)
    df = df.reset_index()
    df = df.set_index('session_id')
    return df

In [38]:
%%time
df = feature_engineer(train_df)
print( df.shape )
df.head()

(35337, 82)
CPU times: total: 3min 6s
Wall time: 6min 39s


Unnamed: 0_level_0,level_group,elapsed_time_diff_mean,elapsed_time_diff_std,elapsed_time_positive,elapsed_time_negative,start-end,navigate_click_sum,person_click_sum,cutscene_click_sum,object_click_sum,...,historicalsociety_std,kohlcenter_std,capitol_0_std,humanecology_std,drycleaner_std,library_std,capitol_1_std,wildlife_std,flaghouse_std,capitol_2_std
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20090312431273200,0-4,1188.170732,2447.579017,161.0,3.0,194860.0,81,22,28,11,...,0.410055,0.401218,0.10976,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20090312431273200,13-22,1040.446301,1985.279111,411.0,8.0,435947.0,170,123,60,20,...,0.497842,0.0,0.0,0.0,0.0,0.293894,0.152637,0.432144,0.265619,0.068924
20090312431273200,5-12,941.525424,714.119241,291.0,4.0,277750.0,103,104,12,28,...,0.500744,0.0,0.171991,0.356344,0.315267,0.362904,0.205262,0.0,0.0,0.0
20090312433251036,0-4,1693.855072,3504.81959,136.0,2.0,233752.0,49,18,36,15,...,0.431407,0.422493,0.119517,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20090312433251036,13-22,2036.150463,10339.121375,1267.0,29.0,2638851.0,637,145,65,83,...,0.491914,0.271684,0.0,0.236403,0.208456,0.30549,0.067884,0.32631,0.194434,0.343418


In [None]:
from sklearn.model_selection import KFold, GroupKFold
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.metrics import f1_score