In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
from collections import Counter
from sklearn.model_selection import  GroupKFold
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [2]:
np.random.seed(42)

In [3]:
class Config:
    TRAIN_PATH = '../inputs/train.csv'
    TEST_PATH = '../inputs/test.csv'
    TRAIN_LABELS = '../inputs/train_labels.csv'
    SAMPLE_SUBMISSION = '../inputs/sample_submission.csv'

In [4]:
def q2l(x):
    if x <= 3:
        return '0-4'
    if x <= 13:
        return '5-12'
    return '13-22'

In [5]:
event_features =  ['checkpoint_click','cutscene_click', 'person_click', 'navigate_click',
                   'observation_click', 'notification_click', 'object_click',
                   'object_hover', 'map_hover', 'map_click', 'notebook_click']
room_features = ['historicalsociety',
          'kohlcenter',
          'capitol_0',
          'humanecology',
          'drycleaner',
          'library',
          'capitol_1',
          'wildlife',
          'flaghouse','capitol_2']
fqid_lists = ['worker', 'archivist', 'gramps', 'wells', 'toentry', 'confrontation', 'crane_ranger', 'groupconvo', 'flag_girl', 'tomap', 'tostacks', 'tobasement', 'archivist_glasses', 'boss', 'journals', 'seescratches', 'groupconvo_flag', 'cs', 'teddy', 'expert', 'businesscards', 'ch3start', 'tunic.historicalsociety', 'tofrontdesk', 'savedteddy', 'plaque', 'glasses', 'tunic.drycleaner', 'reader_flag', 'tunic.library', 'tracks', 'tunic.capitol_2', 'trigger_scarf', 'reader', 'directory', 'tunic.capitol_1', 'journals.pic_0.next', 'unlockdoor', 'tunic', 'what_happened', 'tunic.kohlcenter', 'tunic.humanecology', 'colorbook', 'logbook', 'businesscards.card_0.next', 'journals.hub.topics', 'logbook.page.bingo', 'journals.pic_1.next', 'journals_flag', 'reader.paper0.next', 'tracks.hub.deer', 'reader_flag.paper0.next', 'trigger_coffee', 'wellsbadge', 'journals.pic_2.next', 'tomicrofiche', 'journals_flag.pic_0.bingo', 'plaque.face.date', 'notebook', 'tocloset_dirty', 'businesscards.card_bingo.bingo', 'businesscards.card_1.next', 'tunic.wildlife', 'tunic.hub.slip', 'tocage', 'journals.pic_2.bingo', 'tocollectionflag', 'tocollection', 'chap4_finale_c', 'chap2_finale_c', 'lockeddoor', 'journals_flag.hub.topics', 'tunic.capitol_0', 'reader_flag.paper2.bingo', 'photo', 'tunic.flaghouse', 'reader.paper1.next', 'directory.closeup.archivist', 'intro', 'businesscards.card_bingo.next', 'reader.paper2.bingo', 'retirement_letter', 'remove_cup', 'journals_flag.pic_0.next', 'magnify', 'coffee', 'key', 'togrampa', 'reader_flag.paper1.next', 'janitor', 'tohallway', 'chap1_finale', 'report', 'outtolunch', 'journals_flag.hub.topics_old', 'journals_flag.pic_1.next', 'reader.paper2.next', 'chap1_finale_c', 'reader_flag.paper2.next', 'door_block_talk', 'journals_flag.pic_1.bingo', 'journals_flag.pic_2.next', 'journals_flag.pic_2.bingo', 'block_magnify', 'reader.paper0.prev', 'block', 'reader_flag.paper0.prev', 'block_0', 'door_block_clean', 'reader.paper2.prev', 'reader.paper1.prev', 'doorblock', 'tocloset', 'reader_flag.paper2.prev', 'reader_flag.paper1.prev', 'block_tomap2', 'journals_flag.pic_0_old.next', 'journals_flag.pic_1_old.next', 'block_tocollection', 'block_nelson', 'journals_flag.pic_2_old.next', 'block_tomap1', 'block_badge', 'need_glasses', 'block_badge_2', 'fox', 'block_1']

name_features = ['basic', 'undefined', 'close', 'open', 'prev', 'next']
event_name_feature = ['cutscene_click', 'person_click', 'navigate_click',
       'observation_click', 'notification_click', 'object_click',
       'object_hover', 'map_hover', 'map_click', 'checkpoint',
       'notebook_click']
text_lists = ['tunic.historicalsociety.cage.confrontation', 'tunic.wildlife.center.crane_ranger.crane', 'tunic.historicalsociety.frontdesk.archivist.newspaper', 'tunic.historicalsociety.entry.groupconvo', 'tunic.wildlife.center.wells.nodeer', 'tunic.historicalsociety.frontdesk.archivist.have_glass', 'tunic.drycleaner.frontdesk.worker.hub', 'tunic.historicalsociety.closet_dirty.gramps.news', 'tunic.humanecology.frontdesk.worker.intro', 'tunic.historicalsociety.frontdesk.archivist_glasses.confrontation', 'tunic.historicalsociety.basement.seescratches', 'tunic.historicalsociety.collection.cs', 'tunic.flaghouse.entry.flag_girl.hello', 'tunic.historicalsociety.collection.gramps.found', 'tunic.historicalsociety.basement.ch3start', 'tunic.historicalsociety.entry.groupconvo_flag', 'tunic.library.frontdesk.worker.hello', 'tunic.library.frontdesk.worker.wells', 'tunic.historicalsociety.collection_flag.gramps.flag', 'tunic.historicalsociety.basement.savedteddy', 'tunic.library.frontdesk.worker.nelson', 'tunic.wildlife.center.expert.removed_cup', 'tunic.library.frontdesk.worker.flag', 'tunic.historicalsociety.frontdesk.archivist.hello', 'tunic.historicalsociety.closet.gramps.intro_0_cs_0', 'tunic.historicalsociety.entry.boss.flag', 'tunic.flaghouse.entry.flag_girl.symbol', 'tunic.historicalsociety.closet_dirty.trigger_scarf', 'tunic.drycleaner.frontdesk.worker.done', 'tunic.historicalsociety.closet_dirty.what_happened', 'tunic.wildlife.center.wells.animals', 'tunic.historicalsociety.closet.teddy.intro_0_cs_0', 'tunic.historicalsociety.cage.glasses.afterteddy', 'tunic.historicalsociety.cage.teddy.trapped', 'tunic.historicalsociety.cage.unlockdoor', 'tunic.historicalsociety.stacks.journals.pic_2.bingo', 'tunic.historicalsociety.entry.wells.flag', 'tunic.humanecology.frontdesk.worker.badger', 'tunic.historicalsociety.stacks.journals_flag.pic_0.bingo', 'tunic.historicalsociety.closet.intro', 'tunic.historicalsociety.closet.retirement_letter.hub', 'tunic.historicalsociety.entry.directory.closeup.archivist', 'tunic.historicalsociety.collection.tunic.slip', 'tunic.kohlcenter.halloffame.plaque.face.date', 'tunic.historicalsociety.closet_dirty.trigger_coffee', 'tunic.drycleaner.frontdesk.logbook.page.bingo', 'tunic.library.microfiche.reader.paper2.bingo', 'tunic.kohlcenter.halloffame.togrampa', 'tunic.capitol_2.hall.boss.haveyougotit', 'tunic.wildlife.center.wells.nodeer_recap', 'tunic.historicalsociety.cage.glasses.beforeteddy', 'tunic.historicalsociety.closet_dirty.gramps.helpclean', 'tunic.wildlife.center.expert.recap', 'tunic.historicalsociety.frontdesk.archivist.have_glass_recap', 'tunic.historicalsociety.stacks.journals_flag.pic_1.bingo', 'tunic.historicalsociety.cage.lockeddoor', 'tunic.historicalsociety.stacks.journals_flag.pic_2.bingo', 'tunic.historicalsociety.collection.gramps.lost', 'tunic.historicalsociety.closet.notebook', 'tunic.historicalsociety.frontdesk.magnify', 'tunic.humanecology.frontdesk.businesscards.card_bingo.bingo', 'tunic.wildlife.center.remove_cup', 'tunic.library.frontdesk.wellsbadge.hub', 'tunic.wildlife.center.tracks.hub.deer', 'tunic.historicalsociety.frontdesk.key', 'tunic.library.microfiche.reader_flag.paper2.bingo', 'tunic.flaghouse.entry.colorbook', 'tunic.wildlife.center.coffee', 'tunic.capitol_1.hall.boss.haveyougotit', 'tunic.historicalsociety.basement.janitor', 'tunic.historicalsociety.collection_flag.gramps.recap', 'tunic.wildlife.center.wells.animals2', 'tunic.flaghouse.entry.flag_girl.symbol_recap', 'tunic.historicalsociety.closet_dirty.photo', 'tunic.historicalsociety.stacks.outtolunch', 'tunic.library.frontdesk.worker.wells_recap', 'tunic.historicalsociety.frontdesk.archivist_glasses.confrontation_recap', 'tunic.capitol_0.hall.boss.talktogramps', 'tunic.historicalsociety.closet.photo', 'tunic.historicalsociety.collection.tunic', 'tunic.historicalsociety.closet.teddy.intro_0_cs_5', 'tunic.historicalsociety.closet_dirty.gramps.archivist', 'tunic.historicalsociety.closet_dirty.door_block_talk', 'tunic.historicalsociety.entry.boss.flag_recap', 'tunic.historicalsociety.frontdesk.archivist.need_glass_0', 'tunic.historicalsociety.entry.wells.talktogramps', 'tunic.historicalsociety.frontdesk.block_magnify', 'tunic.historicalsociety.frontdesk.archivist.foundtheodora', 'tunic.historicalsociety.closet_dirty.gramps.nothing', 'tunic.historicalsociety.closet_dirty.door_block_clean', 'tunic.capitol_1.hall.boss.writeitup', 'tunic.library.frontdesk.worker.nelson_recap', 'tunic.library.frontdesk.worker.hello_short', 'tunic.historicalsociety.stacks.block', 'tunic.historicalsociety.frontdesk.archivist.need_glass_1', 'tunic.historicalsociety.entry.boss.talktogramps', 'tunic.historicalsociety.frontdesk.archivist.newspaper_recap', 'tunic.historicalsociety.entry.wells.flag_recap', 'tunic.drycleaner.frontdesk.worker.done2', 'tunic.library.frontdesk.worker.flag_recap', 'tunic.humanecology.frontdesk.block_0', 'tunic.library.frontdesk.worker.preflag', 'tunic.historicalsociety.basement.gramps.seeyalater', 'tunic.flaghouse.entry.flag_girl.hello_recap', 'tunic.historicalsociety.closet.doorblock', 'tunic.drycleaner.frontdesk.worker.takealook', 'tunic.historicalsociety.basement.gramps.whatdo', 'tunic.library.frontdesk.worker.droppedbadge', 'tunic.historicalsociety.entry.block_tomap2', 'tunic.library.frontdesk.block_nelson', 'tunic.library.microfiche.block_0', 'tunic.historicalsociety.entry.block_tocollection', 'tunic.historicalsociety.entry.block_tomap1', 'tunic.historicalsociety.collection.gramps.look_0', 'tunic.library.frontdesk.block_badge', 'tunic.historicalsociety.cage.need_glasses', 'tunic.library.frontdesk.block_badge_2', 'tunic.kohlcenter.halloffame.block_0', 'tunic.capitol_0.hall.chap1_finale_c', 'tunic.capitol_1.hall.chap2_finale_c', 'tunic.capitol_2.hall.chap4_finale_c', 'tunic.wildlife.center.fox.concern', 'tunic.drycleaner.frontdesk.block_0', 'tunic.historicalsociety.entry.gramps.hub', 'tunic.humanecology.frontdesk.block_1', 'tunic.drycleaner.frontdesk.block_1']
room_lists = ['tunic.historicalsociety.entry', 'tunic.wildlife.center', 'tunic.historicalsociety.cage', 'tunic.library.frontdesk', 'tunic.historicalsociety.frontdesk', 'tunic.historicalsociety.stacks', 'tunic.historicalsociety.closet_dirty', 'tunic.humanecology.frontdesk', 'tunic.historicalsociety.basement', 'tunic.kohlcenter.halloffame', 'tunic.library.microfiche', 'tunic.drycleaner.frontdesk', 'tunic.historicalsociety.collection', 'tunic.historicalsociety.closet', 'tunic.flaghouse.entry', 'tunic.historicalsociety.collection_flag', 'tunic.capitol_1.hall', 'tunic.capitol_0.hall', 'tunic.capitol_2.hall']

LEVELS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
level_groups = ["0-4", "5-12", "13-22"]

def groupby_apply(g):
    res = {}
    elasped_time = g['elapsed_time'].values/1000
    level = g['level'].values
    res['duration'] = elasped_time.max() - elasped_time.min()
    for i in range(0,23):
        t = elasped_time[level==i]
        if len(t) > 0 :
            res[f'duration_level_{i}'] = t.max()-t.min()
        else:
            res[f'duration_level_{i}'] = 0
    res['text_fqid_null'] = pd.isnull(g['text_fqid']).sum()
    event_name_dict = Counter(g['event_name'].values)
    event_sequence = g['event_name'].values
    room_event_dict = Counter(g['room_event'].values)
    for col in event_features:
        res[f'{col}_sum'] = event_name_dict.get(col, 0)
        for col2 in room_features:
            res[f'{col}_{col2}_sum'] = room_event_dict.get(f'{col}_{col2}',0)
    room_dict = Counter(g['main_room'].values)
    for col in room_features:
        res[f'{col}_sum'] = room_dict.get(col, 0)

    elapsed_time_diff_all = g['elapsed_time_diff'].values
    # res['elapsed_time_diff_kurtosis'] = stats.kurtosis(elapsed_time_diff_all)
    # res['elapsed_time_diff_skew'] = stats.skew(elapsed_time_diff_all)
    res[f'elapsed_time_diff_q1'] = np.quantile(elapsed_time_diff_all,0.1)
    res[f'elapsed_time_diff_q2'] = np.quantile(elapsed_time_diff_all,0.2)
    res[f'elapsed_time_diff_q3'] = np.quantile(elapsed_time_diff_all,0.3)
    res[f'elapsed_time_diff_q4'] = np.quantile(elapsed_time_diff_all,0.4)
    res[f'elapsed_time_diff_q6'] = np.quantile(elapsed_time_diff_all,0.6)
    res[f'elapsed_time_diff_q7'] = np.quantile(elapsed_time_diff_all,0.7)
    res[f'elapsed_time_diff_q8'] = np.quantile(elapsed_time_diff_all,0.8)
    res[f'elapsed_time_diff_q9'] = np.quantile(elapsed_time_diff_all,0.9)
    res['elapsed_time_diff_mean'] = np.mean(elapsed_time_diff_all)
    res['elapsed_time_diff_std'] = np.std(elapsed_time_diff_all)
    res['elapsed_time_diff_max'] = np.max(elapsed_time_diff_all)
    res['elapsed_time_diff_min'] = np.min(elapsed_time_diff_all)
    res['elapsed_time_diff_positive'] = len(elapsed_time_diff_all[elapsed_time_diff_all>0])
    
    for col in event_features:
        elapsed_time_diff_event = elapsed_time_diff_all[event_sequence==col]
        elapsed_time_diff_event = elapsed_time_diff_event if len(elapsed_time_diff_event) > 0 else [0]
        # res[f'elapsed_time_diff_{col}_kurtosis'] = stats.kurtosis(elapsed_time_diff_event)
        # res[f'elapsed_time_diff_{col}_skew'] = stats.skew(elapsed_time_diff_event)
        res[f'elapsed_time_diff_{col}_q1'] = np.quantile(elapsed_time_diff_event,0.1)
        res[f'elapsed_time_diff_{col}_q2'] = np.quantile(elapsed_time_diff_event,0.2)
        res[f'elapsed_time_diff_{col}_q3'] = np.quantile(elapsed_time_diff_event,0.3)
        res[f'elapsed_time_diff_{col}_q4'] = np.quantile(elapsed_time_diff_event,0.4)
        res[f'elapsed_time_diff_{col}_q6'] = np.quantile(elapsed_time_diff_event,0.6)
        res[f'elapsed_time_diff_{col}_q7'] = np.quantile(elapsed_time_diff_event,0.7)
        res[f'elapsed_time_diff_{col}_q8'] = np.quantile(elapsed_time_diff_event,0.8)
        res[f'elapsed_time_diff_{col}_q9'] = np.quantile(elapsed_time_diff_event,0.9)
        res[f'elapsed_time_diff_{col}_mean'] = np.mean(elapsed_time_diff_event)
        res[f'elapsed_time_diff_{col}_max'] = np.max(elapsed_time_diff_event)
        res[f'elapsed_time_diff_{col}_min'] = np.min(elapsed_time_diff_event)
        res[f'elapsed_time_diff_{col}_std'] = np.std(elapsed_time_diff_event)
        res[f'elapsed_time_diff_{col}_sum'] = np.sum(elapsed_time_diff_event)
    fqid_sequence = g['fqid'].values
    for col in fqid_lists:
        elapsed_time_diff_fqid = elapsed_time_diff_all[fqid_sequence==col]
        elapsed_time_diff_fqid = elapsed_time_diff_fqid if len(elapsed_time_diff_fqid) > 0 else [0]
        # res[f'elapsed_time_diff_{col}_kurtosis'] = stats.kurtosis(elapsed_time_diff_fqid)
        # res[f'elapsed_time_diff_{col}_skew'] = stats.skew(elapsed_time_diff_fqid)
        res[f'elapsed_time_diff_{col}_q1'] = np.quantile(elapsed_time_diff_fqid,0.1)
        res[f'elapsed_time_diff_{col}_q2'] = np.quantile(elapsed_time_diff_fqid,0.2)
        res[f'elapsed_time_diff_{col}_q3'] = np.quantile(elapsed_time_diff_fqid,0.3)
        res[f'elapsed_time_diff_{col}_q4'] = np.quantile(elapsed_time_diff_fqid,0.4)
        res[f'elapsed_time_diff_{col}_q6'] = np.quantile(elapsed_time_diff_fqid,0.6)
        res[f'elapsed_time_diff_{col}_q7'] = np.quantile(elapsed_time_diff_fqid,0.7)
        res[f'elapsed_time_diff_{col}_q8'] = np.quantile(elapsed_time_diff_fqid,0.8)
        res[f'elapsed_time_diff_{col}_q9'] = np.quantile(elapsed_time_diff_fqid,0.9)
        res[f'elapsed_time_diff_{col}_mean'] = np.mean(elapsed_time_diff_fqid)
        res[f'elapsed_time_diff_{col}_max'] = np.max(elapsed_time_diff_fqid)
        res[f'elapsed_time_diff_{col}_min'] = np.min(elapsed_time_diff_fqid)
        res[f'elapsed_time_diff_{col}_std'] = np.std(elapsed_time_diff_fqid)
        res[f'elapsed_time_diff_{col}_sum'] = np.sum(elapsed_time_diff_fqid)
        
    text_sequence = g['text_fqid'].values
    for col in text_lists:
        elapsed_time_diff_text = elapsed_time_diff_all[text_sequence==col]
        elapsed_time_diff_text = elapsed_time_diff_text if len(elapsed_time_diff_text) > 0 else [0]
        # res[f'elapsed_time_diff_{col}_kurtosis'] = stats.kurtosis(elapsed_time_diff_text)
        # res[f'elapsed_time_diff_{col}_skew'] = stats.skew(elapsed_time_diff_text)
        res[f'elapsed_time_diff_{col}_q1'] = np.quantile(elapsed_time_diff_text,0.1)
        res[f'elapsed_time_diff_{col}_q2'] = np.quantile(elapsed_time_diff_text,0.2)
        res[f'elapsed_time_diff_{col}_q3'] = np.quantile(elapsed_time_diff_text,0.3)
        res[f'elapsed_time_diff_{col}_q4'] = np.quantile(elapsed_time_diff_text,0.4)
        res[f'elapsed_time_diff_{col}_q6'] = np.quantile(elapsed_time_diff_text,0.6)
        res[f'elapsed_time_diff_{col}_q7'] = np.quantile(elapsed_time_diff_text,0.7)
        res[f'elapsed_time_diff_{col}_q8'] = np.quantile(elapsed_time_diff_text,0.8)
        res[f'elapsed_time_diff_{col}_q9'] = np.quantile(elapsed_time_diff_text,0.9)
        res[f'elapsed_time_diff_{col}_mean'] = np.mean(elapsed_time_diff_text)
        res[f'elapsed_time_diff_{col}_max'] = np.max(elapsed_time_diff_text)
        res[f'elapsed_time_diff_{col}_min'] = np.min(elapsed_time_diff_text)
        res[f'elapsed_time_diff_{col}_std'] = np.std(elapsed_time_diff_text)
        res[f'elapsed_time_diff_{col}_sum'] = np.sum(elapsed_time_diff_text)
        

    # room_fqid_sequences = g['room_fqid'].values # không ngol
    # for col in room_lists:
    #     elapsed_time_diff_rfqid = elapsed_time_diff_all[room_fqid_sequences==col]
    #     elapsed_time_diff_rfqid = elapsed_time_diff_rfqid if len(elapsed_time_diff_rfqid) > 0 else [0]
    #     res[f'elapsed_time_diff_{col}_mean'] = np.mean(elapsed_time_diff_rfqid)
    #     res[f'elapsed_time_diff_{col}_max'] = np.max(elapsed_time_diff_rfqid)
    #     res[f'elapsed_time_diff_{col}_min'] = np.min(elapsed_time_diff_rfqid)
    #     res[f'elapsed_time_diff_{col}_std'] = np.std(elapsed_time_diff_rfqid)
    # level_sequences = g['level'].values
    # for col in LEVELS:
    #     elapsed_time_diff_level = elapsed_time_diff_all[level_sequences==col]
    #     elapsed_time_diff_level = elapsed_time_diff_level if len(elapsed_time_diff_level) > 0 else [0]
    #     res[f'elapsed_time_diff_{col}_mean'] = np.mean(elapsed_time_diff_level)
    #     res[f'elapsed_time_diff_{col}_max'] = np.max(elapsed_time_diff_level)
    #     res[f'elapsed_time_diff_{col}_min'] = np.min(elapsed_time_diff_level)
    #     res[f'elapsed_time_diff_{col}_std'] = np.std(elapsed_time_diff_level)
    # level_group_sequences = g['level_group'].values
    # for col in level_groups:
    #     elapsed_time_diff_level_group = elapsed_time_diff_all[level_group_sequences==col]
    #     elapsed_time_diff_level_group = elapsed_time_diff_level_group if len(elapsed_time_diff_level_group) > 0 else [0]
    #     res[f'elapsed_time_diff_{col}_mean'] = np.mean(elapsed_time_diff_level_group)
    #     res[f'elapsed_time_diff_{col}_max'] = np.max(elapsed_time_diff_level_group)
    #     res[f'elapsed_time_diff_{col}_min'] = np.min(elapsed_time_diff_level_group)
    #     res[f'elapsed_time_diff_{col}_std'] = np.std(elapsed_time_diff_level_group)
    #     res[f'elapsed_time_diff_{col}_sum'] = np.sum(elapsed_time_diff_level_group)
    # name_sequences = g['name'].values
    # for col in name_features:
    #     elapsed_time_diff_name = elapsed_time_diff_all[name_sequences==col]
    #     elapsed_time_diff_name = elapsed_time_diff_name if len(elapsed_time_diff_name) > 0 else [0]
    #     res[f'elapsed_time_diff_{col}_mean'] = np.mean(elapsed_time_diff_name)
    #     res[f'elapsed_time_diff_{col}_max'] = np.max(elapsed_time_diff_name)
    #     res[f'elapsed_time_diff_{col}_min'] = np.min(elapsed_time_diff_name)
    #     res[f'elapsed_time_diff_{col}_std'] = np.std(elapsed_time_diff_name)
    return pd.Series(res)

In [6]:
def feature_engineering(df, meta):
    df['main_room'] = df['room_fqid'].str.split('.').str[1]
    df['room_event'] =  df['event_name']+'_' + df['main_room']
    X = df.groupby(['session','level_group']).apply(groupby_apply).reset_index()
    X = meta.merge(X,how='left', on=['session','level_group'])
    X['question'] = X['question'].astype('category')
    X['level_group'] = X['level_group'].astype('category')
    for i in range(1,19):
        X[f'q{i}'] = X['question'] == i
    return X

In [7]:
stats.kurtosis([1,2,3])

-1.5

In [8]:
train_df = pd.read_csv(Config.TRAIN_PATH, usecols=lambda x: x not in ['fullscreen','hq','music'])
train_labels = pd.read_csv(Config.TRAIN_LABELS)
train_df.rename(columns={'session_id':'session'},inplace=True)
train_labels['question'] = train_labels['session_id'].str.split('q').str[-1].astype('int')
train_labels['session'] = train_labels['session_id'].str.split('_').str[0].astype('int64')
train_labels['level_group'] = train_labels['question'].apply(lambda x: q2l(x))

In [9]:
train_df['elapsed_time_diff'] = train_df.groupby(['session','level'])['elapsed_time'].diff()
train_df['elapsed_time_diff'].fillna(0,inplace=True)

In [10]:
def time_feature(train):
    train["year"] = train["session_id"].apply(lambda x: int(str(x)[:2])).astype(np.uint8)
    train["month"] = train["session_id"].apply(lambda x: int(str(x)[2:4])+1).astype(np.uint8)
    train["day"] = train["session_id"].apply(lambda x: int(str(x)[4:6])).astype(np.uint8)
    train["hour"] = train["session_id"].apply(lambda x: int(str(x)[6:8])).astype(np.uint8)
    train["minute"] = train["session_id"].apply(lambda x: int(str(x)[8:10])).astype(np.uint8)
    train["second"] = train["session_id"].apply(lambda x: int(str(x)[10:12])).astype(np.uint8)


    return train

In [None]:
%%time
X = feature_engineering(train_df, train_labels)
X = time_feature(X)

  res[f'elapsed_time_diff_{col}_skew'] = stats.skew(elapsed_time_diff_fqid)
  res[f'elapsed_time_diff_{col}_skew'] = stats.skew(elapsed_time_diff_text)
  res[f'elapsed_time_diff_{col}_skew'] = stats.skew(elapsed_time_diff_fqid)
  res[f'elapsed_time_diff_{col}_skew'] = stats.skew(elapsed_time_diff_text)
  res[f'elapsed_time_diff_{col}_skew'] = stats.skew(elapsed_time_diff_event)
  res[f'elapsed_time_diff_{col}_skew'] = stats.skew(elapsed_time_diff_fqid)
  res[f'elapsed_time_diff_{col}_skew'] = stats.skew(elapsed_time_diff_text)
  res[f'elapsed_time_diff_{col}_skew'] = stats.skew(elapsed_time_diff_fqid)
  res[f'elapsed_time_diff_{col}_skew'] = stats.skew(elapsed_time_diff_text)
  res[f'elapsed_time_diff_{col}_skew'] = stats.skew(elapsed_time_diff_fqid)
  res[f'elapsed_time_diff_{col}_skew'] = stats.skew(elapsed_time_diff_text)
  res[f'elapsed_time_diff_{col}_skew'] = stats.skew(elapsed_time_diff_fqid)
  res[f'elapsed_time_diff_{col}_skew'] = stats.skew(elapsed_time_diff_text)
  res[f'ela

In [None]:
FEATURES = X.columns[5:]
len(FEATURES)

In [None]:
# FEATURES = feats.sort_values('value',ascending=False)['feat'][:400].values

In [None]:
n_splits=5
gkf = GroupKFold(n_splits=n_splits)
oof = np.zeros(X.shape[0])
models = {}

# COMPUTE CV SCORE WITH 5 GROUP K FOLD
for i, (train_index, valid_index) in enumerate(gkf.split(X, groups=X['session'])):
    print('#'*25)
    print('### Fold',i+1)
    print('#'*25)

    xgb_params = {
        'objective' : 'binary:logistic',
        'eval_metric':'logloss',
        'learning_rate': 0.015,
        'max_depth': 5,
        'n_estimators': 3000,
        'early_stopping_rounds': 50,
        'subsample':0.8,
        'colsample_bytree': 0.8,
        'tree_method': 'gpu_hist',
        'seed':42,
        'use_label_encoder' : False}

    X_train = X.iloc[train_index][FEATURES]
    X_valid = X.iloc[valid_index][FEATURES]
    y_train = X.iloc[train_index]['correct'].values
    y_valid = X.iloc[valid_index]['correct'].values
        # TRAIN MODEL
    clf =  XGBClassifier(**xgb_params)
    clf.fit(X_train.astype('float32'), y_train,
            eval_set=[(X_train.astype('float32'), y_train), (X_valid.astype('float32'),y_valid)],
            verbose=100)
    print(f'({clf.best_ntree_limit}), ',end='')

        # SAVE MODEL, PREDICT VALID OOF
    models[i] = clf
    oof[valid_index] = clf.predict_proba(X_valid)[:,1]

    print()

In [None]:

def plot_feature_importance(importance,names,model_type):

    #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    fi_df = fi_df.head(50)
    #Define size of bar plot
    plt.figure(figsize=(20,10))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    #Add chart labels
    plt.title(model_type + 'FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')
feat_imp = {}
for k, v in models.items():
    for x, y in zip(v.feature_importances_, v.feature_names_in_):
        if y not in feat_imp:
            feat_imp[y] = x
        else:
            feat_imp[y]+=x
plot_feature_importance(list(feat_imp.values()),list(feat_imp.keys()),'')

In [None]:
feats = pd.DataFrame({"value":list(feat_imp.values()),
                      "feat":list(feat_imp.keys())})

In [None]:
feats.sort_values('value',ascending=False)['feat'][:400].values

In [None]:
scores = []; thresholds = []
best_score = 0; best_threshold = 0

for threshold in np.arange(0.4,0.9,0.01):
    print(f'{threshold:.02f}, ',end='')
    preds = (oof>threshold).astype('int')
    m = f1_score(X['correct'], preds, average='macro')
    scores.append(m)
    thresholds.append(threshold)
    if m>best_score:
        best_score = m
        best_threshold = threshold

In [None]:
import matplotlib.pyplot as plt

# PLOT THRESHOLD VS. F1_SCORE
plt.figure(figsize=(20,5))
plt.plot(thresholds,scores,'-o',color='blue')
plt.scatter([best_threshold], [best_score], color='blue', s=300, alpha=1)
plt.xlabel('Threshold',size=14)
plt.ylabel('Validation F1 Score',size=14)
plt.title(f'Threshold vs. F1_Score with Best F1_Score = {best_score:.4f} at Best Threshold = {best_threshold:.3}',size=18)
plt.show()