In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!unzip -q '/content/drive/MyDrive/predict student games/data/predict-student-performance-from-game-play.zip'

In [None]:
import gc
import os
import pandas as pd
import numpy as np
import warnings
import pickle
import polars as pl

from collections import defaultdict
from itertools import combinations
import pyarrow as pa

# !pip uninstall catboost
# !pip install -q catboost==1.1.1
# from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import GroupKFold, KFold, train_test_split,StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score


import matplotlib.pyplot as plt


In [None]:
CATS = ['event_name', 'name','fqid', 'room_fqid', 'text_fqid']

event_name_feature = ['cutscene_click', 'person_click', 'navigate_click',
       'observation_click', 'notification_click', 'object_click',
       'object_hover', 'map_hover', 'map_click', 'checkpoint',
       'notebook_click']
text_lists = ['tunic.historicalsociety.cage.confrontation', 'tunic.wildlife.center.crane_ranger.crane', 'tunic.historicalsociety.frontdesk.archivist.newspaper', 'tunic.historicalsociety.entry.groupconvo', 'tunic.wildlife.center.wells.nodeer', 'tunic.historicalsociety.frontdesk.archivist.have_glass', 'tunic.drycleaner.frontdesk.worker.hub', 'tunic.historicalsociety.closet_dirty.gramps.news', 'tunic.humanecology.frontdesk.worker.intro', 'tunic.historicalsociety.frontdesk.archivist_glasses.confrontation', 'tunic.historicalsociety.basement.seescratches', 'tunic.historicalsociety.collection.cs', 'tunic.flaghouse.entry.flag_girl.hello', 'tunic.historicalsociety.collection.gramps.found', 'tunic.historicalsociety.basement.ch3start', 'tunic.historicalsociety.entry.groupconvo_flag', 'tunic.library.frontdesk.worker.hello', 'tunic.library.frontdesk.worker.wells', 'tunic.historicalsociety.collection_flag.gramps.flag', 'tunic.historicalsociety.basement.savedteddy', 'tunic.library.frontdesk.worker.nelson', 'tunic.wildlife.center.expert.removed_cup', 'tunic.library.frontdesk.worker.flag', 'tunic.historicalsociety.frontdesk.archivist.hello', 'tunic.historicalsociety.closet.gramps.intro_0_cs_0', 'tunic.historicalsociety.entry.boss.flag', 'tunic.flaghouse.entry.flag_girl.symbol', 'tunic.historicalsociety.closet_dirty.trigger_scarf', 'tunic.drycleaner.frontdesk.worker.done', 'tunic.historicalsociety.closet_dirty.what_happened', 'tunic.wildlife.center.wells.animals', 'tunic.historicalsociety.closet.teddy.intro_0_cs_0', 'tunic.historicalsociety.cage.glasses.afterteddy', 'tunic.historicalsociety.cage.teddy.trapped', 'tunic.historicalsociety.cage.unlockdoor', 'tunic.historicalsociety.stacks.journals.pic_2.bingo', 'tunic.historicalsociety.entry.wells.flag', 'tunic.humanecology.frontdesk.worker.badger', 'tunic.historicalsociety.stacks.journals_flag.pic_0.bingo', 'tunic.historicalsociety.closet.intro', 'tunic.historicalsociety.closet.retirement_letter.hub', 'tunic.historicalsociety.entry.directory.closeup.archivist', 'tunic.historicalsociety.collection.tunic.slip', 'tunic.kohlcenter.halloffame.plaque.face.date', 'tunic.historicalsociety.closet_dirty.trigger_coffee', 'tunic.drycleaner.frontdesk.logbook.page.bingo', 'tunic.library.microfiche.reader.paper2.bingo', 'tunic.kohlcenter.halloffame.togrampa', 'tunic.capitol_2.hall.boss.haveyougotit', 'tunic.wildlife.center.wells.nodeer_recap', 'tunic.historicalsociety.cage.glasses.beforeteddy', 'tunic.historicalsociety.closet_dirty.gramps.helpclean', 'tunic.wildlife.center.expert.recap', 'tunic.historicalsociety.frontdesk.archivist.have_glass_recap', 'tunic.historicalsociety.stacks.journals_flag.pic_1.bingo', 'tunic.historicalsociety.cage.lockeddoor', 'tunic.historicalsociety.stacks.journals_flag.pic_2.bingo', 'tunic.historicalsociety.collection.gramps.lost', 'tunic.historicalsociety.closet.notebook', 'tunic.historicalsociety.frontdesk.magnify', 'tunic.humanecology.frontdesk.businesscards.card_bingo.bingo', 'tunic.wildlife.center.remove_cup', 'tunic.library.frontdesk.wellsbadge.hub', 'tunic.wildlife.center.tracks.hub.deer', 'tunic.historicalsociety.frontdesk.key', 'tunic.library.microfiche.reader_flag.paper2.bingo', 'tunic.flaghouse.entry.colorbook', 'tunic.wildlife.center.coffee', 'tunic.capitol_1.hall.boss.haveyougotit', 'tunic.historicalsociety.basement.janitor', 'tunic.historicalsociety.collection_flag.gramps.recap', 'tunic.wildlife.center.wells.animals2', 'tunic.flaghouse.entry.flag_girl.symbol_recap', 'tunic.historicalsociety.closet_dirty.photo', 'tunic.historicalsociety.stacks.outtolunch', 'tunic.library.frontdesk.worker.wells_recap', 'tunic.historicalsociety.frontdesk.archivist_glasses.confrontation_recap', 'tunic.capitol_0.hall.boss.talktogramps', 'tunic.historicalsociety.closet.photo', 'tunic.historicalsociety.collection.tunic', 'tunic.historicalsociety.closet.teddy.intro_0_cs_5', 'tunic.historicalsociety.closet_dirty.gramps.archivist', 'tunic.historicalsociety.closet_dirty.door_block_talk', 'tunic.historicalsociety.entry.boss.flag_recap', 'tunic.historicalsociety.frontdesk.archivist.need_glass_0', 'tunic.historicalsociety.entry.wells.talktogramps', 'tunic.historicalsociety.frontdesk.block_magnify', 'tunic.historicalsociety.frontdesk.archivist.foundtheodora', 'tunic.historicalsociety.closet_dirty.gramps.nothing', 'tunic.historicalsociety.closet_dirty.door_block_clean', 'tunic.capitol_1.hall.boss.writeitup', 'tunic.library.frontdesk.worker.nelson_recap', 'tunic.library.frontdesk.worker.hello_short', 'tunic.historicalsociety.stacks.block', 'tunic.historicalsociety.frontdesk.archivist.need_glass_1', 'tunic.historicalsociety.entry.boss.talktogramps', 'tunic.historicalsociety.frontdesk.archivist.newspaper_recap', 'tunic.historicalsociety.entry.wells.flag_recap', 'tunic.drycleaner.frontdesk.worker.done2', 'tunic.library.frontdesk.worker.flag_recap', 'tunic.humanecology.frontdesk.block_0', 'tunic.library.frontdesk.worker.preflag', 'tunic.historicalsociety.basement.gramps.seeyalater', 'tunic.flaghouse.entry.flag_girl.hello_recap', 'tunic.historicalsociety.closet.doorblock', 'tunic.drycleaner.frontdesk.worker.takealook', 'tunic.historicalsociety.basement.gramps.whatdo', 'tunic.library.frontdesk.worker.droppedbadge', 'tunic.historicalsociety.entry.block_tomap2', 'tunic.library.frontdesk.block_nelson', 'tunic.library.microfiche.block_0', 'tunic.historicalsociety.entry.block_tocollection', 'tunic.historicalsociety.entry.block_tomap1', 'tunic.historicalsociety.collection.gramps.look_0', 'tunic.library.frontdesk.block_badge', 'tunic.historicalsociety.cage.need_glasses', 'tunic.library.frontdesk.block_badge_2', 'tunic.kohlcenter.halloffame.block_0', 'tunic.capitol_0.hall.chap1_finale_c', 'tunic.capitol_1.hall.chap2_finale_c', 'tunic.capitol_2.hall.chap4_finale_c', 'tunic.wildlife.center.fox.concern', 'tunic.drycleaner.frontdesk.block_0', 'tunic.historicalsociety.entry.gramps.hub', 'tunic.humanecology.frontdesk.block_1', 'tunic.drycleaner.frontdesk.block_1']
room_lists = ['tunic.historicalsociety.entry', 'tunic.wildlife.center', 'tunic.historicalsociety.cage', 'tunic.library.frontdesk', 'tunic.historicalsociety.frontdesk', 'tunic.historicalsociety.stacks', 'tunic.historicalsociety.closet_dirty', 'tunic.humanecology.frontdesk', 'tunic.historicalsociety.basement', 'tunic.kohlcenter.halloffame', 'tunic.library.microfiche', 'tunic.drycleaner.frontdesk', 'tunic.historicalsociety.collection', 'tunic.historicalsociety.closet', 'tunic.flaghouse.entry', 'tunic.historicalsociety.collection_flag', 'tunic.capitol_1.hall', 'tunic.capitol_0.hall', 'tunic.capitol_2.hall']
fqid_lists = ['worker', 'archivist', 'gramps', 'wells', 'toentry', 'confrontation', 'crane_ranger', 'groupconvo', 'flag_girl', 'tomap', 'tostacks', 'tobasement', 'archivist_glasses', 'boss', 'journals', 'seescratches', 'groupconvo_flag', 'cs', 'teddy', 'expert', 'businesscards', 'ch3start', 'tunic.historicalsociety', 'tofrontdesk', 'savedteddy', 'plaque', 'glasses', 'tunic.drycleaner', 'reader_flag', 'tunic.library', 'tracks', 'tunic.capitol_2', 'trigger_scarf', 'reader', 'directory', 'tunic.capitol_1', 'journals.pic_0.next', 'unlockdoor', 'tunic', 'what_happened', 'tunic.kohlcenter', 'tunic.humanecology', 'colorbook', 'logbook', 'businesscards.card_0.next', 'journals.hub.topics', 'logbook.page.bingo', 'journals.pic_1.next', 'journals_flag', 'reader.paper0.next', 'tracks.hub.deer', 'reader_flag.paper0.next', 'trigger_coffee', 'wellsbadge', 'journals.pic_2.next', 'tomicrofiche', 'journals_flag.pic_0.bingo', 'plaque.face.date', 'notebook', 'tocloset_dirty', 'businesscards.card_bingo.bingo', 'businesscards.card_1.next', 'tunic.wildlife', 'tunic.hub.slip', 'tocage', 'journals.pic_2.bingo', 'tocollectionflag', 'tocollection', 'chap4_finale_c', 'chap2_finale_c', 'lockeddoor', 'journals_flag.hub.topics', 'tunic.capitol_0', 'reader_flag.paper2.bingo', 'photo', 'tunic.flaghouse', 'reader.paper1.next', 'directory.closeup.archivist', 'intro', 'businesscards.card_bingo.next', 'reader.paper2.bingo', 'retirement_letter', 'remove_cup', 'journals_flag.pic_0.next', 'magnify', 'coffee', 'key', 'togrampa', 'reader_flag.paper1.next', 'janitor', 'tohallway', 'chap1_finale', 'report', 'outtolunch', 'journals_flag.hub.topics_old', 'journals_flag.pic_1.next', 'reader.paper2.next', 'chap1_finale_c', 'reader_flag.paper2.next', 'door_block_talk', 'journals_flag.pic_1.bingo', 'journals_flag.pic_2.next', 'journals_flag.pic_2.bingo', 'block_magnify', 'reader.paper0.prev', 'block', 'reader_flag.paper0.prev', 'block_0', 'door_block_clean', 'reader.paper2.prev', 'reader.paper1.prev', 'doorblock', 'tocloset', 'reader_flag.paper2.prev', 'reader_flag.paper1.prev', 'block_tomap2', 'journals_flag.pic_0_old.next', 'journals_flag.pic_1_old.next', 'block_tocollection', 'block_nelson', 'journals_flag.pic_2_old.next', 'block_tomap1', 'block_badge', 'need_glasses', 'block_badge_2', 'fox', 'block_1']
DIALOGS = ['that', 'this', 'it', 'you','find','found','Found','notebook','Wells','wells','help','need', 'Oh','Ooh','Jo', 'flag', 'can','and','is','the','to']

name_feature = ['basic', 'undefined', 'close', 'open', 'prev', 'next']
LEVELS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
level_groups = ["0-4", "5-12", "13-22"]

NUMS = [
        'page',
        'room_coor_x',
        'room_coor_y',
        'screen_coor_x',
        'screen_coor_y',
        'hover_duration',
        'elapsed_time_diff']

In [None]:
columns = [
    pl.col("page").cast(pl.Float32),
    (
        (pl.col("elapsed_time") - pl.col("elapsed_time").shift(1))
        .fill_null(0)
        .clip(0, 1e9)
        .over(["session_id", "level"])
        .alias("elapsed_time_diff")
    ),
    (
        (pl.col("screen_coor_x") - pl.col("screen_coor_x").shift(1))
        .abs()
        .over(["session_id", "level"])
    ),
    (
        (pl.col("screen_coor_y") - pl.col("screen_coor_y").shift(1))
        .abs()
        .over(["session_id", "level"])
    ),
    pl.col("fqid").fill_null("fqid_None"),
    pl.col("text_fqid").fill_null("text_fqid_None")

]

In [None]:
def feature_engineer(x, grp, use_extra, feature_suffix):
    aggs = [
        pl.col("index").count().alias(f"session_number_{feature_suffix}"),

        *[pl.col('index').filter(pl.col('text').str.contains(c)).count().alias(f'word_{c}') for c in DIALOGS],
        *[pl.col("elapsed_time_diff").filter((pl.col('text').str.contains(c))).mean().alias(f'word_mean_{c}') for c in DIALOGS],
        *[pl.col("elapsed_time_diff").filter((pl.col('text').str.contains(c))).std().alias(f'word_std_{c}') for c in DIALOGS],
        *[pl.col("elapsed_time_diff").filter((pl.col('text').str.contains(c))).max().alias(f'word_max_{c}') for c in DIALOGS],
        *[pl.col("elapsed_time_diff").filter((pl.col('text').str.contains(c))).sum().alias(f'word_sum_{c}') for c in DIALOGS],
        *[pl.col("elapsed_time_diff").filter((pl.col('text').str.contains(c))).median().alias(f'word_median_{c}') for c in DIALOGS],

        *[pl.col(c).drop_nulls().n_unique().alias(f"{c}_unique_{feature_suffix}") for c in CATS],

        *[pl.col(c).mean().alias(f"{c}_mean_{feature_suffix}") for c in NUMS],
        *[pl.col(c).std().alias(f"{c}_std_{feature_suffix}") for c in NUMS],
        *[pl.col(c).min().alias(f"{c}_min_{feature_suffix}") for c in NUMS],
        *[pl.col(c).max().alias(f"{c}_max_{feature_suffix}") for c in NUMS],
        *[pl.col(c).median().alias(f"{c}_median_{feature_suffix}") for c in NUMS],

        *[pl.col("fqid").filter(pl.col("fqid") == c).count().alias(f"{c}_fqid_counts{feature_suffix}")
          for c in fqid_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("fqid") == c).std().alias(f"{c}_ET_std_{feature_suffix}") for
          c in fqid_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("fqid") == c).mean().alias(f"{c}_ET_mean_{feature_suffix}") for
          c in fqid_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("fqid") == c).sum().alias(f"{c}_ET_sum_{feature_suffix}") for
          c in fqid_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("fqid") == c).median().alias(f"{c}_ET_median_{feature_suffix}") for
          c in fqid_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("fqid") == c).max().alias(f"{c}_ET_max_{feature_suffix}") for
          c in fqid_lists],

        *[pl.col("text_fqid").filter(pl.col("text_fqid") == c).count().alias(f"{c}_text_fqid_counts{feature_suffix}") for
          c in text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text_fqid") == c).std().alias(f"{c}_ET_std_{feature_suffix}") for
          c in text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text_fqid") == c).mean().alias(f"{c}_ET_mean_{feature_suffix}") for
          c in text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text_fqid") == c).sum().alias(f"{c}_ET_sum_{feature_suffix}") for
          c in text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text_fqid") == c).median().alias(f"{c}_ET_median_{feature_suffix}") for
          c in text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text_fqid") == c).max().alias(f"{c}_ET_max_{feature_suffix}") for
          c in text_lists],

        *[pl.col("room_fqid").filter(pl.col("room_fqid") == c).count().alias(f"{c}_room_fqid_counts{feature_suffix}")
          for c in room_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("room_fqid") == c).std().alias(f"{c}_ET_std_{feature_suffix}") for
          c in room_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("room_fqid") == c).mean().alias(f"{c}_ET_mean_{feature_suffix}") for
          c in room_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("room_fqid") == c).sum().alias(f"{c}_ET_sum_{feature_suffix}") for
          c in room_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("room_fqid") == c).median().alias(f"{c}_ET_median_{feature_suffix}") for
          c in room_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("room_fqid") == c).max().alias(f"{c}_ET_max_{feature_suffix}") for
          c in room_lists],

        *[pl.col("event_name").filter(pl.col("event_name") == c).count().alias(f"{c}_event_name_counts{feature_suffix}")
          for c in event_name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("event_name") == c).std().alias(f"{c}_ET_std_{feature_suffix}")for
          c in event_name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("event_name") == c).mean().alias(f"{c}_ET_mean_{feature_suffix}") for
          c in event_name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("event_name") == c).sum().alias(f"{c}_ET_sum_{feature_suffix}") for
          c in event_name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("event_name") == c).median().alias(f"{c}_ET_median_{feature_suffix}") for
          c in event_name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("event_name") == c).max().alias(f"{c}_ET_max_{feature_suffix}") for
          c in event_name_feature],

        *[pl.col("name").filter(pl.col("name") == c).count().alias(f"{c}_name_counts{feature_suffix}") for c in
          name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("name") == c).std().alias(f"{c}_ET_std_{feature_suffix}") for c in
          name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("name") == c).mean().alias(f"{c}_ET_mean_{feature_suffix}") for c in
          name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("name") == c).sum().alias(f"{c}_ET_sum_{feature_suffix}") for c in
          name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("name") == c).median().alias(f"{c}_ET_median_{feature_suffix}") for c in
          name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("name") == c).max().alias(f"{c}_ET_max_{feature_suffix}") for c in
          name_feature],

        *[pl.col("level").filter(pl.col("level") == c).count().alias(f"{c}_LEVEL_count{feature_suffix}") for c in LEVELS],
        *[pl.col("elapsed_time_diff").filter(pl.col("level") == c).std().alias(f"{c}_ET_std_{feature_suffix}") for c in
          LEVELS],
        *[pl.col("elapsed_time_diff").filter(pl.col("level") == c).mean().alias(f"{c}_ET_mean_{feature_suffix}") for c in
          LEVELS],
        *[pl.col("elapsed_time_diff").filter(pl.col("level") == c).sum().alias(f"{c}_ET_sum_{feature_suffix}") for c in
          LEVELS],
        *[pl.col("elapsed_time_diff").filter(pl.col("level") == c).median().alias(f"{c}_ET_median_{feature_suffix}") for c in
          LEVELS],
        *[pl.col("elapsed_time_diff").filter(pl.col("level") == c).max().alias(f"{c}_ET_max_{feature_suffix}") for c in
          LEVELS],

        *[pl.col("level_group").filter(pl.col("level_group") == c).count().alias(f"{c}_LEVEL_group_count{feature_suffix}") for c in
          level_groups],
        *[pl.col("elapsed_time_diff").filter(pl.col("level_group") == c).std().alias(f"{c}_ET_std_{feature_suffix}") for c in
          level_groups],
        *[pl.col("elapsed_time_diff").filter(pl.col("level_group") == c).mean().alias(f"{c}_ET_mean_{feature_suffix}") for c in
          level_groups],
        *[pl.col("elapsed_time_diff").filter(pl.col("level_group") == c).sum().alias(f"{c}_ET_sum_{feature_suffix}") for c in
          level_groups],
        *[pl.col("elapsed_time_diff").filter(pl.col("level_group") == c).median().alias(f"{c}_ET_median_{feature_suffix}") for c in
          level_groups],
        *[pl.col("elapsed_time_diff").filter(pl.col("level_group") == c).max().alias(f"{c}_ET_max_{feature_suffix}") for c in
          level_groups],
        *[pl.col("elapsed_time_diff").filter((pl.col("event_name")==c)&(pl.col("name")==d)&(pl.col("room_fqid")==e)).sum().alias(f"{c}_{d}_{e}_time_sum") for c in event_name_feature for d in name_feature for e in room_lists],
        *[pl.col("elapsed_time_diff").filter((pl.col("event_name")==c)&(pl.col("name")==d)&(pl.col("level")==e)).sum().alias(f"{c}_{d}_{e}_time_sum") for c in event_name_feature for d in name_feature for e in LEVELS],
        *[pl.col("elapsed_time_diff").filter((pl.col('text').str.contains(d))&(pl.col("room_fqid")==e)).sum().alias(f"{d}_{e}_time_sum") for d in DIALOGS for e in room_lists],
        *[pl.col("elapsed_time_diff").filter((pl.col('text').str.contains(d))&(pl.col("text_fqid")==e)).sum().alias(f"{d}_{e}_time_sum") for d in DIALOGS for e in text_lists]
    ]

    df = x.groupby(['session_id'], maintain_order=True).agg(aggs).sort("session_id")

    if use_extra:
        if grp == '5-12':
            aggs = [
                pl.col("elapsed_time").filter((pl.col("text")=="Here's the log book.")
                                              |(pl.col("fqid")=='logbook.page.bingo'))
                    .apply(lambda s: s.max()-s.min() if s.len() > 0 else 0).alias("logbook_bingo_duration"),
                pl.col("index").filter(
                    (pl.col("text") == "Here's the log book.") | (pl.col("fqid") == 'logbook.page.bingo')).apply(
                    lambda s: s.max() - s.min() if s.len() > 0 else 0 ).alias("logbook_bingo_indexCount"),
                pl.col("elapsed_time").filter(
                    ((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'reader')) | (
                                pl.col("fqid") == "reader.paper2.bingo")).apply(lambda s: s.max() - s.min() if s.len() > 0 else 0 ).alias(
                    "reader_bingo_duration"),
                pl.col("index").filter(((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'reader')) | (
                            pl.col("fqid") == "reader.paper2.bingo")).apply(lambda s: s.max() - s.min() if s.len() > 0 else 0 ).alias(
                    "reader_bingo_indexCount"),
                pl.col("elapsed_time").filter(
                    ((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'journals')) | (
                                pl.col("fqid") == "journals.pic_2.bingo")).apply(lambda s: s.max() - s.min() if s.len() > 0 else 0 ).alias(
                    "journals_bingo_duration"),
                pl.col("index").filter(((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'journals')) | (
                            pl.col("fqid") == "journals.pic_2.bingo")).apply(lambda s: s.max() - s.min() if s.len() > 0 else 0).alias(
                    "journals_bingo_indexCount"),
            ]
            tmp = x.groupby(["session_id"], maintain_order=True).agg(aggs).sort("session_id")
            df = df.join(tmp, on="session_id", how='left')

        if grp == '13-22':
            aggs = [
                pl.col("elapsed_time").filter(
                    ((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'reader_flag')) | (
                                pl.col("text_fqid") == "tunic.library.microfiche.reader_flag.paper2.bingo")).apply(
                    lambda s: s.max() - s.min() if s.len() > 0 else 0).alias("reader_flag_duration"),
                pl.col("index").filter(
                    ((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'reader_flag')) | (
                                pl.col("fqid") == "tunic.library.microfiche.reader_flag.paper2.bingo")).apply(
                    lambda s: s.max() - s.min() if s.len() > 0 else 0).alias("reader_flag_indexCount"),
                pl.col("elapsed_time").filter(
                    ((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'journals_flag')) | (
                                pl.col("fqid") == "journals_flag.pic_0.bingo")).apply(
                    lambda s: s.max() - s.min() if s.len() > 0 else 0).alias("journalsFlag_bingo_duration"),
                pl.col("index").filter(
                    ((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'journals_flag')) | (
                                pl.col("fqid") == "journals_flag.pic_0.bingo")).apply(
                    lambda s: s.max() - s.min() if s.len() > 0 else 0).alias("journalsFlag_bingo_indexCount")
            ]
            tmp = x.groupby(["session_id"], maintain_order=True).agg(aggs).sort("session_id")
            df = df.join(tmp, on="session_id", how='left')

    return df.to_pandas()

In [None]:
def time_feature(train):
    train["year"] = train["session_id"].apply(lambda x: int(str(x)[:2])).astype(np.uint8)
    train["month"] = train["session_id"].apply(lambda x: int(str(x)[2:4])+1).astype(np.uint8)
    train["day"] = train["session_id"].apply(lambda x: int(str(x)[4:6])).astype(np.uint8)
    train["hour"] = train["session_id"].apply(lambda x: int(str(x)[6:8])).astype(np.uint8)
    train["minute"] = train["session_id"].apply(lambda x: int(str(x)[8:10])).astype(np.uint8)
    train["second"] = train["session_id"].apply(lambda x: int(str(x)[10:12])).astype(np.uint8)
    return train

In [None]:
%%time
# we prepare the dataset for the training by level :
df = pl.read_csv("/content/train.csv").drop(["fullscreen", "hq", "music"])
df1 = df.filter(pl.col("level_group")=='0-4').sort(by=['session_id','index']).with_columns(columns)
df2 = df.filter((pl.col("level_group")=='0-4') | (pl.col("level_group")=='5-12')).sort(by=['session_id','elapsed_time']).with_columns(columns)
df3 = df.filter((pl.col("level_group")=='0-4') | (pl.col("level_group")=='5-12') | (pl.col("level_group")=='13-22')).sort(by=['session_id','elapsed_time']).with_columns(columns)
df1.shape,df2.shape,df3.shape


df1 = feature_engineer(df1, grp='0-4', use_extra=True, feature_suffix='')
print('df1 done',df1.shape)
df2 = feature_engineer(df2, grp='5-12', use_extra=True, feature_suffix='')
print('df2 done',df2.shape)
df3 = feature_engineer(df3, grp='13-22', use_extra=True, feature_suffix='')
print('df3 done',df3.shape)

df1 done (23562, 7803)
df2 done (23562, 7809)
df3 done (23562, 7807)
CPU times: user 2h 30min 48s, sys: 2min 25s, total: 2h 33min 14s
Wall time: 20min 25s


In [None]:
# some cleaning...
null1 = df1.isnull().sum().sort_values(ascending=False) / len(df1)
null2 = df2.isnull().sum().sort_values(ascending=False) / len(df1)
null3 = df3.isnull().sum().sort_values(ascending=False) / len(df1)

drop1 = list(null1[null1>0.9].index)
drop2 = list(null2[null2>0.9].index)
drop3 = list(null3[null3>0.9].index)
print(len(drop1), len(drop2), len(drop3))

for col in df1.columns:
    if df1[col].nunique()==1 and col != 'session_id':
        drop1.append(col)
print("*********df1 DONE*********")
for col in df2.columns:
    if df2[col].nunique()==1 and col != 'session_id' :
        drop2.append(col)
print("*********df2 DONE*********")
for col in df3.columns:
    if df3[col].nunique()==1 and col != 'session_id' :
        drop3.append(col)
print("*********df3 DONE*********")


6744 5860 4885
*********df1 DONE*********
*********df2 DONE*********
*********df3 DONE*********


In [None]:
df1 = time_feature(df1)
df2 = time_feature(df2)
df3 = time_feature(df3)

In [None]:
df1 = df1.set_index('session_id')
df2 = df2.set_index('session_id')
df3 = df3.set_index('session_id')

FEATURES1 = [c for c in df1.columns if c not in drop1+['level_group']]
FEATURES2 = [c for c in df2.columns if c not in drop2+['level_group']]
FEATURES3 = [c for c in df3.columns if c not in drop3+['level_group']]
df1 , df2 , df3 = df1[FEATURES1],df2[FEATURES2],df3[FEATURES3]
print('We will train with', len(FEATURES1), len(FEATURES2), len(FEATURES3) ,'features')
ALL_USERS = df1.index.unique()
print('We will train with', len(ALL_USERS) ,'users info')

We will train with 836 1836 2923 features
We will train with 23562 users info


In [None]:
# With previous training notebook (Kfold with 20 folds as performed in others notebooks) :
estimators_lgb = [1000] * 20

In [None]:
warnings.filterwarnings("ignore")
targets = pd.read_csv('./train_labels.csv')
targets['session'] = targets.session_id.apply(lambda x: int(x.split('_')[0]))
targets['q'] = targets.session_id.apply(lambda x: int(x.split('_')[-1][1:]))

In [None]:
def f1_metric(train_data,preds):
    labels = train_data
    preds = [1 if i > 0.61 else 0 for i in preds]
    return 'f1', f1_score(labels, preds, average='macro'), True


def f1_score_metric(labels,preds,thresh=0.61):
    preds = [1 if i > thresh else 0 for i in preds]
    return f1_score(labels, preds, average='macro')

In [None]:
!mkdir /content/drive/MyDrive/predict student games/weight_ensemble/weight_ensemble_v1

mkdir: cannot create directory ‘/content/drive/MyDrive/predict’: File exists
mkdir: cannot create directory ‘games/weight_ensemble/weight_ensemble_v1’: No such file or directory


In [None]:
import joblib
important_features = {}
for q in range(1, 19):
    # USE THIS TRAIN DATA WITH THESE QUESTIONS
    if q <= 3:
        grp = '0-4'
        df = df1
        FEATURES = FEATURES1

    elif q <= 13:
        grp = '5-12'
        df = df2
        FEATURES = FEATURES2
    elif q <= 22:
        grp = '13-22'
        df = df3
        FEATURES = FEATURES3
    important_features[q] = FEATURES

# joblib.dump(study.best_trial.params,f'/content/drive/MyDrive/predict student games/weight/xgb_weight_1/best_threshold.pkl')
joblib.dump(important_features,f'/content/drive/MyDrive/predict student games/weight_ensemble/weight_ensemble_v1/important_features.pkl')

['/content/drive/MyDrive/predict student games/weight_ensemble/weight_ensemble_v1/important_features.pkl']

In [None]:
pred_lgb = np.zeros((df1.shape[0], 18))
n_splits = 5
gkf = StratifiedKFold(n_splits=n_splits,random_state=19990829,shuffle=True)

xgb_importances_features = joblib.load('/content/drive/MyDrive/predict student games/weight/xgb_weight_new_v1/important_features_xgb.pkl')

oof_dict = {}
val_score = {}


for q in range(1, 19):
    # USE THIS TRAIN DATA WITH THESE QUESTIONS
    if q <= 3:
        grp = '0-4'
        df = df1
        FEATURES = FEATURES1
        # FEATURES = xgb_importances_features[q-1][:200]
    elif q <= 13:
        grp = '5-12'
        df = df2
        FEATURES = FEATURES2
        # FEATURES = xgb_importances_features[q-1][:500]
    elif q <= 22:
        grp = '13-22'
        df = df3
        FEATURES = FEATURES3
        # FEATURES = xgb_importances_features[q-1][:500]

    # cat_params['n_estimators'] = estimators_lgb[q - 1]
    df = df.reset_index()
    df['pred'] = -1
    df = df.merge(targets.loc[targets.q == q , ['session','correct']] , left_on=['session_id'],right_on=['session'])
    sample_label_rate = len(df.loc[df['correct']==1]) / len(df)

    # FEATURES = features[str(q)]
    class_weights = [sample_label_rate,1-sample_label_rate]

    xgb_params = {
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'tree_method': 'gpu_hist', #Change4
        'eval_metric':'logloss',
        'learning_rate': 0.02,
        'alpha': 8,
        'max_depth': 4,
        'n_estimators': 9999,
        'early_stopping_rounds': 90,
        'subsample':0.8,
        'colsample_bytree': 0.5,
        'seed': 42
    }

    # TRAIN DATA
    print(f'Train model for question {q} Data length {len(df)} Sample Label rate {sample_label_rate}')
    val_score[q] = []
    for fold, (train_idx, val_idx) in enumerate(gkf.split(df,df['correct'])):
        df_train = df.loc[train_idx] #.reset_index(drop=True)
        train_users = df_train.session_id.values
        train_y = df.loc[train_idx,'correct']

        df_val = df.loc[val_idx] #.reset_index(drop=True)
        val_users = df_val.session_id.values
        # val_y = targets[targets['session'].isin(list(val_users))].loc[targets.q == q].set_index('session')
        val_y = df.loc[val_idx,'correct']

        # train_pool = Pool(df_train[FEATURES].astype(np.float32), df_train[f"correct"])
        # valid_pool = Pool(df_val[FEATURES].astype(np.float32), df_val[f"correct"])

        # cat_params['eval_metric'] = f1_metric
        clf =  XGBClassifier(**xgb_params)

        clf.fit(df_train[FEATURES].astype('float32'), train_y,
                eval_set=[(df_val[FEATURES].astype('float32'), val_y)],
                verbose=0)
        joblib.dump(clf,f'/content/drive/MyDrive/predict student games/weight/xgb_weight_new_v4/xgb_question{q}_fold{fold}.pkl')
        clf = joblib.load(f'/content/drive/MyDrive/predict student games/weight/xgb_weight_new_v4/xgb_question{q}_fold{fold}.pkl')
        df.loc[val_idx,'pred'] = clf.predict_proba(df_val[FEATURES].astype('float32'))[:,1]
        print(f'question {q} fold {fold}',f1_score_metric(df.loc[val_idx,'correct'],df.loc[val_idx,'pred']))
        val_score[q].append(f1_score_metric(df.loc[val_idx,'correct'],df.loc[val_idx,'pred']))
    print('average' , f1_score_metric(df.loc[:,'correct'],df.loc[:,'pred']))
    val_score[q].append(f1_score_metric(df.loc[:,'correct'],df.loc[:,'pred']))
    oof_dict[q] = df
    print('*'*100)

In [None]:
valid_oof = pd.DataFrame()
for q in range(1,19):
  tmp = oof_dict[q]
  tmp['q'] = q
  tmp = tmp[['session_id','pred','q']]
  valid_oof = pd.concat([valid_oof,tmp],axis=0)
valid_oof = targets.merge(valid_oof,left_on=['session','q'],right_on=['session_id','q'])

best_score = 0
best_threshold = 0

for thresh in np.arange(0.4,0.81,0.01):
  score = f1_score_metric(valid_oof['correct'],valid_oof['pred'],thresh)
  if score > best_score:
    best_score = score
    best_threshold = thresh
print('best_score',best_score,'best_threshold',best_threshold)

In [None]:
features_importances_dict = {}
for i in range(18):
  models = models_list[i]
  feature_names = importances_dict[i+1]
  feature_importances = np.array([0] * len(feature_names))
  for model in models:
    feature_importances = feature_importances + model.feature_importances_ / 5
  features_importances_dict[i] = np.array(feature_names)[feature_importances.argsort()[::-1]]
joblib.dump(features_importances_dict,'/content/drive/MyDrive/predict student games/weight/xgb_weight_new_v1/important_features_xgb.pkl')

In [None]:
from xgboost import XGBClassifier

In [None]:
pred_lgb = np.zeros((df1.shape[0], 18))
n_splits = 5
gkf = StratifiedKFold(n_splits=n_splits,random_state=19990829,shuffle=True)

xgb_importances_features = joblib.load('/content/drive/MyDrive/predict student games/weight/xgb_weight_new_v1/important_features_xgb.pkl')

oof_dict = {}
val_score = {}


for q in range(1, 19):
    # USE THIS TRAIN DATA WITH THESE QUESTIONS
    if q <= 3:
        grp = '0-4'
        df = df1
        FEATURES = FEATURES1
        FEATURES = xgb_importances_features[q-1][:200]
    elif q <= 13:
        grp = '5-12'
        df = df2
        FEATURES = FEATURES2
        FEATURES = xgb_importances_features[q-1][:500]
    elif q <= 22:
        grp = '13-22'
        df = df3
        FEATURES = FEATURES3
        FEATURES = xgb_importances_features[q-1][:500]

    # cat_params['n_estimators'] = estimators_lgb[q - 1]
    df = df.reset_index()
    df['pred'] = -1
    df = df.merge(targets.loc[targets.q == q , ['session','correct']] , left_on=['session_id'],right_on=['session'])
    sample_label_rate = len(df.loc[df['correct']==1]) / len(df)

    # FEATURES = features[str(q)]
    class_weights = [sample_label_rate,1-sample_label_rate]

    xgb_params = {
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'tree_method': 'gpu_hist', #Change4
        'eval_metric':'logloss',
        'learning_rate': 0.02,
        'alpha': 8,
        'max_depth': 4,
        'n_estimators': 9999,
        'early_stopping_rounds': 90,
        'subsample':0.8,
        'colsample_bytree': 0.5,
        'seed': 42
    }

    # TRAIN DATA
    print(f'Train model for question {q} Data length {len(df)} Sample Label rate {sample_label_rate}')
    val_score[q] = []
    for fold, (train_idx, val_idx) in enumerate(gkf.split(df,df['correct'])):
        df_train = df.loc[train_idx] #.reset_index(drop=True)
        train_users = df_train.session_id.values
        train_y = df.loc[train_idx,'correct']

        df_val = df.loc[val_idx] #.reset_index(drop=True)
        val_users = df_val.session_id.values
        # val_y = targets[targets['session'].isin(list(val_users))].loc[targets.q == q].set_index('session')
        val_y = df.loc[val_idx,'correct']

        # train_pool = Pool(df_train[FEATURES].astype(np.float32), df_train[f"correct"])
        # valid_pool = Pool(df_val[FEATURES].astype(np.float32), df_val[f"correct"])

        # cat_params['eval_metric'] = f1_metric
        clf =  XGBClassifier(**xgb_params)

        clf.fit(df_train[FEATURES].astype('float32'), train_y,
                eval_set=[(df_val[FEATURES].astype('float32'), val_y)],
                verbose=0)
        joblib.dump(clf,f'/content/drive/MyDrive/predict student games/weight_ensemble/weight_ensemble_v1/xgb_question{q}_fold{fold}.pkl')
        clf = joblib.load(f'/content/drive/MyDrive/predict student games/weight_ensemble/weight_ensemble_v1/xgb_question{q}_fold{fold}.pkl')
        df.loc[val_idx,'pred'] = clf.predict_proba(df_val[FEATURES].astype('float32'))[:,1]
        print(f'question {q} fold {fold}',f1_score_metric(df.loc[val_idx,'correct'],df.loc[val_idx,'pred']))
        val_score[q].append(f1_score_metric(df.loc[val_idx,'correct'],df.loc[val_idx,'pred']))
    print('average' , f1_score_metric(df.loc[:,'correct'],df.loc[:,'pred']))
    val_score[q].append(f1_score_metric(df.loc[:,'correct'],df.loc[:,'pred']))
    oof_dict[q] = df
    print('*'*100)

Train model for question 1 Data length 23562 Sample Label rate 0.7274849333672863
question 1 fold 0 0.6679218757067861
question 1 fold 1 0.6845672725148724
question 1 fold 2 0.6770856548763012
question 1 fold 3 0.6751107542151373
question 1 fold 4 0.675382594640836
average 0.6760045998271489
****************************************************************************************************
Train model for question 2 Data length 23562 Sample Label rate 0.9788218317630082
question 2 fold 0 0.5286825437979185
question 2 fold 1 0.4943133047210301
question 2 fold 2 0.5219942186568965
question 2 fold 3 0.5387166953541486
question 2 fold 4 0.5312758843140442
average 0.5233394824670865
****************************************************************************************************
Train model for question 3 Data length 23562 Sample Label rate 0.9340039045921399
question 3 fold 0 0.5291430267687431
question 3 fold 1 0.5186455624308732
question 3 fold 2 0.5120863491137285
question 3 fold 3 

In [None]:
valid_oof = pd.DataFrame()
for q in range(1,19):
  tmp = oof_dict[q]
  tmp['q'] = q
  # if q in (2,18):
  #     tmp['q'] = 1
  tmp = tmp[['session_id','pred','q']]
  valid_oof = pd.concat([valid_oof,tmp],axis=0)
valid_oof = targets.merge(valid_oof,left_on=['session','q'],right_on=['session_id','q'])

best_score = 0
best_threshold = 0

for thresh in np.arange(0.4,0.81,0.01):
  score = f1_score_metric(valid_oof['correct'],valid_oof['pred'],thresh)
  if score > best_score:
    best_score = score
    best_threshold = thresh
print('best_score',best_score,'best_threshold',best_threshold)
print('score_group1',f1_score_metric(valid_oof.loc[valid_oof.q <= 3 , 'correct'],valid_oof.loc[valid_oof.q <= 3 ,'pred'],best_threshold))
print('score_group2',f1_score_metric(valid_oof.loc[(valid_oof.q > 3) & (valid_oof.q <= 13) , 'correct'],valid_oof.loc[(valid_oof.q > 3) & (valid_oof.q <= 13) ,'pred'],best_threshold))
print('score_group3',f1_score_metric(valid_oof.loc[(valid_oof.q > 13) & (valid_oof.q <= 18) , 'correct'],valid_oof.loc[(valid_oof.q > 13) & (valid_oof.q <= 18) ,'pred'],best_threshold))

best_score 0.7017594218494563 best_threshold 0.6200000000000002
score_group1 0.6986048427682099
