# Cat Boost Gameplay - Chapu

[Notebook](https://www.kaggle.com/code/vadimkamaev/catboost-mix) Based on CatBoot Mix of [Vadim Kamaev](https://www.kaggle.com/vadimkamaev) <br>
https://www.kaggle.com/code/vadimkamaev/catboost-mix

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
import gc
import sys
import datetime
import warnings

import polars as pl
from catboost import CatBoostClassifier, Pool

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# settings
warnings.filterwarnings("ignore")



/kaggle/input/predict-student-performance-from-game-play/sample_submission.csv
/kaggle/input/predict-student-performance-from-game-play/train_labels.csv
/kaggle/input/predict-student-performance-from-game-play/train.csv
/kaggle/input/predict-student-performance-from-game-play/test.csv
/kaggle/input/predict-student-performance-from-game-play/jo_wilder_310/competition.cpython-310-x86_64-linux-gnu.so
/kaggle/input/predict-student-performance-from-game-play/jo_wilder_310/__init__.py
/kaggle/input/predict-student-performance-from-game-play/jo_wilder/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/predict-student-performance-from-game-play/jo_wilder/__init__.py


# Load Train Data and Labels

In [2]:
dtypes = {"session_id": 'int64',
          "index": np.int16,
          "elapsed_time": np.int32,
          "event_name": 'category',
          "name": 'category',
          "level": np.int8,
          "page": np.float16,
          "room_coor_x": np.float16,
          "room_coor_y": np.float16,
          "screen_coor_x": np.float16,
          "screen_coor_y": np.float16,
          "hover_duration": np.float32,
          "text": 'category',
          "fqid": 'category',
          "room_fqid": 'category',
          "text_fqid": 'category',
          "fullscreen": np.int8,
          "hq": np.int8,
          "music": np.int8,
          "level_group": 'category'
          }

use_col = ['session_id', 'index', 'elapsed_time', 'event_name', 'name', 'level', 'page',
           'room_coor_x', 'room_coor_y', 'hover_duration', 'text', 'fqid', 'room_fqid', 'text_fqid', 'level_group']

In [3]:
%%time
#Read training dataset
df = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train.csv', dtype=dtypes, usecols=use_col)

CPU times: user 1min 35s, sys: 7.3 s, total: 1min 42s
Wall time: 2min 19s


In [4]:
df0_4 = df[df['level_group'] == '0-4']
df5_12 = df[df['level_group'] == '5-12']
df13_22 = df[df['level_group'] == '13-22']
del df

In [5]:
targets = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train_labels.csv')
targets['session'] = targets.session_id.apply(lambda x: int(x.split('_')[0]) )
targets['q'] = targets.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )
print( targets.shape )
targets.head(8)

(424116, 4)


Unnamed: 0,session_id,correct,session,q
0,20090312431273200_q1,1,20090312431273200,1
1,20090312433251036_q1,0,20090312433251036,1
2,20090312455206810_q1,1,20090312455206810,1
3,20090313091715820_q1,0,20090313091715820,1
4,20090313571836404_q1,1,20090313571836404,1
5,20090314035813970_q1,1,20090314035813970,1
6,20090314121766812_q1,1,20090314121766812,1
7,20090314221187252_q1,0,20090314221187252,1


# Feature Engineering

In [6]:
def delt_time_def(df):
    df.sort_values(by=['session_id', 'elapsed_time'], inplace=True)
    df['d_time'] = df['elapsed_time'].diff(1)
    df['d_time'].fillna(0, inplace=True)
    df['delt_time'] = df['d_time'].clip(0, 103000) 
    return df

In [7]:
def time_feature(train):
    train["session_id"] = train.index
    train["year"] = train["session_id"].apply(lambda x: int(str(x)[:2])).astype(np.uint8)
    train["month"] = train["session_id"].apply(lambda x: int(str(x)[2:4])+1).astype(np.uint8)
    train["day"] = train["session_id"].apply(lambda x: int(str(x)[4:6])).astype(np.uint8)
    train["hour"] = train["session_id"].apply(lambda x: int(str(x)[6:8])).astype(np.uint8)
    train["minute"] = train["session_id"].apply(lambda x: int(str(x)[8:10])).astype(np.uint8)
    train["second"] = train["session_id"].apply(lambda x: int(str(x)[10:12])).astype(np.uint8)
    train.drop(columns=["session_id"], inplace=True)
    
    return train

In [8]:
def feature_engineer(train):
    
    CATS = ['event_name', 'fqid', 'room_fqid', 'text_fqid', 'page']
    NUMS = ['delt_time', 'hover_duration']
    
    EV_NAME = ['checkpoint','observation_click', 'cutscene_click', 'notification_click', 'person_click',
               'object_click', 'map_click', 'object_hover'] 
    
    new_train = pd.DataFrame(index=train['session_id'].unique(), columns=[])
    
    for c in EV_NAME:
        new_train['l_ev_name_' + c] = train[train['event_name'] == c].groupby(['session_id'])['index'].count()
        new_train['t_ev_name_' + c] = train[train['event_name'] == c].groupby(['session_id'])['delt_time'].sum()
    
    maska = train['name'] == 'basic'
    
    # ADD QUANTILES
    qvant = train.groupby(['session_id'])['d_time'].quantile(q=0.3)
    qvant.name = 'qvant1_0_3'
    new_train = new_train.join(qvant)

    qvant = train.groupby(['session_id'])['d_time'].quantile(q=0.5)
    qvant.name = 'qvant3_0_5'
    new_train = new_train.join(qvant)

    qvant = train.groupby(['session_id'])['d_time'].quantile(q=0.65)
    qvant.name = 'qvant4_0_65'
    new_train = new_train.join(qvant)
    
    qvant = train.groupby(['session_id'])['d_time'].quantile(q=0.8)
    qvant.name = 'qvant2_0_8'
    new_train = new_train.join(qvant)
    
    new_train['finish'] = train[maska].groupby(['session_id'])['elapsed_time'].last(1)  
    new_train['len'] = train[maska].groupby(['session_id'])['index'].count()
    
    for c in CATS:
        tmp = train[maska].groupby(['session_id'])[c].agg('nunique')
        tmp.name = tmp.name + '_nunique'
        new_train = new_train.join(tmp)
    for c in NUMS:
        tmp = train[maska].groupby(['session_id'])[c].agg('mean')
        new_train = new_train.join(tmp)
    for c in NUMS:
        tmp = train[maska].groupby(['session_id'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        new_train = new_train.join(tmp)
        
    new_train = time_feature(new_train)
    new_train = new_train.fillna(-1)
    
    return new_train

In [9]:
def feature_quest(new_train, train, q):
    
    train_q = new_train.copy()
    
    texts = {
         1: ["Yes! This cool old slip from 1916.", # 0.665288
             "Go ahead, take a peek at the shirt!", # 0.664379
             "I'll be at the Capitol. Let me know if you find anything!", # 0.6597581972521553
             "We need to talk about that missing paperwork.", # 0.6605181072919606
             "The slip is from 1916 but the team didn't start until 1974!"], # 0.661944558929926
         2: ["It's already all done!", # 0.521736
             "Gramps is the best historian ever!"], # 0.5199033644180857
         3: ["I suppose historians are boring, too?" # 0.516384
             "Why don't you head to the Basketball Center and rustle up some clues?", # 0.514711
             "We need to talk about that missing paperwork."], # 0.5124478306383897   
        
         4: ['I need to find the owner of this slip.',
             'She led marches and helped women get the right to vote!', # 0.6540628230882878
             "Here's a call number to find more info in the Stacks.", # 0.6557324045957984
             "What was Wells doing here?"],# 0.6591721086153398

         5: ["Your gramps is awesome! Always full of stories.",
             "Here's a call number to find more info in the Stacks.", # 0.6246653297986882
             "Where did you get that coffee?"], # 0.6275345934517609        
        
         6: ["Oh, that's from Bean Town.", # 0.6264366752789874
             "Wells? I knew it!"], # 0.6267955513678926
           
         7: ["Try not to panic, Jo.",
             "I've got a stack of business cards from my favorite cleaners.",# 0.604381166742392
             "Check out our microfiche. It's right through that door.", # 0.606660216726699
             "I'm afraid my papers have gone missing in this mess.", # 0.6082894243930865
             "Nope. But Youmans and other suffragists worked hard to change that."], # 0.6088501394641797
            
         8: ["What should I do first?",
             "Thanks to them, Wisconsin was the first state to approve votes for women!"], # 0.5669326944672926

         9: [ "Can you help me? I need to find the owner of this slip.",
             'Looks like a dry cleaning receipt.',# 0.6087513244843515
             "I knew I could count on you, Jo!", # 0.6110724997907914
             "Nope, that's from Bean Town. I only drink Holdgers!"], # 0.611671469921066

         10:["I love these photos of me and Teddy."
             'Your gramps is awesome! Always full of stories.',# 0.5770680950588705
             "Nope. But Youmans and other suffragists worked hard to change that.", # 0.579478190790966
             "Right outside the door.", # 0.580619048797789
             "Do you have any info on Theodora Youmans?"], # 0.5813027148090604
                   
         11:["I ran into Wells there this morning",
             'Your gramps is awesome! Always full of stories.',# 0.6013201782869839
             "Wait a sec. Women couldn't vote?!", # 0.6031050558022679
             "I've got a stack of business cards from my favorite cleaners.",# 0.6040257491159207
             "An old shirt? Try the university."],  # 0.6042512493386453
         12:[],
         13:[],        
         14:[],
         15:[],
         16:[],
         17:[],
         18:[]
        }
    
    i = 0
    for text in texts[q]:
        i += 1
        train_q['text' + str(i)] = train[train['text'] == text].groupby(['session_id'])['delt_time'].sum()
    
    fqids = {
         1: ['directory'], # 0.656051
         2: ['notebook','chap1_finale_c'], # 0.512110
         3: ['tostacks','doorblock'], #0.509907
         4: ['journals.pic_1.next', 'businesscards.card_1.next', 'block'], # 0.6528585079781952
         5: ['janitor', 'journals.pic_2.next'], # 0.6236052722289602
         6: ['businesscards', 'journals.pic_0.next','tobasement', 'logbook.page.bingo', 'tohallway'],  #! 0.622522757907791
         7: ['journals.pic_1.next','reader.paper2.bingo','businesscards.card_bingo.next', 
             'logbook.page.bingo', 'tunic.kohlcenter'], #! 0.5960241518448325 
         8: ['reader.paper2.bingo'], # 0.5660111512321901 
         9: ['journals.pic_1.next','businesscards.card_bingo.bingo', 'reader'],  #! 0.6088924247950622
         10:['tunic.kohlcenter','magnify','block','journals.pic_1.next', 'journals'], #! 0.5735354507175185
         11:['tostacks','block_magnify','block','businesscards.card_bingo.next'], # 0.5983030710907746
         12:['businesscards.card_1.next','tofrontdesk'], # 0.507413 
         13:['tocloset_dirty','reader.paper1.next'], #0.468713
         14:['tracks'], #0.623715
         15:['groupconvo_flag'], # 0.581831
         16:['savedteddy'], # 0.471664
         17:['journals_flag.pic_0.next'], # 0.533380
         18:['chap4_finale_c'],# 0.497796 
        }
    
    for fqid in fqids[q]:
        train_q['t_fqid_' + fqid] = train[train['fqid'] == fqid].groupby(['session_id'])['delt_time'].sum()

    text_fqids = {
        1:[],
        2:['tunic.historicalsociety.collection.gramps.found'],# 0.508716
        3:[],
        4: ['tunic.humanecology.frontdesk.worker.intro',
            'tunic.library.frontdesk.worker.wells', # 0.6666325627660743
            'tunic.library.frontdesk.worker.hello'], # 0.6678694174620372
        5: ['tunic.humanecology.frontdesk.worker.intro',
            'tunic.historicalsociety.closet_dirty.gramps.helpclean',
            'tunic.historicalsociety.closet_dirty.gramps.news'],     # 0.6225926406619734
        6: ['tunic.humanecology.frontdesk.worker.intro',
            'tunic.historicalsociety.frontdesk.archivist.foundtheodora',
            'tunic.historicalsociety.closet_dirty.trigger_coffee', # 0.6298310348680769
            'tunic.historicalsociety.closet_dirty.gramps.archivist'], # 0.6320710506038789
        7: ['tunic.historicalsociety.closet_dirty.door_block_talk',
            'tunic.drycleaner.frontdesk.worker.hub',
            'tunic.historicalsociety.closet_dirty.trigger_coffee'], #
        8: ['tunic.humanecology.frontdesk.worker.intro',
            'tunic.historicalsociety.frontdesk.magnify', # 0.565820361974706
            'tunic.historicalsociety.closet_dirty.trigger_coffee'], # 0.5666316505932836
        9: ['tunic.historicalsociety.frontdesk.archivist.hello',
            'tunic.library.frontdesk.worker.wells', # 0.6123618449755199
            'tunic.historicalsociety.frontdesk.archivist.foundtheodora'], # 0.6165404455938354
        10: ['tunic.library.frontdesk.worker.wells',
            'tunic.historicalsociety.frontdesk.archivist.have_glass_recap',
             'tunic.historicalsociety.closet_dirty.gramps.news'], # 0.5829876555092278
        11: ['tunic.historicalsociety.frontdesk.archivist.newspaper_recap',
             'tunic.historicalsociety.closet_dirty.gramps.archivist'], # 0.5990726954508437 
        12:[],
        13:['tunic.drycleaner.frontdesk.logbook.page.bingo'],
        14: ['tunic.flaghouse.entry.flag_girl.symbol_recap', # 0.618391
             'tunic.historicalsociety.frontdesk.archivist_glasses.confrontation_recap'],
        15:['tunic.flaghouse.entry.colorbook'], # 0.578712
        16:['tunic.library.frontdesk.worker.nelson'], # 0.469557
        17:['tunic.historicalsociety.entry.wells.flag'], # 0.531631
        18:['tunic.flaghouse.entry.flag_girl.symbol_recap'], # 0.496091
    }
    
    for text_fqid in text_fqids[q]:
        maska = train['text_fqid'] == text_fqid
        train_q['t_text_fqid_' + text_fqid] = train[maska].groupby(['session_id'])['delt_time'].sum()       
        train_q['l_text_fqid_' + text_fqid] = train[train['text_fqid'] == text_fqid].groupby(['session_id'])['index'].count()


    room_lvls = {
         1: [['tunic.capitol_0.hall',4],['tunic.historicalsociety.collection',3],
            ['tunic.historicalsociety.entry',1],['tunic.historicalsociety.collection', 2]], 
         2: [],
         3: [['tunic.capitol_0.hall',4]], 
         4: [['tunic.historicalsociety.frontdesk',12], 
             ['tunic.historicalsociety.stacks',7]], 
         5: [['tunic.historicalsociety.stacks',12]],  
         6: [['tunic.drycleaner.frontdesk',8],  
             ['tunic.library.microfiche',9]], 
         7: [['tunic.library.frontdesk',10]], 
         8: [['tunic.kohlcenter.halloffame', 11], 
             ['tunic.kohlcenter.halloffame',6]], 
         9: [['tunic.capitol_1.hall', 12], 
             ['tunic.historicalsociety.collection',12]],
         10:[['tunic.humanecology.frontdesk',7]], 
         11:[['tunic.drycleaner.frontdesk',9], 
             ['tunic.historicalsociety.collection',6]], 
         12:[['tunic.historicalsociety.stacks',6],# 0.504048
             ['tunic.historicalsociety.frontdesk', 7],
             ['tunic.historicalsociety.closet_dirty',11], #0.5018942967072544
             ['tunic.historicalsociety.frontdesk', 12]], # 0.500436
         13:[['tunic.library.microfiche', 9], #0.463537
             ['tunic.historicalsociety.stacks', 11],
             ['tunic.library.frontdesk', 10], # 0.4596453764144546
             ['tunic.historicalsociety.entry', 5]], # 0.452105
         14:[['tunic.historicalsociety.closet_dirty',17],#0.617733
             ['tunic.historicalsociety.entry',15]], # 0.614938
         15:[['tunic.historicalsociety.entry',15],#0.569833
             ['tunic.library.frontdesk',20]], # 0.561463
         16:[['tunic.library.frontdesk', 20],# 0.467553
             ['tunic.wildlife.center',19]], # 0.464778
         17:[['tunic.wildlife.center', 19],#0.529125
             ['tunic.historicalsociety.stacks', 21]], # 0.527719
         18:[['tunic.wildlife.center', 22]], # 0.495289
        }
    
    for rl in room_lvls[q]:
        nam = rl[0]+str(rl[1])
        maska = (train['room_fqid'] == rl[0])&(train['level'] == rl[1])
        train_q['t_' + nam] = train[maska].groupby(['session_id'])['delt_time'].sum()
        train_q['l_' + nam] = train[maska].groupby(['session_id'])['index'].count()

    return train_q

In [10]:
def create_model(train, old_train, quests, models):
    
    kol_quest = len(quests)
    ALL_USERS = train.index.unique()
    print('We will train with', len(ALL_USERS) ,'users info')
   
    print('### quest', end='')
    
    # ITERATE THRU QUESTIONS
    for q in quests:
        print('# ', q, end='')
        
        train_q = feature_quest(train, old_train, q)
           
        # TRAIN DATA
        train_x = train_q
        train_users = train_x.index.values
        train_y = targets.loc[targets.q==q].set_index('session').loc[train_users]

        # TRAIN MODEL 

        model = CatBoostClassifier(
            n_estimators = 300,
            learning_rate= 0.045,
            depth = 6
        )
        
        model.fit(train_x.astype('float32'), train_y['correct'], verbose=False)

        # SAVE MODEL, PREDICT VALID OOF
        models[f'{q}'] = model
    print('***')
    
    return models

In [11]:
models = {}
best_threshold = 0.625

In [12]:
kol_lvl = (df0_4 .groupby(['session_id'])['level'].agg('nunique') < 5)
list_session = kol_lvl[kol_lvl].index
df0_4  = df0_4 [~df0_4 ['session_id'].isin(list_session)]

df0_4 = delt_time_def(df0_4)
train = feature_engineer(df0_4)
quests_0_4 =  [1,2,3] 
models = create_model(train, df0_4, quests_0_4, models)
del df0_4

We will train with 23562 users info
### quest#  1#  2#  3***


In [13]:
kol_lvl = (df5_12.groupby(['session_id'])['level'].agg('nunique') < 8)
list_session = kol_lvl[kol_lvl].index
df5_12 = df5_12[~df5_12['session_id'].isin(list_session)]


df5_12 = delt_time_def(df5_12)
train = feature_engineer(df5_12)
quests_5_12 = [4, 5, 6, 7, 8, 9, 10, 11,12] 
models = create_model(train, df5_12, quests_5_12, models)
del df5_12

We will train with 23561 users info
### quest#  4#  5#  6#  7#  8#  9#  10#  11#  12***


In [14]:
kol_lvl = (df13_22 .groupby(['session_id'])['level'].agg('nunique') < 10)
list_session = kol_lvl[kol_lvl].index
df13_22  = df13_22 [~df13_22 ['session_id'].isin(list_session)]

df13_22 = delt_time_def(df13_22)
train= feature_engineer(df13_22)
quests_13_22 = [13,14, 15, 16, 17,18] 
models = create_model(train, df13_22, quests_13_22, models)

We will train with 22986 users info
### quest#  13#  14#  15#  16#  17#  18***


In [15]:
l_models = list(models.keys())
print(l_models)

['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18']


In [16]:
#Saving a Model
for q in l_models:
    models[q].save_model(f'cat_model_{q}.bin')

In [17]:
#Model Reading
dir = '/kaggle/input/catbust/'
for q in l_models:
    #models[q] = CatBoostClassifier().load_model(dir+f'cat_model_{q}.bin')
    models[q] = CatBoostClassifier().load_model(f'cat_model_{q}.bin')

# Infer Test Data

In [18]:
import jo_wilder_310 as jo_wilder

In [19]:
try:
    jo_wilder.make_env.__called__ = False
    env.__called__ = False
    type(env)._state = type(type(env)._state).__dict__['INIT']
except:
    pass

env = jo_wilder.make_env()
iter_test = env.iter_test() 

In [20]:
list_q = {'0-4':quests_0_4, '5-12':quests_5_12, '13-22':quests_13_22}

for (test, sam_sub) in iter_test: 
    
    sam_sub['question'] = [int(label.split('_')[1][1:]) for label in sam_sub['session_id']]    
    grp = test.level_group.values[0]
    
    tmp = (test['event_name'] == 'checkpoint').sum()

    sam_sub['correct'] = 1
    sam_sub.loc[sam_sub.question.isin([5, 8, 10, 13, 15]), 'correct'] = 0  
    old_train = delt_time_def(test[test.level_group == grp])
    train = feature_engineer(old_train)
    
    for q in list_q[grp]:
        train_q = feature_quest(train, old_train, q)
        clf = models[f'{q}']
        p = clf.predict_proba(train_q.astype('float32'))[:,1]
        mask = sam_sub.question == q 
        x = int(p[0]>best_threshold)
        sam_sub.loc[mask,'correct'] = x
        
    sam_sub = sam_sub[['session_id', 'correct']]        
    env.predict(sam_sub)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


# Submission

In [21]:
df = pd.read_csv('submission.csv')
print( df.shape )

(54, 2)


In [22]:
df.head(10)

Unnamed: 0,session_id,correct
0,20090109393214576_q1,1
1,20090109393214576_q2,1
2,20090109393214576_q3,1
3,20090109393214576_q4,1
4,20090109393214576_q5,0
5,20090109393214576_q6,1
6,20090109393214576_q7,1
7,20090109393214576_q8,0
8,20090109393214576_q9,1
9,20090109393214576_q10,0
