In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm



In [2]:
dtypes={
    'elapsed_time':np.int32,
    'event_name':'category',
    'name':'category',
    'level':np.uint8,
    'room_coor_x':np.float32,
    'room_coor_y':np.float32,
    'screen_coor_x':np.float32,
    'screen_coor_y':np.float32,
    'hover_duration':np.float32,
    'text':'category',
    'fqid':'category',
    'room_fqid':'category',
    'text_fqid':'category',
    'fullscreen':'category',
    'hq':'category',
    'music':'category',
    'level_group':'category'}

dataset_df = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train.csv', dtype=dtypes)
print("Full train dataset shape is {}".format(dataset_df.shape))

Full train dataset shape is (26296946, 20)


In [3]:
dataset_df.head(5)

Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
0,20090312431273200,0,0,cutscene_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,0,0,1,0-4
1,20090312431273200,1,1323,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
2,20090312431273200,2,831,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
3,20090312431273200,3,1147,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,I gotta run to my meeting!,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
4,20090312431273200,4,1863,person_click,basic,0,,-412.991394,-159.314682,381.0,494.0,,"Can I come, Gramps?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4


In [4]:
labels = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train_labels.csv')

In [5]:
labels['session'] = labels.session_id.apply(lambda x: int(x.split('_')[0]) )
labels['q'] = labels.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )

In [6]:
CATAGORICAL = ['event_name', 'name','fqid', 'room_fqid', 'text_fqid']
NUMERICAL = ['elapsed_time','level','page','room_coor_x', 'room_coor_y', 
        'screen_coor_x', 'screen_coor_y', 'hover_duration']

NUMS = ['page', 'room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y',
        'hover_duration', 'elapsed_time_diff']
DIALOGS = ['that', 'this', 'it', 'you','find','found','Found','notebook','Wells','wells','help','need', 'Oh','Ooh','Jo', 'flag', 'can','and','is','the','to']

NAMEFEATURE = ['basic', 'undefined', 'close', 'open', 'prev', 'next']
EVENTNAME = ['cutscene_click', 'person_click', 'navigate_click',
       'observation_click', 'notification_click', 'object_click',
       'object_hover', 'map_hover', 'map_click', 'checkpoint',
       'notebook_click']

SUBROOMLIST = {
     'tunic.historicalsociety.entry',
     'tunic.historicalsociety.stacks',
     'tunic.historicalsociety.basement',
     'tunic.kohlcenter.halloffame',
     'tunic.historicalsociety.collection',
     'tunic.historicalsociety.closet',
     'tunic.capitol_0.hall',
     'tunic.historicalsociety.entry',
     'tunic.library.frontdesk',
     'tunic.historicalsociety.frontdesk',
     'tunic.historicalsociety.stacks',
     'tunic.historicalsociety.closet_dirty',
     'tunic.humanecology.frontdesk',
     'tunic.historicalsociety.basement',
     'tunic.kohlcenter.halloffame',
     'tunic.library.microfiche',
     'tunic.drycleaner.frontdesk',
     'tunic.historicalsociety.collection',
     'tunic.capitol_1.hall',
     'tunic.historicalsociety.entry',
     'tunic.wildlife.center',
     'tunic.historicalsociety.cage',
     'tunic.library.frontdesk',
     'tunic.historicalsociety.frontdesk',
     'tunic.historicalsociety.stacks',
     'tunic.historicalsociety.closet_dirty',
     'tunic.humanecology.frontdesk',
     'tunic.historicalsociety.basement',
     'tunic.kohlcenter.halloffame',
     'tunic.library.microfiche',
     'tunic.drycleaner.frontdesk',
     'tunic.historicalsociety.collection',
     'tunic.flaghouse.entry',
     'tunic.historicalsociety.collection_flag',
     'tunic.capitol_2.hall'
    }


SUBTEXTLIST = {
     'tunic.historicalsociety.entry.groupconvo',
     'tunic.historicalsociety.collection.cs',
     'tunic.historicalsociety.collection.gramps.found',
     'tunic.historicalsociety.closet.gramps.intro_0_cs_0',
     'tunic.historicalsociety.closet.teddy.intro_0_cs_0',
     'tunic.historicalsociety.closet.intro',
     'tunic.historicalsociety.closet.retirement_letter.hub',
     'tunic.historicalsociety.collection.tunic.slip',
     'tunic.kohlcenter.halloffame.plaque.face.date',
     'tunic.kohlcenter.halloffame.togrampa',
     'tunic.historicalsociety.collection.gramps.lost',
     'tunic.historicalsociety.closet.notebook',
     'tunic.historicalsociety.basement.janitor',
     'tunic.historicalsociety.stacks.outtolunch',
     'tunic.historicalsociety.closet.photo',
     'tunic.historicalsociety.collection.tunic',
     'tunic.historicalsociety.closet.teddy.intro_0_cs_5',
     'tunic.historicalsociety.entry.wells.talktogramps',
     'tunic.historicalsociety.entry.boss.talktogramps',
     'tunic.historicalsociety.closet.doorblock',
     'tunic.historicalsociety.entry.block_tomap2',
     'tunic.historicalsociety.entry.block_tocollection',
     'tunic.historicalsociety.entry.block_tomap1',
     'tunic.historicalsociety.collection.gramps.look_0',
     'tunic.kohlcenter.halloffame.block_0',
     'tunic.capitol_0.hall.chap1_finale_c',
     'tunic.historicalsociety.entry.gramps.hub',
     'tunic.historicalsociety.frontdesk.archivist.newspaper',
     'tunic.historicalsociety.frontdesk.archivist.have_glass',
     'tunic.drycleaner.frontdesk.worker.hub',
     'tunic.historicalsociety.closet_dirty.gramps.news',
     'tunic.humanecology.frontdesk.worker.intro',
     'tunic.library.frontdesk.worker.hello',
     'tunic.library.frontdesk.worker.wells',
     'tunic.historicalsociety.frontdesk.archivist.hello',
     'tunic.historicalsociety.closet_dirty.trigger_scarf',
     'tunic.drycleaner.frontdesk.worker.done',
     'tunic.historicalsociety.closet_dirty.what_happened',
     'tunic.historicalsociety.stacks.journals.pic_2.bingo',
     'tunic.humanecology.frontdesk.worker.badger',
     'tunic.historicalsociety.closet_dirty.trigger_coffee',
     'tunic.drycleaner.frontdesk.logbook.page.bingo',
     'tunic.library.microfiche.reader.paper2.bingo',
     'tunic.historicalsociety.closet_dirty.gramps.helpclean',
     'tunic.historicalsociety.frontdesk.archivist.have_glass_recap',
     'tunic.historicalsociety.frontdesk.magnify',
     'tunic.humanecology.frontdesk.businesscards.card_bingo.bingo',
     'tunic.library.frontdesk.wellsbadge.hub',
     'tunic.capitol_1.hall.boss.haveyougotit',
     'tunic.historicalsociety.basement.janitor',
     'tunic.historicalsociety.closet_dirty.photo',
     'tunic.historicalsociety.stacks.outtolunch',
     'tunic.library.frontdesk.worker.wells_recap',
     'tunic.capitol_0.hall.boss.talktogramps',
     'tunic.historicalsociety.closet_dirty.gramps.archivist',
     'tunic.historicalsociety.closet_dirty.door_block_talk',
     'tunic.historicalsociety.frontdesk.archivist.need_glass_0',
     'tunic.historicalsociety.frontdesk.block_magnify',
     'tunic.historicalsociety.frontdesk.archivist.foundtheodora',
     'tunic.historicalsociety.closet_dirty.gramps.nothing',
     'tunic.historicalsociety.closet_dirty.door_block_clean',
     'tunic.library.frontdesk.worker.hello_short',
     'tunic.historicalsociety.stacks.block',
     'tunic.historicalsociety.frontdesk.archivist.need_glass_1',
     'tunic.historicalsociety.frontdesk.archivist.newspaper_recap',
     'tunic.drycleaner.frontdesk.worker.done2',
     'tunic.humanecology.frontdesk.block_0',
     'tunic.library.frontdesk.worker.preflag',
     'tunic.drycleaner.frontdesk.worker.takealook',
     'tunic.library.frontdesk.worker.droppedbadge',
     'tunic.library.microfiche.block_0',
     'tunic.library.frontdesk.block_badge',
     'tunic.library.frontdesk.block_badge_2',
     'tunic.capitol_1.hall.chap2_finale_c',
     'tunic.drycleaner.frontdesk.block_0',
     'tunic.humanecology.frontdesk.block_1',
     'tunic.drycleaner.frontdesk.block_1',
     'tunic.historicalsociety.cage.confrontation',
     'tunic.wildlife.center.crane_ranger.crane',
     'tunic.wildlife.center.wells.nodeer',
     'tunic.historicalsociety.frontdesk.archivist_glasses.confrontation',
     'tunic.historicalsociety.basement.seescratches',
     'tunic.flaghouse.entry.flag_girl.hello',
     'tunic.historicalsociety.basement.ch3start',
     'tunic.historicalsociety.entry.groupconvo_flag',
     'tunic.historicalsociety.collection_flag.gramps.flag',
     'tunic.historicalsociety.basement.savedteddy',
     'tunic.library.frontdesk.worker.nelson',
     'tunic.wildlife.center.expert.removed_cup',
     'tunic.library.frontdesk.worker.flag',
     'tunic.historicalsociety.entry.boss.flag',
     'tunic.flaghouse.entry.flag_girl.symbol',
     'tunic.wildlife.center.wells.animals',
     'tunic.historicalsociety.cage.glasses.afterteddy',
     'tunic.historicalsociety.cage.teddy.trapped',
     'tunic.historicalsociety.cage.unlockdoor',
     'tunic.historicalsociety.stacks.journals.pic_2.bingo',
     'tunic.historicalsociety.entry.wells.flag',
     'tunic.humanecology.frontdesk.worker.badger',
     'tunic.historicalsociety.stacks.journals_flag.pic_0.bingo',
     'tunic.historicalsociety.entry.directory.closeup.archivist',
     'tunic.capitol_2.hall.boss.haveyougotit',
     'tunic.wildlife.center.wells.nodeer_recap',
     'tunic.historicalsociety.cage.glasses.beforeteddy',
     'tunic.wildlife.center.expert.recap',
     'tunic.historicalsociety.stacks.journals_flag.pic_1.bingo',
     'tunic.historicalsociety.cage.lockeddoor',
     'tunic.historicalsociety.stacks.journals_flag.pic_2.bingo',
     'tunic.wildlife.center.remove_cup',
     'tunic.wildlife.center.tracks.hub.deer',
     'tunic.historicalsociety.frontdesk.key',
     'tunic.library.microfiche.reader_flag.paper2.bingo',
     'tunic.flaghouse.entry.colorbook',
     'tunic.wildlife.center.coffee',
     'tunic.historicalsociety.collection_flag.gramps.recap',
     'tunic.wildlife.center.wells.animals2',
     'tunic.flaghouse.entry.flag_girl.symbol_recap',
     'tunic.historicalsociety.closet_dirty.photo',
     'tunic.historicalsociety.stacks.outtolunch',
     'tunic.historicalsociety.frontdesk.archivist_glasses.confrontation_recap',
     'tunic.historicalsociety.entry.boss.flag_recap',
     'tunic.capitol_1.hall.boss.writeitup',
     'tunic.library.frontdesk.worker.nelson_recap',
     'tunic.historicalsociety.entry.wells.flag_recap',
     'tunic.drycleaner.frontdesk.worker.done2',
     'tunic.library.frontdesk.worker.flag_recap',
     'tunic.library.frontdesk.worker.preflag',
     'tunic.historicalsociety.basement.gramps.seeyalater',
     'tunic.flaghouse.entry.flag_girl.hello_recap',
     'tunic.historicalsociety.basement.gramps.whatdo',
     'tunic.library.frontdesk.block_nelson',
     'tunic.historicalsociety.cage.need_glasses',
     'tunic.capitol_2.hall.chap4_finale_c',
     'tunic.wildlife.center.fox.concern'
}

SUB_LEVELS = {'0-4': [1, 2, 3, 4],
              '5-12': [5, 6, 7, 8, 9, 10, 11, 12],
              '13-22': [13, 14, 15, 16, 17, 18, 19, 20, 21, 22]}
level_groups = ["0-4", "5-12", "13-22"]

In [7]:
labels.head(5)

Unnamed: 0,session_id,correct,session,q
0,20090312431273200_q1,1,20090312431273200,1
1,20090312433251036_q1,0,20090312433251036,1
2,20090312455206810_q1,1,20090312455206810,1
3,20090313091715820_q1,0,20090313091715820,1
4,20090313571836404_q1,1,20090313571836404,1


In [8]:
def feature_engineer(dataset_df):
    dfs = []
    
    dataset_group = dataset_df.groupby(['session_id','level_group'])
    
    for c in CATAGORICAL:
        tmp = dataset_group[c].agg('nunique')
        tmp.name = tmp.name + '_nunique'
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = dataset_group[c].agg('mean')
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = dataset_group[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    for c in CATAGORICAL:
        tmp = dataset_group[c].agg('count')
        tmp.name = tmp.name + '_count'
        dfs.append(tmp)
    
    dataset_df['event_name'].fillna(-1)
    for c in EVENTNAME:
        tmp = dataset_df[dataset_df.event_name == c].groupby(['session_id','level_group'])['event_name'].agg('count')
        tmp.name = c + '_count'
        dfs.append(tmp)
    
    dataset_df['room_fqid'].fillna(-1)
    for c in SUBROOMLIST:
        tmp = dataset_df[dataset_df.room_fqid == c].groupby(['session_id','level_group'])['room_fqid'].agg('count')
        tmp.name = c + '_count'
        dfs.append(tmp)
    
    for c in SUBTEXTLIST:
        tmp = dataset_df[dataset_df.room_fqid == c].groupby(['session_id','level_group'])['text_fqid'].agg('count')
        tmp.name = c + '_count'
        dfs.append(tmp)
        
    dataset_df = pd.concat(dfs,axis=1)
    dataset_df = dataset_df.fillna(-1)
    dataset_df = dataset_df.reset_index()
    
    dataset_df['year'] = dataset_df["session_id"].apply(lambda x: int(str(x)[:2])).astype(np.uint8)
    dataset_df['month'] = dataset_df["session_id"].apply(lambda x: int(str(x)[2:4])+1).astype(np.uint8)
    dataset_df['day'] = dataset_df["session_id"].apply(lambda x: int(str(x)[4:6])).astype(np.uint8)
    dataset_df['hour'] = dataset_df["session_id"].apply(lambda x: int(str(x)[6:8])).astype(np.uint8)
    dataset_df['minute'] = dataset_df["session_id"].apply(lambda x: int(str(x)[8:10])).astype(np.uint8)
    dataset_df['second'] = dataset_df["session_id"].apply(lambda x: int(str(x)[10:12])).astype(np.uint8)
    
    dataset_df = dataset_df.set_index('session_id')
    return dataset_df

In [9]:
dataset_df = feature_engineer(dataset_df)
print("Full prepared dataset shape is {}".format(dataset_df.shape))

Full prepared dataset shape is (70686, 189)


In [10]:
def split_dataset(dataset, test_ratio=0.20):
    USER_LIST = dataset.index.unique()
    split = int(len(USER_LIST) * (1 - 0.20))
    return dataset.loc[USER_LIST[:split]], dataset.loc[USER_LIST[split:]]

train_x, valid_x = split_dataset(dataset_df)
print("{} examples in training, {} examples in testing.".format(
    len(train_x), len(valid_x)))

56547 examples in training, 14139 examples in testing.


In [11]:
train_x.head(10)

Unnamed: 0_level_0,level_group,event_name_nunique,name_nunique,fqid_nunique,room_fqid_nunique,text_fqid_nunique,elapsed_time,level,page,room_coor_x,...,tunic.wildlife.center.wells.nodeer_recap_count,tunic.historicalsociety.cage.need_glasses_count,tunic.library.frontdesk.block_badge_count,tunic.library.frontdesk.block_nelson_count,year,month,day,hour,minute,second
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20090312431273200,0-4,10,3,30,7,17,85793.56,1.945455,-1.0,7.701275,...,-1.0,-1.0,-1.0,-1.0,20,10,3,12,43,12
20090312431273200,13-22,10,3,49,12,35,1040601.0,17.402381,-1.0,-130.347168,...,-1.0,-1.0,-1.0,-1.0,20,10,3,12,43,12
20090312431273200,5-12,10,3,39,11,24,357205.2,8.054054,-1.0,14.306062,...,-1.0,-1.0,-1.0,-1.0,20,10,3,12,43,12
20090312433251036,0-4,11,4,22,6,11,97633.42,1.870504,0.0,-84.045959,...,-1.0,-1.0,-1.0,-1.0,20,10,3,12,43,32
20090312433251036,13-22,11,6,73,16,43,2498852.0,17.762529,5.1,-30.762283,...,-1.0,-1.0,-1.0,-1.0,20,10,3,12,43,32
20090312433251036,5-12,11,4,45,11,22,554904.0,8.840506,2.142857,50.284168,...,-1.0,-1.0,-1.0,-1.0,20,10,3,12,43,32
20090312455206810,0-4,9,3,22,6,12,202398.0,1.604027,-1.0,-209.830322,...,-1.0,-1.0,-1.0,-1.0,20,10,3,12,45,52
20090312455206810,13-22,11,4,47,12,30,958643.3,17.706044,5.230769,-58.65551,...,-1.0,-1.0,-1.0,-1.0,20,10,3,12,45,52
20090312455206810,5-12,11,4,41,11,19,507395.3,8.342857,2.333333,80.422424,...,-1.0,-1.0,-1.0,-1.0,20,10,3,12,45,52
20090313091715820,0-4,11,4,24,6,14,62087.4,1.789773,0.0,-111.96273,...,-1.0,-1.0,-1.0,-1.0,20,10,3,13,9,17


In [12]:
print(train_x.dtypes.to_string())

level_group                                                                       object
event_name_nunique                                                                 int64
name_nunique                                                                       int64
fqid_nunique                                                                       int64
room_fqid_nunique                                                                  int64
text_fqid_nunique                                                                  int64
elapsed_time                                                                     float64
level                                                                            float64
page                                                                             float64
room_coor_x                                                                      float32
room_coor_y                                                                      float32
screen_coor_x        

In [13]:
xgbparam = {
    1 : {'max_depth' : 5, 'subsample' : 0.75, 'colsample_bytree' : 0.9, 'n_estimators' : 600, 'learning_rate' : 0.02},
    2 : {'max_depth' : 5, 'subsample' : 0.6, 'colsample_bytree' : 0.75, 'n_estimators' : 200, 'learning_rate' : 0.01},
    3 : {'max_depth' : 6, 'subsample' : 0.6, 'colsample_bytree' : 0.9, 'n_estimators' : 800, 'learning_rate' : 0.03},
    4 : {'max_depth' : 6, 'subsample' : 0.6, 'colsample_bytree' : 0.75, 'n_estimators' : 200, 'learning_rate' : 0.03},
    5 : {'max_depth' : 8, 'subsample' : 0.5, 'colsample_bytree' : 0.75, 'n_estimators' : 800, 'learning_rate' : 0.01},
    6 : {'max_depth' : 8, 'subsample' : 0.6, 'colsample_bytree' : 0.9, 'n_estimators' : 400, 'learning_rate' : 0.02},
    7 : {'max_depth' : 4, 'subsample' : 0.6, 'colsample_bytree' : 0.9, 'n_estimators' : 400, 'learning_rate' : 0.02},
    8 : {'max_depth' : 6, 'subsample' : 0.7, 'colsample_bytree' : 0.9, 'n_estimators' : 400, 'learning_rate' : 0.02},
    9 :  {'max_depth' : 6, 'subsample' : 0.6, 'colsample_bytree' : 0.9, 'n_estimators' : 200, 'learning_rate' : 0.02},
    10 : {'max_depth' : 5, 'subsample' : 0.6, 'colsample_bytree' : 0.9, 'n_estimators' : 200, 'learning_rate' : 0.02},
    11 : {'max_depth' : 8, 'subsample' : 0.9, 'colsample_bytree' : 0.9, 'n_estimators' : 200, 'learning_rate' : 0.02},
    12 : {'max_depth' : 5, 'subsample' : 0.6, 'colsample_bytree' : 0.9, 'n_estimators' : 400, 'learning_rate' : 0.02},
    13 : {'max_depth' : 8, 'subsample' : 0.9, 'colsample_bytree' : 0.9, 'n_estimators' : 200, 'learning_rate' : 0.04},
    14 : {'max_depth' : 5, 'subsample' : 0.6, 'colsample_bytree' : 0.9, 'n_estimators' : 600, 'learning_rate' : 0.02},
    15 : {'max_depth' : 5, 'subsample' : 0.6, 'colsample_bytree' : 0.9, 'n_estimators' : 600, 'learning_rate' : 0.02},
    16 : {'max_depth' : 5, 'subsample' : 0.75, 'colsample_bytree' : 0.9, 'n_estimators' : 200, 'learning_rate' : 0.02},
    17 : {'max_depth' : 5, 'subsample' : 0.9, 'colsample_bytree' : 0.9, 'n_estimators' : 200, 'learning_rate' : 0.04},
    18 : {'max_depth' : 5, 'subsample' : 0.6, 'colsample_bytree' : 0.9, 'n_estimators' : 200, 'learning_rate' : 0.02}
}

In [14]:
models = {}
evaluation = {}
le = LabelEncoder()
# xgboostModel = XGBClassifier(n_estimators=100, learning_rate= 0.3)
for q_no in range(1,19):
    if q_no<=3: grp = '0-4'
    elif q_no<=13: grp = '5-12'
    elif q_no<=22: grp = '13-22'
    print("### q_no", q_no, "grp", grp)
    
    train_df = train_x.loc[train_x.level_group == grp]
    train_users = train_df.index.values
    valid_df = valid_x.loc[valid_x.level_group == grp]
    valid_users = valid_df.index.values

    # Select the labels for the related q_no.
    train_labels = labels.loc[labels.q==q_no].set_index('session').loc[train_users]
    valid_labels = labels.loc[labels.q==q_no].set_index('session').loc[valid_users]
    
    train_labels = le.fit_transform(train_labels["correct"])
    valid_labels = le.fit_transform(valid_labels["correct"])
    
    train_df = train_df.drop(columns=['level_group'])
    valid_df = valid_df.drop(columns=['level_group'])
    
    depths = [5, 6, 8]
    samples = [0.6, 0.75, 0.9]
    colsamples = [0.75, 0.9]
    ests = [200, 400, 600, 800]
    rates = [0.01, 0.02, 0.03, 0.04]
    
    '''progress = tqdm(total=288)
    best_score = 0
    for d in depths:
        for s in samples:
            for c in colsamples:
                for e in ests:
                    for r in rates:
                        progress.update(1)
                        xgboostModel = xgb.XGBClassifier(max_depth=d, subsample=s, colsample_bytree=c, n_estimators=e , learning_rate=r)
                        xgboostModel.fit(train_df, train_labels)

                        models[q_no] = xgboostModel
                        train_score = xgboostModel.score(train_df, train_labels)
                        valid_score = xgboostModel.score(valid_df, valid_labels)
                        if valid_score > best_score:
                            best_score = valid_score
                            print('train: ',train_score)
                            print('valid: ',valid_score)
                            print(f"current params == depth: {d} ,samples: {s} ,cosamples: {c} ,ests: {e} ,rates:{r}")'''
                        
    #for saved param
    xgboostModel = xgb.XGBClassifier()
    xgboostModel.set_params(**xgbparam[q_no])
    xgboostModel.fit(train_df, train_labels)

    models[q_no] = xgboostModel
    train_score = xgboostModel.score(train_df, train_labels)
    valid_score = xgboostModel.score(valid_df, valid_labels)
    print('train: ',train_score)
    print('valid: ',valid_score)
    evaluation[q_no] = valid_score
# print("\nAverage accuracy", sum(evaluation.values()))

### q_no 1 grp 0-4
train:  0.7911295028914
valid:  0.7405049862083598
### q_no 2 grp 0-4
train:  0.9796275664491485
valid:  0.9755994058985784
### q_no 3 grp 0-4
train:  0.9809538967584487
valid:  0.9348610227031615
### q_no 4 grp 5-12
train:  0.8503368878985622
valid:  0.8022490982389137
### q_no 5 grp 5-12
train:  0.901002705713831
valid:  0.645448758752387
### q_no 6 grp 5-12
train:  0.8932038834951457
valid:  0.7916401442817739
### q_no 7 grp 5-12
train:  0.7637540453074434
valid:  0.747719074899215
### q_no 8 grp 5-12
train:  0.7455037402514723
valid:  0.6388712072989603
### q_no 9 grp 5-12
train:  0.7821635100005305
valid:  0.7702100572883513
### q_no 10 grp 5-12
train:  0.6683113162501989
valid:  0.6206238064926798
### q_no 11 grp 5-12
train:  0.8059844023555627
valid:  0.6632718014003819
### q_no 12 grp 5-12
train:  0.8696482572019736
valid:  0.8709951198811797
### q_no 13 grp 5-12
train:  0.877765398694891
valid:  0.720560152768937
### q_no 14 grp 13-22
train:  0.7889012679717

In [15]:
for name, value in evaluation.items():
  print(f"question {name}: accuracy {value:.4f}")

print("\nAverage accuracy", sum(evaluation.values())/18)

question 1: accuracy 0.7405
question 2: accuracy 0.9756
question 3: accuracy 0.9349
question 4: accuracy 0.8022
question 5: accuracy 0.6454
question 6: accuracy 0.7916
question 7: accuracy 0.7477
question 8: accuracy 0.6389
question 9: accuracy 0.7702
question 10: accuracy 0.6206
question 11: accuracy 0.6633
question 12: accuracy 0.8710
question 13: accuracy 0.7206
question 14: accuracy 0.7426
question 15: accuracy 0.6365
question 16: accuracy 0.7486
question 17: accuracy 0.7042
question 18: accuracy 0.9516

Average accuracy 0.7614517764104016


In [16]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
thresholds = {}
th_evaluation = {}
for q_no in range(1,19):
    if q_no<=3: grp = '0-4'
    elif q_no<=13: grp = '5-12'
    elif q_no<=22: grp = '13-22'
    print("### q_no", q_no, "grp", grp)
    
    valid_df = valid_x.loc[valid_x.level_group == grp]
    valid_users = valid_df.index.values
    valid_df = valid_df.drop(columns=['level_group'])

    # Select the labels for the related q_no.
    valid_labels = labels.loc[labels.q==q_no].set_index('session').loc[valid_users]
    valid_labels = le.fit_transform(valid_labels["correct"])
    
    pred = models[q_no].predict_proba(valid_df)
    pred = pred[:, -1]
    
    best_score = 0
    best_threshold = 0
    for threshold in np.arange(0.4,0.6,0.01):
        valid_pred = (pred > threshold).astype(int)
        score = accuracy_score(valid_labels, valid_pred)
        if score > best_score: 
            best_score = score
            best_threshold = threshold
            th_evaluation[q_no] = accuracy_score(valid_labels, valid_pred)
    thresholds[q_no] = best_threshold
    print(best_threshold)

### q_no 1 grp 0-4
0.5200000000000001
### q_no 2 grp 0-4
0.4
### q_no 3 grp 0-4
0.5200000000000001
### q_no 4 grp 5-12
0.4800000000000001
### q_no 5 grp 5-12
0.5000000000000001
### q_no 6 grp 5-12
0.43000000000000005
### q_no 7 grp 5-12
0.45000000000000007
### q_no 8 grp 5-12
0.4800000000000001
### q_no 9 grp 5-12
0.5200000000000001
### q_no 10 grp 5-12
0.45000000000000007
### q_no 11 grp 5-12
0.4700000000000001
### q_no 12 grp 5-12
0.4900000000000001
### q_no 13 grp 5-12
0.5700000000000002
### q_no 14 grp 13-22
0.4800000000000001
### q_no 15 grp 13-22
0.4800000000000001
### q_no 16 grp 13-22
0.4
### q_no 17 grp 13-22
0.4800000000000001
### q_no 18 grp 13-22
0.4


In [17]:
for name, value in th_evaluation.items():
  print(f"question {name}: accuracy {value:.4f}")

print("\nAverage accuracy", sum(th_evaluation.values())/18)

question 1: accuracy 0.7411
question 2: accuracy 0.9756
question 3: accuracy 0.9351
question 4: accuracy 0.8042
question 5: accuracy 0.6454
question 6: accuracy 0.7929
question 7: accuracy 0.7492
question 8: accuracy 0.6421
question 9: accuracy 0.7734
question 10: accuracy 0.6227
question 11: accuracy 0.6694
question 12: accuracy 0.8710
question 13: accuracy 0.7237
question 14: accuracy 0.7428
question 15: accuracy 0.6416
question 16: accuracy 0.7494
question 17: accuracy 0.7046
question 18: accuracy 0.9516

Average accuracy 0.7631138458636868


In [18]:
import jo_wilder_310
env = jo_wilder_310.make_env()
iter_test = env.iter_test()

In [19]:
limits = {'0-4':(1,4), '5-12':(4,14), '13-22':(14,19)}

for (test, sample_submission) in iter_test:
    test = test.sort_values(by = 'index')
    test_df = feature_engineer(test)
    grp = test_df.level_group.values[0]
    a,b = limits[grp]
    for t in range(a,b):
        xgbm = models[t]
        test_ds = test_df.drop(columns=['level_group'])
        pred = xgbm.predict_proba(test_ds)[:, -1]
        mask = sample_submission.session_id.str.contains(f'q{t}')
        sample_submission.loc[mask,'correct'] = (pred > thresholds[t]).astype(int)
    
    env.predict(sample_submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


In [20]:
! head submission.csv

session_id,correct
20090109393214576_q1,1
20090109393214576_q2,1
20090109393214576_q3,1
20090109393214576_q4,1
20090109393214576_q5,0
20090109393214576_q6,1
20090109393214576_q7,1
20090109393214576_q8,1
20090109393214576_q9,1
