In [1]:
import pandas as pd, numpy as np, gc
from sklearn.model_selection import KFold, GroupKFold
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

In [2]:
# READ USER ID ONLY
tmp = pd.read_csv("/kaggle/input/predict-student-performance-from-game-play/train.csv",usecols=[0])
tmp = tmp.groupby('session_id').session_id.agg('count')

# COMPUTE READS AND SKIPS
PIECES = 10
CHUNK = int( np.ceil(len(tmp)/PIECES) )

reads = []
skips = [0]
for k in range(PIECES):
    a = k*CHUNK
    b = (k+1)*CHUNK
    if b>len(tmp): b=len(tmp)
    r = tmp.iloc[a:b].sum()
    reads.append(r)
    skips.append(skips[-1]+r)
    
print(f'To avoid memory error, we will read train in {PIECES} pieces of sizes:')
print(reads)

To avoid memory error, we will read train in 10 pieces of sizes:
[2684191, 2631991, 2638304, 2657670, 2644229, 2629801, 2596616, 2602258, 2619995, 2591891]


In [3]:
train = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train.csv', nrows=reads[0])
print('Train size of first piece:', train.shape )
train.head()

Train size of first piece: (2684191, 20)


Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
0,20090312431273200,0,0,cutscene_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,0,0,1,0-4
1,20090312431273200,1,1323,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
2,20090312431273200,2,831,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
3,20090312431273200,3,1147,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,I gotta run to my meeting!,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
4,20090312431273200,4,1863,person_click,basic,0,,-412.991405,-159.314686,381.0,494.0,,"Can I come, Gramps?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4


In [4]:
targets = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train_labels.csv')
targets['session'] = targets.session_id.apply(lambda x: int(x.split('_')[0]) )
targets['q'] = targets.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )
print( targets.shape )
targets.head()

(424116, 4)


Unnamed: 0,session_id,correct,session,q
0,20090312431273200_q1,1,20090312431273200,1
1,20090312433251036_q1,0,20090312433251036,1
2,20090312455206810_q1,1,20090312455206810,1
3,20090313091715820_q1,0,20090313091715820,1
4,20090313571836404_q1,1,20090313571836404,1


In [5]:
CATS = ['event_name', 'fqid', 'room_fqid', 'text']
NUMS = ['elapsed_time','level','page','room_coor_x', 'room_coor_y', 
        'screen_coor_x', 'screen_coor_y', 'hover_duration']

# https://www.kaggle.com/code/kimtaehun/lightgbm-baseline-with-aggregated-log-data
EVENTS = ['navigate_click','person_click','cutscene_click','object_click',
          'map_hover','notification_click','map_click','observation_click',
          'checkpoint']

In [6]:
def feature_engineer(train):
    
    dfs = []
    for c in CATS:
        tmp = train.groupby(['session_id','level_group'])[c].agg('nunique')
        tmp.name = tmp.name + '_nunique'
        dfs.append(tmp)
    for c in NUMS:
        tmp = train.groupby(['session_id','level_group'])[c].agg('mean')
        tmp.name = tmp.name + '_mean'
        dfs.append(tmp)
    for c in NUMS:
        tmp = train.groupby(['session_id','level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    for c in EVENTS: 
        train[c] = (train.event_name == c).astype('int8')
    for c in EVENTS + ['elapsed_time']:
        tmp = train.groupby(['session_id','level_group'])[c].agg('sum')
        tmp.name = tmp.name + '_sum'
        dfs.append(tmp)
    train = train.drop(EVENTS,axis=1)
        
    df = pd.concat(dfs,axis=1)
    df = df.fillna(-1)
    df = df.reset_index()
    df = df.set_index('session_id')
    return df

In [7]:
%%time

# PROCESS TRAIN DATA IN PIECES
all_pieces = []
print(f'Processing train as {PIECES} pieces to avoid memory error... ')
for k in range(PIECES):
    print(k,', ',end='')
    SKIPS = 0
    if k>0: SKIPS = range(1,skips[k]+1)
    train = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train.csv',
                        nrows=reads[k], skiprows=SKIPS)
    df = feature_engineer(train)
    all_pieces.append(df)
    
# CONCATENATE ALL PIECES
print('\n')
del train; gc.collect()
df = pd.concat(all_pieces, axis=0)
print('Shape of all train data after feature engineering:', df.shape )
df.head()

Processing train as 10 pieces to avoid memory error... 
0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 

Shape of all train data after feature engineering: (70686, 31)
CPU times: user 3min 53s, sys: 8.75 s, total: 4min 2s
Wall time: 4min 3s


Unnamed: 0_level_0,level_group,event_name_nunique,fqid_nunique,room_fqid_nunique,text_nunique,elapsed_time_mean,level_mean,page_mean,room_coor_x_mean,room_coor_y_mean,...,navigate_click_sum,person_click_sum,cutscene_click_sum,object_click_sum,map_hover_sum,notification_click_sum,map_click_sum,observation_click_sum,checkpoint_sum,elapsed_time_sum
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20090312431273200,0-4,10,30,7,56,85793.56,1.945455,-1.0,7.701275,-71.413749,...,81.0,22.0,28.0,11.0,4.0,8,2.0,4,1,14155937
20090312431273200,13-22,10,49,12,168,1040601.0,17.402381,-1.0,-130.34717,-162.00431,...,170.0,123.0,60.0,20.0,14.0,10,6.0,3,1,437052322
20090312431273200,5-12,10,39,11,124,357205.2,8.054054,-1.0,14.306062,-57.269322,...,103.0,104.0,12.0,28.0,9.0,9,8.0,1,1,105732736
20090312433251036,0-4,11,22,6,49,97633.42,1.870504,0.0,-84.04596,-53.671082,...,49.0,18.0,36.0,15.0,3.0,5,3.0,2,1,13571045
20090312433251036,13-22,11,73,16,183,2498852.0,17.762529,5.1,-30.762282,-142.861892,...,637.0,145.0,65.0,83.0,186.0,14,45.0,5,1,3241011333


In [8]:
FEATURES = [c for c in df.columns if c != 'level_group']
print('We will train with', len(FEATURES) ,'features')
ALL_USERS = df.index.unique()
print('We will train with', len(ALL_USERS) ,'users info')

We will train with 30 features
We will train with 23562 users info


In [52]:
param_bounds = {'max_depth': (4, 8),
                'subsample': (0.6, 0.9),
                'colsample_bytree': (0.7, 1.0),
                'min_child_weight': (5, 7),
                'gamma': (8, 11),
                'reg_alpha': (7, 9),
                'reg_lambda': (1.1, 1.5),
                'scale_pos_weight': (1.4, 1.6)}

fixed_params = {'objective': 'binary:logistic',
                'learning_rate': 0.02}

In [53]:
def xgb_f1_score(y, t, threshold=0.5):
    #t = t.get_label()
    y_bin = [1. if y_cont > threshold else 0. for y_cont in y] # binarizing your output
    return 'f1',f1_score(t,y_bin, average='micro')

In [54]:
def eval_function(max_depth, subsample, colsample_bytree, min_child_weight, reg_alpha, gamma, reg_lambda, scale_pos_weight):
    params = {'max_depth': int(round(max_depth)),
              'subsample': subsample,
              'colsample_bytree': colsample_bytree,
              'min_child_weight': min_child_weight,
              'gamma': gamma,
              'reg_alpha': reg_alpha,
              'reg_lambda': reg_lambda,
              'scale_pos_weight': scale_pos_weight}
    
    params.update(fixed_params)
    
    print('하이퍼파라미터 : ', params)
    
    xgb_model = xgb.train(params=params,
                          dtrain=bayes_dtrain,
                          num_boost_round=2000,
                          evals=[(bayes_dvalid, 'bayes_dvalid')],
                          maximize=True,
                          #feval=xgb_f1_score,
                          early_stopping_rounds=200,
                          # verbose_eval=False
                          )
    #xgb_model.fit(train_x[FEATURES].astype('float32'), train_y,
    #            eval_set=[ (X_valid[FEATURES].astype('float32'), y_valid)], verbose=0)
    best_iter = xgb_model.best_iteration
    preds = xgb_model.predict(bayes_dvalid, iteration_range=(0, best_iter))
    preds_binary = np.where(preds > 0.5, 1 , 0)
    # 이진 분류 문제일 경우 average='micro' 추가
    val_f1_score = f1_score(y_valid, preds_binary, average='micro')
    print(f'F1 점수 : {val_f1_score:.4f}')
    
    return val_f1_score

In [56]:
from bayes_opt import BayesianOptimization
from sklearn.model_selection import train_test_split
import xgboost as xgb


level_max_params = []

for t in range(1,19):
        # USE THIS TRAIN DATA WITH THESE QUESTIONS
        if t<=3: grp = '0-4'
        elif t<=13: grp = '5-12'
        elif t<=22: grp = '13-22'
        
        train_x = df[df['level_group'] == grp][FEATURES]
        train_users = train_x.index.values
        train_y = targets.loc[targets['q'] == t].set_index('session').loc[train_users]['correct']

        X_train, X_valid, y_train, y_valid = train_test_split(train_x, train_y, test_size=0.3)


        # 베이지안 최적화용 데이터 셋
        bayes_dtrain = xgb.DMatrix(X_train, y_train)
        bayes_dvalid = xgb.DMatrix(X_valid, y_valid)
        
        optimizer = BayesianOptimization(f=eval_function, pbounds=param_bounds)

        optimizer.maximize(init_points=3, n_iter=10)
        
        max_params = optimizer.max['params']
        level_max_params.append(max_params)
        

|   iter    |  target   | colsam... |   gamma   | max_depth | min_ch... | reg_alpha | reg_la... | scale_... | subsample |
-------------------------------------------------------------------------------------------------------------------------
하이퍼파라미터 :  {'max_depth': 8, 'subsample': 0.7789944118621259, 'colsample_bytree': 0.8812192656909265, 'min_child_weight': 6.183273400100628, 'gamma': 10.881401381557676, 'reg_alpha': 8.91618095892541, 'reg_lambda': 1.2991199089973016, 'scale_pos_weight': 1.4255817129665649, 'objective': 'binary:logistic', 'learning_rate': 0.02}
[0]	bayes_dvalid-logloss:0.68730
[1]	bayes_dvalid-logloss:0.68187
[2]	bayes_dvalid-logloss:0.67653
[3]	bayes_dvalid-logloss:0.67166
[4]	bayes_dvalid-logloss:0.66685
[5]	bayes_dvalid-logloss:0.66223
[6]	bayes_dvalid-logloss:0.65786
[7]	bayes_dvalid-logloss:0.65358
[8]	bayes_dvalid-logloss:0.64943
[9]	bayes_dvalid-logloss:0.64547
[10]	bayes_dvalid-logloss:0.64169
[11]	bayes_dvalid-logloss:0.63805
[12]	bayes_dvalid-logloss:0.6

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[84]	bayes_dvalid-logloss:0.40017
[85]	bayes_dvalid-logloss:0.39981
[86]	bayes_dvalid-logloss:0.39946
[87]	bayes_dvalid-logloss:0.39912
[88]	bayes_dvalid-logloss:0.39883
[89]	bayes_dvalid-logloss:0.39855
[90]	bayes_dvalid-logloss:0.39827
[91]	bayes_dvalid-logloss:0.39802
[92]	bayes_dvalid-logloss:0.39776
[93]	bayes_dvalid-logloss:0.39753
[94]	bayes_dvalid-logloss:0.39733
[95]	bayes_dvalid-logloss:0.39713
[96]	bayes_dvalid-logloss:0.39692
[97]	bayes_dvalid-logloss:0.39676
[98]	bayes_dvalid-logloss:0.39656
[99]	bayes_dvalid-logloss:0.39642
[100]	bayes_dvalid-logloss:0.39625
[101]	bayes_dvalid-logloss:0.39610
[102]	bayes_dvalid-logloss:0.39597
[103]	bayes_dvalid-logloss:0.39585
[104]	bayes_dvalid-logloss:0.39575
[105]	bayes_dvalid-logloss:0.39564
[106]	bayes_dvalid-logloss:0.39556
[107]	bayes_dvalid-logloss:0.39549
[108]	bayes_dvalid-logloss:0.39541
[109]	bayes_dvalid-logloss:0.39534
[110]	bayes_dvalid-logloss:0.39530
[111]	bayes_dvalid-logloss:0.39524
[112]	bayes_dvalid-logloss:0.39517
[

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[10]	bayes_dvalid-logloss:0.65892
[11]	bayes_dvalid-logloss:0.65665
[12]	bayes_dvalid-logloss:0.65447
[13]	bayes_dvalid-logloss:0.65237
[14]	bayes_dvalid-logloss:0.65038
[15]	bayes_dvalid-logloss:0.64852
[16]	bayes_dvalid-logloss:0.64671
[17]	bayes_dvalid-logloss:0.64498
[18]	bayes_dvalid-logloss:0.64333
[19]	bayes_dvalid-logloss:0.64179
[20]	bayes_dvalid-logloss:0.64029
[21]	bayes_dvalid-logloss:0.63890
[22]	bayes_dvalid-logloss:0.63757
[23]	bayes_dvalid-logloss:0.63633
[24]	bayes_dvalid-logloss:0.63513
[25]	bayes_dvalid-logloss:0.63400
[26]	bayes_dvalid-logloss:0.63297
[27]	bayes_dvalid-logloss:0.63199
[28]	bayes_dvalid-logloss:0.63101
[29]	bayes_dvalid-logloss:0.63008
[30]	bayes_dvalid-logloss:0.62927
[31]	bayes_dvalid-logloss:0.62848
[32]	bayes_dvalid-logloss:0.62772
[33]	bayes_dvalid-logloss:0.62698
[34]	bayes_dvalid-logloss:0.62629
[35]	bayes_dvalid-logloss:0.62562
[36]	bayes_dvalid-logloss:0.62501
[37]	bayes_dvalid-logloss:0.62447
[38]	bayes_dvalid-logloss:0.62394
[39]	bayes_dva

In [None]:
level_max_params