In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import polars as pl
import gc
import math
import csv
from pathlib import Path
import tensorflow as tf
from collections import OrderedDict

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

pd.set_option('display.max_rows', 20)

/kaggle/input/predict-student-performance-from-game-play/sample_submission.csv
/kaggle/input/predict-student-performance-from-game-play/train_labels.csv
/kaggle/input/predict-student-performance-from-game-play/train.csv
/kaggle/input/predict-student-performance-from-game-play/test.csv
/kaggle/input/predict-student-performance-from-game-play/jo_wilder/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/predict-student-performance-from-game-play/jo_wilder/__init__.py


In [2]:
CATS = ['event_name', 'name','fqid', 'room_fqid', 'text_fqid']
NUMS = ['elapsed_time','level','room_coor_x', 'room_coor_y', 
        'screen_coor_x', 'screen_coor_y', 'hover_duration']

def feature_engineer(train):
    dfs = []

    for c in CATS:
        tmp = train.groupby('session_id', 'level_group', maintain_order=True).agg(pl.n_unique(c))
        tmp.columns = [tmp.columns[0], tmp.columns[1], str(tmp.columns[2]) + '_nunique']
        dfs.append(tmp)
    for c in NUMS:
        tmp = train.groupby('session_id', 'level_group', maintain_order=True).agg(pl.mean(c))
        tmp.columns = [tmp.columns[0], tmp.columns[1], str(tmp.columns[2]) + '_mean']
        dfs.append(tmp)
    for c in NUMS:
        tmp = train.groupby('session_id', 'level_group', maintain_order=True).agg(pl.std(c))
        tmp.columns = [tmp.columns[0], tmp.columns[1], str(tmp.columns[2]) + '_std']
        dfs.append(tmp)
    
    specials = dfs[0].select(['session_id', 'level_group'])

    for i in range(0, len(dfs)):
        dfs[i] = dfs[i].drop('session_id', 'level_group')
        
    df = pl.concat(dfs, how='horizontal')
    df = pl.concat([specials, df], how='horizontal')
    df = df.fill_nan(-1)
    df = df.fill_null(-1)
    
    return df

In [3]:
Numeric_Int_types = [pl.Int8,pl.Int16,pl.Int32,pl.Int64]
Numeric_Float_types = [pl.Float32,pl.Float64]

def reduce_memory_usage_pl(df):
    
    start_mem = df.estimated_size("mb")
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    # pl.Uint8,pl.UInt16,pl.UInt32,pl.UInt64
    
    for col in df.columns:
        col_type = df[col].dtype
        c_min = df[col].min()
        c_max = df[col].max()
        if col_type in Numeric_Int_types:
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df = df.with_columns(df[col].cast(pl.Int8))
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df = df.with_columns(df[col].cast(pl.Int16))
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df = df.with_columns(df[col].cast(pl.Int32))
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df = df.with_columns(df[col].cast(pl.Int64))
        elif col_type in Numeric_Float_types:
            if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df = df.with_columns(df[col].cast(pl.Float32))
            else:
                pass
        elif col_type == pl.Utf8:
            df = df.with_columns(df[col].cast(pl.Categorical))
        else:
            pass
    mem_usg = df.estimated_size("mb")
    print("Memory usage became: ",mem_usg," MB")
    
    return df

In [4]:
dtypes = {"session_id": pl.Int64,
          "elapsed_time": pl.Int64,
          "event_name": pl.Categorical,
          "name": pl.Categorical,
          "level": pl.Int8,
          "page": pl.Categorical,
          "room_coor_x": pl.Float32,
          "room_coor_y": pl.Float32,
          "screen_coor_x": pl.Float32,
          "screen_coor_y": pl.Float32,
          "hover_duration": pl.Float32,
          "text": pl.Categorical,
          "fqid": pl.Categorical,
          "room_fqid": pl.Categorical,
          "text_fqid": pl.Categorical,
          "fullscreen": pl.Categorical,
          "hq": pl.Categorical,
          "music": pl.Categorical,
          "level_group": pl.Categorical
          }

In [5]:
#read data

print('starting step: reading data')

train_data = pl.read_csv('/kaggle/input/predict-student-performance-from-game-play/train.csv', low_memory=True, dtypes=dtypes)
train_labels = pl.read_csv('/kaggle/input/predict-student-performance-from-game-play/train_labels.csv', low_memory=True)

starting step: reading data


In [6]:
train_data = reduce_memory_usage_pl(train_data)
train_labels = reduce_memory_usage_pl(train_labels)

Memory usage of dataframe is 2260.26 MB
Memory usage became:  2009.4763889312744  MB
Memory usage of dataframe is 14.76 MB
Memory usage became:  13.549703598022461  MB


In [7]:
#"flatten" the training labels

print('starting step: flattening')

pl_attributes = [*['session_id'], *('inp'+str(i) for i in range(1, 19))]#, *['sum']]
processed_labels = pd.DataFrame({item: [] for item in pl_attributes})

LABEL_CASES = int(len(train_labels)/18)
for i in range(LABEL_CASES):
     processed_labels.loc[len(processed_labels)] = [*[train_labels[i, ['session_id']].item().split('_')[0]], 
                                                  *(train_labels[i+LABEL_CASES*j, ['correct']].item() for j in range(18))]

starting step: flattening


In [8]:
print(processed_labels)

              session_id  inp1  inp2  inp3  inp4  inp5  inp6  inp7  inp8  \
0      20090312431273200   1.0   1.0   1.0   1.0   1.0   1.0   1.0   1.0   
1      20090312433251036   0.0   1.0   1.0   1.0   0.0   1.0   1.0   0.0   
2      20090312455206810   1.0   1.0   1.0   1.0   1.0   1.0   1.0   1.0   
3      20090313091715820   0.0   1.0   1.0   1.0   1.0   0.0   1.0   1.0   
4      20090313571836404   1.0   1.0   1.0   1.0   1.0   1.0   1.0   1.0   
...                  ...   ...   ...   ...   ...   ...   ...   ...   ...   
23557  22100215342220508   1.0   1.0   1.0   1.0   1.0   1.0   1.0   0.0   
23558  22100215460321130   0.0   1.0   1.0   1.0   0.0   1.0   1.0   0.0   
23559  22100217104993650   1.0   1.0   1.0   1.0   1.0   1.0   1.0   1.0   
23560  22100219442786200   0.0   1.0   1.0   1.0   1.0   1.0   1.0   0.0   
23561  22100221145014656   0.0   1.0   0.0   1.0   0.0   0.0   0.0   0.0   

       inp9  inp10  inp11  inp12  inp13  inp14  inp15  inp16  inp17  inp18  
0       1.

In [9]:
print('starting step: making train/test data (x)')

x_train = feature_engineer(train_data)

starting step: making train/test data (x)


In [10]:
print('starting step: making train/test data (y)')
y_train = []
pl_dropped = processed_labels.drop(['session_id'], axis=1)
for i in range(1, 19):
    applist = []
    for index in list(pl_dropped['inp'+str(i)].index.values):
        applist.append(pl_dropped.loc[index, ('inp'+str(i))])
    ins = pd.DataFrame({'column':applist})
    y_train.append(ins)

starting step: making train/test data (y)


In [11]:
x_train.drop('session_id')

level_group,event_name_nunique,name_nunique,fqid_nunique,room_fqid_nunique,text_fqid_nunique,elapsed_time_mean,level_mean,room_coor_x_mean,room_coor_y_mean,screen_coor_x_mean,screen_coor_y_mean,hover_duration_mean,elapsed_time_std,level_std,room_coor_x_std,room_coor_y_std,screen_coor_x_std,screen_coor_y_std,hover_duration_std
cat,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f32,f32,f32,f32,f32
"""0-4""",10,3,31,7,18,85793.557576,1.945455,7.70128,-71.413756,448.410256,383.044872,2389.5,49246.539458,1.230975,399.296051,129.292404,214.871002,104.082741,3227.37085
"""5-12""",10,3,40,11,25,357205.189189,8.054054,14.30606,-57.269332,451.950943,378.784906,969.333333,80175.676658,2.096919,357.227692,137.409485,203.268555,120.255447,1316.408325
"""13-22""",10,3,50,12,36,1.0406e6,17.402381,-130.347178,-162.004385,442.489796,379.30102,899.925926,126666.129584,2.358652,622.06134,230.370865,240.280212,99.067856,1305.088257
"""0-4""",11,4,23,6,12,97633.417266,1.870504,-84.045974,-53.671075,358.223077,370.723077,1378.75,67372.714092,1.232616,445.980042,156.186249,252.554718,121.062927,2114.876465
"""5-12""",11,4,46,11,23,554904.037975,8.840506,50.284162,-53.836173,470.819277,375.771084,824.096774,159319.587112,2.134412,377.074066,160.557098,232.280899,132.195572,1836.236206
"""13-22""",11,6,74,16,44,2.4989e6,17.762529,-30.762316,-142.862189,462.85249,387.930077,720.384921,777382.529186,1.825923,529.575684,234.279587,259.288849,133.345688,1990.705566
"""0-4""",9,3,23,6,13,202398.020134,1.604027,-209.830253,-31.12503,373.365517,481.268966,3145.666667,121848.824074,1.223745,478.652374,161.438385,459.750366,177.665588,4575.148438
"""5-12""",11,4,42,11,20,507395.257143,8.342857,80.422413,-51.599345,714.834862,510.330275,783.038462,76156.386207,2.181517,363.422546,180.862289,346.987549,228.421677,825.02948
"""13-22""",11,4,48,12,31,958643.271978,17.706044,-58.655533,-142.360854,639.0,521.446746,1055.32,125153.779971,2.462695,642.936279,243.042709,395.599976,181.509094,1403.679565
"""0-4""",11,4,25,6,15,62087.397727,1.789773,-111.962716,-76.698108,431.910714,441.47619,1917.142857,40315.603807,1.221759,422.766418,161.696457,290.932892,150.691742,1949.073608


In [12]:
x_train = x_train.to_pandas()

In [13]:
print('starting step: training')
models = []

for i in range(0, 18):
    print('starting step: training #' + str(i))
    
    if i < 4: 
        grp = '0-4'
    elif i < 14: 
        grp = '5-12'
    else: 
        grp = '13-22'
        
    subframe = x_train.loc[x_train.level_group == grp]
    subframe = subframe.reset_index()
    subframe = subframe.set_index('session_id')
    subframe = subframe.drop(['index'], axis=1)
    
    subframe.drop(['level_group'], axis=1, inplace=True)

    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.InputLayer(input_shape=(subframe.shape[1],)))
    model.add(tf.keras.layers.Dense(units=64, activation=tf.nn.relu))
    model.add(tf.keras.layers.Dense(units=64, activation=tf.nn.relu))
    model.add(tf.keras.layers.Dense(units=64, activation=tf.nn.relu))
    model.add(tf.keras.layers.Dense(units=1, activation=tf.nn.sigmoid))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    model.fit(subframe, y_train[i], epochs=3)

    accuracy, loss = model.evaluate(subframe, y_train[i])
    print(accuracy)
    print(loss)
    
    models.append(model)

starting step: training
starting step: training #0
Epoch 1/3
Epoch 2/3
Epoch 3/3
13842.71875
0.2726424038410187
starting step: training #1
Epoch 1/3
Epoch 2/3
Epoch 3/3
1089.5966796875
0.975638747215271
starting step: training #2
Epoch 1/3
Epoch 2/3
Epoch 3/3
1436.5858154296875
0.9334097504615784
starting step: training #3
Epoch 1/3
Epoch 2/3
Epoch 3/3
753.4924926757812
0.7547746300697327
starting step: training #4
Epoch 1/3
Epoch 2/3
Epoch 3/3
5.605336666107178
0.5482556819915771
starting step: training #5
Epoch 1/3
Epoch 2/3
Epoch 3/3
2441.07373046875
0.7686953544616699
starting step: training #6
Epoch 1/3
Epoch 2/3
Epoch 3/3
202.92901611328125
0.7355062961578369
starting step: training #7
Epoch 1/3
Epoch 2/3
Epoch 3/3
12128.013671875
0.6168830990791321
starting step: training #8
Epoch 1/3
Epoch 2/3
Epoch 3/3
344.0921936035156
0.2641116976737976
starting step: training #9
Epoch 1/3
Epoch 2/3
Epoch 3/3
874.5117797851562
0.49363380670547485
starting step: training #10
Epoch 1/3
Epoch 2

In [14]:
import jo_wilder
env = jo_wilder.make_env()
iter_test = env.iter_test()

In [15]:
def determine(val):
    if val > 0.55:
        val = 1
    else:
        val = 0
        
    return val

In [16]:
limits = {'0-4':(1,4), '5-12':(4,14), '13-22':(14,19)}

for (test, samplesub) in iter_test:
    if len(test) > 0:
        try:
            group = test.level_group.values[0]

            df = feature_engineer(pl.from_pandas(test))
            df.drop('session_id')

            df = df.to_pandas()
            df = df.reset_index()
            df = df.set_index('session_id')
            df = df.drop(['index'], axis=1)
            df = df.fillna(-1)

            df.drop(['level_group'], axis=1, inplace=True)

            bot, top = limits[group]

            for t in range(bot, top):
                currentmodel = models[t-1]
                if prediciton.length > 0:
                    prediction = currentmodel.predict(df)[0]
                    mask = samplesub.session_id.str.contains(f'q{t}')
                    samplesub.loc[mask,'correct'] = determine(prediction.item())
                else:
                    mask = samplesub.session_id.str.contains(f'q{t}')
                    samplesub.loc[mask,'correct'] = 1
        except:
            pass
    
    print(samplesub)  
    env.predict(samplesub)
        

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
               session_id  correct
0    20090109393214576_q1        0
1    20090312143683264_q1        0
2    20090312331414616_q1        0
3    20090109393214576_q2        0
4    20090312143683264_q2        0
..                    ...      ...
49  20090312143683264_q17        0
50  20090312331414616_q17        0
51  20090109393214576_q18        0
52  20090312143683264_q18        0
53  20090312331414616_q18        0

[54 rows x 2 columns]


In [17]:
print("starting step: reading output")

submission_df = pd.read_csv('submission.csv')
print(submission_df.shape)
print(submission_df)

starting step: reading output
(54, 2)
               session_id  correct
0    20090109393214576_q1        0
1    20090312143683264_q1        0
2    20090312331414616_q1        0
3    20090109393214576_q2        0
4    20090312143683264_q2        0
..                    ...      ...
49  20090312143683264_q17        0
50  20090312331414616_q17        0
51  20090109393214576_q18        0
52  20090312143683264_q18        0
53  20090312331414616_q18        0

[54 rows x 2 columns]


### 