In [1]:
## This Python 3 environment comes with many helpful analytics libraries installed
## It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
## For example, here's several helpful packages to load

#import numpy as np # linear algebra
#import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

## Input data files are available in the read-only "../input/" directory
## For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

## You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
## You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

A Self_Attention model for Knowledge Tracing (Pandey & Karypis 2019)
https://arxiv.org/pdf/1907.06837.pdf

In [2]:
import gc
import os
import pandas as pd
import numpy as np

import random
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

In [3]:
%%time
dtype = {'timestamp': 'int64', 'user_id': 'int32' ,'content_id': 'int16','content_type_id': 'int8','answered_correctly':'int8'}

train_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv', usecols=[1,2,3,4,7], dtype=dtype)
train_df.head()

CPU times: user 1min 9s, sys: 4.31 s, total: 1min 14s
Wall time: 1min 58s


Unnamed: 0,timestamp,user_id,content_id,content_type_id,answered_correctly
0,0,115,5692,0,1
1,56943,115,5716,0,1
2,118363,115,128,0,1
3,131167,115,7860,0,1
4,137965,115,7922,0,1


In [4]:
## content_type_id: (int8) 
## 0 if the event was a question being posed to the user, 
## 1 if the event was the user watching a lecture.
## Only take the rows with content_type_id = 0
train_df = train_df[train_df.content_type_id == False]

## Arrange by timestamp
train_df = train_df.sort_values(['timestamp'], ascending=True).reset_index(drop = True)

In [5]:
# n_skill = number of unique questions
skills = train_df["content_id"].unique()
n_skill = len(skills)
print("number of skills", len(skills))

number of skills 13523


In [6]:
## Group by user id
group = train_df[['user_id', 'content_id', 'answered_correctly']].groupby('user_id').apply(lambda r: (
            r['content_id'].values,
            r['answered_correctly'].values))

#del train_df
#group.head()

In [7]:
class SAKTDataset(Dataset):
    def __init__(self, group, n_skill, max_seq=100):
        super(SAKTDataset, self).__init__()
        ## max_seq = the maximum length the model can handel
        ## n_skill = number of questions/exercise
        self.max_seq = max_seq
        self.n_skill = n_skill
        self.samples = group
        
        #self.user_ids = [x for x in group.index]
        self.user_ids = []
        for user_id in group.index:
            q, qa = group[user_id]
            ## skip users that answer less than 10 questions
            if len(q) < 10:
                continue   ## continue returns the control to the beginning of the loop
            self.user_ids.append(user_id)

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, index):
        user_id = self.user_ids[index]
        ## q_ = questions user answered
        ## qa_ = user answer to question
        q_, qa_ = self.samples[user_id]
        ## seq_len = number of questions user answered
        seq_len = len(q_)
        
        ## padded sequences same length as max_seq
        q = np.zeros(self.max_seq, dtype=int)
        qa = np.zeros(self.max_seq, dtype=int)
        
        ## if length of sequence longer than max_seq, only take the most recently answered questions equal to max_seq
        ## otherwise keep all questions and answers, pad with zeroes to make sequence length = max_seq
        if seq_len >= self.max_seq:
            q[:] = q_[-self.max_seq:]
            qa[:] = qa_[-self.max_seq:]
        else:
            q[-seq_len:] = q_
            qa[-seq_len:] = qa_
        
        ## skip first question since there is no previous question to use (??)
        target_id = q[1:]
        label = qa[1:]

        ## From paper: 
        ## The interaction tuple xt = (et, rt) is presented to the model as a number
        ## yt = et + rt × E, where E is the total number of questions/exercises.
        x = np.zeros(self.max_seq-1, dtype=int)
        x = q[:-1].copy()
        x += (qa[:-1] == 1) * self.n_skill

        return x, target_id, label

In [8]:
## Split 80/20 into train and val
train, val = train_test_split(group, test_size=0.2)

train_dataset = SAKTDataset(train, n_skill)
train_dataloader = DataLoader(train_dataset, batch_size=2048, shuffle=True, num_workers=8)
del train

val_dataset = SAKTDataset(val, n_skill)
val_dataloader = DataLoader(val_dataset, batch_size=2048, shuffle=True, num_workers=8)
del val

# Define model

In [9]:
class FFN(nn.Module):
    ## Feed-forward neural net
    ## S = Multihead(Mˆ, Eˆ)
    ## Mˆ = embedded interaction input matrix,  Eˆ = embedded exercise matrix 
    ## F = FFN(S) = ReLU(SW(1) + b(1))W(2) + b(2)
    def __init__(self, state_size=200):
        super(FFN, self).__init__()
        self.state_size = state_size
        
        ## nn.Linear applies a linear transformation to the incoming data y=xA+b
        ## input state_size, output state_size
        self.lr1 = nn.Linear(state_size, state_size)
        ## nn.relu applies the rectified linear unit function element-wise:
        ## ReLU(x) = (x)^+ = \max(0, x)ReLU(x)=(x)+=max(0,x)
        self.relu = nn.ReLU()
        ## Second linear transformation to incorporate non-linearity in model
        self.lr2 = nn.Linear(state_size, state_size)
        
        ## dropout for regularization
        ## randomly zeroes some of the elements of the input tensor with probability p = 0.2
        self.dropout = nn.Dropout(0.2) 
    
    def forward(self, x): 
        x = self.lr1(x)
        x = self.relu(x)
        x = self.lr2(x)
        return self.dropout(x)

def future_mask(seq_length):
    ## np.triu returns the upper triangle of the matrix
    ## with the elements below the k-th diagonal zeroed
    ## only use the previous questions to predict answer to current question, ignore future questions
    future_mask = np.triu(np.ones((seq_length, seq_length)), k=1).astype('bool')
    return torch.from_numpy(future_mask)


class SAKTModel(nn.Module):
    def __init__(self, n_skill, max_seq=100, embed_dim=128):
        super(SAKTModel, self).__init__()
        self.n_skill = n_skill
        self.embed_dim = embed_dim
        
        ## The interaction tuple xt = (et, rt) is presented to the model as a number
        ## yt = et + rt × E, where E is the total number of exercises.
        ## Thus, the total values that an element in the interaction sequence can take is 2E (rt = 1 or 0),
        self.embedding = nn.Embedding(2*n_skill+1, embed_dim)
        ## while elements in the exercise sequence can take E possible values
        self.e_embedding = nn.Embedding(n_skill+1, embed_dim)
        ## Position Encoding is the layer in the self-attention neural network which is used for encoding the
        ## position so that like convolution network and recurrent neural network, we can encode the order of the sequence.
        self.pos_embedding = nn.Embedding(max_seq-1, embed_dim)
        ## embed_dim = latent dimensions / dimension of embedding vector
        
        ## Multihead self-attention, with 8 attention heads
        self.multi_att = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=8, dropout=0.2)

        self.dropout = nn.Dropout(0.2)
        self.layer_normal = nn.LayerNorm(embed_dim) 

        self.ffn = FFN(embed_dim)
        self.pred = nn.Linear(embed_dim, 1)
    
    def forward(self, x, question_ids):
        device = x.device        
        x = self.embedding(x)
        pos_id = torch.arange(x.size(1)).unsqueeze(0).to(device)

        pos_x = self.pos_embedding(pos_id)
        ##  The ith row of position embedding matrix, Pi is then added to 
        ##the interaction embedding vector of the ith element of the interaction sequence.
        x = x + pos_x

        e = self.e_embedding(question_ids)

        x = x.permute(1, 0, 2) # x: [bs, s_len, embed] => [s_len, bs, embed]
        e = e.permute(1, 0, 2)
        att_mask = future_mask(x.size(0)).to(device)
        ## Self-attention layer
        ## Query from exercise embedding e
        ## Key and Value from embedding of interaction sequence x = yt = et + rt × E
        att_output, att_weight = self.multi_att(e, x, x, attn_mask=att_mask)
        ## Normalization and residual layer
        att_output = self.layer_normal(att_output + e)
        att_output = att_output.permute(1, 0, 2) # att_output: [s_len, bs, embed] => [bs, s_len, embed]

        ## Feed-forward layers
        x = self.ffn(att_output)
        ## Normalization and residual layer
        x = self.layer_normal(x + att_output)
        x = self.pred(x)

        return x.squeeze(-1), att_weight

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SAKTModel(n_skill, embed_dim=128)
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.99, weight_decay=0.005)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

model.to(device)
criterion.to(device)

BCEWithLogitsLoss()

In [11]:
def train_epoch(model, train_iterator, optim, criterion, device="cpu"):
    model.train()

    train_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []

    tbar = tqdm(train_iterator)
    for item in tbar:
        x = item[0].to(device).long()
        target_id = item[1].to(device).long()
        label = item[2].to(device).float()
        ## ignore output if target_id (content_id in original data frame) = 0
        target_mask = (target_id != 0)
        
        ## In PyTorch, we need to set the gradients to zero before starting to do backpropragation 
        ## because PyTorch accumulates the gradients on subsequent backward passes.
        optim.zero_grad()
        output, atten_weight = model(x, target_id)
        
        output = torch.masked_select(output, target_mask)
        label = torch.masked_select(label, target_mask)
        
        loss = criterion(output, label)
        ## loss.backward() computes dloss/dx for every parameter x which has requires_grad=True. 
        ## These are accumulated into x.grad for every parameter x.
        loss.backward()
        ## optimizer.step updates the value of x using the gradient x.grad
        optim.step()
        train_loss.append(loss.item())
        ## apply sigmoid to output prediction
        ## Sigmoid(z) = 1/(1 + e^−z)
        ## If sigmoid of output >= 0.5, user is predicted to answer the question correctly for comparison with label
        pred = (torch.sigmoid(output) >= 0.5).long()
        
        num_corrects += (pred == label).sum().item()
        num_total += len(label)

        labels.extend(label.view(-1).data.cpu().numpy())
        outs.extend(output.view(-1).data.cpu().numpy())

        tbar.set_description('loss - {:.4f}'.format(loss))

    acc = num_corrects / num_total
    auc = roc_auc_score(labels, outs)
    loss = np.average(train_loss)

    return loss, acc, auc

In [12]:
#test_sigmoid = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
#test_sigmoid = torch.FloatTensor(test_sigmoid)
#print(torch.sigmoid(test_sigmoid))

In [13]:
def val_epoch(model, val_iterator, criterion, device="cpu"):
    model.eval()

    train_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []

    tbar = tqdm(val_iterator)
    for item in tbar:
        x = item[0].to(device).long()
        target_id = item[1].to(device).long()
        label = item[2].to(device).float()
        target_mask = (target_id != 0)

        with torch.no_grad():
            output, atten_weight = model(x, target_id)
        
        output = torch.masked_select(output, target_mask)
        label = torch.masked_select(label, target_mask)

        loss = criterion(output, label)
        train_loss.append(loss.item())
        
        pred = (torch.sigmoid(output) >= 0.5).long()
        
        num_corrects += (pred == label).sum().item()
        num_total += len(label)

        labels.extend(label.view(-1).data.cpu().numpy())
        outs.extend(output.view(-1).data.cpu().numpy())

        tbar.set_description('loss - {:.4f}'.format(loss))

    acc = num_corrects / num_total
    auc = roc_auc_score(labels, outs)
    loss = np.average(train_loss)

    return loss, acc, auc

In [14]:
%%time
epochs = 20

over_fit = 0
last_auc = 0
for epoch in range(epochs):
    train_loss, train_acc, train_auc = train_epoch(model, train_dataloader, optimizer, criterion, device)
    print("epoch - {} train_loss - {:.2f} acc - {:.3f} auc - {:.3f}".format(epoch, train_loss, train_acc, train_auc))
    
    val_loss, avl_acc, val_auc = val_epoch(model, val_dataloader, criterion, device)
    print("epoch - {} val_loss - {:.2f} acc - {:.3f} auc - {:.3f}".format(epoch, val_loss, avl_acc, val_auc))
    
    if val_auc > last_auc:
        last_auc = val_auc
        over_fit = 0
    else:
        over_fit += 1
        
    
    if over_fit >= 2:
        print("early stop epoch ", epoch)
        break

## epoch - 16 val_loss - 0.57 acc - 0.702 auc - 0.759
## early stop epoch  16
## CPU times: user 16min 22s, sys: 59.1 s, total: 17min 21s
## Wall time: 20min 10s

loss - 0.5948: 100%|██████████| 153/153 [00:30<00:00,  5.01it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 0 train_loss - 0.63 acc - 0.653 auc - 0.674


loss - 0.5931: 100%|██████████| 39/39 [00:07<00:00,  5.11it/s]
  0%|          | 0/153 [00:00<?, ?it/s]

epoch - 0 val_loss - 0.59 acc - 0.685 auc - 0.731


loss - 0.5801: 100%|██████████| 153/153 [00:29<00:00,  5.23it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 1 train_loss - 0.58 acc - 0.691 auc - 0.742


loss - 0.5816: 100%|██████████| 39/39 [00:05<00:00,  6.84it/s]
  0%|          | 0/153 [00:00<?, ?it/s]

epoch - 1 val_loss - 0.58 acc - 0.696 auc - 0.749


loss - 0.5715: 100%|██████████| 153/153 [00:29<00:00,  5.25it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 2 train_loss - 0.58 acc - 0.697 auc - 0.752


loss - 0.5746: 100%|██████████| 39/39 [00:05<00:00,  6.66it/s]
  0%|          | 0/153 [00:00<?, ?it/s]

epoch - 2 val_loss - 0.57 acc - 0.698 auc - 0.754


loss - 0.5741: 100%|██████████| 153/153 [00:29<00:00,  5.24it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 3 train_loss - 0.57 acc - 0.699 auc - 0.755


loss - 0.5771: 100%|██████████| 39/39 [00:05<00:00,  6.79it/s]
  0%|          | 0/153 [00:00<?, ?it/s]

epoch - 3 val_loss - 0.57 acc - 0.700 auc - 0.756


loss - 0.5675: 100%|██████████| 153/153 [00:30<00:00,  5.09it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 4 train_loss - 0.57 acc - 0.700 auc - 0.757


loss - 0.5820: 100%|██████████| 39/39 [00:06<00:00,  6.48it/s]
  0%|          | 0/153 [00:00<?, ?it/s]

epoch - 4 val_loss - 0.57 acc - 0.700 auc - 0.757


loss - 0.5701: 100%|██████████| 153/153 [00:30<00:00,  5.02it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 5 train_loss - 0.57 acc - 0.701 auc - 0.759


loss - 0.5790: 100%|██████████| 39/39 [00:06<00:00,  6.29it/s]
  0%|          | 0/153 [00:00<?, ?it/s]

epoch - 5 val_loss - 0.57 acc - 0.701 auc - 0.758


loss - 0.5698: 100%|██████████| 153/153 [00:30<00:00,  5.06it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 6 train_loss - 0.57 acc - 0.702 auc - 0.760


loss - 0.5686: 100%|██████████| 39/39 [00:06<00:00,  6.11it/s]
  0%|          | 0/153 [00:00<?, ?it/s]

epoch - 6 val_loss - 0.57 acc - 0.701 auc - 0.758


loss - 0.5723: 100%|██████████| 153/153 [00:30<00:00,  4.97it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 7 train_loss - 0.57 acc - 0.702 auc - 0.761


loss - 0.5780: 100%|██████████| 39/39 [00:06<00:00,  6.13it/s]
  0%|          | 0/153 [00:00<?, ?it/s]

epoch - 7 val_loss - 0.57 acc - 0.701 auc - 0.759


loss - 0.5681: 100%|██████████| 153/153 [00:29<00:00,  5.20it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 8 train_loss - 0.57 acc - 0.703 auc - 0.761


loss - 0.5717: 100%|██████████| 39/39 [00:06<00:00,  5.58it/s]
  0%|          | 0/153 [00:00<?, ?it/s]

epoch - 8 val_loss - 0.57 acc - 0.702 auc - 0.759


loss - 0.5639: 100%|██████████| 153/153 [00:29<00:00,  5.11it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 9 train_loss - 0.57 acc - 0.703 auc - 0.762


loss - 0.5636: 100%|██████████| 39/39 [00:06<00:00,  6.22it/s]
  0%|          | 0/153 [00:00<?, ?it/s]

epoch - 9 val_loss - 0.57 acc - 0.702 auc - 0.759


loss - 0.5676: 100%|██████████| 153/153 [00:29<00:00,  5.21it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 10 train_loss - 0.57 acc - 0.704 auc - 0.762


loss - 0.5725: 100%|██████████| 39/39 [00:06<00:00,  6.09it/s]
  0%|          | 0/153 [00:00<?, ?it/s]

epoch - 10 val_loss - 0.57 acc - 0.702 auc - 0.759


loss - 0.5647: 100%|██████████| 153/153 [00:29<00:00,  5.16it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 11 train_loss - 0.57 acc - 0.704 auc - 0.763


loss - 0.5723: 100%|██████████| 39/39 [00:06<00:00,  5.87it/s]
  0%|          | 0/153 [00:00<?, ?it/s]

epoch - 11 val_loss - 0.57 acc - 0.702 auc - 0.759


loss - 0.5616: 100%|██████████| 153/153 [00:30<00:00,  5.00it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 12 train_loss - 0.57 acc - 0.704 auc - 0.763


loss - 0.5740: 100%|██████████| 39/39 [00:06<00:00,  5.99it/s]
  0%|          | 0/153 [00:00<?, ?it/s]

epoch - 12 val_loss - 0.57 acc - 0.702 auc - 0.759


loss - 0.5697: 100%|██████████| 153/153 [00:30<00:00,  5.03it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 13 train_loss - 0.57 acc - 0.705 auc - 0.764


loss - 0.5729: 100%|██████████| 39/39 [00:06<00:00,  6.30it/s]


epoch - 13 val_loss - 0.57 acc - 0.701 auc - 0.759
early stop epoch  13
CPU times: user 12min 58s, sys: 49.9 s, total: 13min 48s
Wall time: 15min 49s


In [15]:
class TestDataset(Dataset):
    def __init__(self, samples, test_df, skills, max_seq=100):
        super(TestDataset, self).__init__()
        ## Append test_df info to group (the original training dataframe)
        ## Input group as "samples" for TestDataset
        self.samples = samples
        self.user_ids = [x for x in test_df["user_id"].unique()]
        self.test_df = test_df
        self.skills = skills
        self.n_skill = len(skills)
        self.max_seq = max_seq

    def __len__(self):
        return self.test_df.shape[0]

    def __getitem__(self, index):
        test_info = self.test_df.iloc[index]

        user_id = test_info["user_id"]
        target_id = test_info["content_id"]

        q = np.zeros(self.max_seq, dtype=int)
        qa = np.zeros(self.max_seq, dtype=int)
        
        ## Extract user info from samples (the original training dataframe + test_df info)
        ## If user does not exist in samples, return q with all zeros
        if user_id in self.samples.index:
            q_, qa_ = self.samples[user_id]
            
            seq_len = len(q_)

            if seq_len >= self.max_seq:
                q = q_[-self.max_seq:]
                qa = qa_[-self.max_seq:]
            else:
                q[-seq_len:] = q_
                qa[-seq_len:] = qa_          
        
        x = np.zeros(self.max_seq-1, dtype=int)
        x = q[1:].copy()
        x += (qa[1:] == 1) * self.n_skill
        
        questions = np.append(q[2:], [target_id])
        
        return x, questions

In [16]:
example_test = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_test.csv')

In [17]:
example_test.head(50)

Unnamed: 0,row_id,group_num,timestamp,user_id,content_id,content_type_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation,prior_group_answers_correct,prior_group_responses
0,0,0,0,275030867,5729,0,0,,,[],[]
1,1,0,13309898705,554169193,12010,0,4427,19000.0,True,,
2,2,0,4213672059,1720860329,457,0,240,17000.0,True,,
3,3,0,62798072960,288641214,13262,0,266,23000.0,True,,
4,4,0,10585422061,1728340777,6119,0,162,72400.0,True,,
5,5,0,18020362258,1364159702,12023,0,4424,18000.0,True,,
6,6,0,2325432079,1521618396,574,0,1367,18000.0,True,,
7,7,0,39456940781,1317245193,12043,0,5314,17000.0,True,,
8,8,0,3460555189,1700555100,7910,0,532,21000.0,True,,
9,9,0,2214770464,998511398,7908,0,393,21000.0,True,,


In [18]:
print(eval(example_test['prior_group_answers_correct'].iloc[18]))

[0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1]


In [19]:
print(example_test['prior_group_answers_correct'].iloc[18])

[0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1]


In [20]:
import riiideducation

env = riiideducation.make_env()
iter_test = env.iter_test()

In [21]:
import psutil

model.eval()

prev_test_df = None
MAX_SEQ = 100

for (test_df, sample_prediction_df) in tqdm(iter_test):
    if (prev_test_df is not None) & (psutil.virtual_memory().percent<90):
        print(psutil.virtual_memory().percent)
        ## prior_group_responses (string) provides all of the user_answer entries for 
        ## previous group in a string representation of a list in the first row of the group. 
        ## All other rows in each group are null. If you are using Python, you will likely want to call eval on the non-null rows. Some rows may be null, or empty lists.
        
        ## prior_group_answers_correct (string) provides all the answered_correctly field for previous group, 
        ## with the same format and caveats as prior_group_responses. Some rows may be null, or empty lists.
        
        ## For test_df, the answers and answer correctness of the previous test_df (group) is always in the first row
        ## prior_group_answers_correct and prior_group_responses always have the same length as the previous test_df 
        prev_test_df['answered_correctly'] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prev_test_df = prev_test_df[prev_test_df.content_type_id == False]
        prev_group = prev_test_df[['user_id', 'content_id', 'answered_correctly']].groupby('user_id').apply(lambda r: (
            r['content_id'].values,
            r['answered_correctly'].values))
        
        for prev_user_id in prev_group.index:
            ## for each user in previous group get content_id (questions) and answered_correctly (correctness of answer to question)
            prev_group_content = prev_group[prev_user_id][0]
            prev_group_ac = prev_group[prev_user_id][1]
            
            ## If user in previous group also in *training group*, append questions and answer accuracy to user entry in training group
            ## Otherwise just add user to training group

            if prev_user_id in group.index:
                group[prev_user_id] = (np.append(group[prev_user_id][0],prev_group_content), 
                                       np.append(group[prev_user_id][1],prev_group_ac))
             
            else:
                group[prev_user_id] = (prev_group_content,prev_group_ac)
            
            ## Trim sequence in training group to match max_seq (maximum sequence length model will handle)
            if len(group[prev_user_id][0])>MAX_SEQ:
                new_group_content = group[prev_user_id][0][-MAX_SEQ:]
                new_group_ac = group[prev_user_id][1][-MAX_SEQ:]
                group[prev_user_id] = (new_group_content,new_group_ac)

    prev_test_df = test_df.copy()
    
    test_df = test_df[test_df.content_type_id == False]
    
    test_dataset = TestDataset(group, test_df, skills)
    test_dataloader = DataLoader(test_dataset, batch_size=51200, shuffle=False)
    
    outs = []

    for item in tqdm(test_dataloader):
        x = item[0].to(device).long()
        target_id = item[1].to(device).long()

        with torch.no_grad():
            output, att_weight = model(x, target_id)
        
        
        output = torch.sigmoid(output)
        output = output[:, -1]

        # pred = (output >= 0.5).long()
        # loss = criterion(output, label)

        # val_loss.append(loss.item())
        # num_corrects += (pred == label).sum().item()
        # num_total += len(label)

        # labels.extend(label.squeeze(-1).data.cpu().numpy())
        outs.extend(output.view(-1).data.cpu().numpy())
        
    test_df['answered_correctly'] =  outs
    
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

0it [00:00, ?it/s]
100%|██████████| 1/1 [00:00<00:00, 34.27it/s]

100%|██████████| 1/1 [00:00<00:00, 76.53it/s]
2it [00:00, 12.67it/s]
100%|██████████| 1/1 [00:00<00:00, 67.15it/s]

100%|██████████| 1/1 [00:00<00:00, 71.05it/s]


43.1
43.2
43.2


4it [00:00,  6.10it/s]
