# Transformer model to predict students' success

The implemented model is based on Shin et al.: SAINT+: Integrating Temporal Features for EdNet Correctness Prediction ([arXiv](https://arxiv.org/abs/2010.12042)).

Features for the encoder:
- question id
- part (category)
- relative position

Features for the decoder:
- lagged response
- time between questions (in days)
- relative position

Model specification:
- Model dimension (size of embeddings): 128
- Encoder layers: 2
- Deconder layers: 2
- Heads: 4
- Feed forward: 512
- Lenght of a sequence (window size): 100
- Batch size: 128 users (up to 4 sequences)
- Adam optimizer with 0.001 learning rate and a scheduler
<br>

### Vasvani et al. (2017):  [Attention is all you need](https://arxiv.org/abs/1706.03762) <br>

<img style="float: left;" src="transformer_model.png" width="500">

### Importing the necessary modules

In [None]:
import numpy as np
import pandas as pd
import random
import json
import gc
import math
from tqdm import tqdm
from collections import defaultdict
import dill

from sklearn.metrics import roc_auc_score

import torch
import torch.nn as nn

from torch.autograd import Variable
from torch.nn.utils.rnn import pad_sequence

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

### Loading the data

In [None]:
dtypes = {
    "timestamp": "int64",
    "user_id": "int32",
    "content_id": "int16",
    "content_type_id": "int8",
    "task_container_id": "int16",
    "answered_correctly": "int8",
    "prior_question_elapsed_time": "float32", 
    "prior_question_had_explanation": "int8"
    }

data = pd.read_csv('train.csv', dtype=dtypes, usecols=["user_id","timestamp","content_id",
                    "content_type_id","task_container_id","answered_correctly","prior_question_elapsed_time",
                    "prior_question_had_explanation"])

print(data.shape)

In [None]:
questions = pd.read_csv('questions.csv')

questions = questions[['question_id','part']]

### Preprocessing the data

In [None]:
data = data[data['content_type_id']==0]

In [None]:
data = data.join(questions[['question_id','part']].set_index('question_id'), on='content_id', how='left')
   
del questions

data['part'] = data['part'].astype("uint8")

In [None]:
data['answered_correctly_lagged'] = data.groupby(['user_id'])['answered_correctly'].shift(1).astype("float32")

data['time_between_questions'] = data.groupby('user_id')['timestamp'].diff()

data['counter'] = data.groupby('user_id')['answered_correctly'].cumcount().astype("uint16") + 1

In [None]:
y = data[['user_id','answered_correctly']]

x = data[['user_id','content_id','counter','part','answered_correctly_lagged','time_between_questions']]

del data

x['answered_correctly_lagged'] = x.loc[:,'answered_correctly_lagged'].fillna(2)
x['time_between_questions'] = x['time_between_questions'].fillna(366)

for var in x.columns:
    x[var] = x[var].fillna(0)
    
x['time_between_questions'] = (x['time_between_questions']/(60000*60*24)).apply(int).clip(0,366)

In [None]:
x_lst = [d for d in x.groupby('user_id')]
y_lst = [d for d in y.groupby('user_id')]

t_dict = defaultdict(lambda: defaultdict(int))

for x_item, y_item in zip(tqdm(x_lst), y_lst):
    t_dict['x'][x_item[0]]=x_item[1].iloc[:,1:].values
    t_dict['y'][y_item[0]]=y_item[1].iloc[:,1:].values

del x_lst
del y_lst

### The model

In [None]:
class Transformer(nn.Module):
    def __init__(self, d_model):
        super(Transformer, self).__init__()

        self.d_model = d_model
        self.question_id_embedding = nn.Embedding(13523+1, d_model)
        self.positional_embedding = nn.Embedding(100+1, d_model)
        self.part_embedding = nn.Embedding(7+1, d_model)
        self.correctness_embedding = nn.Embedding(2+1, d_model)
        self.time_between_questions_embedding = nn.Embedding(365+2, d_model)
        self.transformer = nn.Transformer(d_model=d_model, 
                                  nhead=4, 
                                  num_encoder_layers= 2,
                                  num_decoder_layers= 2, 
                                  dim_feedforward=512, 
                                  dropout=0.0, 
                                  activation='relu')
        self.linear2output = nn.Linear(d_model, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, question_id, part, correctness, time_between_que, position, mask_triu,
                padding_mask):
        question_id_emb = self.question_id_embedding(question_id)
        position_emb = self.positional_embedding(position)
        part_emb = self.part_embedding(part)
        correctness_emb = self.correctness_embedding(correctness)
        time_between_que_emb = self.time_between_questions_embedding(time_between_que)
        exercise_emb = question_id_emb + position_emb + part_emb
        response_emb = correctness_emb + position_emb + time_between_que_emb 
        output = self.transformer(exercise_emb, response_emb, src_mask=mask_triu, tgt_mask=mask_triu,
                                  memory_mask=mask_triu, src_key_padding_mask=padding_mask,
                                  tgt_key_padding_mask=padding_mask, memory_key_padding_mask=padding_mask)
        output = self.linear2output(output)
        output = self.sigmoid(output)
        return output

In [None]:
batch_size = 128
transformer = Transformer(128)
transformer.to(device)

learning_rate = 0.001

criterion = nn.BCELoss()
criterion.to(device)
optimizer = torch.optim.Adam(transformer.parameters(), lr=learning_rate)

lambda1 = lambda epoch: 1 / (1 + 0.05*(epoch-1))
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1)

### Splitting the data into training and validation data

In [None]:
user_list = t_dict['x'].keys()
user_list = random.sample(list(user_list), len(user_list))

train_split = int(len(user_list)*0.9)
user_train = user_list[:train_split]
user_test = user_list[train_split:]

training_size = len(user_train)
training_batches = training_size // batch_size

test_size = len(user_test)
test_batches = test_size // batch_size

### Training the model

In [None]:
epochs = 30

for epoch in tqdm(range(1, epochs+1)):
    
    user_train_epoch = random.sample(list(user_train), len(user_train))
    
################################# Training ##################################
    i = 0
        
    for batch in range(1, training_batches + 1):
        
        input_seq_l = []
        input_seq_len = []
        y_tensor_l = []
        y_tensor_len = []

        for j in range(i, i+batch_size):
            seq = t_dict['x'][user_train_epoch[j]]
            if len(seq)>100:
                if len(seq)>500:
                    seq = seq[-500:]
                a = random.randint(0,100)
                b = a
                while b <= len(seq)-100:
                    input_seq = seq[b:b+100]
                    input_seq = torch.from_numpy(input_seq).long().to(device)
                    input_seq_l.append(input_seq)
                    input_seq_len.append(input_seq.size()[0])
                    b += 100
            else:
                input_seq = torch.from_numpy(seq).long().to(device)
                input_seq_l.append(input_seq)
                input_seq_len.append(input_seq.size()[0])
        
            y_tensor = torch.tensor(t_dict['y'][user_train_epoch[j]], dtype=torch.float, device=device)
            if len(y_tensor)>100:
                if len(y_tensor)>500:
                    y_tensor = y_tensor[-500:]
                while a <= len(y_tensor)-100:
                    y_tensor_b = y_tensor[a:a+100]
                    y_tensor_l.append(y_tensor_b)
                    y_tensor_len.append(y_tensor_b.size()[0]) 
                    a += 100
            else:
                y_tensor_l.append(y_tensor)
                y_tensor_len.append(y_tensor.size()[0])
          
        input_seq_p = pad_sequence(input_seq_l, padding_value=0)

        y_tensor_p = pad_sequence(y_tensor_l)

        y_tensor_p = y_tensor_p.view(y_tensor_p.size()[0],y_tensor_p.size()[1],1)
        y_tensor_p.to(device)
    
        input_seq_p.to(device)

        optimizer.zero_grad()
        
        position = torch.arange(1, input_seq_p[:,:,0].shape[0]+1).to(device).unsqueeze(0).repeat(input_seq_p[
            :,:,0].shape[1], 1).transpose(0,1)
        mask_triu = torch.triu(torch.ones(input_seq_p[:,:,3].size()[0], input_seq_p[:,:,3].size()[0])
                               ==1, diagonal=1)
        mask_triu = mask_triu.to(device)
        
        padding_mask = (input_seq_p[:,:,1]==0).transpose(0, 1)
        padding_mask = padding_mask.to(device)
        
        output = transformer(input_seq_p[:,:,0], input_seq_p[:,:,2], input_seq_p[:,:,3], input_seq_p[:,:,4
                    ], position, mask_triu, padding_mask)
        
        loss_batch = criterion(output, y_tensor_p)
        loss_batch.backward()
        optimizer.step()

        loss_train = loss_batch.detach().item()
        del loss_batch
        
        outputs = output.detach()
        del output
        
        y_tensors = y_tensor_p.detach()
        del y_tensor_p
        
        y_pred = outputs[y_tensor_len[0]-1,0]
        y_true = y_tensors[y_tensor_len[0]-1,0]
        
        for k, seqlen in enumerate(y_tensor_len[1:], start=1):
            y_pred = torch.cat((y_pred, outputs[seqlen-1,k]))
            y_true = torch.cat((y_true, y_tensors[seqlen-1,k]))
                
        if batch==1:
            y_hat_train = y_pred
            y_true_train = y_true
            loss = loss_train
        else:
            y_hat_train = torch.cat((y_hat_train, y_pred))
            y_true_train = torch.cat((y_true_train, y_true))
            loss = loss + loss_train
            
        i += batch_size

################################ Validation #########################################
    
    i = 0
    
    for batch in range(1, test_batches + 1):
        
 
        input_seq_l = []
        input_seq_len = []
        y_tensor_l = []
        y_tensor_len = []

        for j in range(i, i+batch_size):
            seq_val = t_dict['x'][user_test[j]]
            if len(seq_val)>100:
                seq_val = seq_val[-100:]
                
            input_seq_val = torch.from_numpy(seq_val).long().to(device)
            input_seq_l.append(input_seq_val)
    
            y_tensor_val = torch.tensor(t_dict['y'][user_test[j]], dtype=torch.float, device=device)
            if len(y_tensor_val)>100:
                y_tensor_val = y_tensor_val[-100:]
            y_tensor_l.append(y_tensor_val)
            y_tensor_len.append(y_tensor_val.size()[0])
    
        input_seq_p_val = pad_sequence(input_seq_l, padding_value=0)
        y_tensor_p_val = pad_sequence(y_tensor_l)
        y_tensor_p_val = y_tensor_p_val.view(y_tensor_p_val.size()[0],y_tensor_p_val.size()[1],1)
        y_tensor_p_val.to(device)
        input_seq_p_val.to(device)
        
        position = torch.arange(1, input_seq_p_val[:,:,0].shape[0]+1).to(device).unsqueeze(0).repeat(
            input_seq_p_val[:,:,0].shape[1], 1).transpose(0,1)
        mask_triu = torch.triu(torch.ones(input_seq_p_val[:,:,3].size()[0], input_seq_p_val[:,:,3].size()[0])
                               ==1, diagonal=1)
        mask_triu = mask_triu.to(device)
        
        padding_mask = (input_seq_p_val[:,:,1]==0).transpose(0, 1)
        padding_mask = padding_mask.to(device)
        
        output_val = transformer(input_seq_p_val[:,:,0], input_seq_p_val[:,:,2], input_seq_p_val[:,:,3
                    ], input_seq_p_val[:,:,4], position, mask_triu, padding_mask)
        
        loss_batch_val = criterion(output_val, y_tensor_p_val)
        
        loss_val = loss_batch_val.detach().item()
        del loss_batch_val
        
        y_tensors_val = y_tensor_p_val.detach()
        del y_tensor_p_val
        
        outputs_val = output_val.detach()
        del output_val
        
        y_pred_val = outputs_val[y_tensor_len[0]-1,0]
        y_true_val = y_tensors_val[y_tensor_len[0]-1,0]
        
        for k, seqlen in enumerate(y_tensor_len[1:], start=1):
            y_pred_val = torch.cat((y_pred_val, outputs_val[seqlen-1,k]))
            y_true_val = torch.cat((y_true_val, y_tensors_val[seqlen-1,k]))  

        if batch==1:
            y_hat_test = y_pred_val
            y_true_test = y_true_val
            loss_test = loss_val
        else:
            y_hat_test = torch.cat((y_hat_test, y_pred_val))
            y_true_test = torch.cat((y_true_test, y_true_val))
            loss_test = loss_test + loss_val
            
        i += batch_size
   
    y_hat_train = y_hat_train.detach().cpu().numpy()  
    y_true_train = y_true_train.detach().cpu().numpy()
    
    auc_score_train = roc_auc_score(y_true_train, y_hat_train)
    
    loss = loss / training_batches
    
    y_hat_test = y_hat_test.detach().cpu().numpy()  
    y_true_test = y_true_test.detach().cpu().numpy()

    auc_score_val = roc_auc_score(y_true_test, y_hat_test)
    
    loss_test = loss_test / test_batches
    
    print('Epoch: {}/{}, Batches: {}/{}.............'.format(epoch, epochs, training_batches, training_batches
                                                            ), end=' ')
    print("Loss: {:.4f}, ROC AUC: {:.4f}, Loss val: {:.4f}, ROC AUC val: {:.4f}".format(
        loss, auc_score_train, loss_test, auc_score_val))
    
    scheduler.step()

### Saving the model for inference

In [None]:
torch.save(transformer, "/kaggle/working/transformer_model.pt")