# JKT

In [1]:
import numpy as np
import pandas as pd
import os
import joblib
from tqdm import notebook, tqdm
from scipy.sparse import coo_matrix
from functools import partial
# from coo import co_acc_probs, co_acc_skills


In [2]:
# multiprocessing to speedup matrix extraction
import multiprocessing
cores = multiprocessing.cpu_count()
pool = multiprocessing.Pool(processes=cores)

## preprocess data csv

In [4]:
# preprocess assist2009
read_col = ['order_id', 'assignment_id', 'user_id', 'assistment_id', 'problem_id', 'correct', 
            'sequence_id', 'base_sequence_id', 'skill_id', 'skill_name', 'original']
target = 'correct'

# read in the data
df = pd.read_csv('./data/a09/skill_builder_data.csv', low_memory=False, encoding="ISO-8859-1")[read_col]
# df

In [5]:

# delete empty skill_id
df = df.dropna(subset=['skill_id'])
df = df[~df['skill_id'].isin(['noskill'])]
df.skill_id = df.skill_id.astype('int')
print('After removing empty skill_id, records number %d' % len(df))

# delete scaffolding problems
df = df[df['original'].isin([1])]
print('After removing scaffolding problems, records number %d' % len(df))

min_inter_num = 3
users = df.groupby(['user_id'], as_index=True)
delete_users = []
for u in users:
    if len(u[1]) < min_inter_num:
        delete_users.append(u[0])
print('deleted user number based min-inters %d' % len(delete_users))
df = df[~df['user_id'].isin(delete_users)]
df = df[[ 'user_id', 'problem_id', 'skill_id', 'correct']]
print('After deleting some users, records number %d' % len(df))

After removing empty skill_id, records number 459208
After removing scaffolding problems, records number 433161
deleted user number based min-inters 290
After deleting some users, records number 432702


In [6]:
# 5,"[55969, 55970, ...
skill_df = df[['skill_id', 'problem_id']].groupby(['skill_id'], as_index=True).apply(lambda r: np.array(list(set(r['problem_id'].values))))
# joblib.dump(skill_df, 'Data/assist2009/skill_prob.pkl.zip')
user_prob = df[['user_id', 'problem_id', 'correct']].groupby(['user_id', 'problem_id'])['correct'].agg('mean')
# print(user_prob)

## Extract concept-to-concept matrix and exercise -to-exercise matrix

In [7]:
def co_acc_probs(user_prob, prob1, prob2):
    count = 0
    agg = 0
    for user, prob in user_prob.index:
        if prob == prob1:
            if prob2 in user_prob[user].index:
                count += 1
                agg += user_prob[user][prob1] * user_prob[user][prob2]
#                 print('user {} answered two questions, with {} answered {}, and {} answered {}'
#                   .format(user, prob1, user_prob[user][prob1], prob2, user_prob[user][prob2]))
    if count == 0:
        return -1
    else:
        return agg/count


def co_acc_skills(user_skill, skill1, skill2):
    count = 0
    agg = 0
    for user, skill in user_skill.index:
        if skill1 == skill: 
            if skill2 in user_skill[user].index:
                count += 1
                agg += user_skill[user][skill1] * user_skill[user][skill2]
    if count == 0:
        return -1
    else:
        return agg/count

In [8]:
# e2e matrix
skill_mats = []
e2e = partial(co_acc_probs, user_prob)
# e2e

In [None]:

for skill in tqdm(skill_df.index):
    print('processing skill: ', skill)
    skill_probs = skill_df[skill]
    row = []
    col = []
    args = ((prob1, prob2) for i, prob1 in enumerate(skill_probs) for prob2 in skill_probs[i:])
    val = pool.starmap(e2e, args)
    print('matrix size is: ', len(val))
    for i in range(len(skill_probs)):
        for j in range(i, len(skill_probs)):
            row.append(i)
            col.append(j)
    assert(len(val)==len(row))
    mat = coo_matrix((val, (row, col)), shape=(len(skill_probs), len(skill_probs)))
    skill_mats.append(mat)
# joblib.dump(skill_mats, 'para_e2e.pkl.zip')

In [None]:
skill_acc = df[['skill_id', 'correct']].groupby('skill_id')['correct'].agg('mean')
user_skill = df[['user_id', 'skill_id', 'correct']].groupby(['user_id', 'skill_id'])['correct'].agg('mean')

for user, skill in tqdm(user_skill.index):
    if user_skill[user][skill]>=skill_acc[skill]:
        user_skill[user][skill] = 1
    else:
        user_skill[user][skill] = 0
        
# c2c matrix
row = []
col = []
val = []
print('Extracting c2c matrix...')
arg_skills = ((skill1, skill2) for (i, skill1) in enumerate(skill_df.index) for skill2 in skill_df.index[i:])
c2c = partial(co_acc_skills, user_skill)
val = pool.starmap(c2c, arg_skills)
for i in range(len(skill_df.index)):
    for j in range(i, len(skill_df.index)):
        row.append(i)
        col.append(j)
mat_skill = coo_matrix((val, (row, col)), shape=(len(skill_df.index), len(skill_df.index)))
joblib.dump(mat_skill,'c2c.pkl.zip')

## Training matrix in Autoencoder

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# import torch_geometric.nn as pyg_nn
# import torch_geometric.utils as pyg_utils

import time, joblib
from datetime import datetime
from tqdm import tqdm

import networkx as nx
import numpy as np
import torch
import torch.optim as optim

from torch_geometric.data import Data, DataLoader
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
from torch_geometric.utils import train_test_split_edges

from sklearn.manifold import TSNE
# import matplotlib.pyplot as plt

In [None]:
c2c = joblib.load('./data/test/c2c.pkl.zip')
skill_df = joblib.load('./data/test/skill_prob.pkl.zip')
c2c_t = c2c.transpose()
c2c_t.setdiag(0)
c2c_add = c2c+c2c_t
# c2c_add.setdiag(0)
c2c

In [None]:

class Encoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Encoder, self).__init__()
        self.conv1 = pyg_nn.GCNConv(in_channels, 2 * out_channels, cached=True)
        self.conv2 = pyg_nn.GCNConv(2 * out_channels, out_channels, cached=True)

    def forward(self, x, edge_index):
        x = (self.conv1(x, edge_index))
        self.feature = self.conv2(x, edge_index)
        return self.feature

def train(epoch):
    model.train()
    optimizer.zero_grad()
    z = model.encode(x, train_pos_edge_index)
    loss = model.recon_loss(z, train_pos_edge_index)
    loss.backward()
    optimizer.step()

#     writer.add_scalar("loss", loss.item(), epoch)

def test(pos_edge_index, neg_edge_index):
    model.eval()
    with torch.no_grad():
        z = model.encode(x, train_pos_edge_index)
    return model.test(z, pos_edge_index, neg_edge_index)

In [None]:
# train c2c embedding
x = torch.tensor(c2c_add.toarray().tolist(), dtype=torch.float)
edge_index, edge_weight = pyg_utils.convert.from_scipy_sparse_matrix(c2c_add)
data = Data(edge_index=edge_index, x=x)

channels = 10
dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('CUDA availability:', torch.cuda.is_available())

model = pyg_nn.GAE(Encoder(123, channels)).to(dev)
labels = data.y
data.train_mask = data.val_mask = data.test_mask = data.y = None
data = train_test_split_edges(data, val_ratio=0, test_ratio=0.2)

x, train_pos_edge_index = data.x.to(dev), data.train_pos_edge_index.to(dev)
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)

for epoch in range(1, 2001):
    train(epoch)
    auc, ap = test(data.test_pos_edge_index, data.test_neg_edge_index)
    if epoch % 100 == 0:
        print('Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, auc, ap))
c2c_emb = {}
for i, emb in enumerate(model.encoder.feature.cpu()):
    c2c_emb[skill_df.index[i]] = emb
joblib.dump(c2c_emb, 'Data/assist2009/c2c_emb.pkl.zip') 

In [None]:
# train e2e embedding
e2e_mat = joblib.load('Data/assist2009/para_e2e.pkl.zip')
e2e_emb = {}
channels = 10
for i, e2e in enumerate(e2e_mat):
    skill_id = skill_df.index[i]
    if len(skill_df[skill_id]) >= channels:
        
        print('processing questions in skill: {} ======'.format(skill_id))
        e2e_t = e2e.transpose()
        e2e_t.setdiag(0)
        e2e_add = e2e + e2e_t

        x = torch.tensor(e2e_add.toarray().tolist(), dtype=torch.float)
        edge_index, edge_weight = pyg_utils.convert.from_scipy_sparse_matrix(e2e_add)
        data = Data(edge_index=edge_index, x=x)

        # encoder: written by us; decoder: default (inner product)
        model = pyg_nn.GAE(Encoder(e2e.shape[0], channels)).to(dev)
        # model = pyg_nn.GAE(Encoder(dataset.num_features, channels)).to(dev)

        labels = data.y
        data.train_mask = data.val_mask = data.test_mask = data.y = None
        # data = model.split_edges(data)

        data = train_test_split_edges(data, val_ratio=0, test_ratio=0.2)

        x, train_pos_edge_index = data.x.to(dev), data.train_pos_edge_index.to(dev)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.002)
        for epoch in range(1, 2001):
            train(epoch)
            auc, ap = test(data.test_pos_edge_index, data.test_neg_edge_index)
            # writer.add_scalar("AUC", auc, epoch)
            # writer.add_scalar("AP", ap, epoch)
            if epoch % 500 == 0:
                print('Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, auc, ap))
        e2e_emb[skill_id] = model.encoder.feature.cpu()
joblib.dump(e2e_emb, 'Data/assist2009/e2e_emb.pkl.zip')

## Embedding + DKT

In [9]:
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler

In [10]:
e2e_emb = joblib.load('./data/a09/e2e_emb.pkl.zip')
c2c_emb = joblib.load('./data/a09/c2c_emb.pkl.zip')
skill_prob = joblib.load('./data/a09/skill_prob.pkl.zip')

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
filtered_skill_prob = {}
channel = 10
for i, skill_id in enumerate(skill_prob.index):
    if len(skill_prob[skill_id])>= channel:
        filtered_skill_prob[skill_id] = skill_prob[skill_id]
# joblib.dump(filtered_skill_prob, 'Data/assist2009/filtered_skill_prob.pkl.zip')

In [12]:
# normalization
scaler = StandardScaler()
all_c_v = []
for k,v in c2c_emb.items():
    all_c_v.extend(list(v.numpy()))
all_c_v = scaler.fit_transform(np.array(all_c_v).reshape(-1,1))
all_c_v1 = {}
for i, (k,v) in enumerate(c2c_emb.items()):
    all_c_v1[k] = all_c_v[i*10:(i+1)*10].reshape(-1,)
all_e_v = {}
for skill,qu_embs in e2e_emb.items():
    q_num = qu_embs.shape[0]
    temp_all_v = qu_embs.numpy().reshape(-1,)
    temp_all_v = scaler.fit_transform(np.array(temp_all_v).reshape(-1,1))
    all_e_v[skill] = temp_all_v.reshape(-1,10)

In [13]:
skill_emb = {}
for skill in tqdm(filtered_skill_prob.keys()):
    temp_c = (np.array(all_c_v1[skill]))
    temp_e = np.array(np.mean(all_e_v[skill], axis=0))
    skill_emb[skill] = np.append(temp_c, temp_e)
prob_emb = {}
for skill in tqdm(filtered_skill_prob.keys()):
    for i, prob in enumerate(filtered_skill_prob[skill]):
        temp_c = (np.array(all_c_v1[skill]))
        temp_e = (np.array(all_e_v[skill][i]))
        new_emb = np.append(temp_c, temp_e)
        if prob in prob_emb.keys():
            prob_emb[prob] = np.row_stack((prob_emb[prob], new_emb)).squeeze()
#             print(prob_emb[prob].shape)
        else: prob_emb[prob] = new_emb
for prob in tqdm(prob_emb.keys()):
    if len(prob_emb[prob].shape) > 1:
        prob_emb[prob] = np.mean(prob_emb[prob], axis=0)

100%|██████████| 112/112 [00:00<00:00, 15378.34it/s]
100%|██████████| 112/112 [00:00<00:00, 731.16it/s]
100%|██████████| 15879/15879 [00:00<00:00, 410430.35it/s]


In [31]:
# Train/Test data
read_col = ['order_id', 'assignment_id', 'user_id', 'assistment_id', 'problem_id', 'correct', 
            'sequence_id', 'base_sequence_id', 'skill_id', 'skill_name', 'original']
target = 'correct'
# read in the data
# df = pd.read_csv('./data/a09/skill_builder_data.csv', low_memory=False, encoding="ISO-8859-1")[read_col]
# df = df.sort_values(['order_id', 'user_id'])
# # delete empty skill_id
# df = df.dropna(subset=['skill_id'])
# df = df[~df['skill_id'].isin(['noskill'])]
# df.skill_id = df.skill_id.astype('int')
# print('After removing empty skill_id, records number %d' % len(df))

# # delete scaffolding problems
# df = df[df['original'].isin([1])]
# print('After removing scaffolding problems, records number %d' % len(df))

# #delete the users whose interaction number is less than min_inter_num
# min_inter_num = 3
# users = df.groupby(['user_id'], as_index=True)
# delete_users = []
# for u in users:
#     if len(u[1]) < min_inter_num:
#         delete_users.append(u[0])
# print('deleted user number based min-inters %d' % len(delete_users))
# df = df[~df['user_id'].isin(delete_users)]
# df = df[['user_id', 'problem_id', 'skill_id', 'correct']]
# print('After deleting some users, records number %d' % len(df))
# # print('features: ', df['assistment_id'].unique(), df['answer_type'].unique())

# df = df[df['skill_id'].isin(filtered_skill_prob.keys())]
# df['skill_cat'] = df['skill_id'].astype('category').cat.codes
# df['e_emb'] = df['problem_id'].apply(lambda r: prob_emb[r])
# df['c_emb'] = df['skill_id'].apply(lambda r: skill_emb[r])

df = pd.read_csv('./to_result.csv')

group_c = df[['user_id', 'c_emb', 'correct']].groupby('user_id').apply(lambda r: (np.array(r['c_emb'].tolist()).squeeze(), r['correct'].values))
train_group_c = group_c.sample(frac=0.8, random_state=2020)
test_group_c = group_c[~group_c.index.isin(train_group_c.index)]
# joblib.dump(train_group_c, 'Data/assist2009/train_group_c.pkl.zip')
# joblib.dump(test_group_c, 'Data/assist2009/test_group_c.pkl.zip')

### DKT Models

In [32]:
from torch.utils.data import Dataset
class DKTDataset(Dataset):
    def __init__(self, group, min_samples=3, max_seq=100):
        super(DKTDataset, self).__init__()
        self.max_seq = max_seq
        self.samples = {}
        
        self.user_ids = []
        for user_id in group.index:
            q, qa = group[user_id]
            if len(q) < min_samples:
                continue
            
            # Main Contribution
            if len(q) > self.max_seq:
                total_questions = len(q)
                initial = total_questions % self.max_seq
                if initial >= min_samples:
                    self.user_ids.append(f"{user_id}_0")
                    self.samples[f"{user_id}_0"] = (q[:initial], qa[:initial])
                for seq in range(total_questions // self.max_seq):
                    self.user_ids.append(f"{user_id}_{seq+1}")
                    start = initial + seq * self.max_seq
                    end = start + self.max_seq
                    self.samples[f"{user_id}_{seq+1}"] = (q[start:end], qa[start:end])
            else:
                user_id = str(user_id)
                self.user_ids.append(user_id)
                self.samples[user_id] = (q, qa)
    
    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, index):
        user_id = self.user_ids[index]
        q_, qa_ = self.samples[user_id]
        seq_len = len(q_)

        q = np.zeros((self.max_seq, q_.shape[1]))
        qa = np.zeros(self.max_seq, dtype=int)
        if seq_len == self.max_seq:
            q[:] = q_
            qa[:] = qa_
        else:
            q[-seq_len:] = q_
            qa[-seq_len:] = qa_
        
        x_emb = self.onehot(q[:-1], qa[:-1])
        q_next = q[1:]
        labels = qa[1:]
        
        return x_emb, q_next, labels

    
    def onehot(self, questions, answers):
        emb_num = questions.shape[-1]
        result = np.zeros(shape=[self.max_seq-1, 2*emb_num])
        for i in range(self.max_seq-1):
            if answers[i] > 0:
                result[i][:emb_num] = questions[i]
            elif answers[i] == 0:
                result[i][emb_num:] = questions[i]
        return result

In [33]:
from torch.autograd import Variable
import torch
from torch import nn
class DKT(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, device):
        super(DKT, self).__init__()
        self.device = device
        # Number of hidden dimensions
        self.hidden_dim = hidden_dim
        
        # Number of hidden layers
        self.layer_dim = layer_dim
        
        # RNN
        self.rnn = nn.RNN(input_dim*2, hidden_dim, layer_dim, batch_first=True, nonlinearity='tanh')
        # Readout layer
        self.fc = nn.Linear(hidden_dim+input_dim, output_dim)
        self.sig = nn.Sigmoid()
        
    def forward(self, x, q_next):
        
        # Initialize hidden state with zeros
        h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)).to(self.device)
            
        # One time step
        out_next, hn = self.rnn(x, h0)
        out_next = torch.cat((out_next, q_next), axis=2)
#         out_next = torch.cat((out.reshape(-1,hidden_dim), q_next.reshape(-1,20)), axis=1)
        out = (self.fc(out_next))
        return out

In [34]:
def train_fn(model, dataloader, optimizer, criterion, scheduler=None,  device="cpu"):
    model.train()

    train_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []

    for x_emb, q_next, y in (dataloader):
        x = x_emb.to(device).float()
        y = y.to(device).float()
        q_next = q_next.to(device).float()
        
        out = model(x, q_next).squeeze()#[:, :-1]
        
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()
#         scheduler.step()
        train_loss.append(loss.item())
        
        target_mask = (q_next!=0).unique(dim=2).squeeze()
#         target_mask = (y!=-1)
    
        filtered_out = torch.masked_select(out, target_mask)
        filtered_label = torch.masked_select(y, target_mask)
        filtered_pred = (torch.sigmoid(filtered_out) >= 0.5).long()
        
        num_corrects += (filtered_pred == filtered_label).sum().item()
        num_total += len(filtered_label)

        labels.extend(filtered_label.view(-1).data.cpu().numpy())
        outs.extend(filtered_pred.view(-1).data.cpu().numpy())

    acc = num_corrects / num_total
    auc = roc_auc_score(labels, outs)
    loss = np.mean(train_loss)

    return loss, acc, auc

In [35]:
def valid_fn(model, dataloader, criterion, device="cpu"):
    model.eval()

    valid_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []

    for x_emb, q_next, y in (dataloader):
        x = x_emb.to(device).float()
        y = y.to(device).float()
        q_next = q_next.to(device).float()
        out = model(x, q_next).squeeze()#[:, :-1]
        
        loss = criterion(out, y)
        valid_loss.append(loss.item())
        
        target_mask = (q_next!=0).unique(dim=2).squeeze()
#         target_mask = (y!=-1)
    
        filtered_out = torch.masked_select(out, target_mask)
        filtered_label = torch.masked_select(y, target_mask)
        filtered_pred = (torch.sigmoid(filtered_out) >= 0.5).long()
        
        num_corrects += (filtered_pred == filtered_label).sum().item()
        num_total += len(filtered_label)

        labels.extend(filtered_label.view(-1).data.cpu().numpy())
        outs.extend(filtered_pred.view(-1).data.cpu().numpy())

    acc = num_corrects / num_total
    auc = roc_auc_score(labels, outs)
    pre = precision_score(labels, outs)
    f1 = f1_score(labels, outs)
    rec = recall_score(labels, outs)
    loss = np.mean(valid_loss)

    return loss, acc, pre, rec, f1, auc

In [36]:
# Model settings
input_dim = 20    # input dimension
hidden_dim = 128  # hidden layer dimension
layer_dim = 1     # number of hidden layers
output_dim = 1   # output dimension
MAX_LEARNING_RATE = 1e-3
LEARNING_RATE = 1e-4
EPOCHS = 200
BATCH_SIZE = 128
MAX_SEQ = 50

In [37]:
from torch.utils.data import DataLoader
train_dataset = DKTDataset(train_group_c, max_seq=MAX_SEQ)
train_dataloader  = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_dataset = DKTDataset(test_group_c, max_seq=MAX_SEQ)
valid_dataloader  = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [38]:
device = torch.device('cpu')#("cuda" if torch.cuda.is_available() else "cpu")

model = DKT(input_dim, hidden_dim, layer_dim, output_dim, device)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss()
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer, max_lr=MAX_LEARNING_RATE, steps_per_epoch=len(train_dataloader), epochs=EPOCHS
)

model.to(device)
criterion.to(device)

BCEWithLogitsLoss()

In [39]:
for epoch in (range(EPOCHS)):
    loss, acc, auc = train_fn(model, train_dataloader, optimizer, criterion, device)
#     print("epoch - {}/{} train: - {:.3f} acc - {:.3f} auc - {:.3f}".format(epoch+1, EPOCHS, loss, acc, auc))
    loss, acc, pre, rec, f1, auc = valid_fn(model, valid_dataloader, criterion, device)

    res = "epoch - {}/{} valid: - {:.3f} acc - {:.3f} pre - {:.3f} rec - {:.3f} f1 - {:3f} auc - {:.3f}".format(epoch+1, EPOCHS, loss, acc, pre, rec, f1, auc)
    print(res)

IndexError: tuple index out of range

## Embedding viz

In [None]:
num_skill = 100
viz_data = []
ran = np.random.randint(low=0, high=112, size=num_skill)
for i in ran:
    c_em = skill_emb[list(skill_emb.keys())[i]]
    viz_data.append(c_em.reshape(-1,))

In [None]:
qu_len = []
for i, k in enumerate(skill_emb.keys()):
    if i<num_skill:
        questions = skill_prob[k]
        qu_len.append(len(questions))
        q_emb = []
        for j, q in enumerate(questions):
            if j<100:
                q_emb.append((prob_emb[q].reshape(-1,)))
#         q_emb = scaler.fit_transform(q_emb)
        viz_data.extend(q_emb)
viz_data = scaler.fit_transform(viz_data)

In [None]:
colors = ['red' for i in range(num_skill)] + ['blue' for i in range(num_skill, len(viz_data))]
s = [50 for i in range(num_skill)] + [1 for j in range(num_skill,len(viz_data))]
xs, ys = zip(*TSNE(n_components=2, init='random', perplexity=30).fit_transform(viz_data))
plt.figure(figsize=(6,4))
plt.scatter(xs, ys, color=colors, s=s)
plt.savefig('cluster.eps', format='eps')