In [75]:
import numpy as np
import pandas as pd

import gc
import random
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

torch.manual_seed(0)
np.random.seed(0)

class SAKTDataset(Dataset):
    def __init__(self, group, n_skill, max_seq=100, is_test=False):
        super(SAKTDataset, self).__init__()
        self.max_seq = max_seq
        self.n_skill = n_skill
        self.samples = group
        self.is_test = is_test

        self.user_ids = []
        for user_id in group.index:
            q, qa, is_val = group[user_id]
            if not is_test:
                self.user_ids.append([user_id, -1])
            else:
                for i in range(len(q)):
                    if is_val[i]:
                        self.user_ids.append([user_id, i+1])

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, index):
        user_id = self.user_ids[index][0]
        end = self.user_ids[index][1]
        q_, qa_, _ = self.samples[user_id]

        if not self.is_test:
            seq_len = len(q_)
        else:
            start = np.max([0, end - self.max_seq])
            q_ = q_[start:end]
            qa_ = qa_[start:end]
            seq_len = len(q_)

        q = np.zeros(self.max_seq, dtype=int)
        qa = np.zeros(self.max_seq, dtype=int)
        if seq_len >= self.max_seq:
            q[:] = q_[-self.max_seq:]
            qa[:] = qa_[-self.max_seq:]
        else:
            q[-seq_len:] = q_
            qa[-seq_len:] = qa_

        target_id = q[1:]
        label = qa[1:]

        x = np.zeros(self.max_seq - 1, dtype=int)
        x = q[:-1].copy()
        x += (qa[:-1] == 1) * self.n_skill

        return x, target_id, label


class FFN(nn.Module):
    def __init__(self, state_size=200):
        super(FFN, self).__init__()
        self.state_size = state_size

        self.lr1 = nn.Linear(state_size, state_size)
        self.relu = nn.ReLU()
        self.lr2 = nn.Linear(state_size, state_size)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = self.lr1(x)
        x = self.relu(x)
        x = self.lr2(x)
        return self.dropout(x)


def future_mask(seq_length):
    future_mask = np.triu(np.ones((seq_length, seq_length)), k=1).astype('bool')
    return torch.from_numpy(future_mask)


class SAKTModel(nn.Module):
    def __init__(self, n_skill, max_seq=100, embed_dim=128):
        super(SAKTModel, self).__init__()
        self.n_skill = n_skill
        self.embed_dim = embed_dim

        self.embedding = nn.Embedding(2 * n_skill + 1, embed_dim)
        self.pos_embedding = nn.Embedding(max_seq - 1, embed_dim)
        self.e_embedding = nn.Embedding(n_skill + 1, embed_dim)

        self.multi_att = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=8, dropout=0.2)

        self.dropout = nn.Dropout(0.2)
        self.layer_normal = nn.LayerNorm(embed_dim)

        self.ffn = FFN(embed_dim)
        self.pred = nn.Linear(embed_dim, 1)

    def forward(self, x, question_ids):
        device = x.device
        x = self.embedding(x)

        pos_id = torch.arange(x.size(1)).unsqueeze(0).to(device)

        pos_x = self.pos_embedding(pos_id)
        x = x + pos_x

        e = self.e_embedding(question_ids)

        x = x.permute(1, 0, 2)  # x: [bs, s_len, embed] => [s_len, bs, embed]
        e = e.permute(1, 0, 2)
        att_mask = future_mask(x.size(0)).to(device)
        att_output, att_weight = self.multi_att(e, x, x, attn_mask=att_mask)
        att_output = self.layer_normal(att_output + e)
        att_output = att_output.permute(1, 0, 2)  # att_output: [s_len, bs, embed] => [bs, s_len, embed]

        x = self.ffn(att_output)
        x = self.layer_normal(x + att_output)
        x = self.pred(x)

        return x.squeeze(-1), att_weight


df = pd.read_pickle("../input/riiid-test-answer-prediction/split10/train_0.pickle")
df = df[df.content_type_id == False]

train_idx = []
val_idx = []
np.random.seed(0)
for _, w_df in df.groupby("user_id"):
    if np.random.random() < 0.1:
        # all val
        val_idx.extend(w_df.index.tolist())
    else:
        train_num = int(len(w_df) * 0.9)
        train_idx.extend(w_df[:train_num].index.tolist())
        val_idx.extend(w_df[train_num:].index.tolist())

df["is_val"] = 0
df["is_val"].loc[val_idx] = 1

group = df[['user_id', 'content_id', 'answered_correctly', 'is_val']].groupby('user_id').apply(lambda r: (
            r['content_id'].values,
            r['answered_correctly'].values,
            r["is_val"].values))

dataset_train = SAKTDataset(group, 13523)
dataset_val = SAKTDataset(group, 13523, is_test=True)

dataloader_train = DataLoader(dataset_train, batch_size=1024, shuffle=True, num_workers=1)
dataloader_val = DataLoader(dataset_val, batch_size=1024, shuffle=False, num_workers=1)

device = torch.device("cuda")

model = SAKTModel(13523, embed_dim=128)
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.99, weight_decay=0.005)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

model.to(device)
criterion.to(device)

def train_epoch(model, train_iterator, optim, criterion, device="cuda"):
    model.train()

    train_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []

    tbar = tqdm(train_iterator)
    for item in tbar:
        x = item[0].to(device).long()
        target_id = item[1].to(device).long()
        label = item[2].to(device).float()

        optim.zero_grad()
        output, atten_weight = model(x, target_id)
        loss = criterion(output, label)
        loss.backward()
        optim.step()
        train_loss.append(loss.item())

        output = output[:, -1]
        label = label[:, -1]
        pred = (torch.sigmoid(output) >= 0.5).long()

        num_corrects += (pred == label).sum().item()
        num_total += len(label)

        labels.extend(label.view(-1).data.cpu().numpy())
        outs.extend(output.view(-1).data.cpu().numpy())

        tbar.set_description('loss - {:.4f}'.format(loss))

    acc = num_corrects / num_total
    auc = roc_auc_score(labels, outs)
    loss = np.mean(train_loss)

    return loss, acc, auc

epochs = 20
for epoch in range(epochs):
    loss, acc, auc = train_epoch(model, dataloader_train, optimizer, criterion, device)
    print("epoch - {} train_loss - {:.2f} acc - {:.3f} auc - {:.4f}".format(epoch, loss, acc, auc))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
loss - 0.3656: 100%|██████████| 39/39 [00:05<00:00,  6.99it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 0 train_loss - 0.39 acc - 0.547 auc - 0.5598


loss - 0.3530: 100%|██████████| 39/39 [00:05<00:00,  7.17it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 1 train_loss - 0.36 acc - 0.576 auc - 0.6114


loss - 0.3060: 100%|██████████| 39/39 [00:05<00:00,  7.02it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 2 train_loss - 0.33 acc - 0.608 auc - 0.6535


loss - 0.2840: 100%|██████████| 39/39 [00:05<00:00,  7.02it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 3 train_loss - 0.29 acc - 0.629 auc - 0.6825


loss - 0.2623: 100%|██████████| 39/39 [00:05<00:00,  7.07it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 4 train_loss - 0.27 acc - 0.643 auc - 0.7005


loss - 0.2646: 100%|██████████| 39/39 [00:05<00:00,  6.88it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 5 train_loss - 0.26 acc - 0.650 auc - 0.7085


loss - 0.2562: 100%|██████████| 39/39 [00:05<00:00,  7.10it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 6 train_loss - 0.25 acc - 0.652 auc - 0.7104


loss - 0.2593: 100%|██████████| 39/39 [00:05<00:00,  6.90it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 7 train_loss - 0.24 acc - 0.653 auc - 0.7087


loss - 0.2358: 100%|██████████| 39/39 [00:05<00:00,  6.95it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 8 train_loss - 0.23 acc - 0.650 auc - 0.7076


loss - 0.2130: 100%|██████████| 39/39 [00:05<00:00,  6.87it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 9 train_loss - 0.22 acc - 0.651 auc - 0.7075


loss - 0.2136: 100%|██████████| 39/39 [00:05<00:00,  6.88it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 10 train_loss - 0.21 acc - 0.652 auc - 0.7078


loss - 0.2070: 100%|██████████| 39/39 [00:05<00:00,  6.95it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 11 train_loss - 0.20 acc - 0.650 auc - 0.7073


loss - 0.1742: 100%|██████████| 39/39 [00:05<00:00,  6.96it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 12 train_loss - 0.19 acc - 0.653 auc - 0.7068


loss - 0.1560: 100%|██████████| 39/39 [00:05<00:00,  6.86it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 13 train_loss - 0.18 acc - 0.655 auc - 0.7097


loss - 0.1718: 100%|██████████| 39/39 [00:05<00:00,  7.00it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 14 train_loss - 0.17 acc - 0.657 auc - 0.7110


loss - 0.1686: 100%|██████████| 39/39 [00:05<00:00,  7.02it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 15 train_loss - 0.16 acc - 0.659 auc - 0.7131


loss - 0.1589: 100%|██████████| 39/39 [00:05<00:00,  6.99it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 16 train_loss - 0.16 acc - 0.661 auc - 0.7152


loss - 0.1463: 100%|██████████| 39/39 [00:05<00:00,  6.70it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 17 train_loss - 0.15 acc - 0.661 auc - 0.7164


loss - 0.1420: 100%|██████████| 39/39 [00:05<00:00,  6.77it/s]
  0%|          | 0/39 [00:00<?, ?it/s]

epoch - 18 train_loss - 0.14 acc - 0.659 auc - 0.7184


loss - 0.1269: 100%|██████████| 39/39 [00:05<00:00,  6.92it/s]

epoch - 19 train_loss - 0.13 acc - 0.665 auc - 0.7211





In [76]:
preds = []
labels = []
for d in tqdm(dataloader_val):
    x = d[0].to(device).long()
    target_id = d[1].to(device).long()
    label = d[2].to(device).long()
    
    output, atten_weight = model(x, target_id)
    
    preds.extend(torch.nn.Sigmoid()(output[:, -1]).view(-1).data.cpu().numpy().tolist())
    labels.extend(label[:, -1].view(-1).data.cpu().numpy())

100%|██████████| 1783/1783 [02:16<00:00, 13.08it/s]


In [77]:
from sklearn.metrics import roc_auc_score

In [78]:
roc_auc_score(labels, preds)

0.6814781319825192

In [79]:
roc_auc_score(df.iloc[val_idx]["answered_correctly"].values, preds)

0.6814781319825192

In [48]:
df_oof = pd.DataFrame()
df_oof["row_id"] = df.loc[val_idx].index
df_oof["predict"] = preds
df_oof["target"] = df.loc[val_idx]["answered_correctly"].values

In [49]:
roc_auc_score(df_oof["target"].values, df_oof["predict"].values)

0.6822792295122452

In [50]:
df_oof.to_csv("transformers1.csv", index=False)

In [57]:
df_oof2 = pd.read_csv("../output/ex_172/20201202080625/oof_train_1_lgbm.csv")
df_oof2.columns = ["row_id", "predict_lgbm", "target"]

In [59]:
df_oof2 = pd.merge(df_oof, df_oof2, how="inner")

In [63]:
roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values)

0.7909067186967411

In [66]:
roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*0.9 + df_oof2["predict"].values*0.1)

0.8074526256515147

In [67]:
roc_auc_score(df_oof2["target"].values, df_oof2["predict"].values*0.1)

0.6831381395586142

In [None]:
np.corrcoef(df)