In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset


from transformers import *
import os
import sys
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
import numpy as np
import re
import pickle
import time
import pandas as pd
from pathlib import Path
import random
from torch.utils.tensorboard import SummaryWriter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('./data/hackathon_train.csv', encoding='cp949', index_col=0)

# split train and test dataframe
train_df_list = []
test_df_list = []
for idx in df['User_ID'].unique():
    train_df_list.append(df[df['User_ID']==idx][0:40])
    test_df_list.append(df[df['User_ID']==idx][40:])
    
train_df = pd.concat(train_df_list, ignore_index=True)
test_df = pd.concat(test_df_list, ignore_index=True)

In [3]:
# load embeddings
train_question_embeddings = torch.load('./embedded/train_result_question.pt')
train_answer_embeddings = torch.load('./embedded/train_result_answer.pt')
test_question_embeddings = torch.load('./embedded/test_result_question.pt')
test_answer_embeddings = torch.load('./embedded/test_result_answer.pt')

In [4]:
# combine the question and the answer embeddings for both train and test
combined_train_embeddings = torch.cat([train_question_embeddings[0], train_answer_embeddings[0]], dim=1)
combined_test_embeddings = torch.cat([test_question_embeddings[0], test_answer_embeddings[0]], dim=1)
    

In [5]:
# print the shape of the combined embeddings
print(combined_train_embeddings.shape)
print(combined_test_embeddings.shape)

torch.Size([9600, 1536])
torch.Size([1920, 1536])


In [6]:
def set_random(SEED=0):
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    np.random.seed(SEED)
    random.seed(SEED)

class MyDataset(Dataset):
    def __init__(self, data, label, label_idx=0):
        self.data = data
        self.label = label
        self.label_idx = label_idx

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], torch.tensor(self.label[idx][self.label_idx])
    
def convert_mbti_to_label(mbti: str):
    """
    :param mbti: string. length=4
    :return:
    """
    stand = 'ISTJ'  # [0, 0, 0, 0]
    result = []
    for i in range(4):
        if stand[i] == mbti[i]:
            result.append(0)
        else:
            result.append(1)

    return result

# def convert_label_to_mbti(num, label_idx):
#     stand = 'ISTJ'
#     mbti = stand[label_idx]

In [7]:
def train(model, dl, optimizer, criterion, device=1):
    model = model.cuda(device)
    model.train()
    loss_all, acc_all = 0, 0
    
    for x, y in dl:
        x, y = x.cuda(device), y.cuda(device)
        output = model(x)
        loss = criterion(output, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        acc = (output.argmax(axis=1) == y).sum() / len(y)

        loss_all += loss.item()
        acc_all += acc.item()


    loss = loss_all / len(dl)
    acc = acc_all / len(dl)

    return loss, acc

def valid(model, dl, optimizer=None, criterion=None, device=1):
    model = model.cuda(device)
    model.eval()
    loss_all, acc_all = 0, 0
    
    output_list = []
    for x, y in dl:
        x, y = x.cuda(device), y.cuda(device)
        output = model(x)
        loss = criterion(output, y)

        acc = (output.argmax(axis=1) == y).sum() / len(y)

        loss_all += loss.item()
        acc_all += acc.item()

        output_list.append(output.argmax(dim=1).cpu())
        
    loss = loss_all / len(dl)
    acc = acc_all / len(dl)
    
    
#     # userid accuracy
#     result = 0
#     a = torch.cat(output_list)
#     for uid in test_df['User_ID'].unique():
#         idx = test_df[test_df['User_ID']==uid].index
#         if a[idx].count_nonzero().item() > len(a[idx])//2:
#             label = 1
#         else:
#             label = 0
            
#         result += convert_mbti_to_label(test_df[test_df['User_ID']==uid]['MBTI'].unique()[0])[label_idx] == label
        
    
    return loss, acc


In [8]:
def forward(model, dl, device=0):
    pooled = []
    hidden = []
    model.cuda(device)
    model.eval()
    for data in dl:
        data = {k:v.cuda(device) for k,v in data.items()}
        with torch.no_grad():
            output = model(**data, output_hidden_states=True)
        p, h = output.pooler_output, output.hidden_states
        pooled.append(p) # pooler output
        hidden.append(h[-1][:,0,:]) # only [CLS] token embedding 
    return torch.cat(pooled), torch.cat(hidden)

In [10]:
def main(label_idx=0, device=1, name='test', epochs=500):
    
    model = nn.Sequential(nn.Linear(768*2, 50),
                              nn.ReLU(),
                              nn.Linear(50, 2))
    
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
        
    # dataset / dataloader
    train_data = combined_train_embeddings # pooled_output of question and answer embeddings
    train_label = train_df['MBTI'].map(convert_mbti_to_label)
    
    test_data = combined_test_embeddings
    test_label = test_df['MBTI'].map(convert_mbti_to_label)
    
    train_ds = MyDataset(train_data, train_label, label_idx)
    test_ds = MyDataset(test_data, test_label, label_idx)

    train_dl = DataLoader(train_ds, batch_size=1024, shuffle=True)
    test_dl = DataLoader(test_ds, batch_size=1024, shuffle=False)

    # train
    train_final = []
    val_final = []
    
    save_dir = f'./ckpt/{name}'
    for epoch in range(1, epochs+1):
        train_loss, train_acc = train(model, train_dl, optimizer, criterion, device=0)
        # validation
        val_loss, val_acc = valid(model, test_dl, criterion=criterion, device=1)

        writer.add_scalar('Loss/Train', train_loss, epoch)
        writer.add_scalar('Acc/Train', train_acc, epoch)
#         wandb.log({'train_loss': train_loss, 'train_acc': train_acc, 'epoch': epoch})
        writer.add_scalar('Loss/Test', val_loss, epoch)
        writer.add_scalar('Acc/Test', val_acc, epoch)
#         writer.add_scalar('Acc/userid', acc, epoch)
#         wandb.log({'val_loss': val_loss, 'val_acc': val_acc, 'epoch': epoch})

        train_final.append([train_loss, train_acc])
        val_final.append([val_loss, val_acc])
        
        os.makedirs(save_dir, exist_ok=True)
        
        if epoch % 50 == 0:
                print(f"Epoch {epoch}/{epochs} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")
                torch.save(model, f"{save_dir}/epoch_{epoch}.pt")

    return train_final, val_final

In [11]:
# Train all
test_number = 1
MBTI = ['IE', 'SN', 'TF', 'JP']
set_random(422)
for i in range(4):
    writer = SummaryWriter(f'./tensorboard/{test_number}/{MBTI[i]}/')
    result = main(i, 7, MBTI[i], 500)

Epoch 50/500 | Train Loss: 0.6814 | Train Acc: 0.5614 | Val Loss: 0.7020 | Val Acc: 0.4951
Epoch 100/500 | Train Loss: 0.6820 | Train Acc: 0.5586 | Val Loss: 0.7122 | Val Acc: 0.5001
Epoch 150/500 | Train Loss: 0.6776 | Train Acc: 0.5750 | Val Loss: 0.7041 | Val Acc: 0.5023
Epoch 200/500 | Train Loss: 0.6774 | Train Acc: 0.5740 | Val Loss: 0.7052 | Val Acc: 0.5072
Epoch 250/500 | Train Loss: 0.6749 | Train Acc: 0.5781 | Val Loss: 0.7064 | Val Acc: 0.5046
Epoch 300/500 | Train Loss: 0.6743 | Train Acc: 0.5777 | Val Loss: 0.7070 | Val Acc: 0.5035
Epoch 350/500 | Train Loss: 0.6715 | Train Acc: 0.5869 | Val Loss: 0.7071 | Val Acc: 0.5092
Epoch 400/500 | Train Loss: 0.6722 | Train Acc: 0.5818 | Val Loss: 0.7117 | Val Acc: 0.5005
Epoch 450/500 | Train Loss: 0.6683 | Train Acc: 0.5921 | Val Loss: 0.7086 | Val Acc: 0.5151
Epoch 500/500 | Train Loss: 0.6682 | Train Acc: 0.5905 | Val Loss: 0.7082 | Val Acc: 0.5052
Epoch 50/500 | Train Loss: 0.6825 | Train Acc: 0.5632 | Val Loss: 0.7044 | Val Ac