In [None]:
#################################
####### TRAINING CODE CELL ######
#################################

import pandas as pd
from collections import Counter
import random
from sklearn.model_selection import KFold
import numpy as np
import torch
import os
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel, PreTrainedModel, AutoConfig
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.optim.lr_scheduler import LambdaLR

sig = nn.Sigmoid()

# PREP ELEMENT
def prep_element(x):
    return torch.permute(torch.stack(x), dims=[1,0]).to(device)

# GET ACC
def get_acc(preds, labels, threshold=0.5):

    with torch.no_grad():
        new_preds = sig(preds)

    new_preds = new_preds.detach().cpu().numpy()
    labels = labels.detach().cpu().numpy()

    binary_preds = (new_preds >= threshold).astype(np.float32)

    weak_preds = (binary_preds == labels).mean(axis=1).astype(np.float32)
    weak_accuracy = weak_preds.sum() / len(labels)

    return weak_accuracy.item()


# FIX RANDOM SEED
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

# PROCESS CODE
def process(example):
    q = questions[example['Q_number'] - 1]
    a = example['Answer']
    sex, age = example['Gender'], example['Age']
    sex = '남자' if sex == 0 else '여자'
    inp = f'{sex}, {age}세' + ' 질문: ' + q + ' 답변: ' + a   
    result = tokenizer(inp, max_length=144, padding='max_length', truncation=True, return_tensors='pt') # 96~97%.
    
    for k in result:
        example[k] = result[k].squeeze(0)
    
    example['labels'] = torch.Tensor([0 if x in "ISTJ" else 1 for x in example['MBTI']])
    return example


### MODEL CODE START ###
class ClfModel(PreTrainedModel):
    def __init__(self, config, out_size=1024):
        super(ClfModel, self).__init__(config)
        self.config = config
        self.model = AutoModel.from_pretrained(config_._name_or_path)
        
        self.out = out_size
        
        self.nn_1 = nn.Sequential(
         nn.Dropout(p=0.1),
         nn.Linear(self.out, 1)
        )
        self.nn_2 = nn.Sequential(
         nn.Dropout(p=0.1),
         nn.Linear(self.out, 1)
        )        
        
        self.nn_3 = nn.Sequential(
         nn.Dropout(p=0.1),
         nn.Linear(self.out, 1)
        )        
        
        self.nn_4 = nn.Sequential(
         nn.Dropout(p=0.1),
         nn.Linear(self.out, 1)
        )
    
    def forward(self, input_ids, attention_mask, **kwargs):
        out = self.model(input_ids=input_ids, attention_mask=attention_mask)['last_hidden_state'][:, 0, :] # select first token.
        
        out1 = self.nn_1(out)
        out2 = self.nn_2(out)
        out3 = self.nn_3(out)
        out4 = self.nn_4(out)

        return out1, out2, out3, out4

### END OF MODEL CODE ###

rseed = 123
set_seed(rseed)

# Best Training setting would be to give New Data's 49~60 + Conventional K-Fold.
df = pd.read_csv("/workspace/final_QIA/phase1/train.csv") # 0 ~ 240까지는 phase 1.

# shuffle quesiton numbers
q_numbers = list(range(1, 49))
q_numbers_aux = list(range(49, 61))

train = []
test = []

SPLIT_NUM = 5

kf = KFold(n_splits=SPLIT_NUM, shuffle=True, random_state=rseed)

for CURR_IDX in range(SPLIT_NUM):
    kf_ = [x for x in kf.split(q_numbers)]

    train = [1+x for x in kf_[CURR_IDX][0]] # 38
    val = [1+x for x in kf_[CURR_IDX][1]] # 10

    # train_df =  df[df.Q_number.isin(q_numbers_aux)]], axis=0) # 38 * 360 + 12 * 120 = 15120
    # val_df = df[df.Q_number.isin(val)] # 10 * 360 = 3600

    # train_df = df[df.Q_number.isin(train)]
    # val_df = pd.concat([df[df.Q_number.isin(q_numbers_aux)], df[df.Q_number.isin(val)]], axis=0)
    
    train_df = df[df.Q_number.isin(train)]
    val_df = df[df.Q_number.isin(val)]


    train_dset = Dataset.from_pandas(train_df)
    val_dset = Dataset.from_pandas(val_df)
    mbti = list(df.MBTI.unique())

    model_name = "klue/roberta-large"
    model_path = f"/workspace/final_QIA/models/{model_name}"

    os.makedirs(model_path, exist_ok=True)

    questions = pd.read_excel("/workspace/final_QIA/phase1/Question.xlsx")
    questions = questions.Question.tolist()
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    config_ = AutoConfig.from_pretrained(model_name)
    model = ClfModel(config_, out_size=1024)

    columns_to_remove = ['MBTI', 'Answer', 'Short_Answer', '__index_level_0__']
    train_dset = train_dset.map(lambda x: process(x), remove_columns= columns_to_remove)
    val_dset = val_dset.map(lambda x: process(x), remove_columns= columns_to_remove)

    train_loader = DataLoader(train_dset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dset, batch_size=32, shuffle=True)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    scaler = torch.cuda.amp.GradScaler() 

    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
    criterion = nn.BCEWithLogitsLoss()  
    scheduler = LambdaLR(optimizer, lr_lambda = lambda epoch: 0.95 ** epoch)
    
    best_acc = 0

    EPOCHS = 10

    print("Initiate Training ...")
    for epoch in range(EPOCHS):
    
        # Basic initital settings.
        train_loss, test_loss, train_wacc, train_sacc, test_wacc, test_sacc = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

        scheduler.step() 
        
        ### TRAINING ###
        model.train()
        
        with tqdm(train_loader, unit="batch") as t_epoch:
            for batch in t_epoch:
                t_epoch.set_description(f"Training at Epoch {epoch+1}")
                
                optimizer.zero_grad()
                
                output = model(
                    input_ids= prep_element(batch["input_ids"]),
                    attention_mask = prep_element(batch["attention_mask"])
                )

                loss = criterion(torch.permute(torch.stack(output).squeeze(-1), dims=[1,0]), prep_element(batch["labels"]).squeeze(-1))

                loss.backward()
                optimizer.step()

                train_loss += loss.item()
                train_wacc += get_acc(torch.permute(torch.stack(output).squeeze(-1), dims=[1,0]), prep_element(batch["labels"]).squeeze(-1), 0.5)

                # t_epoch.set_postfix(loss=loss.item(), accuracy= 100*float((pred == label).to(torch.float).mean()))
        
        tr_len = len(train_loader)

        # Trivial Error
        avg_train_loss = train_loss / tr_len
        avg_train_wacc = train_wacc / tr_len

        print(f"Epoch {epoch}:")
        print("Train Loss: {:.4f}".format(avg_train_loss))
        print("Train Weak Acc: {:.4f}".format(avg_train_wacc))

        ### VALIDATION ###
        model.eval()

        with torch.no_grad():
            with tqdm(val_loader, unit="batch") as v_epoch:
                for batch in v_epoch:
                    
                    with torch.no_grad():
                        output = model(
                            input_ids= prep_element(batch["input_ids"]),
                            attention_mask = prep_element(batch["attention_mask"])
                        )


                    loss = criterion(torch.permute(torch.stack(output).squeeze(-1), dims=[1,0]), prep_element(batch["labels"]).squeeze(-1))

                    test_loss += loss.item()
                    test_wacc +=  get_acc(torch.permute(torch.stack(output).squeeze(-1), dims=[1,0]), prep_element(batch["labels"]).squeeze(-1), 0.5)

                    # v_epoch.set_postfix(loss=loss.item(), accuracy= 100*float((pred == label).to(torch.float).mean()))

            te_len = len(val_loader)

            # Trivial Error
            avg_test_loss = test_loss / te_len
            avg_test_wacc = test_wacc / te_len

        print(f"Epoch {epoch}:")
        print("Test Loss: {:.4f}".format(avg_test_loss))
        print("Test Weak Acc: {:.4f}".format(avg_test_wacc))

        ## Saving
        ckpt = {'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'epoch': epoch,
                'train_acc': avg_train_wacc,
                'test_acc': avg_test_wacc,
                }

        if avg_test_wacc > best_acc:
            best_acc = avg_test_wacc
            torch.save(ckpt, f"{model_path}/klue_roberta_base_{CURR_IDX}.pth")

In [None]:
#################################
### CHECK VALIDATION ACCURACY ###
#################################
import glob
import torch

s = sorted(glob.glob("/workspace/final_QIA/models/*/*/*"))

for x in s:
    e = torch.load(x, map_location='cpu')
    print(x.split("/")[-1], e['train_acc'], e['test_acc'])

In [None]:
#################################
####### TESTING CODE CELL ######
#################################

import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import Dataset 
from transformers import AutoConfig, AutoModel, AutoTokenizer, PreTrainedModel
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import numpy as np
import torch
import glob
import csv
import re
import torch.nn as nn
import statistics

model_name = "klue/roberta-large"
fname = "0521_final_test_file.csv"

# INFERENCE

# df_test = pd.merge(df_test1, questions, left_on='Q_number', right_on='index', how='inner')

# PREP ELEMENT
def prep_element(x):
    return torch.permute(torch.stack(x), dims=[1,0]).to(device)


def test_process(example, tokenizer):
    q = questions[example['Q_number'] - 1]
    a = example['Answer']

    sex, age = example['Gender'], example['Age']
    sex = '남자' if sex == 0 else '여자'
    inp = f'{sex}, {age}세 질문: ' + q + " 답변: " + a
    result = tokenizer(inp, max_length=144, padding='max_length', truncation=True, return_tensors='pt') # 96~97%.

    for k in result:
        example[k] = result[k].squeeze(0)

    return example    


### MODEL CODE START ###
class ClfModel(PreTrainedModel):
    def __init__(self, config, out_size=1024):
        super(ClfModel, self).__init__(config)
        self.config = config
        self.model = AutoModel.from_pretrained(config_._name_or_path)
        
        self.out = out_size
        
        self.nn_1 = nn.Sequential(
         nn.Dropout(p=0.1),
         nn.Linear(self.out, 1)
        )
        self.nn_2 = nn.Sequential(
         nn.Dropout(p=0.1),
         nn.Linear(self.out, 1)
        )        
        
        self.nn_3 = nn.Sequential(
         nn.Dropout(p=0.1),
         nn.Linear(self.out, 1)
        )        
        
        self.nn_4 = nn.Sequential(
         nn.Dropout(p=0.1),
         nn.Linear(self.out, 1)
        )
    
    def forward(self, input_ids, attention_mask, **kwargs):
        out = self.model(input_ids=input_ids, attention_mask=attention_mask)['last_hidden_state'][:, 0, :] # select first token.
        
        out1 = self.nn_1(out)
        out2 = self.nn_2(out)
        out3 = self.nn_3(out)
        out4 = self.nn_4(out)

        return out1, out2, out3, out4

### END OF MODEL CODE ###


df_test = pd.read_csv("/workspace/final_QIA/phase1/test.csv")
questions = pd.read_excel("/workspace/final_QIA/phase1/Question.xlsx")
questions = questions.Question.tolist()
tokenizer = AutoTokenizer.from_pretrained(model_name)

test_set = Dataset.from_pandas(df_test)
test_set = test_set.map(lambda x: test_process(x, tokenizer))

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
test_loader = DataLoader(test_set, batch_size=32, shuffle=False)

ensemble_final_preds = []
sigmoid = nn.Sigmoid()

model_name = "klue/roberta-large"
model_path = f"/workspace/final_QIA/models/{model_name}"

questions = pd.read_excel("/workspace/final_QIA/phase1/Question.xlsx")
questions = questions.Question.tolist()

for test_model_path in sorted(glob.glob("/workspace/final_QIA/models/*/*/*")):

    results = []
    # test_model_path = f"/workspace/QIA/outputs/klue/roberta-large/klue_roberta_large_{IDX_NUM_TEST}.pth"
    print("Loading", test_model_path)

    # Load the model and specify the task
    if 'roberta-base' in test_model_path:
        continue
        
    elif 'roberta-large' in test_model_path:
        config_ = AutoConfig.from_pretrained('klue/roberta-large')
        model = ClfModel(config_, out_size=1024).to(device)

    elif 'kobigbird-bert-base' in test_model_path:
        config_ = AutoConfig.from_pretrained('monologg/kobigbird-bert-base')
        model = ClfModel(config_, out_size=768).to(device)
    
    model.load_state_dict(torch.load(test_model_path)['model'])
    model.to(device)
    
    # Get all the results by running the model on the dataset
    model.eval()

    for batch in tqdm(test_loader):
        with torch.no_grad():
            output = model(
                input_ids= prep_element(batch["input_ids"]),
                attention_mask = prep_element(batch["attention_mask"])
            )

            results.append(torch.stack(output).squeeze().permute(dims=[1,0]).detach().cpu())

    final_results = torch.stack(results).reshape(-1, 4)
    final_results = sigmoid(final_results)
    ensemble_final_preds.append(final_results)


# define the column names
fieldnames = ['idx', 'I/E', 'S/N', 'T/F', 'J/P']

def avg2(li):
    median = statistics.median(li)
    stdev = statistics.stdev(li)
    lower_bound = median - 2.5 * stdev
    upper_bound = median + 2.5 * stdev
    non_outliers = [x for x in li if lower_bound <= x <= upper_bound]
    return sum(non_outliers) / len(non_outliers)

def avg(li):
    avg_val = sum(li) / len(li)
    return avg_val


# open the CSV file for writing
with open(f"/workspace/final_QIA_{fname}", 'w', newline='') as csvfile:
    # create a writer object
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    # write the header row
    writer.writeheader()

    for idx in tqdm(range(len(ensemble_final_preds[0]))):
        
        # Majority voting
        ie = avg([x[idx][0].item() for x in ensemble_final_preds])
        sn = avg([x[idx][1].item() for x in ensemble_final_preds])
        tf = avg([x[idx][2].item() for x in ensemble_final_preds])
        jp = avg([x[idx][3].item() for x in ensemble_final_preds])

        writer.writerow({
            'idx': idx + 1,
            'I/E': ie,
            'S/N': sn,
            'T/F': tf,
            'J/P': jp
        })