In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers==4.28.0
!pip install datasets evaluate
!pip install konlpy

# Importing Package

In [None]:
import collections
import numpy as np
import string
import pandas as pd

import logging
import json
import os
import sys
from dataclasses import dataclass, field
from typing import Optional

import datasets
from datasets import load_dataset

import evaluate
import transformers
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    BertTokenizerFast,
    AlbertModel,
    DataCollatorWithPadding,
    PreTrainedTokenizerFast,
    TrainingArguments,
    Trainer,
    DefaultDataCollator,
    default_data_collator,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version

# Loading Data + Preprocessing

In [None]:
# get stopwords list
stopwords_file_path = '/content/drive/MyDrive/QIA2023_phase1/data/stopwords.txt'
lines = open(stopwords_file_path, "r")

filter_list = []
for word in lines:
  filter_list.append(word.replace("\n", "").replace("\ufeff", ""))
filter_list.append(".")
filter_list.append(",")

from konlpy.tag import Okt
from konlpy.utils import pprint

okt = Okt()

# remove stopwords and punctuation
def preprocess_str(text):
  text = okt.morphs(text, norm=True) # not lemma
  text = [word for word in text if word not in filter_list]
  result = text[0]
  for word in text[1:]:
    result = result + " " + word
  return result

question_df = pd.read_excel('/content/drive/MyDrive/QIA2023_phase1/data/Question.xlsx')['Question']
# question_df = question_df.apply(preprocess_str)


# Remove <> and apply preprocess_str
def preprocess_ans(word):
  lst = word.split(">")
  return lst[1] + " [SEP]" + lst[0][1:]

def preprocess_data(data_df):
  data_df["Answer"] = data_df["Answer"].apply(preprocess_ans)
  data_df["Q_number"] = data_df["Q_number"].apply(lambda idx: question_df[idx - 1])
  data_df['Answer'] = data_df["Q_number"] + " [SEP] " + data_df['Answer'] + " [SEP] " + data_df['Age'].astype(str) + " [SEP] " + data_df['Gender'].astype(str)
  return data_df

In [None]:
# load datasets
df = pd.read_csv('/content/drive/MyDrive/QIA2023_phase1/data/train_data_s1_v1.csv', encoding='cp949', index_col=0)
df = preprocess_data(df)

In [None]:
df.tail()

# Loading pretrained model checkpoint

In [None]:
labels = ["ESTJ", "ENTJ", "ESFJ", "ENFJ", "ISTJ", "ISFJ", "INTJ", "INFJ", "ESTP", "ESFP", "ENTP", "ENFP", "ISTP", "ISFP", "INTP", "INFP"]

id2label = dict()
label2id = dict()

for i, label in enumerate(labels):
  label2id[label] = i
  id2label[i] = label

# TensorFlow model

In [None]:
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from transformers import TFBertModel

AUTO = tf.data.experimental.AUTOTUNE
# Configuration
EPOCHS = 3
BATCH_SIZE = 8
MAX_LEN = 512

# Hugging Face model

In [None]:
model_checkpoint = "klue/roberta-large"

config    = AutoConfig.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model     = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=16, id2label=id2label, label2id=label2id
)

In [None]:
dataset = {
    "text": [],
    "label": []
}

for index, row in df.iterrows():
    dataset["text"].append(row["Answer"])
    dataset["label"].append(label2id[row["MBTI"]])

dataset = datasets.Dataset.from_dict(dataset)

In [None]:
from sklearn.model_selection import train_test_split

dataset = dataset.train_test_split(test_size = 0.05)

In [None]:
example = dataset['train'][0]
example

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation = True)

In [None]:
tokenized_ds = dataset.map(preprocess_function, batched = True)

# Metrics

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# Class Training Arguments + Trainer

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir = "checkpoints",
    overwrite_output_dir = 'True',
    learning_rate = 1e-5,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 10,
    weight_decay = 0.01,
    evaluation_strategy = "steps",
    save_strategy = "steps",
    eval_steps = 1000
)

In [None]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_ds["train"],
    eval_dataset = tokenized_ds["test"],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)

In [None]:
import torch, gc
import os
gc.collect()
torch.cuda.empty_cache()

# Training

In [None]:
trainer.train()

# Deploy Model

In [None]:
from tqdm import tqdm
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/QIA2023_phase1/data/hackathon_test_for_user.csv', encoding='cp949')
test_df = preprocess_data(test_df)

In [None]:
model_checkpoint = "/content/checkpoints/checkpoint-7500"

In [None]:
config    = AutoConfig.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model     = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=16, id2label=id2label, label2id=label2id
).to(device)
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
texts = []

for index, row in test_df.iterrows():
    texts.append(row['Answer'])

In [None]:
final_logits = {
    'I/E': [],
    'S/N': [],
    'T/F': [],
    'J/P': []
}

for i in tqdm(range(0, len(texts), 16)):
    current_batch_size = min(16, len(texts) - i)

    inputs = texts[i: i + current_batch_size]
    inputs = tokenizer(inputs)
    inputs = data_collator(inputs).to(device)

    logits = model(**inputs).logits
    logits = torch.nn.Softmax(dim = 1)(logits)
    logits = torch.permute(logits, (1, 0)).cpu().data
    one_logits = torch.zeros(4, current_batch_size)

    gc.collect()
    torch.cuda.empty_cache()

    for label in labels:
        if ('E' in label):  one_logits[0] += logits[label2id[label]]
        if ('N' in label):  one_logits[1] += logits[label2id[label]]
        if ('F' in label):  one_logits[2] += logits[label2id[label]]
        if ('P' in label):  one_logits[3] += logits[label2id[label]]
    
    final_logits['I/E'] += one_logits[0].tolist()
    final_logits['S/N'] += one_logits[1].tolist()
    final_logits['T/F'] += one_logits[2].tolist()
    final_logits['J/P'] += one_logits[3].tolist()

In [None]:
result = pd.DataFrame(final_logits)
result.index += 1
result.to_csv('/content/drive/MyDrive/to_submit/Results/Phase1/klue-roberta-large.csv', index_label="idx")

In [None]:
# test = pd.DataFrame({
#     'I/E': [0 if MBTI[0] == 'I' else 1 for MBTI in MBTIs], 
#     'S/N': [0 if MBTI[1] == 'S' else 1 for MBTI in MBTIs], 
#     'T/F': [0 if MBTI[2] == 'T' else 1 for MBTI in MBTIs], 
#     'J/P': [0 if MBTI[3] == 'J' else 1 for MBTI in MBTIs], 
# })
# test.index += 1
# test.to_csv('result.csv', index_label="idx")

In [None]:
# %cp -av /content/Single-Flow-model/checkpoint-7500 /content/drive/MyDrive/QIA2023_phase1/checkpoint