In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers==4.28.0
!pip install datasets evaluate
# !pip install konlpy

# Importing Package

In [None]:
import collections
import numpy as np
import string
import pandas as pd

import logging
import json
import os
import sys
from dataclasses import dataclass, field
from typing import Optional

import datasets
from datasets import load_dataset

import evaluate
import transformers
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    BertTokenizerFast,
    AlbertModel,
    DataCollatorWithPadding,
    PreTrainedTokenizerFast,
    TrainingArguments,
    Trainer,
    DefaultDataCollator,
    default_data_collator,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version

In [None]:

#config

COMBINE_SIZE = 2
RANDOM_SEED = 80
TRAIN_PATH = '/content/drive/MyDrive/QIA2023_phase2/data/train_data.xlsx'
TEST_PATH = '/content/drive/MyDrive/QIA2023_phase2/data/test_data.xlsx'

# Loading Data + Preprocessing

In [None]:
df = pd.read_excel(TRAIN_PATH)

In [None]:
df.tail()

In [None]:
print(len(df['User_ID'].unique()))
print(df['User_ID'].unique())

import random
user_id_list = df['User_ID'].unique()
random.shuffle(user_id_list)

# Loading pretrained model checkpoint

In [None]:
# I = 0, E = 1
# S = 0, N = 1
# J = 0, P = 1
# T = 0, F = 1
label_class = [['I', 'E'], ['S', 'N'], ['T', 'F'], ['J', 'P']]

labels = ["ESTJ", "ENTJ", "ESFJ", "ENFJ", "ISTJ", "ISFJ", "INTJ", "INFJ", "ESTP", "ESFP", "ENTP", "ENFP", "ISTP", "ISFP", "INTP", "INFP"]

id2label = dict()
label2id = dict()

for i, label in enumerate(labels):
  label2id[label] = i
  id2label[i] = label

In [None]:
question_df = pd.read_excel('/content/drive/MyDrive/QIA2023_phase2/data/Question.xlsx')['Question']

def remove_trash(short_ans):
  if '/' in short_ans:
      temp = short_ans.split('/')
      return temp[0]
  return short_ans

def read_df(df):
  user_infos = dict()

  for index, row in df.iterrows():
    id = row['User_ID']

    if (id not in user_infos):
      user_infos[id] = {
          'Age': row['Age'],
          'Gender': row['Gender'],
          'questions': [],
          'responses': [],
          'MBTI': row['MBTI']
      }

    user_infos[id]['questions'].append(question_df[row['Q_number'] - 1])
    user_infos[id]['responses'].append(remove_trash(row['Short_Answer']) + ' , ' + row['Long_Answer'])
    
  return user_infos

def read_test_df(df):
  user_infos = dict()

  for index, row in df.iterrows():
    id = row['User_ID']

    if (id not in user_infos):
      user_infos[id] = {
          'Age': row['Age'],
          'Gender': row['Gender'],
          'questions': [],
          'responses': []
      }

    user_infos[id]['questions'].append(question_df[row['Q_number'] - 1])
    user_infos[id]['responses'].append(remove_trash(row['Short_Answer']) + ' , ' + row['Long_Answer'])
    
  return user_infos

def get_dataset(data_df):
  user_info = read_df(data_df)
  

  dataset = {
      "text": [],
      "label": [],
  }

  for id, data in user_info.items():
    questions = data['questions']
    responses = data['responses']
    label_str = data['MBTI']
    info_str = str(data['Age']) + ' [SEP] ' + data['Gender']
    size = len(questions)

    for i in range(0, size, COMBINE_SIZE):
      if (i + COMBINE_SIZE) > size:
        break
      
      text = '질문 : ' + questions[i] + ' [SEP] ' + '답변 : ' + responses[i]
      for j in range(1, COMBINE_SIZE, 1):
        text += ' [SEP] ' + '질문 : ' + questions[i + j] + ' [SEP] ' + '답변 : ' + responses[i + j]
      text += ' [SEP] ' + info_str     

      dataset['text'].append(text);
      dataset['label'].append(label2id[label_str])

  return datasets.Dataset.from_dict(dataset)
  # return datasets.DatasetDict({"train":datasets.Dataset.from_dict(train_dataset),"test":datasets.Dataset.from_dict(val_dataset)})

def get_test_dataset(test_df):
  user_info = read_test_df(test_df)
  
  user_ids = []
  result = []
  
  for id, data in user_info.items():
    questions = data['questions']
    responses = data['responses']
    info_str = str(data['Age']) + ' [SEP] ' + data['Gender']
    size = len(questions)

    for i in range(0, size, COMBINE_SIZE):
      if (i + COMBINE_SIZE) > size:
        break
      text = '질문 : ' + questions[i] + ' [SEP] ' + '답변 : ' + responses[i]
      for j in range(1, COMBINE_SIZE, 1):
        text += ' [SEP] ' + '질문 : ' + questions[i + j] + ' [SEP] ' + '답변 : ' + responses[i + j]
      text += ' [SEP] ' + info_str
      result.append(text)
      
    user_ids.append(id)
  return result, user_ids

In [None]:
# dataset = {
#     "text": [],
#     "label": []
# }

# for index, row in df.iterrows():
#     dataset["text"].append(row["Long_Answer"])
#     dataset["label"].append(label2id[row["MBTI"]])
# df = pd.read_csv(TRAIN_PATH, encoding="cp949", index_col=0)
df = pd.read_excel(TRAIN_PATH)
dataset = get_dataset(df)

In [None]:
from sklearn.model_selection import train_test_split

dataset = dataset.train_test_split(test_size = 0.1, seed=RANDOM_SEED)
dataset

In [None]:
example = dataset['train'][0]
example

# TensorFlow model

In [None]:
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel
from transformers import AutoModelForMaskedLM

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from transformers import TFBertModel

AUTO = tf.data.experimental.AUTOTUNE
# Configuration
EPOCHS = 3
BATCH_SIZE = 8
MAX_LEN = 512

In [None]:
from sklearn.model_selection import train_test_split

# Hugging Face model

In [None]:
# model_checkpoint = "klue/roberta-large"
# model_checkpoint = "klue/bert-base"
model_checkpoint = "beomi/kcbert-large"
# model_checkpoint = "/content/drive/MyDrive/QIA2023_phase2/temp-checkpoints/kcbert-large/checkpoint-8500-lr1e5-031"
# model_checkpoint = "beomi/kcbert-large"

config    = AutoConfig.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model     = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=16, id2label=id2label, label2id=label2id
)

In [None]:
model

In [None]:
# dataset = get_dataset()
# dataset

In [None]:
# example = dataset['train'][0]
# example

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation = True)

In [None]:
tokenized_ds = dataset.map(preprocess_function, batched = True)

# Metrics

In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# Class Training Arguments + Trainer

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir = "/content/drive/MyDrive/QIA2023_phase2/temp-checkpoints/kcbert-large",
    overwrite_output_dir = 'True',
    learning_rate = 1e-5,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 20,
    weight_decay = 0.01,
    evaluation_strategy = "steps",
    save_strategy = "steps",
    eval_steps = 200,
    save_steps = 200,
)

In [None]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_ds["train"],
    eval_dataset = tokenized_ds["test"],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)

In [None]:
import torch, gc
import os
gc.collect()
torch.cuda.empty_cache()

# Training

In [None]:
trainer.train()
# trainer.save_model("checkpoints")

# Deploy Model

In [None]:
from tqdm import tqdm
import torch, gc
from transformers import DataCollatorWithPadding

device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
test_df = pd.read_excel(TEST_PATH)
# test_df = preprocess_data(test_df)

In [None]:
model_checkpoint = '/content/drive/MyDrive/QIA2023_phase2/temp-checkpoints/kcbert-large/checkpoint-2000'

config    = AutoConfig.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length=300)
model     = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=16, id2label=id2label, label2id=label2id
).to(device)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# texts = []

# for index, row in test_df.iterrows():
#     texts.append(row['Long_Answer'])
# len(texts)

texts, user_ids = get_test_dataset(test_df)

In [None]:
texts[0]

In [None]:
raw_logits = {
    'I/E': [],
    'S/N': [],
    'T/F': [],
    'J/P': []
}

final_logits = {
    'User_ID': user_ids,
    'I/E': [],
    'S/N': [],
    'T/F': [],
    'J/P': []
}

for i in tqdm(range(0, len(texts), 16)):
    current_batch_size = min(16, len(texts) - i)

    inputs = texts[i: i + current_batch_size]
    inputs = tokenizer(inputs, truncation=True)
    inputs = data_collator(inputs).to(device)

    logits = model(**inputs).logits
    logits = torch.nn.Softmax(dim = 1)(logits)
    logits = torch.permute(logits, (1, 0)).cpu().data
    one_logits = torch.zeros(4, current_batch_size)

    gc.collect()
    torch.cuda.empty_cache()

    for label in labels:
        if ('E' in label):  one_logits[0] += logits[label2id[label]]
        if ('N' in label):  one_logits[1] += logits[label2id[label]]
        if ('F' in label):  one_logits[2] += logits[label2id[label]]
        if ('P' in label):  one_logits[3] += logits[label2id[label]]
    
    raw_logits['I/E'] += one_logits[0].tolist()
    raw_logits['S/N'] += one_logits[1].tolist()
    raw_logits['T/F'] += one_logits[2].tolist()
    raw_logits['J/P'] += one_logits[3].tolist()

combine_steps = int(60 / COMBINE_SIZE)

def calc(row):
  cnt = 0
  for x in row:
    if x >= 0.5:
      cnt += 1
  if cnt >= int(len(row) / 2):
    return max(row)
  return min(row)

for i in tqdm(range(0, len(texts), combine_steps)):
    final_logits['I/E'] += [calc(raw_logits['I/E'][i:i+combine_steps])]
    final_logits['S/N'] += [calc(raw_logits['S/N'][i:i+combine_steps])]
    final_logits['T/F'] += [calc(raw_logits['T/F'][i:i+combine_steps])]
    final_logits['J/P'] += [calc(raw_logits['J/P'][i:i+combine_steps])]

In [None]:
result = pd.DataFrame(final_logits)
result.index += 1
result.to_csv('/content/drive/MyDrive/to_submit/Results/Phase2/kcbert_combine_2_ver2.csv')

In [None]:
# result = pd.DataFrame(raw_logits)
# result.index += 1
# result.to_csv('/content/drive/MyDrive/QIA2023_phase2/result/raw_kcbert_combine_2_2000_phase2only_lr1e5_v2.csv', index_label="User_ID")

In [None]:
# test = pd.DataFrame({
#     'I/E': [MBTI[0] for MBTI in MBTIs], 
#     'S/N': [MBTI[1] for MBTI in MBTIs], 
#     'T/F': [MBTI[2] for MBTI in MBTIs], 
#     'J/P': [MBTI[3] for MBTI in MBTIs], 
# })
# test.index += 1
# test.to_csv(f'/content/drive/MyDrive/QIA2023_phase1/result/single-flow-1/result-{checkpoint_idx}.csv', index_label="idx")

In [None]:
# %cp -av /content/Single-Flow-model/checkpoint-9000 /content/drive/MyDrive/QIA2023_phase1/checkpoint