# 환경 설정 & KoBERT 설치
- https://github.com/ChangZero/koBERT-finetuning-demo/blob/main/kobert_colab.ipynb

In [4]:
# wandb 설치
!pip install wandb



In [5]:
# # install and restart
# !pip install mxnet
# !pip install gluonnlp==0.8.0
# !pip install tqdm pandas
# !pip install sentencepiece
# !pip install transformers
# !pip install torch>=1.8.1
# !pip install transformers
# !pip install wandb # wandb 설치

# !pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

In [6]:
import gluonnlp as nlp
from transformers import BertModel
from kobert_tokenizer import KoBERTTokenizer
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
import seaborn as sns
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import datetime
import wandb
import time

In [7]:
# torch GPU 설정
device_type = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(device_type)
device

device(type='cuda')

# git clone data

In [8]:
!git clone https://github.com/huijunam/CJ_AI_hackathon.git

fatal: destination path 'CJ_AI_hackathon' already exists and is not an empty directory.


In [9]:
import pandas as pd
df_data = pd.read_csv('/content/CJ_AI_hackathon/data/final_dataset.csv')

In [10]:
df_data

Unnamed: 0,Sentence,Emotion
0,언니 동생으로 부르는게 맞는 일인가요..??,공포
1,그냥 내 느낌일뿐겠지?,공포
2,아직너무초기라서 그런거죠?,공포
3,유치원버스 사고 낫다던데,공포
4,근데 원래이런거맞나요,공포
...,...,...
90090,얘긴 다 끝났냐? 원예부,중립
90091,"예. 그거 때문에, 부탁이 있......는......데요.",중립
90092,여자 숨겨달라는거면 사절이다.,중립
90093,아무래도 안되나요?,중립


In [11]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90095 entries, 0 to 90094
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Sentence  90095 non-null  object
 1   Emotion   90095 non-null  object
dtypes: object(2)
memory usage: 1.4+ MB


# target classes

In [12]:
target_classes ={
    '공포': 0,
    '놀람': 1,
    '분노': 2,
    '슬픔': 3,
    '중립': 4,
    '행복': 5,
    '혐오': 6
  }

In [13]:
# 레이블링
for cls in target_classes:
  df_data.loc[(df_data['Emotion'] == cls), 'Emotion'] = target_classes[cls]

In [14]:
data_list =[]
# ['sentence', 'class'] 형태로 변환
for q, label in zip(df_data['Sentence'], df_data['Emotion']):
  data =[]
  data.append(q)
  data.append(str(label))

  data_list.append(data)

In [15]:
print(data_list[6000])

['36도라고...미쳤다', '1']


In [16]:
model_list =['koBERT','KoELECTRA', 'KcBERT', 'KcELECTRA']

In [17]:
# 하이퍼 파라미터 설정
max_length = 100
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5
random_seed = 42

# wandb 설정

In [48]:
!wandb login 09b76e2d7910cb85b58be240c8cf088ff601d126

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [24]:
# # config
# wandb.config ={
#   "learning_rate": learning_rate,
#   "epochs": num_epochs,
#   "batch_size": batch_size,
#   "seed": random_seed
# }

In [19]:
# wandb init function
def wandb_init(model,max_length=100,
               batch_size=64,num_epochs=5,
               log_interval=200,learning_rate=5e-5,extra='',random_seed=42):

  # config
  wandb.config ={
    "learning_rate": learning_rate,
    "epochs": num_epochs,
    "batch_size": batch_size,
    "seed": random_seed
  }
  # project: 폴더 이름, entity: team_5g (팀), name: 기록명
  test_time = datetime.datetime.now()
  extra ='' # 원본 데이터 일 경우
  # extra ='drop_data_' # drop data 일 경우
  wandb.init(project="KoBERT",name=f"{model}_epoch_{num_epochs}_{extra}_batch_size_{batch_size}_learning_rate_{learning_rate}/{test_time}",notes=f"koBERT-epoch_{num_epochs}-batch_size_{batch_size}-random_seed{random_seed}-learning_rate_{learning_rate}", entity='team_5g')
  # loss 추적
  wandb.define_metric('train_loss', summary='min')
  wandb.define_metric('val_loss', summary='min')
  # accuracy score 추적
  wandb.define_metric('train_accuracy', summary='max')
  wandb.define_metric('val_accuracy', summary='max')


In [26]:
# # project: 폴더 이름, entity: team_5g (팀), name: 기록명
# test_time = datetime.datetime.now()
# extra ='' # 원본 데이터 일 경우
# # extra ='drop_data_' # drop data 일 경우
# wandb.init(project="KoBERT",name=f"{model}_epoch_{num_epochs}_{extra}batch_size_{batch_size}_learning_rate_{learning_rate}/{test_time}",notes=f"koBERT-epoch_{num_epochs}-batch_size_{batch_size}-random_seed{random_seed}-learning_rate_{learning_rate}", entity='team_5g')
# # loss 추적
# wandb.define_metric('train_loss', summary='min')
# wandb.define_metric('val_loss', summary='min')
# # accuracy score 추적
# wandb.define_metric('train_accuracy', summary='max')
# wandb.define_metric('val_accuracy', summary='max')

In [25]:
# BERTSentenceTransform 수정
class BERTSentenceTransform:
    r"""BERT style data transformation.

    Parameters
    ----------
    tokenizer : BERTTokenizer.
        Tokenizer for the sentences.
    max_seq_length : int.
        Maximum sequence length of the sentences.
    pad : bool, default True
        Whether to pad the sentences to maximum length.
    pair : bool, default True
        Whether to transform sentences or sentence pairs.
    """

    def __init__(self, tokenizer, max_seq_length, vocab, pad=True, pair=True):
        self._tokenizer = tokenizer
        self._max_seq_length = max_seq_length
        self._pad = pad
        self._pair = pair
        self._vocab = vocab

    def __call__(self, line):
        """Perform transformation for sequence pairs or single sequences.

        The transformation is processed in the following steps:
        - tokenize the input sequences
        - insert [CLS], [SEP] as necessary
        - generate type ids to indicate whether a token belongs to the first
        sequence or the second sequence.
        - generate valid length

        For sequence pairs, the input is a tuple of 2 strings:
        text_a, text_b.

        Inputs:
            text_a: 'is this jacksonville ?'
            text_b: 'no it is not'
        Tokenization:
            text_a: 'is this jack ##son ##ville ?'
            text_b: 'no it is not .'
        Processed:
            tokens: '[CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]'
            type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
            valid_length: 14

        For single sequences, the input is a tuple of single string:
        text_a.

        Inputs:
            text_a: 'the dog is hairy .'
        Tokenization:
            text_a: 'the dog is hairy .'
        Processed:
            text_a: '[CLS] the dog is hairy . [SEP]'
            type_ids: 0     0   0   0  0     0 0
            valid_length: 7

        Parameters
        ----------
        line: tuple of str
            Input strings. For sequence pairs, the input is a tuple of 2 strings:
            (text_a, text_b). For single sequences, the input is a tuple of single
            string: (text_a,).

        Returns
        -------
        np.array: input token ids in 'int32', shape (batch_size, seq_length)
        np.array: valid length in 'int32', shape (batch_size,)
        np.array: input token type ids in 'int32', shape (batch_size, seq_length)

        """

        # convert to unicode
        text_a = line[0]
        if self._pair:
            assert len(line) == 2
            text_b = line[1]

        tokens_a = self._tokenizer.tokenize(text_a) # 수정한 부분
        tokens_b = None

        if self._pair:
            tokens_b = self._tokenizer(text_b)

        if tokens_b:
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            self._truncate_seq_pair(tokens_a, tokens_b,
                                    self._max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > self._max_seq_length - 2:
                tokens_a = tokens_a[0:(self._max_seq_length - 2)]

        # The embedding vectors for `type=0` and `type=1` were learned during
        # pre-training and are added to the wordpiece embedding vector
        # (and position vector). This is not *strictly* necessary since
        # the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.

        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        #vocab = self._tokenizer.vocab
        vocab = self._vocab
        tokens = []
        tokens.append(vocab.cls_token)
        tokens.extend(tokens_a)
        tokens.append(vocab.sep_token)
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens.extend(tokens_b)
            tokens.append(vocab.sep_token)
            segment_ids.extend([1] * (len(tokens) - len(segment_ids)))

        input_ids = self._tokenizer.convert_tokens_to_ids(tokens)

        # The valid length of sentences. Only real  tokens are attended to.
        valid_length = len(input_ids)

        if self._pad:
            # Zero-pad up to the sequence length.
            padding_length = self._max_seq_length - valid_length
            # use padding tokens for the rest
            input_ids.extend([vocab[vocab.padding_token]] * padding_length)
            segment_ids.extend([0] * padding_length)

        return np.array(input_ids, dtype='int32'), np.array(valid_length, dtype='int32'),\
            np.array(segment_ids, dtype='int32')

In [26]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len,
                 pad, pair):
        transform = BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len,vocab=vocab, pad=pad, pair=pair)
        #transform = nlp.data.BERTSentenceTransform(
        #    tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [27]:
# kobert 공식 git에 있는 get_kobert_model 선언
def get_kobert_model(model_path, ctx=device):
    tokenizer = KoBERTTokenizer.from_pretrained(model_path)
    bertmodel = BertModel.from_pretrained(model_path)
    device = torch.device(ctx)
    bertmodel.to(device)
    bertmodel.eval()
    vocab_b_obj = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file,
                                                         padding_token='[PAD]')
    return bertmodel, vocab_b_obj, tokenizer

In [28]:
bertmodel, vocab, tokenizer = get_kobert_model('skt/kobert-base-v1')
# tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower = False)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [29]:
train_list, test_list = train_test_split(data_list, test_size = 0.2, shuffle = True, random_state = random_seed)

In [30]:
data_train = BERTDataset(train_list, 0, 1, tokenizer, vocab, max_length, True, False)
data_test = BERTDataset(test_list, 0, 1, tokenizer, vocab, max_length, True, False)

In [31]:
data_train[0] # 토큰화와 패딩 확인

(array([   2, 2926, 7670, 6122, 7523,  680,  329,  329,  835,  202,    0,
          93, 6844, 4128, 6295, 7266, 2135, 6037, 5659,  835, 3990, 6459,
        6825,  786,    3,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1], dtype=int32),
 array(25, dtype=int32),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [32]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size = batch_size, num_workers = 2)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size = batch_size, num_workers = 2)

In [33]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = len(target_classes),   # 클래스 현재 7개
                 dr_rate = None,
                 params = None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p = dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device),return_dict = False)
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [34]:
model = BERTClassifier(bertmodel, dr_rate = 0.5).to(device)

In [35]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr = learning_rate)
loss_fn = nn.CrossEntropyLoss() # 다중분류를 위한 loss function

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps = warmup_step, num_training_steps = t_total)

In [34]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

# KoBERT 모델 학습

In [None]:
KoBERT_y_preds =[]
KoBERT_label_list =[]

# results ={
#       "train_loss": [],
#       "train_acc": [],
#       "val_loss": [],
#       "val_acc": []
#   }
# wandb_init
wandb_init(model='koBERT') # model_list =['koBERT','KoELECTRA', 'KcBERT', 'KcELECTRA']
# watch model
wandb.watch(model)
for e in range(num_epochs):
  train_acc = 0.0
  test_acc = 0.0
  train_loss =0.0
  test_loss =0.0

  model.train() #train
  start_time = time.time() # time

  for batch_id, (token_ids, valid_length, segment_ids, labels) in enumerate(tqdm(train_dataloader)):
    optimizer.zero_grad()
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length= valid_length
    labels = labels.long().to(device)

    y_pred = model(token_ids, valid_length, segment_ids)

    loss = loss_fn(y_pred, labels)
    loss.backward()
    # 에포크 전체 손실을 누적합니다.
    train_loss += loss.item()

    torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
    optimizer.step()
    scheduler.step()  # Update learning rate schedule
    train_acc += calc_accuracy(y_pred, labels)

    if batch_id % log_interval == 0:
      print("epoch {} batch id {} loss {:.4f} train acc {:.4f}".format(e+1,batch_id+1,train_loss/(batch_id+1),train_acc/(batch_id+1)))
  # results["train_loss"].append(train_loss.data.cpu().numpy()/len(train_dataloader))
  # results["train_acc"].append(train_acc/len(train_dataloader))

    # wandb log
    wandb.log({"train_acc": train_acc/len(train_dataloader)}, commit=False)
    wandb.log({"train_loss": train_loss/len(train_dataloader)}, commit=False)

  print("epoch {} train acc {:.4f}".format(e+1, train_acc / len(train_dataloader)))

  model.eval() # inference
  for batch_id, (token_ids, valid_length, segment_ids, labels) in enumerate(tqdm(test_dataloader)):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length = valid_length
    labels = labels.long().to(device)
    test_pred = model(token_ids, valid_length, segment_ids)

    KoBERT_y_preds.extend(test_pred.cpu().detach().numpy()) # for confusion matrix
    KoBERT_label_list.extend(labels.cpu().numpy()) # for confusion matrix true label

    loss = loss_fn(test_pred, labels)
    loss.backward()
    test_loss += loss.item()
    test_acc += calc_accuracy(test_pred, labels)


  # results["val_loss"].append(test_loss.data.cpu().numpy()/len(test_dataloader))
  # results["val_acc"].append(test_acc/ len(test_dataloader))

  end_time = time.time()

  # wandb log
  wandb.log({"val_acc": test_acc/len(test_dataloader)}, commit=False)
  wandb.log({"val_loss": test_loss/len(test_dataloader)})

  print("epoch {} test acc {:.4f}".format(e+1, test_acc / len(test_dataloader)))
  print("Time: {:.4f}sec".format((end_time - start_time)))

KoBERT_label_list = np.array(KoBERT_label_list)
KoBERT_y_preds = np.array(KoBERT_y_preds)

In [None]:
# torch.save(model.state_dict(), "./base_model_koBERT_epoch3_.pt")

# KcBERT 모델
https://github.com/Beomi/KcBERT

# KcELECTRA 모델
https://github.com/Beomi/KcELECTRA

# KoELECTRA 모델
https://github.com/monologg/KoELECTRA

## Dataset

In [21]:
# Define a custom dataset class
class custom_dataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_length):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = self.texts[idx]
    label = self.labels[idx]
    encoding = self.tokenizer(text,padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
    input_ids = encoding['input_ids'].squeeze() #
    attention_mask = encoding['attention_mask'].squeeze() #

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        # 'label': label
        # 'labels': torch.tensor(int(label), dtype=torch.float)
        'labels': torch.tensor(int(label), dtype=torch.long)
    }

In [40]:
# # Define the custom collate function
# def collate_fn(batch):
#  input_ids = [item['input_ids'] for item in batch]
#  attention_masks = [item['attention_mask'] for item in batch]
#  labels = [item['labels'] for item in batch]

#  # Pad or truncate sequences to the same length within each batch
#  input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
#  attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True)
#  labels = torch.stack(labels)

#  return {
#  'input_ids': input_ids,
#  'attention_mask': attention_masks,
#  'labels': labels
#  }

In [22]:
train_list, test_list = train_test_split(data_list, test_size = 0.2, shuffle = True, random_state = random_seed)
train_list = pd.DataFrame(train_list, columns=['Sentence', 'Emotion'])
test_list = pd.DataFrame(test_list, columns=['Sentence', 'Emotion'])

# Load your training data
X_train = train_list['Sentence']
Y_train = train_list['Emotion']

# Load your testing data
X_test = test_list['Sentence']
Y_test = test_list['Emotion']

In [23]:
# dataset
train_dataset = custom_dataset(X_train, Y_train, tokenizer, max_length)
test_dataset = custom_dataset(X_test, Y_test, tokenizer, max_length)

# dataloader
train_data_loader = DataLoader(train_dataset,batch_size=batch_size,shuffle=True)
test_data_loader = DataLoader(test_dataset,batch_size=batch_size,shuffle=True)

In [53]:
# text, attention_mask, y = train_dataset[0]
# attention_mask.shape

# 학습

In [49]:
# Load the pre-trained LLM model and tokenizer
model_name = "monologg/koelectra-small-v3-discriminator"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels =len(target_classes))
model.to(device)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-small-v3-discriminator and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Li

In [50]:
KoELECTRA_y_preds =[] # confusion matrix
KoELECTRA_label_list =[] # confusion matrix

In [51]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [None]:
# wandb_init
wandb_init(model='KoELECTRA') # model_list =['koBERT','KoELECTRA', 'KcBERT', 'KcELECTRA']

# watch model
wandb.watch(model)
for e in range(num_epochs):
  train_acc = 0.0
  test_acc = 0.0
  train_loss =0.0
  test_loss =0.0

  start_time = time.time() # time

  model.train() #train
  for batch_id, batch in enumerate(tqdm(train_data_loader)):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']

    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    labels = labels.to(device)

    # 그래디언트 초기화
    optimizer.zero_grad()
    # 모델에 입력을 주어 예측을 생성합니다.
    outputs = model(input_ids, attention_mask=attention_mask)
    # 모델 출력에서 로짓(분류에 대한 점수)을 얻습니다.
    # 모델 예측
    logits = outputs.logits  # 최신 Transformers 라이브러리에서는 이렇게 사용할 수 있을 수 있음
    # 손실을 계산합니다.
    loss = criterion(logits, labels)
    # 역전파를 통해 그래디언트 계산
    loss.backward()
    # 에포크 전체 손실을 누적합니다.
    train_loss += loss.item()
    # 옵티마이저를 사용해 가중치를 업데이트
    optimizer.step()
    # 정확도
    # train_acc += calc_accuracy(np.argmax(logits.cpu().detach().numpy()), label)
    train_acc += (logits.argmax(dim=1)==labels).sum().item() / ((labels.size(0))*(batch_id+1))

    if batch_id % log_interval == 0:
      print("epoch {} batch id {} loss {:.4f} train acc {:.4f}".format(e+1, batch_id+1,train_loss/(batch_id+1),train_acc))

  # # wandb log
  wandb.log({"train_acc": train_acc/len(train_data_loader)}, commit=False)
  wandb.log({"train_loss": train_loss/len(train_data_loader)},commit=False)

  print("epoch {} train acc {:.4f}".format(e+1, train_acc / len(train_data_loader)))

  # 모델 평가
  model.eval()
  with torch.no_grad():
    for batch_id, batch in enumerate(test_data_loader):
      # Validation 데이터 가져오기
      input_ids = batch['input_ids']
      attention_mask = batch['attention_mask']
      labels = batch['labels']

      input_ids = input_ids.to(device)
      attention_mask = attention_mask.to(device)
      labels = labels.to(device)

      # 모델 예측
      outputs = model(input_ids, attention_mask=attention_mask)
      logits = outputs.logits

      KoELECTRA_y_preds.extend(logits.argmax(dim=1)) # for confusion matrix
      KoELECTRA_label_list.extend(labels.cpu().numpy()) # for confusion matrix true label

      # 손실 계산
      loss = criterion(logits,labels)
      test_loss += loss.item()

      # 정확도 계산
      # preds = logits.argmax(dim=1)
      # test_acc += calc_accuracy(np.argmax(logits), label)
      test_acc += (logits.argmax(dim=1)==labels).sum().item() /((labels.size(0))*(batch_id+1))
      if batch_id % log_interval == 0:
        print("epoch {} batch id {} loss {:.4f} train acc {:.4f}".format(e+1, batch_id+1,test_loss/(batch_id+1),test_acc))

    end_time = time.time()
    # wandb log
    wandb.log({"val_acc": test_acc/len(test_data_loader)}, commit=False)
    wandb.log({"val_loss": test_loss/len(test_data_loader)})

    print("epoch {} test acc {:.4f}".format(e+1, test_acc / len(test_data_loader)))
    print("Time: {:.4f}sec".format((end_time - start_time)))

  # val_avg_loss = val_total_loss / len(test_data_loader)
  # val_accuracy = correct / total
  # print(f"Validation Loss: {val_avg_loss:.4f} - Validation Accuracy: {val_accuracy:.4f}")

KoELECTRA_label_list = np.array(KoELECTRA_label_list)
KoELECTRA_y_preds = np.array(KoELECTRA_y_preds)

In [None]:
torch.save(model.state_dict(), "./KoELECTRA_small_v3_epoch5_.pt")

# 평가

In [None]:
cm = confusion_matrix(KoELECTRA_label_list, KoELECTRA_y_preds)
# Define class labels (assuming 7 classes in this example)
class_labels = list(dict(map(reversed, target_classes.items())).keys()) # true label

plt.figure(figsize=(8, 6))
fig, ax = plot_confusion_matrix(conf_mat=cm, colorbar=True,
show_absolute=False, show_normed=True, class_names=class_labels)
plt.show()

# Create a heatmap for the confusion matrix
# plt.figure(figsize=(8, 6))
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
# plt.xlabel('Predicted Labels')
# plt.ylabel('True Labels')
# plt.title('Confusion Matrix')
# plt.show()

In [None]:
# wandb_img = wandb.Image(cm, caption=f"basemodel_epoch_{num_epochs}_figure")

# wandb.log({"Confusion Matrix": wandb_img})

# 키워드 추출
- 명사, 동사, 형용사

In [None]:
# !pip install konlpy

In [None]:
# from konlpy.tag import Okt

In [None]:
# okt = Okt()
# kor_sentence = '너무 배고파 ㅜㅜ'
# print('OKT 품사 태깅 :', okt.pos(kor_sentence))

OKT 품사 태깅 : [('너무', 'Adverb'), ('배고파', 'Adjective'), ('ㅜㅜ', 'KoreanParticle')]


In [None]:
# # 명사, 동사, 형용사 추출하기
# keyword = [x for (x, y) in okt.pos(kor_sentence) if y in ['Verb', 'Noun', 'Adjective']]
# print('\nOKT 명사, 동사, 형용사 추출 :', keyword)


OKT 명사, 동사, 형용사 추출 : ['배고파']


# 새로운 문장 테스트

In [None]:
# target_classes ={
#     '공포': 0,
#     '놀람': 1,
#     '분노': 2,
#     '슬픔': 3,
#     '중립': 4,
#     '행복': 5,
#     '혐오': 6
#   }

In [None]:
# reverse =dict(map(reversed, target_classes.items()))
# print(reverse)

{0: '공포', 1: '놀람', 2: '분노', 3: '슬픔', 4: '중립', 5: '행복', 6: '혐오'}


In [None]:
# list(reverse.keys())

[0, 1, 2, 3, 4, 5, 6]

In [None]:
# # 토큰화
# tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
# # tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower = False)

# def predict_and_keword(predict_sentence):
#   new_data =[[predict_sentence, '0']]
#   # print(new_data)

#   new_dataset = BERTDataset(new_data, 0, 1, tokenizer, vocab ,max_length, True, False)
#   new_dataloader = torch.utils.data.DataLoader(new_dataset, batch_size = batch_size, num_workers =5)

#   model.eval()
#   for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(new_dataloader)):
#     token_ids = token_ids.long().to(device)
#     segment_ids = segment_ids.long().to(device)
#     valid_length = valid_length
#     label = label.long().to(device)
#     out = model(token_ids, valid_length, segment_ids)

#     test_eval =[]
#     for logit in out:
#       test_eval.append(reverse[np.argmax(logit.detach().cpu().numpy())])

#     # 키워드 추출: 명사, 동사, 형용사 추출하기
#     keyword = [x for (x, y) in okt.pos(predict_sentence) if y in ['Verb', 'Noun', 'Adjective']]
#     print('\nOKT 명사, 동사, 형용사 추출 :', keyword)
#     test_eval += keyword

#     return test_eval

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [None]:
# predict_and_keword('어제 헤어졌어 ㅜ')



  0%|          | 0/1 [00:00<?, ?it/s]


OKT 명사, 동사, 형용사 추출 : ['어제', '헤어졌어']


['슬픔', '어제', '헤어졌어']

In [None]:
# predict_and_keword('진짜 짜증나')

  0%|          | 0/1 [00:00<?, ?it/s]


OKT 명사, 동사, 형용사 추출 : ['진짜', '짜증나']


['분노', '진짜', '짜증나']

In [None]:
# predict_and_keword('너무 무서워')

  0%|          | 0/1 [00:00<?, ?it/s]


OKT 명사, 동사, 형용사 추출 : ['무서워']


['공포', '무서워']

## 1. 중립 데이터 제거

In [None]:
# df_data_neut = df_data[df_data['Emotion'] == '중립'] # 확인

In [None]:
# df_data_drop = df_data[df_data['Emotion'] != '중립']
# df_data_drop # 중복 데이터 제거한 데이터

Unnamed: 0,Sentence,Emotion
0,언니 동생으로 부르는게 맞는 일인가요..??,공포
1,그냥 내 느낌일뿐겠지?,공포
2,아직너무초기라서 그런거죠?,공포
3,유치원버스 사고 낫다던데,공포
4,근데 원래이런거맞나요,공포
...,...,...
94132,뭐? 다시 한 번 말해봐.,분노
94152,어? 정말요?,놀람
94154,"혹시, 다들 은행 계좌없는 거예요?",놀람
94169,자네는 대체 뭘 하러 왔나! 젖은 생쥐 꼴이 된 나를 보면서 비웃으러 왔나?,분노


In [None]:
# len(df_data), len(df_data_neut)+len(df_data_drop) # 확인

45578

In [None]:
# df_data = df_data_drop

In [None]:
# target_classes ={
#     '공포': 0,
#     '놀람': 1,
#     '분노': 2,
#     '슬픔': 3,
#     # '중립': 4,
#     '행복': 4,
#     '혐오': 5
#   }