In [1]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m42.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m91.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, TrainingArguments, Trainer

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("beomi/kcbert-base")

Downloading (…)okenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/250k [00:00<?, ?B/s]

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


- 슬픔 - 0  - 5267
- 중립 - 1 - 4830
- 행복 - 2 - 6037
- 혐오 - 3 - 5429
- 분노 - 4 - 5665
- 공포 - 5 - 5468
- 놀람 - 6 - 5898


In [4]:
data_df = pd.read_csv('/content/drive/MyDrive/sentiment_kcbert/emotion_data.csv', encoding = 'cp949')

In [5]:
data_df

Unnamed: 0,text,labels,Unnamed: 2,Unnamed: 3,Unnamed: 4,공포,5468
0,언니 동생으로 부르는게 맞는 일인가요..??,5,,,,놀람,5898.0
1,그냥 내 느낌일뿐겠지?,5,,,,분노,5665.0
2,아직너무초기라서 그런거죠?,5,,,,슬픔,5267.0
3,유치원버스 사고 낫다던데,5,,,,중립,4830.0
4,근데 원래이런거맞나요,5,,,,행복,6037.0
...,...,...,...,...,...,...,...
38589,솔직히 예보 제대로 못하는 데 세금이라도 아끼게 그냥 폐지해라..,3,,,,,
38590,재미가 없으니 망하지,3,,,,,
38591,공장 도시락 비우생적임 아르바이트했는데 화장실가성 손도 않씯고 재료 담고 바닥 떨어...,3,,,,,
38592,코딱지 만한 나라에서 지들끼리 피터지게 싸우는 센징 클래스 ㅉㅉㅉ,3,,,,,


In [6]:
train_df = data_df.sample(frac=0.8,random_state=11)
len(train_df)

30875

In [7]:
for i in range(6):
    print(len(train_df.loc[train_df['labels'] == i])) 

4205
3868
4820
4370
4553
4344


In [8]:
test_df = data_df.drop(train_df.index)
len(test_df)

7719

In [9]:
for i in range(6):
    print(len(test_df.loc[test_df['labels'] == i])) 

1062
962
1217
1059
1112
1124


In [10]:
tokenized_train_sentences = tokenizer(
    list(train_df["text"]),
    return_tensors= "pt",        # pytorch의 tensor 형태로 return
    max_length = 128,            # 최대 토큰 길이
    padding = True,              # 제로패팅
    truncation = True,           # 최대 토큰 길이 초과하면 자름
    add_special_tokens = True,   # special token 추가
)

In [11]:
print(tokenized_train_sentences[0])
print(tokenized_train_sentences[0].tokens)
print(tokenized_train_sentences[0].ids)
print(tokenized_train_sentences[0].attention_mask)

Encoding(num_tokens=128, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
['[CLS]', '최순실', '특별', '##법을', '입법', '##해서', '.', '.', '극', '##형에', '처', '##해야', '한다', '이말', '##입니다', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]',

In [12]:
tokenized_test_sentences = tokenizer(
    list(test_df["text"]),
    return_tensors= "pt",        # pytorch의 tensor 형태로 return
    max_length = 128,            # 최대 토큰 길이
    padding = True,              # 제로패팅
    truncation = True,           # 최대 토큰 길이 초과하면 자름
    add_special_tokens = True,   # special token 추가가
)

In [13]:
print(tokenized_test_sentences[0])
print(tokenized_test_sentences[0].tokens)
print(tokenized_test_sentences[0].ids)
print(tokenized_test_sentences[0].attention_mask)

Encoding(num_tokens=128, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
['[CLS]', '내가', '##불안', '##해서', '##꾸', '##는걸', '##까', '.', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]

In [14]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key : torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [15]:
train_label = train_df["labels"].values
test_label = test_df["labels"].values

train_dataset = SentimentDataset(tokenized_train_sentences, train_label)
test_dataset = SentimentDataset(tokenized_test_sentences, test_label)

In [22]:
model = AutoModelForSequenceClassification.from_pretrained("beomi/kcbert-base", num_labels=7)
model.to(device)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--beomi--kcbert-base/snapshots/99fc27ea7d643d8377ade8912c6c445a5e3861be/config.json
Model config BertConfig {
  "_name_or_path": "beomi/kcbert-base",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 300,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_si

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/438M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--beomi--kcbert-base/snapshots/99fc27ea7d643d8377ade8912c6c445a5e3861be/pytorch_model.bin
Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be 

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(300, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [23]:
args = TrainingArguments(
    output_dir='/content/drive/MyDrive/sentiment_kcbert',           # 학습결과 저장경로
    num_train_epochs=5,                                                 # 학습 에포크 설정
    per_device_train_batch_size=32,                                     # 학습 배치 사이즈 설정
    per_device_eval_batch_size=64,                                      # 테스트 배치 사이즈 설정  
    logging_dir='/content/drive/MyDrive/sentiment_kcbert/logs',      # 학습 log 저장 경로
    logging_steps=250,                                                  # 학습 log 기록 단위
    save_total_limit=10,                                                 # 학습 결과 저장 최대 갯수
    resume_from_checkpoint=True
)   

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [24]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [25]:
# 평가지표를 위한 함수
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy' : acc,
        'f1' : f1,
        'precision' : precision,
        'recall' : recall
    }

In [26]:
trainer = Trainer(
    model = model,
    args = args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    compute_metrics=compute_metrics,
)

In [27]:
trainer.train()

***** Running training *****
  Num examples = 30875
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 4825
  Number of trainable parameters = 108923911
  item = {key : torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
250,1.3357
500,1.2257
750,1.1944
1000,1.151
1250,0.8763
1500,0.8837
1750,0.8491
2000,0.7738
2250,0.4986
2500,0.489


Saving model checkpoint to /content/drive/MyDrive/sentiment_kcbert/checkpoint-500
Configuration saved in /content/drive/MyDrive/sentiment_kcbert/checkpoint-500/config.json
Model weights saved in /content/drive/MyDrive/sentiment_kcbert/checkpoint-500/pytorch_model.bin
  item = {key : torch.tensor(val[idx]) for key, val in self.encodings.items()}
Saving model checkpoint to /content/drive/MyDrive/sentiment_kcbert/checkpoint-1000
Configuration saved in /content/drive/MyDrive/sentiment_kcbert/checkpoint-1000/config.json
Model weights saved in /content/drive/MyDrive/sentiment_kcbert/checkpoint-1000/pytorch_model.bin
  item = {key : torch.tensor(val[idx]) for key, val in self.encodings.items()}
Saving model checkpoint to /content/drive/MyDrive/sentiment_kcbert/checkpoint-1500
Configuration saved in /content/drive/MyDrive/sentiment_kcbert/checkpoint-1500/config.json
Model weights saved in /content/drive/MyDrive/sentiment_kcbert/checkpoint-1500/pytorch_model.bin
  item = {key : torch.tensor(val

TrainOutput(global_step=4825, training_loss=0.581881980303038, metrics={'train_runtime': 3192.533, 'train_samples_per_second': 48.355, 'train_steps_per_second': 1.511, 'total_flos': 1.0154898156e+16, 'train_loss': 0.581881980303038, 'epoch': 5.0})

In [None]:
trainer.evaluate(eval_dataset=test_dataset)

***** Running Evaluation *****
  Num examples = 7719
  Batch size = 64
  item = {key : torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [None]:
def sentence_predict(sent):
    # 평가모드로 변경
    model.eval()

    # 입력된 문장 토크나이징
    tokenized_sent = tokenizer(
        sent,
        return_tensors="pt",
        truncation=True,
        add_special_tokens=True,
        max_length=128
    )

    # 모델이 위치한 GPU로 이동
    tokenized_sent.to(device)

    # 예측
    with torch.no_grad():
        outputs = model(
            input_ids = tokenized_sent["input_ids"],
            attention_mask=tokenized_sent["attention_mask"],
            token_type_ids=tokenized_sent["token_type_ids"]
        )

    # 결과 return
    logits = outputs[0]
    logits = logits.detach().cpu()
    prob = logits.softmax(dim=1)
    print(prob)
    result = logits.argmax(-1)
    if result == 0:
        result = "슬픔"
    elif result == 1:
        result = "중립"
    elif result == 2:
        result = "행복"
    elif result == 3: 
        result = "혐오"
    elif result == 4:
        result = "분노"
    elif result == 5:
        result = "공포"
    elif result == 6:
        result = "놀람"
    return result

#0 입력시 종료
while True:
     sentence = input("문장을 입력해주세요: ")
     if sentence == "0": 
         break
     print(sentence_predict(sentence))
     print("\n")
    

문장을 입력해주세요: 안녕하세요
tensor([[4.0922e-03, 5.6184e-03, 9.8639e-01, 1.5649e-04, 3.0154e-04, 2.5095e-03,
         9.3442e-04]])
행복


문장을 입력해주세요: 헉
tensor([[3.9641e-04, 4.4009e-04, 4.2540e-04, 4.8594e-04, 7.0759e-04, 3.3625e-03,
         9.9418e-01]])
놀람


문장을 입력해주세요: 시발
tensor([[5.2062e-03, 7.5220e-04, 7.6104e-04, 5.4827e-03, 9.8099e-01, 3.0987e-03,
         3.7059e-03]])
분노


문장을 입력해주세요: 장난해?
tensor([[1.2689e-03, 2.9341e-03, 3.6187e-04, 1.8164e-01, 7.4208e-01, 8.9744e-03,
         6.2743e-02]])
분노


문장을 입력해주세요: 집에 가고 싶어..
tensor([[9.9820e-01, 3.1445e-04, 3.0976e-04, 3.3057e-04, 1.5858e-04, 4.4775e-04,
         2.3802e-04]])
슬픔




KeyboardInterrupt: ignored