In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers
!pip install torch

!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

Collecting mxnet
  Downloading mxnet-1.9.1-py3-none-manylinux2014_x86_64.whl (49.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.1/49.1 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting graphviz<0.9.0,>=0.8.1 (from mxnet)
  Downloading graphviz-0.8.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: graphviz, mxnet
  Attempting uninstall: graphviz
    Found existing installation: graphviz 0.20.1
    Uninstalling graphviz-0.20.1:
      Successfully uninstalled graphviz-0.20.1
Successfully installed graphviz-0.8.4 mxnet-1.9.1
Collecting gluonnlp
  Downloading gluonnlp-0.10.0.tar.gz (344 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m344.5/344.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gluonnlp
  Building wheel for gluonnlp (setup.py) ... [?25l[?25hdone
  Created wheel for gluonnlp: filename=gluonnlp-0.10.0-cp310-cp310-li

In [3]:
import torch
from torch import nn
from torch.utils.data import Dataset
import gluonnlp as nlp
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel

# Set device
device = torch.device("cpu")

class BERTClassifier(nn.Module):
    def __init__(self, bert, hidden_size=768, num_classes=4, dr_rate=0.3):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
        self.classifier = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        attention_mask[:, :valid_length] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        # valid_length를 사용하여 token_type_ids 생성
        token_type_ids = segment_ids.new_full((1, valid_length), 1)  # 1은 두 번째 문장을 의미

        _, pooler = self.bert(input_ids=token_ids, token_type_ids=token_type_ids, attention_mask=attention_mask.float(), return_dict=False)

        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)


# Initialize tokenizer and model
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bert_model = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')

# Initialize model
model = BERTClassifier(bert_model, dr_rate=0.5).to(device)

# Load trained model state_dict
#model = torch.load('/content/drive/MyDrive/인지프_프로젝트/12-11-model-kobert.pt', map_location='cpu')
model.load_state_dict(torch.load('/content/drive/MyDrive/인지프_프로젝트/best-param-earlystop.pth', map_location='cpu'))
model.eval()

tokenizer_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/371k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


config.json:   0%|          | 0.00/535 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/369M [00:00<?, ?B/s]

BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

In [21]:
# 감정 예측 함수 정의
def predict_emotion(sentence, model, tokenizer):
    # 입력 문장 토큰화
    tokens = tokenizer.tokenize(sentence)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)

    # Attention mask 생성
    valid_length = len(token_ids)
    attention_mask = torch.ones(valid_length).to(device)

    # Segment_ids 생성
    segment_ids = torch.zeros(valid_length, dtype=torch.long).to(device)

    # 모델에 입력 데이터 전달
    with torch.no_grad():
        # 모델 입력 데이터와 attention_mask를 동일한 디바이스로 이동
        outputs = model(torch.tensor([token_ids]).to(device), valid_length, segment_ids)

    # 출력 중 가장 높은 확률을 가진 클래스 선택
    _, predicted_class = torch.max(outputs, 1)

    return predicted_class.item()

# 예측 예시
input_sentence = input()
predicted_emotion = predict_emotion(input_sentence, model, tokenizer)

emotion_dict = {0 : '분노', 1 : '슬픔', 2 : '불안', 3 : '행복'}

# 예측된 감정 출력
print("입력 문장:", input_sentence)
print("예측된 감정:", emotion_dict[predicted_emotion])

아 개빡쳐 친구새끼
입력 문장: 아 개빡쳐 친구새끼
예측된 감정: 분노
