In [None]:
!pip install transformers
!pip install kss

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

LABELS = ['불평/불만',
 '환영/호의',
 '감동/감탄',
 '지긋지긋',
 '고마움',
 '슬픔',
 '화남/분노',
 '존경',
 '기대감',
 '우쭐댐/무시함',
 '안타까움/실망',
 '비장함',
 '의심/불신',
 '뿌듯함',
 '편안/쾌적',
 '신기함/관심',
 '아껴주는',
 '부끄러움',
 '공포/무서움',
 '절망',
 '한심함',
 '역겨움/징그러움',
 '짜증',
 '어이없음',
 '없음',
 '패배/자기혐오',
 '귀찮음',
 '힘듦/지침',
 '즐거움/신남',
 '깨달음',
 '죄책감',
 '증오/혐오',
 '흐뭇함(귀여움/예쁨)',
 '당황/난처',
 '경악',
 '부담/안_내킴',
 '서러움',
 '재미없음',
 '불쌍함/연민',
 '놀람',
 '행복',
 '불안/걱정',
 '기쁨',
 '안심/신뢰']

In [None]:
import torch
import torch.nn as nn
from transformers import ElectraConfig, ElectraModel
import numpy as np


class ELECTRALSTMClassification(nn.Module):
    def __init__(self):
        super().__init__()
        self.device = 'cuda'
        self.config = ElectraConfig.from_pretrained("beomi/KcELECTRA-base",
                                                    problem_type="multi_label_classification",
                                                    num_labels = 44) 
        
        self.embedding_size = 768
        self.batch_size = 32

        self.electra = ElectraModel.from_pretrained("beomi/KcELECTRA-base",config=self.config).to(self.device)
        self.lstm = nn.LSTM(self.embedding_size, self.embedding_size, batch_first=True, bidirectional=True).to(self.device)
        self.fc1 = nn.Linear(self.embedding_size * 5, 44)
        self.fc2 = nn.Linear(self.embedding_size * 2, 44)
        self.gelu = nn.GELU()


    def forward(self, input_ids=None, attention_mask=None, sep_idx=None):
        
        electra_output = self.electra(input_ids, attention_mask)[0]

        cls = electra_output[:, 0, :] # <CLS> embeddings
        # sep 토큰 가져오기
        sep_idx_x = sep_idx[0]
        sep_idx_y = sep_idx[1]

        idx = 0
        cnt = 0
        longest = torch.where(sep_idx_x==torch.mode(sep_idx_x).values)[0].size()[0]
        # 초기화
        sep_embeddings = torch.zeros(cls.size(0), longest, self.embedding_size).to(self.device)

        # embedding 값 집어넣어주기
        for x, y in zip(sep_idx_x, sep_idx_y):
            if idx == x:
                sep_embeddings[x, cnt, :] += electra_output[x, y, :]
                cnt += 1
            else:
                idx += 1
                cnt = 0
                sep_embeddings[x, cnt, :] += electra_output[x, y, :]

        # lstm 실행
        lstm_output, (h, c) = self.lstm(sep_embeddings) # (batch_size, seq_length, embedding_size)

        # lstm 처음과 끝 가져오기
        sep_first = lstm_output[:, 0, :]
        sep_last = lstm_output[:, -1, :]

        # lstm 결과와 cls 토큰 합치기
        concats = torch.cat((cls, sep_first, sep_last), dim=1)
        # fc 레이어에 넣고 44개 output
        x = self.gelu(concats)
        output = self.fc1(x)

        first_output = self.fc2(sep_first)
        last_output = self.fc2(sep_last)

        
        return output, first_output, last_output

In [None]:
def cos_similiarity(v1, v2):
    dot_product = np.dot(v1, v2)
    l2_norm = (np.sqrt(sum(np.square(v1)))*np.sqrt(sum(np.square(v2))))
    similarity = dot_product/l2_norm

    return similarity

In [None]:
import kss

def kss_sentence(sent):
    x = ''
    split_sent = kss.split_sentences(sent)
    for i,s in enumerate(split_sent):
        if i == 0:
            x = s
        else:
            x += ' [SEP] ' + s
    return x

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('beomi/KcELECTRA-base', do_lower_case=False)

def embedding(text):
    embeddings = tokenizer(text,
                           truncation=True,
                           max_length=512,
                           padding="max_length",
                           return_token_type_ids=False,
                           return_attention_mask=True,
                           add_special_tokens=True)
    return embeddings

In [None]:
dic_ori = "계절학기를 마치고 오빠와 여의나루 역에 갔다. 도착해서 빠삐코, 탱크보이 하나씩 빨아주면서 돗자리를 폈다. 쨍한 여름날씨. 가방에서 책을 꺼내 손잡고 읽기 시작했다.이렇게 평화로울수가 없었다."
dic_data = kss_sentence(dic_ori)


movie_path = '/content/drive/MyDrive/final_project/영화_52_data.pkl'

movie_ori = pd.read_pickle(movie_path)

dic_emb = embedding(dic_data)


In [None]:
PATH = '/content/drive/MyDrive/final_project/data_processing/best_model_52.pth'

model = ELECTRALSTMClassification()
model.load_state_dict(torch.load(PATH)['model_state_dict'],strict=False)
model.to('cuda')

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ELECTRALSTMClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(50135, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,)

In [None]:
input_id = torch.LongTensor(dic_emb['input_ids']).unsqueeze(0).to('cuda')
mask = torch.LongTensor(dic_emb['attention_mask']).unsqueeze(0).to('cuda')
sep_idx = torch.where(input_id == 3)
 
y_pred = model(input_id, mask, sep_idx)

In [None]:
y = torch.sigmoid(y_pred[0])[0]
print(torch.sigmoid(y_pred[0]))
arr = np.zeros(44)
for i in range(44):
    if y.tolist()[i] >= 0.3:
        arr[i] = 1
    else:
        arr[i] = 0

tensor([[0.0439, 0.2028, 0.5314, 0.0362, 0.2874, 0.2047, 0.0095, 0.0250, 0.5479,
         0.0197, 0.1156, 0.0580, 0.0091, 0.4463, 0.3528, 0.2080, 0.0917, 0.0249,
         0.0083, 0.0555, 0.0126, 0.0030, 0.0433, 0.0151, 0.1810, 0.0516, 0.0239,
         0.2902, 0.7801, 0.2125, 0.0073, 0.0039, 0.0772, 0.0383, 0.0040, 0.0286,
         0.0797, 0.0453, 0.0204, 0.0416, 0.8634, 0.0355, 0.8710, 0.2546]],
       device='cuda:0', grad_fn=<SigmoidBackward0>)


In [None]:
np.array(y.detach().cpu())

array([0.04387942, 0.20282224, 0.5314014 , 0.03619223, 0.287428  ,
       0.20470592, 0.00953157, 0.02502867, 0.54787314, 0.01972395,
       0.11557244, 0.05797869, 0.00908455, 0.44626296, 0.35281157,
       0.2079633 , 0.09165575, 0.02492141, 0.00826655, 0.05549975,
       0.01256068, 0.00301333, 0.04330639, 0.01512295, 0.18095808,
       0.0515527 , 0.02390566, 0.29018793, 0.7801142 , 0.21250196,
       0.00733664, 0.00394556, 0.07715964, 0.03831826, 0.00395051,
       0.02859649, 0.07966546, 0.04529065, 0.0204489 , 0.04158196,
       0.86337024, 0.03547783, 0.87102807, 0.2546464 ], dtype=float32)

In [None]:
for l, p in zip(LABELS, y.tolist()):
    if p>= 0.4:
        print(f"{l}, {p}")


for i in [27925, 40687, 28689, 37771, 18398]:
    print(movie_ori.loc[i]['제명'])
    print(cos_similiarity(np.array(movie_ori.loc[i]['pb_emotion']),np.array(y.detach().cpu())))
    print('---')

감동/감탄, 0.5314013957977295
기대감, 0.5478731393814087
뿌듯함, 0.4462629556655884
즐거움/신남, 0.7801141738891602
행복, 0.8633702397346497
기쁨, 0.8710280656814575
명량
0.3663338233903888
---
극한직업
0.3140481055563002
---
인사이드 아웃
0.663657035257374
---
리틀 포레스트
0.8035442775314633
---
써니
0.7397084641332972
---
