In [None]:
!pip install transformers
!pip install kss

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 4.9 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 86.4 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 53.4 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.24.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting kss
  Downloading kss-3.6.4.tar.gz (42.4 MB)
[K     |████████████████████████████████| 42.4 MB 1.2 MB/s 
[?25hCollectin

In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [None]:
import torch.nn as nn
from transformers import ElectraModel, AutoTokenizer
from transformers import ElectraConfig
import torch

import kss

import pandas as pd

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from tqdm import tqdm
import time

def kss_sentence(sent):
    x = ''
    split_sent = kss.split_sentences(sent)
    for i,s in enumerate(split_sent):
        if i == 0:
            x = s
        else:
            x += ' [SEP] ' + s
    return x

def one_hot_encode(data, n_label=44):
    data = list(map(int,data.split(',')))
    one_hot = [0] * n_label
    label_idx = data
    for idx in label_idx:
        one_hot[idx] = 1
    return torch.LongTensor(one_hot)

def data_preprocessing(df,electra_model):
    tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base", do_lower_case=False)
    X = df['줄거리'][:]
    pr_x = [] # sep 추가
    pr_y = [] # threshold 적용 (one hot)
    pred_y =[] # threshold 미적용
    for x in tqdm(X):
        sp_x = kss_sentence(x)
        embeddings = tokenizer(sp_x, truncation=True, max_length=512, padding="max_length", return_token_type_ids=False, return_attention_mask=True, add_special_tokens=True)

        input_id = torch.LongTensor(embeddings['input_ids']).unsqueeze(0).to('cuda')
        mask = torch.LongTensor(embeddings['attention_mask']).unsqueeze(0).to('cuda')
        sep_idx = torch.where(input_id == 3)

        pr_x.append(sp_x)
        pred,_,_ = electra_model(input_id, mask, sep_idx)
        sig_pred = torch.sigmoid(pred[0])
        
        
        temp = [] # 각 threshold 적용
        for y in sig_pred.tolist():
            if y >= 0.3:
                temp.append(1)
            else:
                temp.append(0)

        pr_y.append(temp)
        pred_y.append(sig_pred.tolist())

    return pr_x, pr_y, pred_y

device = "cuda" if torch.cuda.is_available() else "cpu"

class ELECTRALSTMClassification(nn.Module):
    def __init__(self):
        super().__init__()
        self.device = 'cuda'
        self.config = ElectraConfig.from_pretrained("beomi/KcELECTRA-base",
                                                    problem_type="multi_label_classification",
                                                    num_labels = 44) 
        
        self.embedding_size = 768
        self.batch_size = 32

        self.electra = ElectraModel.from_pretrained("beomi/KcELECTRA-base",config=self.config).to(self.device)
        self.lstm = nn.LSTM(self.embedding_size, self.embedding_size, batch_first=True, bidirectional=True).to(self.device)
        self.fc1 = nn.Linear(self.embedding_size * 5, 44)
        self.fc2 = nn.Linear(self.embedding_size * 2, 44)
        self.gelu = nn.GELU()


    def forward(self, input_ids=None, attention_mask=None, sep_idx=None):
        
        electra_output = self.electra(input_ids, attention_mask)[0]

        cls = electra_output[:, 0, :] # <CLS> embeddings
        # sep 토큰 가져오기
        sep_idx_x = sep_idx[0]
        sep_idx_y = sep_idx[1]

        idx = 0
        cnt = 0
        longest = torch.where(sep_idx_x==torch.mode(sep_idx_x).values)[0].size()[0]
        # 초기화
        sep_embeddings = torch.zeros(cls.size(0), longest, self.embedding_size).to(self.device)

        # embedding 값 집어넣어주기
        for x, y in zip(sep_idx_x, sep_idx_y):
            if idx == x:
                sep_embeddings[x, cnt, :] += electra_output[x, y, :]
                cnt += 1
            else:
                idx += 1
                cnt = 0
                sep_embeddings[x, cnt, :] += electra_output[x, y, :]


        # lstm 실행
        lstm_output, (h, c) = self.lstm(sep_embeddings) # (batch_size, seq_length, embedding_size)

        # lstm 처음과 끝 가져오기
        sep_first = lstm_output[:, 0, :]
        sep_last = lstm_output[:, -1, :]

        # lstm 결과와 cls 토큰 합치기
        concats = torch.cat((cls, sep_first, sep_last), dim=1)
        # fc 레이어에 넣고 44개 output
        x = self.gelu(concats)
        output = self.fc1(x)

        first_output = self.fc2(sep_first)
        last_output = self.fc2(sep_last)

        
        return output, first_output, last_output

electra_model_path= "/content/drive/MyDrive/final_project/data_processing/best_model_52.pth"

data_type = '영화_52'
data_path = '/content/drive/MyDrive/final_project/영화데이터.xlsx'

df = pd.read_excel(data_path)
df_movie = df[['제명', '줄거리']]

trained_model = ELECTRALSTMClassification()
trained_model.load_state_dict(torch.load(electra_model_path)['model_state_dict'],strict=False)
trained_model.to('cuda')


x_data,y_data, y_ori = data_preprocessing(df_movie,trained_model)
data_xy = pd.DataFrame(list(zip(x_data,y_data,y_ori)), columns = ['sep_text','emotion','pb_emotion'])
data_set = pd.concat([df_movie, data_xy], axis=1)

data_set.to_pickle(f"/content/drive/MyDrive/final_project/{data_type}_data.pkl")



Downloading:   0%|          | 0.00/504 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraModel: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/288 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/396k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/124 [00:00<?, ?B/s]

100%|██████████| 53650/53650 [44:45<00:00, 19.98it/s]


In [None]:
data_set.loc[data_set['제명']=='명량','줄거리']

27925    1597년 임진왜란 6년 오랜 전쟁으로 인해 혼란이 극에 달한 조선 무서운 속도로 ...
Name: 줄거리, dtype: object

In [None]:
data_set.loc[28689]['pb_emotion']

[0.03824764862656593,
 0.287714421749115,
 0.3486618399620056,
 0.07613112032413483,
 0.1442570984363556,
 0.39879825711250305,
 0.03241504356265068,
 0.1055382788181305,
 0.7053400874137878,
 0.06376766413450241,
 0.3627794086933136,
 0.24338549375534058,
 0.10541623085737228,
 0.19157084822654724,
 0.1649988889694214,
 0.559218168258667,
 0.6308729648590088,
 0.08410441875457764,
 0.13497766852378845,
 0.14769236743450165,
 0.09804009646177292,
 0.042275745421648026,
 0.046480193734169006,
 0.07248089462518692,
 0.23257319629192352,
 0.13526031374931335,
 0.029720952734351158,
 0.2928175628185272,
 0.2843879759311676,
 0.5433635115623474,
 0.06906551867723465,
 0.02992640621960163,
 0.24720409512519836,
 0.19350947439670563,
 0.05730673298239708,
 0.14604967832565308,
 0.14438873529434204,
 0.024886898696422577,
 0.47817888855934143,
 0.2259160876274109,
 0.354506254196167,
 0.6694902181625366,
 0.3358496427536011,
 0.4066759943962097]