In [2]:
import math
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset
import gluonnlp as nlp
import numpy as np
from tqdm.notebook import tqdm

#kobert
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

device = torch.device("cuda:0")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
bertmodel, vocab = get_pytorch_kobert_model()

using cached model. /home/inmo/tide/.cache/kobert_v1.zip
using cached model. /home/inmo/tide/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [4]:
class BERTClassifier(nn.Module): ## 클래스를 상속
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=6,   ##클래스 수 조정##
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [5]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))  

In [6]:
max_len = 64   # 텍스트 데이터 최대 길이
batch_size = 64

In [7]:
PATH = './models/'
model = torch.load(PATH + '6emotions_model.pt')  # 전체 모델을 통째로 불러옴, 클래스 선언 필수
model.load_state_dict(torch.load(PATH + '6emotions_model_state_dict.pt'))  # state_dict를 불러 온 후, 모델에 저장

<All keys matched successfully>

In [8]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model. /home/inmo/tide/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [9]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce GTX 1050


In [10]:
emotion_list = ['분노','슬픔', '불안', '상처', '당황', '기쁨']

In [11]:
def predict(predict_sentence):

    data = [predict_sentence, '0']
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0, 1, tok, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=2)
    
    model.eval()

    for (token_ids, valid_length, segment_ids, label) in test_dataloader:
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)

        out = model(token_ids, valid_length, segment_ids)

        # for i, e in zip(out[0], emotion_list):
        #     print(f'{e}: {round(float(i),4)}')
        return out

In [12]:
predict('')

tensor([[-0.0885, -0.0890, -0.5017,  0.1378, -0.1167, -0.5003]],
       device='cuda:0', grad_fn=<AddmmBackward0>)

### 가사집 데이터 불러오기

In [13]:
from konlpy.tag import Okt, Kkma

lyrics_df = pd.read_csv('song_lyrics_df.csv', index_col=0)
lyrics_df.head(5)

Unnamed: 0,song_id,title,lyrics
0,52441,너에게로 또 다시,그 얼마나 오랜 시간을 \n 짙은 어둠에서 서성거렸나 \n 내 마음을 닫아 둔채...
1,53060,솔아 솔아 푸르른 솔아,거센 바람이 불어와서 \n 어머님의 눈물이 \n 가슴속에 사무쳐 우는 \n 갈라진 ...
2,1017150,그 아픔까지 사랑한거야,너를 처음 만난 날 소리없이 \n 밤새 눈은 내리고 \n 끝도 없이 찾아드는 기...
3,53018,향기로운 추억 (응답하라 1988 삽입곡),한줌 젖은 바람은 \n 이젠 희미해진 옛 추억 \n 어느 거리로 \n 날 데리고 가...
4,1859404,잊지 말아요,이젠 모두 지나버린 일이야 \n 사랑했던 그 추억 마저도 \n 하지만 멀리서 \n ...


### 문장별 리스트 및 한 줄 가사 생성

In [20]:
title = '마음이 말하는 행복 (Happiness) (Feat.이라온)'

lyrics = lyrics_df[lyrics_df.title == title].lyrics.item()
lyrics_list = [l for l in lyrics.split(' \\n ') if l != '']
lyrics = lyrics.replace(' \\n ', '')

lyrics_df[lyrics_df.title == title]

Unnamed: 0,song_id,title,lyrics
8486,32282389,마음이 말하는 행복 (Happiness) (Feat.이라온),소란했던 오늘의 하루가 \n 너와 함께 고요해 \n 작은 입술을 열어 \n 우리의 ...


In [15]:
lyrics_df[lyrics_df.title == title]

Unnamed: 0,song_id,title,lyrics
8696,32224166,너의 번호를 누르고 (Prod. 영화처럼),우연히 너를 만나서 \n 너의 옆자리에 앉아 \n 그렇게 우린 친해졌어 \n 짧은 ...


In [16]:
okt = Okt()
kkma = Kkma()

In [21]:
res = predict(lyrics)[0]
pre_emo = [0]*6
s_len = len(kkma.sentences(lyrics))
for lyric in kkma.sentences(lyrics):
    emo_tensor = predict(lyric)
    for i in range(6):
        x = float(emo_tensor[0][i])/s_len
        pre_emo[i] += x
for i in range(6):
    res[i] -= round(pre_emo[i],4)

res

tensor([-0.3902,  0.4337,  1.1908, -0.6545, -0.0517, -0.3632], device='cuda:0',
       grad_fn=<AsStridedBackward0>)

In [24]:
predict(lyrics)

tensor([[ 2.2508,  1.6192,  0.0179,  0.2913,  0.0930, -4.4530]],
       device='cuda:0', grad_fn=<AddmmBackward0>)

tensor([[-0.8521,  0.0367, -1.0053,  3.4312, -0.2189, -0.5578, -1.2379, -0.2323,
          1.5694,  0.0354]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [47]:
torch.cuda.memory_reserved()

937426944

In [46]:
torch.cuda.empty_cache()