In [3]:
from keybert import KeyBERT
from kobert.pytorch_kobert import get_pytorch_kobert_model
from kobert.utils import get_tokenizer
import torch
from torch import nn
from torch.utils.data import Dataset
import gluonnlp as nlp
import numpy as np
import pandas as pd
import math

from tqdm.notebook import tqdm

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# os.environ['LD_LIBRARY_PATH'] = '/usr/local/cuda/lib64'

In [4]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce GTX 1050


In [5]:
device = torch.device("cpu")

In [6]:
bertmodel, vocab = get_pytorch_kobert_model()
kw_model = KeyBERT(bertmodel)

using cached model. /home/inmo/tide/data/emo/.cache/kobert_v1.zip
using cached model. /home/inmo/tide/data/emo/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [7]:
class BERTClassifier(nn.Module): ## 클래스를 상속
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=10,   ##클래스 수 조정##
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

max_len = 64   # 텍스트 데이터 최대 길이
batch_size = 64

In [8]:
PATH = 'models/'
model = BERTClassifier(bertmodel)
model.load_state_dict(torch.load(PATH + '10emotions_model_state_dict_2_10epoch.pt', map_location='cpu'))  # state_dict를 불러 온 후, 모델에 저장

<All keys matched successfully>

In [11]:
model_scripted = torch.jit.script(model)
model_scripted.save('./model_scripted.pt')

RuntimeError: Can't redefine method: forward on class: __torch__.transformers.models.bert.modeling_bert.BertEmbeddings (of Python compilation unit at: 0x57608b0)

In [7]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model. /home/inmo/tide_pjt/data/emo/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [8]:
def predict(predict_sentence):

    data = [predict_sentence, '0']
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0, 1, tok, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=2)
    
    model.eval()

    for (token_ids, valid_length, segment_ids, label) in test_dataloader:
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)

        out = model(token_ids, valid_length, segment_ids)

        # for i, e in zip(out[0], emotion_list):
        #     print(f'{e}: {round(float(i),4)}')
        return out

In [9]:
def emo_rank(pre_result):
    if pre_result[8] >= math.sqrt(sum(abs(pre_result))/10):
        pre_result[8] = max(float(pre_result[8] ** 2), pre_result[8])
    if pre_result[9] >= math.sqrt(sum(abs(pre_result))/10):
        pre_result[9] = max(float(pre_result[9] ** 2), pre_result[9])
        
    for i in range(8):
        pre_result[i] = (pre_result[i]/5)*4
    
    mask = sorted(enumerate(pre_result), key=lambda x:x[1], reverse=True)
    e = []
    tmp = []
    for i, x in mask:
        if len(e) >= 3:
            break
        if x > float(sum(abs(pre_result))/len(pre_result)):
            e.append(i+1)
        elif x >= math.sqrt(abs(sum(pre_result)))/len(pre_result):
            tmp.append(i+11)
    else:
        while len(e) < 3:
            if tmp:
                e.append(tmp.pop(0))
            else:
                e.append(0)
    return e

In [10]:
text = '''아 프로젝트 너무 힘들다. 개빡친다! 자고 싶다! 너무 힘들다!'''
emotion = predict(text)
emotion

tensor([[ 2.3929,  0.0689, -0.1116,  0.7096,  0.7330,  2.0293, -0.7949, -0.8528,
         -2.4520, -1.7598]], grad_fn=<AddmmBackward0>)

In [11]:
keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words=None, top_n=3)
keywords

[('자고', 0.733), ('너무', 0.5021), ('힘들다', 0.4998)]

In [128]:
emo_rank(emotion[0])

[9, 10, 0]

In [12]:
df = pd.read_csv('./song_emotion_keyword.csv', index_col=0)
song = pd.read_csv('../song/song_data/song.csv', encoding='cp949',  index_col=0)

In [13]:
import random

keyword_list = [i[0] for i in keywords]
e1, e2, e3 = emo_rank(emotion[0])
song_dic = {}

def add_score(i, score, is_sec):
    if song_dic.get(i):
        song_dic[i] += (score+random.random())/is_sec
    else:
        song_dic[i] = score/is_sec
    

is_sec = 2 if e1 > 10 else 1
x = df[df.loc[:, 'emotion_1'] == e1]
for i in x.song_id:
    rs = round(random.random(),3)
    song_dic[i] = (100+rs)/is_sec
x = df[df.loc[:, 'emotion_2'] == e1]
for i in x.song_id:
    rs = round(random.random(),3)
    song_dic[i] = (80+rs)/is_sec
x = df[df.loc[:, 'emotion_3'] == e1]
for i in x.song_id:
    rs = round(random.random(),3)
    song_dic[i] = (50+rs)/is_sec

is_sec = 3 if e2 > 10 else 1
x = df[df.loc[:, 'emotion_1'] == e2]
for i in x.song_id:
    add_score(i, 70, is_sec)
x = df[df.loc[:, 'emotion_2'] == e2]
for i in x.song_id:
    add_score(i, 90, is_sec)
x = df[df.loc[:, 'emotion_3'] == e2]
for i in x.song_id:
    add_score(i, 50, is_sec)

is_sec = 3 if e3 > 10 else 1
x = df[df.loc[:, 'emotion_1'] == e3]
for i in x.song_id:
    add_score(i, 40, is_sec)
x = df[df.loc[:, 'emotion_2'] == e3]
for i in x.song_id:
    add_score(i, 50, is_sec)
x = df[df.loc[:, 'emotion_3'] == e3]
for i in x.song_id:
    add_score(i, 60, is_sec)
    
for word in keyword_list:
    x = df[df.loc[:, 'key_sentence'].str.contains(word)]
    for i in x.song_id:
        add_score(i, 50, 1)

In [14]:
song_dic
rec_list = sorted([[k,v]for k, v in song_dic.items()],key=lambda x: x[1], reverse=True)
rec_list[:10]

[[56251, 302.97320243830063],
 [8204539, 262.80269885445523],
 [78853, 252.78768658161476],
 [3549031, 252.24612169124487],
 [4635701, 252.03869593643628],
 [3573326, 251.84312789662755],
 [31266290, 251.7537439968295],
 [2654948, 251.63220805897603],
 [891991, 251.60079168872485],
 [5644826, 251.54448350750968]]

100.104

In [97]:
df[df.loc[:, 'key_sentence'].str.contains('오늘도')].song_id

584       607928
1251       85508
1849      316664
2193      621812
2595      928705
2931     1314788
3811     1898142
3907     1932254
4285     2247067
5054     3164052
5681     3764815
6013     3978258
6066     4001753
6216     4100521
6935     5450271
7085     5683227
7869    30657307
8144    31331750
8202    31455159
8521    32399832
8544     3118484
Name: song_id, dtype: int64

In [92]:
import re

string = "aHello, World!"
pattern = "Hello"

if re.search(pattern, string):
    print('fds')

fds
