In [1]:
import torch
from transformers import BertModel
from kobert_tokenizer import KoBERTTokenizer

In [2]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
model = BertModel.from_pretrained('skt/kobert-base-v1')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [3]:
feelings=['기쁨','슬픔','설렘','그리움']

In [4]:
def get_l1_distance(x1, x2):
    return ((x1 - x2).abs()).sum()

In [5]:
import pandas as pd
from tqdm import tqdm

In [7]:
df=pd.read_json('meta_list.json')

In [8]:
df=df[df['has_lyric']==1]
df=df[~df['album_genre'].str.contains('팝')]

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 908 entries, 0 to 1012
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   album_genre   908 non-null    object        
 1   album_title   908 non-null    object        
 2   artists       908 non-null    object        
 3   has_lyric     908 non-null    int64         
 4   lyric         908 non-null    object        
 5   play_time     908 non-null    datetime64[ns]
 6   release_date  908 non-null    object        
 7   track_id      908 non-null    int64         
 8   track_title   908 non-null    object        
dtypes: datetime64[ns](1), int64(2), object(6)
memory usage: 70.9+ KB


In [10]:
device = torch.device("mps")
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(8002, 768, padding_idx=1)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )

In [11]:
encoded_feelings=[]
for mood in feelings:
    inputs=[tokenizer.encode(mood)]
    a1=torch.tensor(inputs,device=device)
    a2=torch.ones(1,len(inputs),device=device)
    out = model(input_ids=a1,attention_mask=a2)
    encoded_feelings.append(out.pooler_output)

In [12]:
def get_cosine_similarity(x1, x2):
    return (x1 * x2).sum() / ((x1**2).sum()**.5 * (x2**2).sum()**.5)

In [13]:
def get_mood(out):
    angles=[]
    for i in range(len(encoded_feelings)):
        angle=get_cosine_similarity(out,encoded_feelings[i])
        #print(feelings[i],angle)
        angles.append(angle)
    ret=max(angles)
    idx=angles.index(ret)
    return idx;

In [20]:
def parseLyric(lyric):
    bag=lyric.split('\n')
    size=len(bag)
    
    ret='';
    for i in range(size):
        if i<=size//6:
            ret+=bag[i]
        elif i>=2*size//6 and i<=3*size//6:
            ret+=bag[i]
        elif i>=5*size//6:
            ret+=bag[i];
    if len(ret) > 512:
        ret=ret[:512]
    return ret;
#for idx,lyric in enumerate(df['lyric'].iloc[:10]):
#    lyric=parseLyric(lyric)
#    print(lyric)

In [21]:
expect=[]
for idx,lyric in enumerate(tqdm(df['lyric'])):
    lyric=parseLyric(lyric)
    inputs=[tokenizer.encode(lyric)]
    a1=torch.tensor(inputs,device=device)
    a2=torch.ones(1,len(inputs),device=device)
    out = model(input_ids = a1,attention_mask=a2)
    mood=get_mood(out.pooler_output)
    #print(feelings[mood])
    expect.append(feelings[mood])
df['mood']=expect

100%|███████████████████████████████████████| 908/908 [1:06:11<00:00,  4.37s/it]


In [22]:
df.to_excel('auto_labeling.xlsx')

In [23]:
df.head(100)

Unnamed: 0,album_genre,album_title,artists,has_lyric,lyric,play_time,release_date,track_id,track_title,mood
0,발라드,밤편지,[아이유(IU)],1,이 밤 그날의 반딧불을 \n당신의 창 가까이 보낼게요\n음 사랑한다는 말이에요\n\...,2022-07-20 04:13:00,2017.3.24,16676937,밤편지,그리움
1,드라마음악,'키스 먼저 할까요?' OST Part 3,[폴킴],1,네가 없이 웃을 수 있을까\n생각만 해도 눈물이나\n힘든 시간 날 지켜준 사람\n이...,2022-07-20 03:30:00,2018.3.20,21064399,"모든 날, 모든 순간 (Every day, Every Moment)",그리움
2,드라마음악,도깨비 OST Part 9,[에일리(Ailee)],1,널 품기 전 알지 못했다\n내 머문 세상 이토록 \n찬란한 것을\n\n작은 숨결로 ...,2022-07-20 03:50:00,2017.1.7,16039008,첫눈처럼 너에게 가겠다,기쁨
3,락,버스커 버스커,[버스커 버스커],1,그대여 그대여 그대여 그대여 그대여\n\n\n오늘은 우리 같이 걸어요 이 거리를\n...,2022-07-20 04:22:00,2012.3.29,3182429,벚꽃 엔딩,그리움
4,랩/힙합,YOU NEVER WALK ALONE,[방탄소년단],1,보고 싶다 이렇게 \n말하니까 더 보고 싶다\n너희 사진을 \n보고 있어도 보고 싶...,2022-07-20 04:34:00,2017.2.13,16091695,봄날,그리움
...,...,...,...,...,...,...,...,...,...,...
112,소울,21,[Adele],1,There's a fire starting in my heart\r\nReachin...,2022-07-20 03:52:00,2011.1.21,2407510,Rolling In The Deep,기쁨
113,발라드,고독의 의미,[이적],1,다시 돌아올 거라고 했잖아 \n잠깐이면 될 거라고 했잖아 \n여기 서 있으라 말했었...,2022-07-20 04:32:00,2013.11.15,3999137,거짓말 거짓말 거짓말,그리움
114,드라마음악,도깨비 OST Part 7,[소유 (SOYOU)],1,바라보면 자꾸 눈물이 나는 건\n왠지 몰라도\n돌고 돌아 내게 오고 있었나요\n피해...,2022-07-20 02:49:00,2016.12.31,16023533,I Miss You,그리움
116,댄스,THE STORY BEGINS,[TWICE(트와이스)],1,모두 나를 가지고\r매일 가만 안 두죠\r내가 너무 예쁘죠\r나 때문에 다 힘들죠\...,2022-07-20 03:35:00,2015.10.20,5773242,OOH-AHH하게,기쁨
