In [1]:
import math
import logging
from datetime import datetime

import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, models, LoggingHandler, losses, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
from sentence_transformers.datasets import NoDuplicatesDataLoader

In [2]:
# SEED 설정
import random
seed = 7777
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [3]:
# LOGGER 초기화
logging.basicConfig(
    format="%(asctime)s - %(message)s",
    datefmt="%Y/%m/%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()],
)

In [4]:
import pandas as pd
import numpy as np
file1 = pd.read_parquet('rep_track.parquet')
file1=file1.set_index(np.arange(len(file1)))

In [37]:
file1['merge_key'] = file1['track_nm_notbracspace'] + ' ' + file1['artist_ids']
file1['similar_merge_key'] = file1['similar_track_nm_notbracspace'] + ' ' + file1['similar_artist_ids']

In [38]:
new_df = pd.DataFrame({'sentence1':file1['merge_key'], 'sentence2':file1['similar_merge_key'],'label':0})

In [39]:
sentence1 = []
sentence2 = []
labels = []
for i in range(len(new_df)):
    sentence1.append(new_df['sentence1'][i])
    sentence2.append(new_df['sentence2'][i])
    labels.append(new_df['label'][i])

In [41]:
device = 'cuda:0'

In [None]:
model = SentenceTransformer('distiluse-base-multilingual-cased-v1',device = device)

corpus_embeddings = model.encode(sentence1, convert_to_tensor=True) # senetence1 유사도
query_embeddings = model.encode(sentence2, convert_to_tensor=True) # sentence2 유사도

def cosine_similarity_manual(x, y, small_number=1e-8): # sentence1과 sentence2의 임베딩값으로 유사도 계산
    result =  torch.dot(x, y) / (torch.linalg.norm(x) * torch.linalg.norm(y) + small_number)
    return result

test_scores = []
for i in range(len(sentence1)):
    score = cosine_similarity_manual(corpus_embeddings[i],query_embeddings[i])
    score=score.cpu().detach().numpy()
    test_scores.append(score)

test_scores = np.array(test_scores) # 모델 예측값
y_pred = np.where(test_scores>=0.6, 1, 0) # klue에서 3.0을 기준으로 binary label을 만들었기에, normalize 기준 threshold: 0.6
labels = np.array(labels)
y_label = np.where(labels >= 0.6, 1, 0)

2023/08/21 01:36:47 - Load pretrained SentenceTransformer: distiluse-base-multilingual-cased-v1


Batches:   0%|          | 0/28191 [00:00<?, ?it/s]

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
from scipy.stats import pearsonr

In [None]:
corpus_embeddings = corpus_embeddings.cpu().detach().numpy()
query_embeddings = query_embeddings.cpu().detach().numpy()

cosine_scores = 1 - (paired_cosine_distances(corpus_embeddings, query_embeddings))
manhattan_distances = -paired_manhattan_distances(corpus_embeddings, query_embeddings)
euclidean_distances = -paired_euclidean_distances(corpus_embeddings, query_embeddings)
dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(corpus_embeddings, query_embeddings)]

In [None]:
len(corpus_embeddings)

In [None]:
new_df['sentence_bert_notblacspace_artist_nm_label']= cosine_scores

In [None]:
new_df['label'] = new_df['sentence_bert_notblacspace_artist_nm_label'].apply(lambda x: 1 if x>=0.6 else 0)

In [None]:
new_df.to_csv('pretrained_model_track_nm_not_bracspace_artist_nm_result.csv')

In [35]:
new_df[new_df['sentence_bert_notblac_artist_nm_label']<0.6]

Unnamed: 0,sentence1,sentence2,label,sentence_bert_notblac_artist_nm_label
5684,대동강편지 이수진,대동강 편지 이수진,0,0.589508
41126,복지만리 나훈아,복지 만리 나훈아,0,0.592601
48193,차마고도 main theme 양방언,차마고도 양방언,0,0.557088
49783,exterminate tv 애니메이션 [전희절창 심포기어 gx] 오프닝 테마 Mi...,exterminate Mizuki Nana,0,0.583998
75776,elegy 이루마,elegy내 마음에 비친 내 모습 이루마,0,0.444181
...,...,...,...,...
878583,슬램덩크 너와 함께라면 박응식,슬램덩크 엔딩 박응식,0,0.436692
878585,슬램덩크 너와 함께라면 박응식,슬램덩크 엔딩 박응식,0,0.436692
880797,ユキトキyukitoki 역시 내 청춘 러브코메디는 잘못됐다 1기 op Variou...,ユキトキ yukitoki Various Artists,0,0.579756
889964,사랑의 밧줄 진성,사랑의밧줄 진성,0,0.545908


In [44]:
import os
lst_file= os.listdir('data/')
file1=pd.read_parquet(f'./data/{lst_file[0]}')

In [46]:
for i in range(1,len(lst_file)):
    file2 = pd.read_parquet(f'./data/{lst_file[i]}')
    file1 = pd.concat([file1,file2])

In [93]:
file1.to_parquet('rep_track.parquet', compression='gzip')

In [90]:
file1=file1.drop_duplicates()

In [91]:
file1['notspace_distance'].value_counts()

notspace_distance
0    901451
2       620
1        13
3        12
4         3
Name: count, dtype: int64

In [92]:
file1['notbracspace_distance'].value_counts()

notbracspace_distance
0     901471
3         94
7         72
8         54
2         52
6         51
11        42
10        42
5         39
9         38
4         33
12        16
13        16
17        15
1         13
14        10
16         8
19         7
15         7
18         6
20         5
28         3
30         1
33         1
69         1
24         1
38         1
Name: count, dtype: int64