In [6]:
import math
import logging
from datetime import datetime

import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, models, LoggingHandler, losses, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
from sentence_transformers.datasets import NoDuplicatesDataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# SEED 설정
import random
seed = 7777
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [3]:
# LOGGER 초기화
logging.basicConfig(
    format="%(asctime)s - %(message)s",
    datefmt="%Y/%m/%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()],
)

In [3]:
import pandas as pd
import numpy as np

In [9]:
file1 = pd.read_csv('cbf_track.csv')
file1 = file1.set_index(np.arange(len(file1)))
new_df = pd.DataFrame({'sentence1':file1['merge_key'], 'sentence2':file1['similar_merge_key'],'label':0})

In [10]:
%%time
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
from scipy.stats import pearsonr

device = 'cuda'
model = SentenceTransformer('distiluse-base-multilingual-cased-v1',device = device)

corpus_embeddings = model.encode(file1['merge_key'].values, convert_to_tensor=True) # senetence1 유사도

CPU times: user 5min 28s, sys: 53.7 s, total: 6min 22s
Wall time: 49.4 s


In [37]:
track_ids = file1['seed_track_id']
result = list()
for i in range(len(track_ids)):
    result.append({"track_id": str(track_ids[i]), "vector": corpus_embeddings[i].cpu().detach().numpy().tolist()})

In [38]:
import os
filename = f"similar_track_0.parquet"
print(os.path.join('./', filename), flush = True)

file_df = pd.DataFrame(result, columns=['track_id', 'vector'])
print(file_df.head(10), flush = True)
file_df.to_parquet(os.path.join('./', filename), engine="pyarrow", compression="gzip")

./similar_track_0.parquet
    track_id                                             vector
0  456609510  [-0.0033900258131325245, 0.05714113265275955, ...
1  462264041  [0.0371885672211647, 0.10016318410634995, -0.0...
2  456609510  [-0.0033900258131325245, 0.05714113265275955, ...
3  435796657  [-0.0026813074946403503, -0.03216226398944855,...
4   31278931  [-0.04989597201347351, -0.011758898384869099, ...
5   30413068  [-0.004028111696243286, 0.017778100445866585, ...
6  456609510  [-0.0033900258131325245, 0.05714113265275955, ...
7  450406118  [0.04241568595170975, 0.022643975913524628, -0...
8  445405279  [0.016840310767292976, -0.0575084462761879, 0....
9   30089640  [-0.03719403222203255, -0.026945270597934723, ...


In [39]:
import gzip, json
with gzip.open(os.path.join('./', filename), "wt") as gz_file:
    json_str = json.dumps(result)
    gz_file.write(json_str + "\n")

In [40]:
file3 = pd.read_parquet('similar_track_0.parquet')

ArrowInvalid: Could not open Parquet input source '<Buffer>': Parquet magic bytes not found in footer. Either the file is corrupted or this is not a parquet file.

In [None]:
result[0]

In [4]:
file2 = pd.read_parquet('rep_track.parquet')
file2=file2.set_index(np.arange(len(file2)))
file2['merge_key'] = file2['track_nm_notbracspace']+file2['artist_ids']
file2['similar_merge_key'] = file2['similar_track_nm_notbracspace']+file2['similar_artist_ids']
new_df = pd.DataFrame({'sentence1':file2['merge_key'], 'sentence2':file2['similar_merge_key'],'label':0})

In [6]:
sentence1 = []
# sentence2 = []
# labels = []
for i in range(len(new_df)):
    sentence1.append(new_df['sentence1'][i])
    # sentence2.append(new_df['sentence2'][i])
    # labels.append(new_df['label'][i])

In [7]:
%%time
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
from scipy.stats import pearsonr

device = 'cuda'
model = SentenceTransformer('distiluse-base-multilingual-cased-v1',device = device)

corpus_embeddings = model.encode(sentence1, convert_to_tensor=True) # senetence1 유사도
query_embeddings = model.encode(sentence2, convert_to_tensor=True) # sentence2 유사도

def cosine_similarity_manual(x, y, small_number=1e-8): # sentence1과 sentence2의 임베딩값으로 유사도 계산
    result =  torch.dot(x, y) / (torch.linalg.norm(x) * torch.linalg.norm(y) + small_number)
    return result

test_scores = []
for i in range(len(sentence1)):
    score = cosine_similarity_manual(corpus_embeddings[i],query_embeddings[i])
    score=score.cpu().detach().numpy()
    test_scores.append(score)

test_scores = np.array(test_scores) # 모델 예측값
y_pred = np.where(test_scores>=0.6, 1, 0) # klue에서 3.0을 기준으로 binary label을 만들었기에, normalize 기준 threshold: 0.6
labels = np.array(labels)
y_label = np.where(labels >= 0.6, 1, 0)

corpus_embeddings = corpus_embeddings.cpu().detach().numpy()
query_embeddings = query_embeddings.cpu().detach().numpy()

cosine_scores = 1 - (paired_cosine_distances(corpus_embeddings, query_embeddings))
manhattan_distances = -paired_manhattan_distances(corpus_embeddings, query_embeddings)
euclidean_distances = -paired_euclidean_distances(corpus_embeddings, query_embeddings)
dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(corpus_embeddings, query_embeddings)]

2023/08/22 05:48:57 - Load pretrained SentenceTransformer: distiluse-base-multilingual-cased-v1


Batches: 100%|██████████| 2557/2557 [00:43<00:00, 59.23it/s]
Batches: 100%|██████████| 2557/2557 [00:43<00:00, 59.32it/s]


CPU times: user 10min 54s, sys: 2min 8s, total: 13min 3s
Wall time: 1min 48s


In [8]:
new_df['sentence_bert']= cosine_scores

In [9]:
new_df['sentence_bert_label'] = new_df['sentence_bert'].apply(lambda x:1 if x>=0.6 else 0)

In [13]:
file1

Unnamed: 0,seed_track_id,seed_track_nm,seed_track_artist_nm_list,similar_track_id,similar_track_nm,similar_track_artist_nm_list,seed_track_nm_rnm,similar_track_nm_rnm,prep1,prep2,track_artist,similar_track_artist,merge_key,similar_merge_key,leven,artist_same
0,456609510,Healinf,['Subway'],458369695,Heali,['Subway'],healinf,heali,Healinf,Heali,Subway,Subway,Healinf Subway,Heali Subway,2,1
1,462264041,Hwangmo,['Hwangmi'],472111675,Hwangq,['Hwangmi'],hwangmo,hwangq,Hwangmo,Hwangq,Hwangmi,Hwangmi,Hwangmo Hwangmi,Hwangq Hwangmi,2,1
2,456609510,Healinf,['Subway'],456609518,Healinn,['Subway'],healinf,healinn,Healinf,Healinn,Subway,Subway,Healinf Subway,Healinn Subway,1,1
3,435796657,Trigger the fever,['NCT DREAM'],30585945,Trigger the fever,['NCT DREAM'],triggerthefever,triggerthefever,Triggerthefever,Triggerthefever,NCT DREAM,NCT DREAM,Triggerthefever NCT DREAM,Triggerthefever NCT DREAM,0,1
4,31278931,Beautiful Day,['그_냥'],80266210,Beautiful Day,['넬 (NELL)'],beautifulday,beautifulday,BeautifulDay,BeautifulDay,그_냥,넬 (NELL),BeautifulDay 그_냥,BeautifulDay 넬 (NELL),8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81789,437589198,Stronger,['The Score'],62242533,Stronger,['The Score'],stronger,stronger,Stronger,Stronger,The Score,The Score,Stronger The Score,Stronger The Score,0,1
81790,2977025,사랑의 계절,['럼블피쉬'],3082175,사랑의 계절,['럼블피쉬'],사랑의계절,사랑의계절,사랑의계절,사랑의계절,럼블피쉬,럼블피쉬,사랑의계절 럼블피쉬,사랑의계절 럼블피쉬,0,1
81791,424686924,아기돼지 삼형제,['여름동요'],427029386,아기돼지 삼형제 (유아동요),['동요'],아기돼지삼형제,아기돼지삼형제,아기돼지삼형제,아기돼지삼형제,여름동요,동요,아기돼지삼형제 여름동요,아기돼지삼형제 동요,2,0
81792,431329,Funky Tonight (Funky Mix),['클론'],431301,Funky Tonight,['클론'],funkytonight,funkytonight,FunkyTonight,FunkyTonight,클론,클론,FunkyTonight 클론,FunkyTonight 클론,0,1


In [12]:

del file1['Unnamed: 0']

In [20]:
file1.to_csv('cbf_track.csv')

In [16]:
result = pd.read_csv('cbf_track_result.csv')
result[(result['leven']>=1)&(result['leven']<=3)&(result['artist_same']==1)&(result['sentence_bert_label']==1)]

Unnamed: 0.1,Unnamed: 0,seed_track_id,seed_track_nm,seed_track_artist_nm_list,similar_track_id,similar_track_nm,similar_track_artist_nm_list,seed_track_nm_rnm,similar_track_nm_rnm,prep1,prep2,track_artist,similar_track_artist,merge_key,similar_merge_key,leven,artist_same,sentence_bert_label
0,0,456609510,Healinf,['Subway'],458369695,Heali,['Subway'],healinf,heali,Healinf,Heali,Subway,Subway,Healinf Subway,Heali Subway,2,1,1
1,1,462264041,Hwangmo,['Hwangmi'],472111675,Hwangq,['Hwangmi'],hwangmo,hwangq,Hwangmo,Hwangq,Hwangmi,Hwangmi,Hwangmo Hwangmi,Hwangq Hwangmi,2,1,1
2,2,456609510,Healinf,['Subway'],456609518,Healinn,['Subway'],healinf,healinn,Healinf,Healinn,Subway,Subway,Healinf Subway,Healinn Subway,1,1,1
6,6,456609510,Healinf,['Subway'],457380146,Healif,['Subway'],healinf,healif,Healinf,Healif,Subway,Subway,Healinf Subway,Healif Subway,1,1,1
13,13,455431651,Dancef,['SCVM'],458364774,Dancu,['SCVM'],dancef,dancu,Dancef,Dancu,SCVM,SCVM,Dancef SCVM,Dancu SCVM,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81773,81773,459065217,Morai,['Morae'],459065214,Moraf,['Morae'],morai,moraf,Morai,Moraf,Morae,Morae,Morai Morae,Moraf Morae,1,1,1
81775,81775,459065217,Morai,['Morae'],459065230,Morav,['Morae'],morai,morav,Morai,Morav,Morae,Morae,Morai Morae,Morav Morae,1,1,1
81780,81780,458369704,Healr,['Subway'],457380151,Healik,['Subway'],healr,healik,Healr,Healik,Subway,Subway,Healr Subway,Healik Subway,2,1,1
81787,81787,461329023,Hwangmb,['Hwangmi'],464046850,Hwangmv,['Hwangmi'],hwangmb,hwangmv,Hwangmb,Hwangmv,Hwangmi,Hwangmi,Hwangmb Hwangmi,Hwangmv Hwangmi,1,1,1


In [17]:
result[(result['leven']>=1)&(result['leven']<=3)&(result['artist_same']==1)&(result['sentence_bert_label']==1)].to_csv('same_song.csv')

In [16]:
new_df.to_csv('cbf_track.csv')

In [147]:
len(new_df[new_df['sentence_bert2_track_label']>=0.6])/len(new_df)

0.9172316246402965

In [148]:
new_df

Unnamed: 0,sentence1,sentence2,label,sentence_bert2_track_label
0,Seasons,Seasons,1,1.000000
1,Believer,Believer,1,1.000000
2,SmoothCriminal,SmoothCriminal,1,1.000000
3,AYo,AYo,1,1.000000
4,하루이틀,하루일과,1,0.879729
...,...,...,...,...
61504,MyLove,Love,1,0.829423
61505,LuckySerendipitg,LuckySerendipityu,1,0.978678
61506,TakeonMe,TakeOnMe,1,0.890553
61507,Putcha,Putcha,1,1.000000


In [67]:
new_df.to_csv('sentence_bert2.csv')

In [68]:
new_df['sentence_bert2_notblac_label'].describe()

count    902099.000000
mean          0.999899
std           0.004906
min           0.300607
25%           1.000000
50%           1.000000
75%           1.000000
max           1.000000
Name: sentence_bert2_notblac_label, dtype: float64