In [1]:
import math
import logging
from datetime import datetime

import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, models, LoggingHandler, losses, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
from sentence_transformers.datasets import NoDuplicatesDataLoader

In [2]:
# SEED 설정
import random
seed = 7777
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [3]:
# LOGGER 초기화
logging.basicConfig(
    format="%(asctime)s - %(message)s",
    datefmt="%Y/%m/%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()],
)

In [4]:
pretrained_model_name = 'klue/roberta-base'
nli_num_epochs = 1
sts_num_epochs = 4
train_batch_size = 32

nli_model_save_path = 'output/training_nli_by_Softmaxloss'+pretrained_model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
sts_model_save_path = 'output/training_sts_by_Softmaxloss'+pretrained_model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [5]:
# load KLUE-NLI Dataset
klue_nli_train = load_dataset("klue", "nli", split='train')
print('Length of Train : ',len(klue_nli_train))

Length of Train :  24998


In [7]:
def make_nli_input_example(dataset):
    ''' 
    Transform to InputExample
    ''' 
    input_examples = []
    for i, data in enumerate(dataset):
        sentence1 = data['hypothesis']
        sentence2 = data['premise']
        label = data['label'] # 0(entailment), 1(neutral), 2(contradiction)
        input_examples.append(InputExample(texts=[sentence1, sentence2], label=label))

    return input_examples

In [8]:
nli_train_examples = make_nli_input_example(klue_nli_train)

In [16]:
# Train Dataloader
train_dataloader = DataLoader(
    nli_train_examples,
    shuffle=True,
    batch_size=train_batch_size,
)

In [17]:
# Load Embedding Model
embedding_model = models.Transformer(
    model_name_or_path=pretrained_model_name, 
    max_seq_length=256,
    do_lower_case=True
)

# Only use Mean Pooling -> Pooling all token embedding vectors of sentence.
pooling_model = models.Pooling(
    embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False,
)

model = SentenceTransformer(modules=[embedding_model, pooling_model])

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


2023/08/08 02:23:08 - Use pytorch device: cuda


In [18]:
# Use SoftmaxLoss, because NLI is Multi-class Classification task.
train_loss = losses.SoftmaxLoss(
    model=model, 
    sentence_embedding_dimension=model.get_sentence_embedding_dimension(), 
    num_labels=3 # entailment, neutral, contradiction
)

# warmup steps
warmup_steps = math.ceil(len(nli_train_examples) * nli_num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Training
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=nli_num_epochs,
    evaluation_steps=int(len(train_dataloader)*0.1),
    warmup_steps=warmup_steps,
    output_path=nli_model_save_path
)

2023/08/08 02:23:08 - Softmax loss: #Vectors concatenated: 3
2023/08/08 02:23:08 - Warmup-steps: 79


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/782 [00:00<?, ?it/s]

2023/08/08 02:23:36 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 78 steps:
2023/08/08 02:23:39 - Cosine-Similarity :	Pearson: 0.8331	Spearman: 0.8476
2023/08/08 02:23:39 - Manhattan-Distance:	Pearson: 0.8240	Spearman: 0.8369
2023/08/08 02:23:39 - Euclidean-Distance:	Pearson: 0.8224	Spearman: 0.8355
2023/08/08 02:23:39 - Dot-Product-Similarity:	Pearson: 0.7367	Spearman: 0.7445
2023/08/08 02:23:39 - Save model to output/training_nli_by_Softmaxlossklue-roberta-base-2023-08-08_02-22-25
2023/08/08 02:24:01 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 156 steps:
2023/08/08 02:24:04 - Cosine-Similarity :	Pearson: 0.5904	Spearman: 0.6869
2023/08/08 02:24:04 - Manhattan-Distance:	Pearson: 0.6507	Spearman: 0.6990
2023/08/08 02:24:04 - Euclidean-Distance:	Pearson: 0.6451	Spearman: 0.6978
2023/08/08 02:24:04 - Dot-Product-Similarity:	Pearson: 0.5515	Spearman: 0.5637
2023/08/08 02:24:25 - EmbeddingSimilarityEval

In [19]:
# load KLUE-STS Dataset
klue_sts_train = load_dataset("klue", "sts", split='train[:90%]')
klue_sts_valid = load_dataset("klue", "sts", split='train[-10%:]') # train의 10%를 validation set으로 사용
klue_sts_test = load_dataset("klue", "sts", split='validation')

print('Length of Train : ',len(klue_sts_train))
print('Length of Valid : ',len(klue_sts_valid))
print('Length of Test : ',len(klue_sts_test))

def make_sts_input_example(dataset):
    ''' 
    Transform to InputExample
    ''' 
    input_examples = []
    for i, data in enumerate(dataset):
        sentence1 = data['sentence1']
        sentence2 = data['sentence2']
        score = (data['labels']['label']) / 5.0  # normalize 0 to 5
        input_examples.append(InputExample(texts=[sentence1, sentence2], label=score))

    return input_examples

sts_train_examples = make_sts_input_example(klue_sts_train)
sts_valid_examples = make_sts_input_example(klue_sts_valid)
sts_test_examples = make_sts_input_example(klue_sts_test)

# Train Dataloader
train_dataloader = DataLoader(
    sts_train_examples,
    shuffle=True,
    batch_size=train_batch_size,
)

# Evaluator by sts-validation
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    sts_valid_examples,
    name="sts-dev",
)

# Evaluator by sts-test
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    sts_test_examples,
    name="sts-test",
)

Length of Train :  10501
Length of Valid :  1167
Length of Test :  519


In [20]:
# Load model of fine-tuning by NLI
model = SentenceTransformer(nli_model_save_path)

2023/08/08 02:27:30 - Load pretrained SentenceTransformer: output/training_nli_by_Softmaxlossklue-roberta-base-2023-08-08_02-22-25
2023/08/08 02:27:32 - Use pytorch device: cuda


In [21]:
# Use CosineSimilarityLoss
train_loss = losses.CosineSimilarityLoss(model=model)

# warmup steps
warmup_steps = math.ceil(len(sts_train_examples) * sts_num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Trainingㅁ
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=sts_num_epochs,
    evaluation_steps=int(len(train_dataloader)*0.1),
    warmup_steps=warmup_steps,
    output_path=sts_model_save_path
)

2023/08/08 02:27:32 - Warmup-steps: 132


Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/329 [00:00<?, ?it/s]

2023/08/08 02:27:43 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 32 steps:
2023/08/08 02:27:47 - Cosine-Similarity :	Pearson: 0.8871	Spearman: 0.8798
2023/08/08 02:27:47 - Manhattan-Distance:	Pearson: 0.8660	Spearman: 0.8667
2023/08/08 02:27:47 - Euclidean-Distance:	Pearson: 0.8655	Spearman: 0.8664
2023/08/08 02:27:47 - Dot-Product-Similarity:	Pearson: 0.8571	Spearman: 0.8485
2023/08/08 02:27:47 - Save model to output/training_sts_by_Softmaxlossklue-roberta-base-2023-08-08_02-22-25
2023/08/08 02:27:57 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 64 steps:
2023/08/08 02:28:01 - Cosine-Similarity :	Pearson: 0.9245	Spearman: 0.8981
2023/08/08 02:28:01 - Manhattan-Distance:	Pearson: 0.9160	Spearman: 0.8956
2023/08/08 02:28:01 - Euclidean-Distance:	Pearson: 0.9160	Spearman: 0.8958
2023/08/08 02:28:01 - Dot-Product-Similarity:	Pearson: 0.9164	Spearman: 0.8894
2023/08/08 02:28:01 - Save model to output/tra

Iteration:   0%|          | 0/329 [00:00<?, ?it/s]

2023/08/08 02:30:09 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 1 after 32 steps:
2023/08/08 02:30:13 - Cosine-Similarity :	Pearson: 0.9594	Spearman: 0.9187
2023/08/08 02:30:13 - Manhattan-Distance:	Pearson: 0.9535	Spearman: 0.9181
2023/08/08 02:30:13 - Euclidean-Distance:	Pearson: 0.9535	Spearman: 0.9185
2023/08/08 02:30:13 - Dot-Product-Similarity:	Pearson: 0.9523	Spearman: 0.9049
2023/08/08 02:30:23 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 1 after 64 steps:
2023/08/08 02:30:26 - Cosine-Similarity :	Pearson: 0.9600	Spearman: 0.9202
2023/08/08 02:30:26 - Manhattan-Distance:	Pearson: 0.9547	Spearman: 0.9191
2023/08/08 02:30:26 - Euclidean-Distance:	Pearson: 0.9546	Spearman: 0.9192
2023/08/08 02:30:26 - Dot-Product-Similarity:	Pearson: 0.9526	Spearman: 0.9051
2023/08/08 02:30:36 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 1 after 96 steps:
2023/08/08 02:30:40 - Cosine-Simila

Iteration:   0%|          | 0/329 [00:00<?, ?it/s]

2023/08/08 02:32:32 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 2 after 32 steps:
2023/08/08 02:32:36 - Cosine-Similarity :	Pearson: 0.9621	Spearman: 0.9241
2023/08/08 02:32:36 - Manhattan-Distance:	Pearson: 0.9547	Spearman: 0.9222
2023/08/08 02:32:36 - Euclidean-Distance:	Pearson: 0.9546	Spearman: 0.9221
2023/08/08 02:32:36 - Dot-Product-Similarity:	Pearson: 0.9540	Spearman: 0.9107
2023/08/08 02:32:36 - Save model to output/training_sts_by_Softmaxlossklue-roberta-base-2023-08-08_02-22-25
2023/08/08 02:32:48 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 2 after 64 steps:
2023/08/08 02:32:52 - Cosine-Similarity :	Pearson: 0.9614	Spearman: 0.9235
2023/08/08 02:32:52 - Manhattan-Distance:	Pearson: 0.9531	Spearman: 0.9211
2023/08/08 02:32:52 - Euclidean-Distance:	Pearson: 0.9530	Spearman: 0.9210
2023/08/08 02:32:52 - Dot-Product-Similarity:	Pearson: 0.9526	Spearman: 0.9089
2023/08/08 02:33:01 - EmbeddingSimilarityEvalu

Iteration:   0%|          | 0/329 [00:00<?, ?it/s]

2023/08/08 02:35:05 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 3 after 32 steps:
2023/08/08 02:35:09 - Cosine-Similarity :	Pearson: 0.9640	Spearman: 0.9276
2023/08/08 02:35:09 - Manhattan-Distance:	Pearson: 0.9563	Spearman: 0.9247
2023/08/08 02:35:09 - Euclidean-Distance:	Pearson: 0.9564	Spearman: 0.9248
2023/08/08 02:35:09 - Dot-Product-Similarity:	Pearson: 0.9546	Spearman: 0.9111
2023/08/08 02:35:09 - Save model to output/training_sts_by_Softmaxlossklue-roberta-base-2023-08-08_02-22-25
2023/08/08 02:35:20 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 3 after 64 steps:
2023/08/08 02:35:25 - Cosine-Similarity :	Pearson: 0.9638	Spearman: 0.9275
2023/08/08 02:35:25 - Manhattan-Distance:	Pearson: 0.9557	Spearman: 0.9243
2023/08/08 02:35:25 - Euclidean-Distance:	Pearson: 0.9558	Spearman: 0.9245
2023/08/08 02:35:25 - Dot-Product-Similarity:	Pearson: 0.9546	Spearman: 0.9117
2023/08/08 02:35:34 - EmbeddingSimilarityEvalu

In [22]:
# evaluation sts-test
test_evaluator(model, output_path=sts_model_save_path)

2023/08/08 02:37:19 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-test dataset:
2023/08/08 02:37:20 - Cosine-Similarity :	Pearson: 0.8855	Spearman: 0.8874
2023/08/08 02:37:20 - Manhattan-Distance:	Pearson: 0.8822	Spearman: 0.8789
2023/08/08 02:37:20 - Euclidean-Distance:	Pearson: 0.8832	Spearman: 0.8803
2023/08/08 02:37:20 - Dot-Product-Similarity:	Pearson: 0.8725	Spearman: 0.8715


0.8873581486439149

In [75]:
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
from scipy.stats import pearsonr

In [48]:
import json
import pandas as pd

dset = []

with open("cbf_track_names.json") as fp:
    for l in fp:
        d = json.loads(l)
        dset.append(d)

print(dset[0])
df = pd.DataFrame(dset)

{'seed_track_nm': 'Seasons (Feat. Harley Bird) (Futuristik & Whogaux Remix)', 'seed_track_artist_nm_list': ['Cadmium', 'Rival'], 'similar_track_nm': 'Seasons (Futuristik & Whogaux Remix)', 'similar_track_artist_nm_list': ['Cadmium', 'Harley Bird', 'Rival'], 'seed_track_nm_rnm': 'seasons', 'similar_track_nm_rnm': 'seasons'}


import pandas as pd
import numpy as np
a= pd.read_parquet('./dataset.parquet')
a['track_id']= a['track_id'].astype('str')
a['string']=a['track_nm']+' '+a['artist_nm_list']
a= a.set_index(np.arange(len(a)))
b = a.sample(frac=1).reset_index(drop=True)
original = a['string'].values
similar = b['string'].values
new_df=pd.DataFrame({'sentence1':original, 'sentence2':similar,'label':0})

In [None]:
file1 = pd.read_parquet('rep_track.parquet')
file1 = file1.set_index(np.arange(len(file1)))

In [55]:
file1.columns

Index(['similar_track_id', 'similar_track_nm', 'similar_track_nm_notbrac',
       'similar_track_nm_notspace', 'similar_track_nm_notbracspace',
       'similar_artist_ids', 'similar_track_nm_origin', 'track_id',
       'rep_track_id', 'track_nm', 'track_nm_notbrac', 'track_nm_notspace',
       'track_nm_notbracspace', 'artist_ids', 'track_nm_origin',
       'notspace_distance', 'notbracspace_distance'],
      dtype='object')

In [96]:
new_df = pd.DataFrame({'sentence1':file1['track_nm_notbrac'], 'sentence2':file1['similar_track_nm_notbrac'],'label':0})

In [99]:
sentence1 = []
sentence2 = []
labels = []
for i in range(len(new_df)):
    sentence1.append(new_df['sentence1'][i])
    sentence2.append(new_df['sentence2'][i])
    labels.append(new_df['label'][i])

In [100]:
device = 'cuda:2'

In [101]:
model = SentenceTransformer(sts_model_save_path,device = device)

corpus_embeddings = model.encode(sentence1, convert_to_tensor=True) # senetence1 유사도
query_embeddings = model.encode(sentence2, convert_to_tensor=True) # sentence2 유사도

def cosine_similarity_manual(x, y, small_number=1e-8): # sentence1과 sentence2의 임베딩값으로 유사도 계산
    result =  torch.dot(x, y) / (torch.linalg.norm(x) * torch.linalg.norm(y) + small_number)
    return result

test_scores = []
for i in range(len(sentence1)):
    score = cosine_similarity_manual(corpus_embeddings[i],query_embeddings[i])
    score=score.cpu().detach().numpy()
    test_scores.append(score)

test_scores = np.array(test_scores) # 모델 예측값
y_pred = np.where(test_scores>=0.6, 1, 0) # klue에서 3.0을 기준으로 binary label을 만들었기에, normalize 기준 threshold: 0.6
labels = np.array(labels)
y_label = np.where(labels >= 0.6, 1, 0)

2023/08/08 06:31:51 - Load pretrained SentenceTransformer: output/training_sts_by_Softmaxlossklue-roberta-base-2023-08-08_02-22-25


Batches:   0%|          | 0/28191 [00:00<?, ?it/s]

Batches:   0%|          | 0/28191 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.58 GiB (GPU 2; 79.10 GiB total capacity; 5.58 GiB already allocated; 775.94 MiB free; 5.68 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
corpus_embeddings = corpus_embeddings.cpu().detach().numpy()
query_embeddings = query_embeddings.cpu().detach().numpy()

cosine_scores = 1 - (paired_cosine_distances(corpus_embeddings, query_embeddings))
manhattan_distances = -paired_manhattan_distances(corpus_embeddings, query_embeddings)
euclidean_distances = -paired_euclidean_distances(corpus_embeddings, query_embeddings)
dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(corpus_embeddings, query_embeddings)]

In [None]:
len(corpus_embeddings)

In [None]:
new_df['sentence_bert_notblac_label']= cosine_scores

In [36]:
new_df[new_df['sentence_bert_label']>=0.7]

Unnamed: 0,sentence1,sentence2,label,sentence_bert_label
26432,Love Somebody Maroon 5,What Lovers Do Maroon 5,0,0.777964
38914,Snowman J.Fla,Snowman Sia,0,0.803596
43413,Burning Thursdz Burning Thursday,Burning Thursdah Burning Thursday,0,0.975578
88338,"Won't Bite (Explicit Ver.) Doja Cat,Smino",JACKBOYS (Explicit Ver.) JACKBOYS,0,0.727241
101580,Cry In My Gucci (Explicit Ver.) Margaret,You (Explicit Ver.) The 1975,0,0.713326
122111,Too Deep (Explicit Ver.) Kehlani,How (Explicit Ver.) Ella Mai,0,0.722489
170480,Deepnb Deepnoid,Deepu Deepnoid,0,0.92883
185948,Abracadabra (Explicit Ver.) Nas,Blah Blah Blah (Explicit Ver.) Kesha,0,0.775345
207041,꼭두각시 핑크퐁 (Pinkfong),프리도를 따라 해 핑크퐁 (Pinkfong),0,0.710193
255369,Level of Concern Twenty One Pilots,Never Take It Twenty One Pilots,0,0.731473


import os
lst_file= os.listdir('data/')
file1=pd.read_parquet(f'./data/{lst_file[0]}')
for i in range(1,len(lst_file)):
    file2 = pd.read_parquet(f'./data/{lst_file[i]}')
    file1 = pd.concat([file1,file2])
file1.to_parquet('rep_track.parquet', compression='gzip')
file1=file1.drop_duplicates()