In [3]:
import math
import logging
from datetime import datetime

import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, models, LoggingHandler, losses, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
from sentence_transformers.datasets import NoDuplicatesDataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# SEED 설정
import random
seed = 7777
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [5]:
# LOGGER 초기화
logging.basicConfig(
    format="%(asctime)s - %(message)s",
    datefmt="%Y/%m/%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()],
)

In [6]:
pretrained_model_name = 'klue/roberta-base'
nli_num_epochs = 1
sts_num_epochs = 4
train_batch_size = 32

nli_model_save_path = 'output/training_nli_by_Softmaxloss'+pretrained_model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
sts_model_save_path = 'output/training_sts_by_Softmaxloss'+pretrained_model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [7]:
# load KLUE-NLI Dataset
klue_nli_train = load_dataset("klue", "nli", split='train')
print('Length of Train : ',len(klue_nli_train))

Downloading builder script: 100%|██████████| 23.3k/23.3k [00:00<00:00, 9.33MB/s]
Downloading metadata: 100%|██████████| 22.7k/22.7k [00:00<00:00, 11.5MB/s]
Downloading readme: 100%|██████████| 21.5k/21.5k [00:00<00:00, 11.3MB/s]
Downloading data: 100%|██████████| 1.26M/1.26M [00:00<00:00, 24.2MB/s]
Generating train split: 100%|██████████| 24998/24998 [00:02<00:00, 8865.21 examples/s] 
Generating validation split: 100%|██████████| 3000/3000 [00:00<00:00, 9593.25 examples/s] 

Length of Train :  24998





In [8]:
def make_nli_input_example(dataset):
    ''' 
    Transform to InputExample
    ''' 
    input_examples = []
    for i, data in enumerate(dataset):
        sentence1 = data['hypothesis']
        sentence2 = data['premise']
        label = data['label'] # 0(entailment), 1(neutral), 2(contradiction)
        input_examples.append(InputExample(texts=[sentence1, sentence2], label=label))

    return input_examples

In [9]:
nli_train_examples = make_nli_input_example(klue_nli_train)

In [12]:
# Train Dataloader
train_dataloader = DataLoader(
    nli_train_examples,
    shuffle=True,
    batch_size=train_batch_size,
)

In [13]:
# Load Embedding Model
embedding_model = models.Transformer(
    model_name_or_path=pretrained_model_name, 
    max_seq_length=256,
    do_lower_case=True
)

# Only use Mean Pooling -> Pooling all token embedding vectors of sentence.
pooling_model = models.Pooling(
    embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False,
)

model = SentenceTransformer(modules=[embedding_model, pooling_model])

Downloading (…)lve/main/config.json: 100%|██████████| 546/546 [00:00<00:00, 60.8kB/s]
Downloading model.safetensors: 100%|██████████| 443M/443M [00:04<00:00, 91.6MB/s] 
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Downloading (…)okenizer_config.json: 100%|██████████| 375/375 [00:00<00:00, 49.1kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 248k/248k [00:00<00:00, 1.45MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 752k/752k [00:00<00:00, 4.27MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 173/173 [00:00<00:00, 85.0kB/s]


2023/08/08 07:08:50 - Use pytorch device: cuda


In [14]:
# Use SoftmaxLoss, because NLI is Multi-class Classification task.
train_loss = losses.SoftmaxLoss(
    model=model, 
    sentence_embedding_dimension=model.get_sentence_embedding_dimension(), 
    num_labels=3 # entailment, neutral, contradiction
)

# warmup steps
warmup_steps = math.ceil(len(nli_train_examples) * nli_num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Training
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=nli_num_epochs,
    evaluation_steps=int(len(train_dataloader)*0.1),
    warmup_steps=warmup_steps,
    output_path=nli_model_save_path
)

2023/08/08 07:08:50 - Softmax loss: #Vectors concatenated: 3
2023/08/08 07:08:50 - Warmup-steps: 79


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]
Iteration:   0%|          | 0/782 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/782 [00:00<06:50,  1.90it/s][A
Iteration:   0%|          | 2/782 [00:00<04:23,  2.96it/s][A
Iteration:   0%|          | 3/782 [00:00<03:38,  3.56it/s][A
Iteration:   1%|          | 4/782 [00:01<03:16,  3.96it/s][A
Iteration:   1%|          | 5/782 [00:01<02:59,  4.33it/s][A
Iteration:   1%|          | 6/782 [00:01<02:48,  4.61it/s][A
Iteration:   1%|          | 7/782 [00:01<02:41,  4.80it/s][A
Iteration:   1%|          | 8/782 [00:01<02:38,  4.87it/s][A
Iteration:   1%|          | 9/782 [00:01<02:33,  5.03it/s][A
Iteration:   1%|▏         | 10/782 [00:02<02:30,  5.14it/s][A
Iteration:   1%|▏         | 11/782 [00:02<02:29,  5.17it/s][A
Iteration:   2%|▏         | 12/782 [00:02<02:26,  5.26it/s][A
Iteration:   2%|▏         | 13/782 [00:02<02:24,  5.33it/s][A
Iteration:   2%|▏         | 14/782 [00:02<02:21,  5.42it/s][A
Iteration:   2%|▏         | 

2023/08/08 07:09:05 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 78 steps:


                                            
Epoch:   0%|          | 0/1 [00:16<?, ?it/s]               
                                            :56,  6.06it/s][A
Epoch:   0%|          | 0/1 [00:16<?, ?it/s]               
                                            :56,  6.06it/s][A
Epoch:   0%|          | 0/1 [00:16<?, ?it/s]               
                                            :56,  6.06it/s][A
Epoch:   0%|          | 0/1 [00:16<?, ?it/s]               
                                            :56,  6.06it/s][A
Epoch:   0%|          | 0/1 [00:16<?, ?it/s]               
Iteration:  10%|▉         | 77/782 [00:16<01:56,  6.06it/s][A

2023/08/08 07:09:08 - Cosine-Similarity :	Pearson: 0.8382	Spearman: 0.8563
2023/08/08 07:09:08 - Manhattan-Distance:	Pearson: 0.8317	Spearman: 0.8473
2023/08/08 07:09:08 - Euclidean-Distance:	Pearson: 0.8278	Spearman: 0.8445
2023/08/08 07:09:08 - Dot-Product-Similarity:	Pearson: 0.7066	Spearman: 0.7190
2023/08/08 07:09:09 - Save model to output/training_nli_by_Softmaxlossklue-roberta-base-2023-08-08_07-08-05



Iteration:  10%|▉         | 78/782 [00:17<04:36,  2.55it/s][A
Iteration:  10%|█         | 79/782 [00:17<04:28,  2.62it/s][A
Iteration:  10%|█         | 80/782 [00:18<04:20,  2.70it/s][A
Iteration:  10%|█         | 81/782 [00:18<04:12,  2.77it/s][A
Iteration:  10%|█         | 82/782 [00:18<04:05,  2.85it/s][A
Iteration:  11%|█         | 83/782 [00:18<03:58,  2.93it/s][A
Iteration:  11%|█         | 84/782 [00:18<03:52,  3.01it/s][A
Iteration:  11%|█         | 85/782 [00:18<03:45,  3.09it/s][A
Iteration:  11%|█         | 86/782 [00:19<03:39,  3.17it/s][A
Iteration:  11%|█         | 87/782 [00:19<03:34,  3.25it/s][A
Iteration:  11%|█▏        | 88/782 [00:19<03:28,  3.33it/s][A
Iteration:  11%|█▏        | 89/782 [00:19<03:23,  3.40it/s][A
Iteration:  12%|█▏        | 90/782 [00:19<03:18,  3.49it/s][A
Iteration:  12%|█▏        | 91/782 [00:19<03:14,  3.55it/s][A
Iteration:  12%|█▏        | 92/782 [00:20<03:10,  3.63it/s][A
Iteration:  12%|█▏        | 93/782 [00:20<03:05,  3.70

2023/08/08 07:09:22 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 156 steps:


                                            
Epoch:   0%|          | 0/1 [00:34<?, ?it/s]                
                                            1:45,  5.93it/s][A
Epoch:   0%|          | 0/1 [00:34<?, ?it/s]                
                                            1:45,  5.93it/s][A
Epoch:   0%|          | 0/1 [00:34<?, ?it/s]                
                                            1:45,  5.93it/s][A
Epoch:   0%|          | 0/1 [00:34<?, ?it/s]                
Iteration:  20%|█▉        | 155/782 [00:34<01:45,  5.93it/s][A
Iteration:  20%|█▉        | 156/782 [00:34<03:43,  2.80it/s][A

2023/08/08 07:09:26 - Cosine-Similarity :	Pearson: 0.5477	Spearman: 0.6574
2023/08/08 07:09:26 - Manhattan-Distance:	Pearson: 0.6125	Spearman: 0.6628
2023/08/08 07:09:26 - Euclidean-Distance:	Pearson: 0.6070	Spearman: 0.6623
2023/08/08 07:09:26 - Dot-Product-Similarity:	Pearson: 0.5068	Spearman: 0.4982



Iteration:  20%|██        | 157/782 [00:34<03:37,  2.87it/s][A
Iteration:  20%|██        | 158/782 [00:34<03:31,  2.95it/s][A
Iteration:  20%|██        | 159/782 [00:34<03:26,  3.02it/s][A
Iteration:  20%|██        | 160/782 [00:34<03:20,  3.10it/s][A
Iteration:  21%|██        | 161/782 [00:35<03:14,  3.19it/s][A
Iteration:  21%|██        | 162/782 [00:35<03:10,  3.26it/s][A
Iteration:  21%|██        | 163/782 [00:35<03:05,  3.34it/s][A
Iteration:  21%|██        | 164/782 [00:35<03:00,  3.41it/s][A
Iteration:  21%|██        | 165/782 [00:35<02:56,  3.49it/s][A
Iteration:  21%|██        | 166/782 [00:35<02:52,  3.57it/s][A
Iteration:  21%|██▏       | 167/782 [00:36<02:48,  3.64it/s][A
Iteration:  21%|██▏       | 168/782 [00:36<02:45,  3.72it/s][A
Iteration:  22%|██▏       | 169/782 [00:36<02:41,  3.79it/s][A
Iteration:  22%|██▏       | 170/782 [00:36<02:38,  3.86it/s][A
Iteration:  22%|██▏       | 171/782 [00:36<02:35,  3.93it/s][A
Iteration:  22%|██▏       | 172/782 [00

2023/08/08 07:09:39 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 234 steps:


                                            
Epoch:   0%|          | 0/1 [00:51<?, ?it/s]                
                                            1:33,  5.90it/s][A
Epoch:   0%|          | 0/1 [00:51<?, ?it/s]                
                                            1:33,  5.90it/s][A
Epoch:   0%|          | 0/1 [00:51<?, ?it/s]                
                                            1:33,  5.90it/s][A
Epoch:   0%|          | 0/1 [00:51<?, ?it/s]                
Iteration:  30%|██▉       | 233/782 [00:51<01:33,  5.90it/s][A
Iteration:  30%|██▉       | 234/782 [00:51<03:16,  2.79it/s][A

2023/08/08 07:09:43 - Cosine-Similarity :	Pearson: 0.5873	Spearman: 0.6857
2023/08/08 07:09:43 - Manhattan-Distance:	Pearson: 0.6509	Spearman: 0.6999
2023/08/08 07:09:43 - Euclidean-Distance:	Pearson: 0.6446	Spearman: 0.6984
2023/08/08 07:09:43 - Dot-Product-Similarity:	Pearson: 0.5318	Spearman: 0.5404



Iteration:  30%|███       | 235/782 [00:51<03:10,  2.87it/s][A
Iteration:  30%|███       | 236/782 [00:51<03:05,  2.94it/s][A
Iteration:  30%|███       | 237/782 [00:51<03:01,  3.01it/s][A
Iteration:  30%|███       | 238/782 [00:51<02:56,  3.09it/s][A
Iteration:  31%|███       | 239/782 [00:51<02:51,  3.17it/s][A
Iteration:  31%|███       | 240/782 [00:52<02:46,  3.25it/s][A
Iteration:  31%|███       | 241/782 [00:52<02:42,  3.32it/s][A
Iteration:  31%|███       | 242/782 [00:52<02:39,  3.39it/s][A
Iteration:  31%|███       | 243/782 [00:52<02:35,  3.47it/s][A
Iteration:  31%|███       | 244/782 [00:52<02:31,  3.54it/s][A
Iteration:  31%|███▏      | 245/782 [00:52<02:28,  3.61it/s][A
Iteration:  31%|███▏      | 246/782 [00:53<02:25,  3.69it/s][A
Iteration:  32%|███▏      | 247/782 [00:53<02:22,  3.75it/s][A
Iteration:  32%|███▏      | 248/782 [00:53<02:19,  3.83it/s][A
Iteration:  32%|███▏      | 249/782 [00:53<02:16,  3.89it/s][A
Iteration:  32%|███▏      | 250/782 [00

2023/08/08 07:09:56 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 312 steps:


                                            
Epoch:   0%|          | 0/1 [01:07<?, ?it/s]                
                                            1:20,  5.86it/s][A
Epoch:   0%|          | 0/1 [01:07<?, ?it/s]                
                                            1:20,  5.86it/s][A
Epoch:   0%|          | 0/1 [01:07<?, ?it/s]                
                                            1:20,  5.86it/s][A
Epoch:   0%|          | 0/1 [01:07<?, ?it/s]                
Iteration:  40%|███▉      | 311/782 [01:07<01:20,  5.86it/s][A
Iteration:  40%|███▉      | 312/782 [01:07<02:48,  2.79it/s][A
Iteration:  40%|████      | 313/782 [01:08<02:43,  2.86it/s][A

2023/08/08 07:09:59 - Cosine-Similarity :	Pearson: 0.5990	Spearman: 0.6821
2023/08/08 07:09:59 - Manhattan-Distance:	Pearson: 0.6651	Spearman: 0.6996
2023/08/08 07:09:59 - Euclidean-Distance:	Pearson: 0.6569	Spearman: 0.6991
2023/08/08 07:09:59 - Dot-Product-Similarity:	Pearson: 0.5535	Spearman: 0.5609



Iteration:  40%|████      | 314/782 [01:08<02:39,  2.94it/s][A
Iteration:  40%|████      | 315/782 [01:08<02:34,  3.02it/s][A
Iteration:  40%|████      | 316/782 [01:08<02:30,  3.11it/s][A
Iteration:  41%|████      | 317/782 [01:08<02:26,  3.18it/s][A
Iteration:  41%|████      | 318/782 [01:08<02:22,  3.27it/s][A
Iteration:  41%|████      | 319/782 [01:09<02:18,  3.34it/s][A
Iteration:  41%|████      | 320/782 [01:09<02:15,  3.41it/s][A
Iteration:  41%|████      | 321/782 [01:09<02:12,  3.49it/s][A
Iteration:  41%|████      | 322/782 [01:09<02:09,  3.56it/s][A
Iteration:  41%|████▏     | 323/782 [01:09<02:06,  3.63it/s][A
Iteration:  41%|████▏     | 324/782 [01:09<02:03,  3.71it/s][A
Iteration:  42%|████▏     | 325/782 [01:10<02:01,  3.78it/s][A
Iteration:  42%|████▏     | 326/782 [01:10<01:58,  3.85it/s][A
Iteration:  42%|████▏     | 327/782 [01:10<01:56,  3.92it/s][A
Iteration:  42%|████▏     | 328/782 [01:10<01:53,  3.99it/s][A
Iteration:  42%|████▏     | 329/782 [01

2023/08/08 07:10:12 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 390 steps:


                                            
Epoch:   0%|          | 0/1 [01:24<?, ?it/s]                
                                            1:06,  5.92it/s][A
Epoch:   0%|          | 0/1 [01:24<?, ?it/s]                
                                            1:06,  5.92it/s][A
Epoch:   0%|          | 0/1 [01:24<?, ?it/s]                
                                            1:06,  5.92it/s][A
Epoch:   0%|          | 0/1 [01:24<?, ?it/s]                
Iteration:  50%|████▉     | 389/782 [01:24<01:06,  5.92it/s][A
Iteration:  50%|████▉     | 390/782 [01:24<02:18,  2.83it/s][A
Iteration:  50%|█████     | 391/782 [01:24<02:14,  2.91it/s][A

2023/08/08 07:10:16 - Cosine-Similarity :	Pearson: 0.6206	Spearman: 0.6976
2023/08/08 07:10:16 - Manhattan-Distance:	Pearson: 0.6782	Spearman: 0.7083
2023/08/08 07:10:16 - Euclidean-Distance:	Pearson: 0.6723	Spearman: 0.7076
2023/08/08 07:10:16 - Dot-Product-Similarity:	Pearson: 0.5779	Spearman: 0.5903



Iteration:  50%|█████     | 392/782 [01:24<02:10,  2.98it/s][A
Iteration:  50%|█████     | 393/782 [01:25<02:07,  3.06it/s][A
Iteration:  50%|█████     | 394/782 [01:25<02:03,  3.14it/s][A
Iteration:  51%|█████     | 395/782 [01:25<02:00,  3.20it/s][A
Iteration:  51%|█████     | 396/782 [01:25<01:57,  3.28it/s][A
Iteration:  51%|█████     | 397/782 [01:25<01:54,  3.36it/s][A
Iteration:  51%|█████     | 398/782 [01:25<01:51,  3.43it/s][A
Iteration:  51%|█████     | 399/782 [01:26<01:49,  3.51it/s][A
Iteration:  51%|█████     | 400/782 [01:26<01:46,  3.58it/s][A
Iteration:  51%|█████▏    | 401/782 [01:26<01:44,  3.64it/s][A
Iteration:  51%|█████▏    | 402/782 [01:26<01:42,  3.72it/s][A
Iteration:  52%|█████▏    | 403/782 [01:26<01:39,  3.80it/s][A
Iteration:  52%|█████▏    | 404/782 [01:26<01:37,  3.87it/s][A
Iteration:  52%|█████▏    | 405/782 [01:27<01:35,  3.94it/s][A
Iteration:  52%|█████▏    | 406/782 [01:27<01:33,  4.00it/s][A
Iteration:  52%|█████▏    | 407/782 [01

2023/08/08 07:10:29 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 468 steps:


                                            
Epoch:   0%|          | 0/1 [01:41<?, ?it/s]                
                                            0:53,  5.85it/s][A
Epoch:   0%|          | 0/1 [01:41<?, ?it/s]                
                                            0:53,  5.85it/s][A
Epoch:   0%|          | 0/1 [01:41<?, ?it/s]                
                                            0:53,  5.85it/s][A
Epoch:   0%|          | 0/1 [01:41<?, ?it/s]                
Iteration:  60%|█████▉    | 467/782 [01:41<00:53,  5.85it/s][A
Iteration:  60%|█████▉    | 468/782 [01:41<01:52,  2.78it/s][A

2023/08/08 07:10:33 - Cosine-Similarity :	Pearson: 0.6103	Spearman: 0.6841
2023/08/08 07:10:33 - Manhattan-Distance:	Pearson: 0.6665	Spearman: 0.6958
2023/08/08 07:10:33 - Euclidean-Distance:	Pearson: 0.6593	Spearman: 0.6950
2023/08/08 07:10:33 - Dot-Product-Similarity:	Pearson: 0.5702	Spearman: 0.5865



Iteration:  60%|█████▉    | 469/782 [01:41<01:49,  2.86it/s][A
Iteration:  60%|██████    | 470/782 [01:41<01:46,  2.94it/s][A
Iteration:  60%|██████    | 471/782 [01:41<01:43,  3.01it/s][A
Iteration:  60%|██████    | 472/782 [01:42<01:40,  3.09it/s][A
Iteration:  60%|██████    | 473/782 [01:42<01:37,  3.16it/s][A
Iteration:  61%|██████    | 474/782 [01:42<01:35,  3.24it/s][A
Iteration:  61%|██████    | 475/782 [01:42<01:32,  3.31it/s][A
Iteration:  61%|██████    | 476/782 [01:42<01:30,  3.40it/s][A
Iteration:  61%|██████    | 477/782 [01:42<01:27,  3.47it/s][A
Iteration:  61%|██████    | 478/782 [01:43<01:25,  3.56it/s][A
Iteration:  61%|██████▏   | 479/782 [01:43<01:23,  3.63it/s][A
Iteration:  61%|██████▏   | 480/782 [01:43<01:21,  3.71it/s][A
Iteration:  62%|██████▏   | 481/782 [01:43<01:19,  3.78it/s][A
Iteration:  62%|██████▏   | 482/782 [01:43<01:17,  3.86it/s][A
Iteration:  62%|██████▏   | 483/782 [01:43<01:15,  3.94it/s][A
Iteration:  62%|██████▏   | 484/782 [01

2023/08/08 07:10:46 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 546 steps:


                                            
Epoch:   0%|          | 0/1 [01:58<?, ?it/s]                
                                            0:40,  5.87it/s][A
Epoch:   0%|          | 0/1 [01:58<?, ?it/s]                
                                            0:40,  5.87it/s][A
Epoch:   0%|          | 0/1 [01:58<?, ?it/s]                
                                            0:40,  5.87it/s][A
Epoch:   0%|          | 0/1 [01:58<?, ?it/s]                
Iteration:  70%|██████▉   | 545/782 [01:58<00:40,  5.87it/s][A
Iteration:  70%|██████▉   | 546/782 [01:58<01:24,  2.79it/s][A
Iteration:  70%|██████▉   | 547/782 [01:58<01:21,  2.87it/s][A

2023/08/08 07:10:50 - Cosine-Similarity :	Pearson: 0.5772	Spearman: 0.6648
2023/08/08 07:10:50 - Manhattan-Distance:	Pearson: 0.6500	Spearman: 0.6821
2023/08/08 07:10:50 - Euclidean-Distance:	Pearson: 0.6402	Spearman: 0.6810
2023/08/08 07:10:50 - Dot-Product-Similarity:	Pearson: 0.5429	Spearman: 0.5617



Iteration:  70%|███████   | 548/782 [01:58<01:19,  2.95it/s][A
Iteration:  70%|███████   | 549/782 [01:58<01:17,  3.02it/s][A
Iteration:  70%|███████   | 550/782 [01:58<01:14,  3.11it/s][A
Iteration:  70%|███████   | 551/782 [01:58<01:12,  3.19it/s][A
Iteration:  71%|███████   | 552/782 [01:59<01:10,  3.27it/s][A
Iteration:  71%|███████   | 553/782 [01:59<01:08,  3.34it/s][A
Iteration:  71%|███████   | 554/782 [01:59<01:06,  3.42it/s][A
Iteration:  71%|███████   | 555/782 [01:59<01:04,  3.50it/s][A
Iteration:  71%|███████   | 556/782 [01:59<01:03,  3.58it/s][A
Iteration:  71%|███████   | 557/782 [01:59<01:01,  3.65it/s][A
Iteration:  71%|███████▏  | 558/782 [02:00<01:00,  3.72it/s][A
Iteration:  71%|███████▏  | 559/782 [02:00<00:58,  3.80it/s][A
Iteration:  72%|███████▏  | 560/782 [02:00<00:57,  3.86it/s][A
Iteration:  72%|███████▏  | 561/782 [02:00<00:56,  3.92it/s][A
Iteration:  72%|███████▏  | 562/782 [02:00<00:55,  3.98it/s][A
Iteration:  72%|███████▏  | 563/782 [02

2023/08/08 07:11:03 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 624 steps:


                                            
Epoch:   0%|          | 0/1 [02:14<?, ?it/s]                
                                            0:27,  5.88it/s][A
Epoch:   0%|          | 0/1 [02:14<?, ?it/s]                
                                            0:27,  5.88it/s][A
Epoch:   0%|          | 0/1 [02:14<?, ?it/s]                
                                            0:27,  5.88it/s][A
Epoch:   0%|          | 0/1 [02:14<?, ?it/s]                
Iteration:  80%|███████▉  | 623/782 [02:14<00:27,  5.88it/s][A
Iteration:  80%|███████▉  | 624/782 [02:14<00:56,  2.81it/s][A
Iteration:  80%|███████▉  | 625/782 [02:14<00:54,  2.89it/s][A

2023/08/08 07:11:06 - Cosine-Similarity :	Pearson: 0.6125	Spearman: 0.6935
2023/08/08 07:11:06 - Manhattan-Distance:	Pearson: 0.6772	Spearman: 0.7065
2023/08/08 07:11:06 - Euclidean-Distance:	Pearson: 0.6700	Spearman: 0.7053
2023/08/08 07:11:06 - Dot-Product-Similarity:	Pearson: 0.5744	Spearman: 0.5934



Iteration:  80%|████████  | 626/782 [02:15<00:52,  2.97it/s][A
Iteration:  80%|████████  | 627/782 [02:15<00:50,  3.05it/s][A
Iteration:  80%|████████  | 628/782 [02:15<00:49,  3.12it/s][A
Iteration:  80%|████████  | 629/782 [02:15<00:47,  3.19it/s][A
Iteration:  81%|████████  | 630/782 [02:15<00:46,  3.27it/s][A
Iteration:  81%|████████  | 631/782 [02:15<00:45,  3.34it/s][A
Iteration:  81%|████████  | 632/782 [02:16<00:43,  3.42it/s][A
Iteration:  81%|████████  | 633/782 [02:16<00:42,  3.49it/s][A
Iteration:  81%|████████  | 634/782 [02:16<00:41,  3.57it/s][A
Iteration:  81%|████████  | 635/782 [02:16<00:40,  3.65it/s][A
Iteration:  81%|████████▏ | 636/782 [02:16<00:39,  3.73it/s][A
Iteration:  81%|████████▏ | 637/782 [02:16<00:38,  3.80it/s][A
Iteration:  82%|████████▏ | 638/782 [02:17<00:37,  3.87it/s][A
Iteration:  82%|████████▏ | 639/782 [02:17<00:36,  3.94it/s][A
Iteration:  82%|████████▏ | 640/782 [02:17<00:35,  4.01it/s][A
Iteration:  82%|████████▏ | 641/782 [02

2023/08/08 07:11:19 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 702 steps:


                                            
Epoch:   0%|          | 0/1 [02:31<?, ?it/s]                
                                            0:13,  5.86it/s][A
Epoch:   0%|          | 0/1 [02:31<?, ?it/s]                
                                            0:13,  5.86it/s][A
Epoch:   0%|          | 0/1 [02:31<?, ?it/s]                
                                            0:13,  5.86it/s][A
Epoch:   0%|          | 0/1 [02:31<?, ?it/s]                
Iteration:  90%|████████▉ | 701/782 [02:31<00:13,  5.86it/s][A
Iteration:  90%|████████▉ | 702/782 [02:31<00:28,  2.82it/s][A

2023/08/08 07:11:23 - Cosine-Similarity :	Pearson: 0.6036	Spearman: 0.6875
2023/08/08 07:11:23 - Manhattan-Distance:	Pearson: 0.6730	Spearman: 0.6990
2023/08/08 07:11:23 - Euclidean-Distance:	Pearson: 0.6645	Spearman: 0.6980
2023/08/08 07:11:23 - Dot-Product-Similarity:	Pearson: 0.5740	Spearman: 0.5945



Iteration:  90%|████████▉ | 703/782 [02:31<00:27,  2.89it/s][A
Iteration:  90%|█████████ | 704/782 [02:31<00:26,  2.97it/s][A
Iteration:  90%|█████████ | 705/782 [02:31<00:25,  3.05it/s][A
Iteration:  90%|█████████ | 706/782 [02:32<00:24,  3.12it/s][A
Iteration:  90%|█████████ | 707/782 [02:32<00:23,  3.19it/s][A
Iteration:  91%|█████████ | 708/782 [02:32<00:22,  3.28it/s][A
Iteration:  91%|█████████ | 709/782 [02:32<00:21,  3.35it/s][A
Iteration:  91%|█████████ | 710/782 [02:32<00:21,  3.42it/s][A
Iteration:  91%|█████████ | 711/782 [02:32<00:20,  3.49it/s][A
Iteration:  91%|█████████ | 712/782 [02:33<00:19,  3.56it/s][A
Iteration:  91%|█████████ | 713/782 [02:33<00:18,  3.63it/s][A
Iteration:  91%|█████████▏| 714/782 [02:33<00:18,  3.71it/s][A
Iteration:  91%|█████████▏| 715/782 [02:33<00:17,  3.79it/s][A
Iteration:  92%|█████████▏| 716/782 [02:33<00:17,  3.86it/s][A
Iteration:  92%|█████████▏| 717/782 [02:33<00:16,  3.94it/s][A
Iteration:  92%|█████████▏| 718/782 [02

2023/08/08 07:11:36 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 780 steps:


                                            
Epoch:   0%|          | 0/1 [02:48<?, ?it/s]                
                                            0:00,  5.81it/s][A
Epoch:   0%|          | 0/1 [02:48<?, ?it/s]                
                                            0:00,  5.81it/s][A
Epoch:   0%|          | 0/1 [02:48<?, ?it/s]                
                                            0:00,  5.81it/s][A
Epoch:   0%|          | 0/1 [02:48<?, ?it/s]                
Iteration: 100%|█████████▉| 779/782 [02:48<00:00,  5.81it/s][A
Iteration: 100%|█████████▉| 780/782 [02:48<00:00,  2.80it/s][A


2023/08/08 07:11:40 - Cosine-Similarity :	Pearson: 0.6064	Spearman: 0.6883
2023/08/08 07:11:40 - Manhattan-Distance:	Pearson: 0.6761	Spearman: 0.7012
2023/08/08 07:11:40 - Euclidean-Distance:	Pearson: 0.6674	Spearman: 0.7003
2023/08/08 07:11:40 - Dot-Product-Similarity:	Pearson: 0.5751	Spearman: 0.5950


Iteration: 100%|█████████▉| 781/782 [02:48<00:00,  2.87it/s][A
Iteration: 100%|██████████| 782/782 [02:48<00:00,  4.64it/s][A
Epoch:   0%|          | 0/1 [02:48<?, ?it/s]

2023/08/08 07:11:40 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset after epoch 0:


Epoch: 100%|██████████| 1/1 [02:52<00:00, 172.32s/it]

2023/08/08 07:11:44 - Cosine-Similarity :	Pearson: 0.6064	Spearman: 0.6883
2023/08/08 07:11:44 - Manhattan-Distance:	Pearson: 0.6761	Spearman: 0.7012
2023/08/08 07:11:44 - Euclidean-Distance:	Pearson: 0.6674	Spearman: 0.7003
2023/08/08 07:11:44 - Dot-Product-Similarity:	Pearson: 0.5751	Spearman: 0.5950





In [15]:
# load KLUE-STS Dataset
klue_sts_train = load_dataset("klue", "sts", split='train[:90%]')
klue_sts_valid = load_dataset("klue", "sts", split='train[-10%:]') # train의 10%를 validation set으로 사용
klue_sts_test = load_dataset("klue", "sts", split='validation')

print('Length of Train : ',len(klue_sts_train))
print('Length of Valid : ',len(klue_sts_valid))
print('Length of Test : ',len(klue_sts_test))

def make_sts_input_example(dataset):
    ''' 
    Transform to InputExample
    ''' 
    input_examples = []
    for i, data in enumerate(dataset):
        sentence1 = data['sentence1']
        sentence2 = data['sentence2']
        score = (data['labels']['label']) / 5.0  # normalize 0 to 5
        input_examples.append(InputExample(texts=[sentence1, sentence2], label=score))

    return input_examples

sts_train_examples = make_sts_input_example(klue_sts_train)
sts_valid_examples = make_sts_input_example(klue_sts_valid)
sts_test_examples = make_sts_input_example(klue_sts_test)

# Train Dataloader
train_dataloader = DataLoader(
    sts_train_examples,
    shuffle=True,
    batch_size=train_batch_size,
)

# Evaluator by sts-validation
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    sts_valid_examples,
    name="sts-dev",
)

# Evaluator by sts-test
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    sts_test_examples,
    name="sts-test",
)

Length of Train :  10501
Length of Valid :  1167
Length of Test :  519


In [16]:
# Load model of fine-tuning by NLI
model = SentenceTransformer(nli_model_save_path)

2023/08/08 07:11:53 - Load pretrained SentenceTransformer: output/training_nli_by_Softmaxlossklue-roberta-base-2023-08-08_07-08-05
2023/08/08 07:11:56 - Use pytorch device: cuda


In [17]:
# Use CosineSimilarityLoss
train_loss = losses.CosineSimilarityLoss(model=model)

# warmup steps
warmup_steps = math.ceil(len(sts_train_examples) * sts_num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Trainingㅁ
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=sts_num_epochs,
    evaluation_steps=int(len(train_dataloader)*0.1),
    warmup_steps=warmup_steps,
    output_path=sts_model_save_path
)

2023/08/08 07:11:56 - Warmup-steps: 132


Epoch:   0%|          | 0/4 [00:00<?, ?it/s]
Iteration:   0%|          | 0/329 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/329 [00:00<01:04,  5.12it/s][A
Iteration:   1%|          | 2/329 [00:00<01:04,  5.03it/s][A
Iteration:   1%|          | 3/329 [00:00<01:01,  5.29it/s][A
Iteration:   1%|          | 4/329 [00:00<00:58,  5.54it/s][A
Iteration:   2%|▏         | 5/329 [00:00<00:59,  5.43it/s][A
Iteration:   2%|▏         | 6/329 [00:01<00:59,  5.40it/s][A
Iteration:   2%|▏         | 7/329 [00:01<00:59,  5.40it/s][A
Iteration:   2%|▏         | 8/329 [00:01<01:00,  5.35it/s][A
Iteration:   3%|▎         | 9/329 [00:01<01:00,  5.28it/s][A
Iteration:   3%|▎         | 10/329 [00:01<01:00,  5.24it/s][A
Iteration:   3%|▎         | 11/329 [00:02<01:00,  5.30it/s][A
Iteration:   4%|▎         | 12/329 [00:02<00:59,  5.29it/s][A
Iteration:   4%|▍         | 13/329 [00:02<01:00,  5.24it/s][A
Iteration:   4%|▍         | 14/329 [00:02<00:59,  5.26it/s][A
Iteration:   5%|▍         | 

2023/08/08 07:12:02 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 32 steps:



                                                           
Epoch:   0%|          | 0/4 [00:09<?, ?it/s]:55,  5.33it/s][A
                                                           
Epoch:   0%|          | 0/4 [00:09<?, ?it/s]:55,  5.33it/s][A
                                                           
Epoch:   0%|          | 0/4 [00:09<?, ?it/s]:55,  5.33it/s][A
                                                           
Epoch:   0%|          | 0/4 [00:09<?, ?it/s]:55,  5.33it/s][A
                                                           
Epoch:   0%|          | 0/4 [00:09<?, ?it/s]:55,  5.33it/s][A

2023/08/08 07:12:06 - Cosine-Similarity :	Pearson: 0.8717	Spearman: 0.8755
2023/08/08 07:12:06 - Manhattan-Distance:	Pearson: 0.8418	Spearman: 0.8549
2023/08/08 07:12:06 - Euclidean-Distance:	Pearson: 0.8409	Spearman: 0.8543
2023/08/08 07:12:06 - Dot-Product-Similarity:	Pearson: 0.8312	Spearman: 0.8292
2023/08/08 07:12:06 - Save model to output/training_sts_by_Softmaxlossklue-roberta-base-2023-08-08_07-08-05



Iteration:  10%|▉         | 32/329 [00:10<02:14,  2.21it/s][A
Iteration:  10%|█         | 33/329 [00:10<02:08,  2.30it/s][A
Iteration:  10%|█         | 34/329 [00:10<02:04,  2.38it/s][A
Iteration:  11%|█         | 35/329 [00:10<01:59,  2.46it/s][A
Iteration:  11%|█         | 36/329 [00:11<01:55,  2.53it/s][A
Iteration:  11%|█         | 37/329 [00:11<01:51,  2.63it/s][A
Iteration:  12%|█▏        | 38/329 [00:11<01:47,  2.71it/s][A
Iteration:  12%|█▏        | 39/329 [00:11<01:44,  2.78it/s][A
Iteration:  12%|█▏        | 40/329 [00:11<01:40,  2.87it/s][A
Iteration:  12%|█▏        | 41/329 [00:11<01:37,  2.95it/s][A
Iteration:  13%|█▎        | 42/329 [00:12<01:34,  3.03it/s][A
Iteration:  13%|█▎        | 43/329 [00:12<01:32,  3.10it/s][A
Iteration:  13%|█▎        | 44/329 [00:12<01:30,  3.16it/s][A
Iteration:  14%|█▎        | 45/329 [00:12<01:27,  3.24it/s][A
Iteration:  14%|█▍        | 46/329 [00:12<01:25,  3.32it/s][A
Iteration:  14%|█▍        | 47/329 [00:13<01:24,  3.36

2023/08/08 07:12:12 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 64 steps:



                                                           
Epoch:   0%|          | 0/4 [00:19<?, ?it/s]:02,  4.25it/s][A
                                                           
Epoch:   0%|          | 0/4 [00:19<?, ?it/s]:02,  4.25it/s][A
                                                           
Epoch:   0%|          | 0/4 [00:19<?, ?it/s]:02,  4.25it/s][A
                                                           
Epoch:   0%|          | 0/4 [00:19<?, ?it/s]:02,  4.25it/s][A
                                                           
Epoch:   0%|          | 0/4 [00:19<?, ?it/s]:02,  4.25it/s][A

2023/08/08 07:12:16 - Cosine-Similarity :	Pearson: 0.9250	Spearman: 0.9016
2023/08/08 07:12:16 - Manhattan-Distance:	Pearson: 0.9194	Spearman: 0.9010
2023/08/08 07:12:16 - Euclidean-Distance:	Pearson: 0.9193	Spearman: 0.9012
2023/08/08 07:12:16 - Dot-Product-Similarity:	Pearson: 0.9113	Spearman: 0.8855
2023/08/08 07:12:16 - Save model to output/training_sts_by_Softmaxlossklue-roberta-base-2023-08-08_07-08-05



Iteration:  19%|█▉        | 64/329 [00:20<02:01,  2.19it/s][A
Iteration:  20%|█▉        | 65/329 [00:20<01:56,  2.26it/s][A
Iteration:  20%|██        | 66/329 [00:21<01:53,  2.33it/s][A
Iteration:  20%|██        | 67/329 [00:21<01:49,  2.40it/s][A
Iteration:  21%|██        | 68/329 [00:21<01:45,  2.47it/s][A
Iteration:  21%|██        | 69/329 [00:21<01:42,  2.54it/s][A
Iteration:  21%|██▏       | 70/329 [00:21<01:39,  2.61it/s][A
Iteration:  22%|██▏       | 71/329 [00:21<01:36,  2.68it/s][A
Iteration:  22%|██▏       | 72/329 [00:22<01:33,  2.75it/s][A
Iteration:  22%|██▏       | 73/329 [00:22<01:30,  2.82it/s][A
Iteration:  22%|██▏       | 74/329 [00:22<01:27,  2.90it/s][A
Iteration:  23%|██▎       | 75/329 [00:22<01:25,  2.98it/s][A
Iteration:  23%|██▎       | 76/329 [00:22<01:22,  3.05it/s][A
Iteration:  23%|██▎       | 77/329 [00:23<01:20,  3.13it/s][A
Iteration:  24%|██▎       | 78/329 [00:23<01:18,  3.19it/s][A
Iteration:  24%|██▍       | 79/329 [00:23<01:16,  3.26

2023/08/08 07:12:23 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 96 steps:



                                                           
Epoch:   0%|          | 0/4 [00:30<?, ?it/s]:56,  4.17it/s][A
                                                           
Epoch:   0%|          | 0/4 [00:30<?, ?it/s]:56,  4.17it/s][A
                                                           
Epoch:   0%|          | 0/4 [00:30<?, ?it/s]:56,  4.17it/s][A
                                                           
Epoch:   0%|          | 0/4 [00:30<?, ?it/s]:56,  4.17it/s][A
                                                           
Epoch:   0%|          | 0/4 [00:30<?, ?it/s]:56,  4.17it/s][A

2023/08/08 07:12:26 - Cosine-Similarity :	Pearson: 0.9422	Spearman: 0.9094
2023/08/08 07:12:26 - Manhattan-Distance:	Pearson: 0.9375	Spearman: 0.9091
2023/08/08 07:12:26 - Euclidean-Distance:	Pearson: 0.9374	Spearman: 0.9092
2023/08/08 07:12:26 - Dot-Product-Similarity:	Pearson: 0.9306	Spearman: 0.8920
2023/08/08 07:12:26 - Save model to output/training_sts_by_Softmaxlossklue-roberta-base-2023-08-08_07-08-05



Iteration:  29%|██▉       | 96/329 [00:31<01:47,  2.16it/s][A
Iteration:  29%|██▉       | 97/329 [00:31<01:44,  2.22it/s][A
Iteration:  30%|██▉       | 98/329 [00:31<01:41,  2.29it/s][A
Iteration:  30%|███       | 99/329 [00:31<01:37,  2.36it/s][A
Iteration:  30%|███       | 100/329 [00:31<01:34,  2.42it/s][A
Iteration:  31%|███       | 101/329 [00:32<01:31,  2.49it/s][A
Iteration:  31%|███       | 102/329 [00:32<01:28,  2.56it/s][A
Iteration:  31%|███▏      | 103/329 [00:32<01:25,  2.64it/s][A
Iteration:  32%|███▏      | 104/329 [00:32<01:23,  2.71it/s][A
Iteration:  32%|███▏      | 105/329 [00:32<01:21,  2.76it/s][A
Iteration:  32%|███▏      | 106/329 [00:32<01:18,  2.84it/s][A
Iteration:  33%|███▎      | 107/329 [00:33<01:16,  2.90it/s][A
Iteration:  33%|███▎      | 108/329 [00:33<01:14,  2.98it/s][A
Iteration:  33%|███▎      | 109/329 [00:33<01:12,  3.05it/s][A
Iteration:  33%|███▎      | 110/329 [00:33<01:10,  3.12it/s][A
Iteration:  34%|███▎      | 111/329 [00:33<

2023/08/08 07:12:33 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 128 steps:



                                                            
Epoch:   0%|          | 0/4 [00:40<?, ?it/s]0:49,  4.12it/s][A
                                                            
Epoch:   0%|          | 0/4 [00:40<?, ?it/s]0:49,  4.12it/s][A
                                                            
Epoch:   0%|          | 0/4 [00:40<?, ?it/s]0:49,  4.12it/s][A
                                                            
Epoch:   0%|          | 0/4 [00:40<?, ?it/s]0:49,  4.12it/s][A
                                                            
Epoch:   0%|          | 0/4 [00:40<?, ?it/s]0:49,  4.12it/s][A

2023/08/08 07:12:37 - Cosine-Similarity :	Pearson: 0.9479	Spearman: 0.9111
2023/08/08 07:12:37 - Manhattan-Distance:	Pearson: 0.9433	Spearman: 0.9108
2023/08/08 07:12:37 - Euclidean-Distance:	Pearson: 0.9432	Spearman: 0.9109
2023/08/08 07:12:37 - Dot-Product-Similarity:	Pearson: 0.9380	Spearman: 0.8932
2023/08/08 07:12:37 - Save model to output/training_sts_by_Softmaxlossklue-roberta-base-2023-08-08_07-08-05



Iteration:  39%|███▉      | 128/329 [00:41<01:33,  2.16it/s][A
Iteration:  39%|███▉      | 129/329 [00:41<01:30,  2.22it/s][A
Iteration:  40%|███▉      | 130/329 [00:41<01:27,  2.28it/s][A
Iteration:  40%|███▉      | 131/329 [00:42<01:24,  2.35it/s][A
Iteration:  40%|████      | 132/329 [00:42<01:21,  2.42it/s][A
Iteration:  40%|████      | 133/329 [00:42<01:18,  2.50it/s][A
Iteration:  41%|████      | 134/329 [00:42<01:16,  2.56it/s][A
Iteration:  41%|████      | 135/329 [00:42<01:14,  2.62it/s][A
Iteration:  41%|████▏     | 136/329 [00:43<01:11,  2.69it/s][A
Iteration:  42%|████▏     | 137/329 [00:43<01:09,  2.76it/s][A
Iteration:  42%|████▏     | 138/329 [00:43<01:07,  2.83it/s][A
Iteration:  42%|████▏     | 139/329 [00:43<01:05,  2.89it/s][A
Iteration:  43%|████▎     | 140/329 [00:43<01:03,  2.96it/s][A
Iteration:  43%|████▎     | 141/329 [00:43<01:01,  3.03it/s][A
Iteration:  43%|████▎     | 142/329 [00:44<01:00,  3.09it/s][A
Iteration:  43%|████▎     | 143/329 [00

2023/08/08 07:12:43 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 160 steps:



                                                            
Epoch:   0%|          | 0/4 [00:51<?, ?it/s]0:40,  4.19it/s][A
                                                            
Epoch:   0%|          | 0/4 [00:51<?, ?it/s]0:40,  4.19it/s][A
                                                            
Epoch:   0%|          | 0/4 [00:51<?, ?it/s]0:40,  4.19it/s][A
                                                            
Epoch:   0%|          | 0/4 [00:51<?, ?it/s]0:40,  4.19it/s][A
                                                            
Epoch:   0%|          | 0/4 [00:51<?, ?it/s]0:40,  4.19it/s][A

2023/08/08 07:12:47 - Cosine-Similarity :	Pearson: 0.9502	Spearman: 0.9134
2023/08/08 07:12:47 - Manhattan-Distance:	Pearson: 0.9462	Spearman: 0.9123
2023/08/08 07:12:47 - Euclidean-Distance:	Pearson: 0.9460	Spearman: 0.9123
2023/08/08 07:12:47 - Dot-Product-Similarity:	Pearson: 0.9390	Spearman: 0.8944
2023/08/08 07:12:47 - Save model to output/training_sts_by_Softmaxlossklue-roberta-base-2023-08-08_07-08-05



Iteration:  49%|████▊     | 160/329 [00:51<01:17,  2.18it/s][A
Iteration:  49%|████▉     | 161/329 [00:52<01:14,  2.25it/s][A
Iteration:  49%|████▉     | 162/329 [00:52<01:12,  2.31it/s][A
Iteration:  50%|████▉     | 163/329 [00:52<01:09,  2.38it/s][A
Iteration:  50%|████▉     | 164/329 [00:52<01:07,  2.45it/s][A
Iteration:  50%|█████     | 165/329 [00:52<01:05,  2.52it/s][A
Iteration:  50%|█████     | 166/329 [00:52<01:02,  2.59it/s][A
Iteration:  51%|█████     | 167/329 [00:53<01:00,  2.66it/s][A
Iteration:  51%|█████     | 168/329 [00:53<00:58,  2.73it/s][A
Iteration:  51%|█████▏    | 169/329 [00:53<00:56,  2.81it/s][A
Iteration:  52%|█████▏    | 170/329 [00:53<00:55,  2.87it/s][A
Iteration:  52%|█████▏    | 171/329 [00:53<00:53,  2.95it/s][A
Iteration:  52%|█████▏    | 172/329 [00:54<00:52,  3.00it/s][A
Iteration:  53%|█████▎    | 173/329 [00:54<00:50,  3.08it/s][A
Iteration:  53%|█████▎    | 174/329 [00:54<00:49,  3.16it/s][A
Iteration:  53%|█████▎    | 175/329 [00

2023/08/08 07:12:54 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 192 steps:



                                                            
Epoch:   0%|          | 0/4 [01:01<?, ?it/s]0:33,  4.14it/s][A
                                                            
Epoch:   0%|          | 0/4 [01:01<?, ?it/s]0:33,  4.14it/s][A
                                                            
Epoch:   0%|          | 0/4 [01:01<?, ?it/s]0:33,  4.14it/s][A
                                                            
Epoch:   0%|          | 0/4 [01:01<?, ?it/s]0:33,  4.14it/s][A
Iteration:  58%|█████▊    | 192/329 [01:01<00:58,  2.34it/s][A

2023/08/08 07:12:58 - Cosine-Similarity :	Pearson: 0.9510	Spearman: 0.9110
2023/08/08 07:12:58 - Manhattan-Distance:	Pearson: 0.9471	Spearman: 0.9101
2023/08/08 07:12:58 - Euclidean-Distance:	Pearson: 0.9468	Spearman: 0.9101
2023/08/08 07:12:58 - Dot-Product-Similarity:	Pearson: 0.9401	Spearman: 0.8902



Iteration:  59%|█████▊    | 193/329 [01:01<00:56,  2.40it/s][A
Iteration:  59%|█████▉    | 194/329 [01:01<00:54,  2.48it/s][A
Iteration:  59%|█████▉    | 195/329 [01:02<00:52,  2.55it/s][A
Iteration:  60%|█████▉    | 196/329 [01:02<00:50,  2.61it/s][A
Iteration:  60%|█████▉    | 197/329 [01:02<00:49,  2.67it/s][A
Iteration:  60%|██████    | 198/329 [01:02<00:47,  2.74it/s][A
Iteration:  60%|██████    | 199/329 [01:02<00:46,  2.81it/s][A
Iteration:  61%|██████    | 200/329 [01:03<00:44,  2.87it/s][A
Iteration:  61%|██████    | 201/329 [01:03<00:43,  2.95it/s][A
Iteration:  61%|██████▏   | 202/329 [01:03<00:42,  3.01it/s][A
Iteration:  62%|██████▏   | 203/329 [01:03<00:41,  3.06it/s][A
Iteration:  62%|██████▏   | 204/329 [01:03<00:40,  3.12it/s][A
Iteration:  62%|██████▏   | 205/329 [01:04<00:38,  3.19it/s][A
Iteration:  63%|██████▎   | 206/329 [01:04<00:37,  3.26it/s][A
Iteration:  63%|██████▎   | 207/329 [01:04<00:36,  3.32it/s][A
Iteration:  63%|██████▎   | 208/329 [01

2023/08/08 07:13:04 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 224 steps:



                                                            
Epoch:   0%|          | 0/4 [01:11<?, ?it/s]0:25,  4.21it/s][A
                                                            
Epoch:   0%|          | 0/4 [01:11<?, ?it/s]0:25,  4.21it/s][A
                                                            
Epoch:   0%|          | 0/4 [01:11<?, ?it/s]0:25,  4.21it/s][A
                                                            
Epoch:   0%|          | 0/4 [01:11<?, ?it/s]0:25,  4.21it/s][A
                                                            
Epoch:   0%|          | 0/4 [01:11<?, ?it/s]0:25,  4.21it/s][A

2023/08/08 07:13:07 - Cosine-Similarity :	Pearson: 0.9535	Spearman: 0.9150
2023/08/08 07:13:07 - Manhattan-Distance:	Pearson: 0.9477	Spearman: 0.9131
2023/08/08 07:13:07 - Euclidean-Distance:	Pearson: 0.9474	Spearman: 0.9134
2023/08/08 07:13:07 - Dot-Product-Similarity:	Pearson: 0.9453	Spearman: 0.9007
2023/08/08 07:13:07 - Save model to output/training_sts_by_Softmaxlossklue-roberta-base-2023-08-08_07-08-05



Iteration:  68%|██████▊   | 224/329 [01:12<00:48,  2.18it/s][A
Iteration:  68%|██████▊   | 225/329 [01:12<00:46,  2.24it/s][A
Iteration:  69%|██████▊   | 226/329 [01:12<00:44,  2.31it/s][A
Iteration:  69%|██████▉   | 227/329 [01:12<00:42,  2.38it/s][A
Iteration:  69%|██████▉   | 228/329 [01:12<00:41,  2.46it/s][A
Iteration:  70%|██████▉   | 229/329 [01:13<00:39,  2.52it/s][A
Iteration:  70%|██████▉   | 230/329 [01:13<00:38,  2.59it/s][A
Iteration:  70%|███████   | 231/329 [01:13<00:36,  2.66it/s][A
Iteration:  71%|███████   | 232/329 [01:13<00:35,  2.74it/s][A
Iteration:  71%|███████   | 233/329 [01:13<00:34,  2.78it/s][A
Iteration:  71%|███████   | 234/329 [01:13<00:33,  2.85it/s][A
Iteration:  71%|███████▏  | 235/329 [01:14<00:32,  2.91it/s][A
Iteration:  72%|███████▏  | 236/329 [01:14<00:31,  2.97it/s][A
Iteration:  72%|███████▏  | 237/329 [01:14<00:30,  3.02it/s][A
Iteration:  72%|███████▏  | 238/329 [01:14<00:29,  3.09it/s][A
Iteration:  73%|███████▎  | 239/329 [01

2023/08/08 07:13:14 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 256 steps:



                                                            
Epoch:   0%|          | 0/4 [01:21<?, ?it/s]0:18,  4.11it/s][A
                                                            
Epoch:   0%|          | 0/4 [01:21<?, ?it/s]0:18,  4.11it/s][A
                                                            
Epoch:   0%|          | 0/4 [01:21<?, ?it/s]0:18,  4.11it/s][A
                                                            
Epoch:   0%|          | 0/4 [01:21<?, ?it/s]0:18,  4.11it/s][A
Iteration:  78%|███████▊  | 256/329 [01:21<00:31,  2.34it/s][A

2023/08/08 07:13:18 - Cosine-Similarity :	Pearson: 0.9538	Spearman: 0.9122
2023/08/08 07:13:18 - Manhattan-Distance:	Pearson: 0.9481	Spearman: 0.9112
2023/08/08 07:13:18 - Euclidean-Distance:	Pearson: 0.9480	Spearman: 0.9113
2023/08/08 07:13:18 - Dot-Product-Similarity:	Pearson: 0.9474	Spearman: 0.8987



Iteration:  78%|███████▊  | 257/329 [01:22<00:30,  2.39it/s][A
Iteration:  78%|███████▊  | 258/329 [01:22<00:28,  2.47it/s][A
Iteration:  79%|███████▊  | 259/329 [01:22<00:27,  2.54it/s][A
Iteration:  79%|███████▉  | 260/329 [01:22<00:26,  2.62it/s][A
Iteration:  79%|███████▉  | 261/329 [01:22<00:25,  2.68it/s][A
Iteration:  80%|███████▉  | 262/329 [01:22<00:24,  2.76it/s][A
Iteration:  80%|███████▉  | 263/329 [01:23<00:23,  2.82it/s][A
Iteration:  80%|████████  | 264/329 [01:23<00:22,  2.89it/s][A
Iteration:  81%|████████  | 265/329 [01:23<00:21,  2.95it/s][A
Iteration:  81%|████████  | 266/329 [01:23<00:20,  3.03it/s][A
Iteration:  81%|████████  | 267/329 [01:23<00:20,  3.08it/s][A
Iteration:  81%|████████▏ | 268/329 [01:24<00:19,  3.14it/s][A
Iteration:  82%|████████▏ | 269/329 [01:24<00:18,  3.21it/s][A
Iteration:  82%|████████▏ | 270/329 [01:24<00:17,  3.28it/s][A
Iteration:  82%|████████▏ | 271/329 [01:24<00:17,  3.35it/s][A
Iteration:  83%|████████▎ | 272/329 [01

2023/08/08 07:13:24 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 288 steps:



                                                            
Epoch:   0%|          | 0/4 [01:31<?, ?it/s]0:10,  4.18it/s][A
                                                            
Epoch:   0%|          | 0/4 [01:31<?, ?it/s]0:10,  4.18it/s][A
                                                            
Epoch:   0%|          | 0/4 [01:31<?, ?it/s]0:10,  4.18it/s][A
                                                            
Epoch:   0%|          | 0/4 [01:31<?, ?it/s]0:10,  4.18it/s][A
Iteration:  88%|████████▊ | 288/329 [01:31<00:17,  2.34it/s][A

2023/08/08 07:13:28 - Cosine-Similarity :	Pearson: 0.9553	Spearman: 0.9119
2023/08/08 07:13:28 - Manhattan-Distance:	Pearson: 0.9492	Spearman: 0.9112
2023/08/08 07:13:28 - Euclidean-Distance:	Pearson: 0.9491	Spearman: 0.9113
2023/08/08 07:13:28 - Dot-Product-Similarity:	Pearson: 0.9475	Spearman: 0.8951



Iteration:  88%|████████▊ | 289/329 [01:31<00:16,  2.40it/s][A
Iteration:  88%|████████▊ | 290/329 [01:32<00:15,  2.46it/s][A
Iteration:  88%|████████▊ | 291/329 [01:32<00:14,  2.53it/s][A
Iteration:  89%|████████▉ | 292/329 [01:32<00:14,  2.61it/s][A
Iteration:  89%|████████▉ | 293/329 [01:32<00:13,  2.68it/s][A
Iteration:  89%|████████▉ | 294/329 [01:32<00:12,  2.74it/s][A
Iteration:  90%|████████▉ | 295/329 [01:33<00:12,  2.82it/s][A
Iteration:  90%|████████▉ | 296/329 [01:33<00:11,  2.90it/s][A
Iteration:  90%|█████████ | 297/329 [01:33<00:10,  2.96it/s][A
Iteration:  91%|█████████ | 298/329 [01:33<00:10,  3.03it/s][A
Iteration:  91%|█████████ | 299/329 [01:33<00:09,  3.10it/s][A
Iteration:  91%|█████████ | 300/329 [01:34<00:09,  3.14it/s][A
Iteration:  91%|█████████▏| 301/329 [01:34<00:08,  3.21it/s][A
Iteration:  92%|█████████▏| 302/329 [01:34<00:08,  3.27it/s][A
Iteration:  92%|█████████▏| 303/329 [01:34<00:07,  3.33it/s][A
Iteration:  92%|█████████▏| 304/329 [01

2023/08/08 07:13:34 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 320 steps:



                                                            
Epoch:   0%|          | 0/4 [01:41<?, ?it/s]0:02,  4.13it/s][A
                                                            
Epoch:   0%|          | 0/4 [01:41<?, ?it/s]0:02,  4.13it/s][A
                                                            
Epoch:   0%|          | 0/4 [01:41<?, ?it/s]0:02,  4.13it/s][A
                                                            
Epoch:   0%|          | 0/4 [01:41<?, ?it/s]0:02,  4.13it/s][A
                                                            
Epoch:   0%|          | 0/4 [01:41<?, ?it/s]0:02,  4.13it/s][A

2023/08/08 07:13:38 - Cosine-Similarity :	Pearson: 0.9579	Spearman: 0.9172
2023/08/08 07:13:38 - Manhattan-Distance:	Pearson: 0.9529	Spearman: 0.9168
2023/08/08 07:13:38 - Euclidean-Distance:	Pearson: 0.9528	Spearman: 0.9165
2023/08/08 07:13:38 - Dot-Product-Similarity:	Pearson: 0.9522	Spearman: 0.9035
2023/08/08 07:13:38 - Save model to output/training_sts_by_Softmaxlossklue-roberta-base-2023-08-08_07-08-05



Iteration:  97%|█████████▋| 320/329 [01:42<00:04,  2.17it/s][A
Iteration:  98%|█████████▊| 321/329 [01:42<00:03,  2.24it/s][A
Iteration:  98%|█████████▊| 322/329 [01:42<00:03,  2.30it/s][A
Iteration:  98%|█████████▊| 323/329 [01:42<00:02,  2.36it/s][A
Iteration:  98%|█████████▊| 324/329 [01:43<00:02,  2.42it/s][A
Iteration:  99%|█████████▉| 325/329 [01:43<00:01,  2.49it/s][A
Iteration:  99%|█████████▉| 326/329 [01:43<00:01,  2.56it/s][A
Iteration:  99%|█████████▉| 327/329 [01:43<00:00,  2.63it/s][A
Iteration: 100%|█████████▉| 328/329 [01:43<00:00,  2.69it/s][A
Iteration: 100%|██████████| 329/329 [01:44<00:00,  3.16it/s][A
Epoch:   0%|          | 0/4 [01:44<?, ?it/s]

2023/08/08 07:13:40 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset after epoch 0:


Epoch:   0%|          | 0/4 [01:47<?, ?it/s]

2023/08/08 07:13:44 - Cosine-Similarity :	Pearson: 0.9585	Spearman: 0.9183
2023/08/08 07:13:44 - Manhattan-Distance:	Pearson: 0.9548	Spearman: 0.9178
2023/08/08 07:13:44 - Euclidean-Distance:	Pearson: 0.9547	Spearman: 0.9178
2023/08/08 07:13:44 - Dot-Product-Similarity:	Pearson: 0.9525	Spearman: 0.9044
2023/08/08 07:13:44 - Save model to output/training_sts_by_Softmaxlossklue-roberta-base-2023-08-08_07-08-05


Epoch:  25%|██▌       | 1/4 [01:48<05:25, 108.46s/it]
Iteration:   0%|          | 0/329 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/329 [00:00<00:56,  5.85it/s][A
Iteration:   1%|          | 2/329 [00:00<00:56,  5.81it/s][A
Iteration:   1%|          | 3/329 [00:00<00:59,  5.52it/s][A
Iteration:   1%|          | 4/329 [00:00<00:58,  5.53it/s][A
Iteration:   2%|▏         | 5/329 [00:00<00:59,  5.41it/s][A
Iteration:   2%|▏         | 6/329 [00:01<00:59,  5.47it/s][A
Iteration:   2%|▏         | 7/329 [00:01<00:58,  5.51it/s][A
Iteration:   2%|▏         | 8/329 [00:01<00:57,  5.60it/s][A
Iteration:   3%|▎         | 9/329 [00:01<00:58,  5.46it/s][A
Iteration:   3%|▎         | 10/329 [00:01<00:58,  5.48it/s][A
Iteration:   3%|▎         | 11/329 [00:02<00:58,  5.40it/s][A
Iteration:   4%|▎         | 12/329 [00:02<00:58,  5.41it/s][A
Iteration:   4%|▍         | 13/329 [00:02<00:58,  5.40it/s][A
Iteration:   4%|▍         | 14/329 [00:02<01:00,  5.24it/s][A
Iteration:   5%|▍  

2023/08/08 07:13:51 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 1 after 32 steps:



                                                           
Epoch:  25%|██▌       | 1/4 [01:58<05:25, 108.46s/it]1it/s][A
                                                           
Epoch:  25%|██▌       | 1/4 [01:58<05:25, 108.46s/it]1it/s][A
                                                           
Epoch:  25%|██▌       | 1/4 [01:58<05:25, 108.46s/it]1it/s][A
                                                           
Epoch:  25%|██▌       | 1/4 [01:58<05:25, 108.46s/it]1it/s][A
                                                           
Epoch:  25%|██▌       | 1/4 [01:58<05:25, 108.46s/it]1it/s][A

2023/08/08 07:13:54 - Cosine-Similarity :	Pearson: 0.9599	Spearman: 0.9202
2023/08/08 07:13:54 - Manhattan-Distance:	Pearson: 0.9541	Spearman: 0.9192
2023/08/08 07:13:54 - Euclidean-Distance:	Pearson: 0.9539	Spearman: 0.9189
2023/08/08 07:13:54 - Dot-Product-Similarity:	Pearson: 0.9515	Spearman: 0.9044
2023/08/08 07:13:54 - Save model to output/training_sts_by_Softmaxlossklue-roberta-base-2023-08-08_07-08-05



Iteration:  10%|▉         | 32/329 [00:10<02:20,  2.12it/s][A
Iteration:  10%|█         | 33/329 [00:10<02:14,  2.21it/s][A
Iteration:  10%|█         | 34/329 [00:10<02:08,  2.29it/s][A
Iteration:  11%|█         | 35/329 [00:11<02:04,  2.37it/s][A
Iteration:  11%|█         | 36/329 [00:11<01:59,  2.44it/s][A
Iteration:  11%|█         | 37/329 [00:11<01:55,  2.53it/s][A
Iteration:  12%|█▏        | 38/329 [00:11<01:51,  2.61it/s][A
Iteration:  12%|█▏        | 39/329 [00:11<01:47,  2.69it/s][A
Iteration:  12%|█▏        | 40/329 [00:12<01:43,  2.78it/s][A
Iteration:  12%|█▏        | 41/329 [00:12<01:40,  2.86it/s][A
Iteration:  13%|█▎        | 42/329 [00:12<01:37,  2.93it/s][A
Iteration:  13%|█▎        | 43/329 [00:12<01:34,  3.01it/s][A
Iteration:  13%|█▎        | 44/329 [00:12<01:32,  3.09it/s][A
Iteration:  14%|█▎        | 45/329 [00:13<01:30,  3.14it/s][A
Iteration:  14%|█▍        | 46/329 [00:13<01:27,  3.22it/s][A
Iteration:  14%|█▍        | 47/329 [00:13<01:25,  3.29

2023/08/08 07:14:01 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 1 after 64 steps:



                                                           
Epoch:  25%|██▌       | 1/4 [02:08<05:25, 108.46s/it]2it/s][A
                                                           
Epoch:  25%|██▌       | 1/4 [02:08<05:25, 108.46s/it]2it/s][A
                                                           
Epoch:  25%|██▌       | 1/4 [02:08<05:25, 108.46s/it]2it/s][A
                                                           
Epoch:  25%|██▌       | 1/4 [02:08<05:25, 108.46s/it]2it/s][A
                                                           
Epoch:  25%|██▌       | 1/4 [02:08<05:25, 108.46s/it]2it/s][A

2023/08/08 07:14:05 - Cosine-Similarity :	Pearson: 0.9598	Spearman: 0.9207
2023/08/08 07:14:05 - Manhattan-Distance:	Pearson: 0.9544	Spearman: 0.9200
2023/08/08 07:14:05 - Euclidean-Distance:	Pearson: 0.9543	Spearman: 0.9198
2023/08/08 07:14:05 - Dot-Product-Similarity:	Pearson: 0.9508	Spearman: 0.9030
2023/08/08 07:14:05 - Save model to output/training_sts_by_Softmaxlossklue-roberta-base-2023-08-08_07-08-05



Iteration:  19%|█▉        | 64/329 [00:20<02:02,  2.16it/s][A
Iteration:  20%|█▉        | 65/329 [00:21<01:58,  2.24it/s][A
Iteration:  20%|██        | 66/329 [00:21<01:53,  2.31it/s][A
Iteration:  20%|██        | 67/329 [00:21<01:49,  2.39it/s][A
Iteration:  21%|██        | 68/329 [00:21<01:46,  2.46it/s][A
Iteration:  21%|██        | 69/329 [00:21<01:42,  2.53it/s][A
Iteration:  21%|██▏       | 70/329 [00:22<01:39,  2.60it/s][A
Iteration:  22%|██▏       | 71/329 [00:22<01:36,  2.66it/s][A
Iteration:  22%|██▏       | 72/329 [00:22<01:34,  2.73it/s][A
Iteration:  22%|██▏       | 73/329 [00:22<01:31,  2.81it/s][A
Iteration:  22%|██▏       | 74/329 [00:22<01:28,  2.88it/s][A
Iteration:  23%|██▎       | 75/329 [00:23<01:26,  2.92it/s][A
Iteration:  23%|██▎       | 76/329 [00:23<01:24,  2.98it/s][A
Iteration:  23%|██▎       | 77/329 [00:23<01:22,  3.04it/s][A
Iteration:  24%|██▎       | 78/329 [00:23<01:20,  3.12it/s][A
Iteration:  24%|██▍       | 79/329 [00:23<01:18,  3.18

2023/08/08 07:14:12 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 1 after 96 steps:



                                                           
Epoch:  25%|██▌       | 1/4 [02:19<05:25, 108.46s/it]3it/s][A
                                                           
Epoch:  25%|██▌       | 1/4 [02:19<05:25, 108.46s/it]3it/s][A
                                                           
Epoch:  25%|██▌       | 1/4 [02:19<05:25, 108.46s/it]3it/s][A
                                                           
Epoch:  25%|██▌       | 1/4 [02:19<05:25, 108.46s/it]3it/s][A
Iteration:  29%|██▉       | 96/329 [00:30<01:39,  2.33it/s][A

2023/08/08 07:14:15 - Cosine-Similarity :	Pearson: 0.9579	Spearman: 0.9163
2023/08/08 07:14:15 - Manhattan-Distance:	Pearson: 0.9514	Spearman: 0.9161
2023/08/08 07:14:15 - Euclidean-Distance:	Pearson: 0.9514	Spearman: 0.9158
2023/08/08 07:14:15 - Dot-Product-Similarity:	Pearson: 0.9491	Spearman: 0.8986



Iteration:  29%|██▉       | 97/329 [00:31<01:37,  2.39it/s][A
Iteration:  30%|██▉       | 98/329 [00:31<01:34,  2.45it/s][A
Iteration:  30%|███       | 99/329 [00:31<01:30,  2.53it/s][A
Iteration:  30%|███       | 100/329 [00:31<01:28,  2.60it/s][A
Iteration:  31%|███       | 101/329 [00:31<01:25,  2.67it/s][A
Iteration:  31%|███       | 102/329 [00:31<01:22,  2.74it/s][A
Iteration:  31%|███▏      | 103/329 [00:32<01:20,  2.81it/s][A
Iteration:  32%|███▏      | 104/329 [00:32<01:17,  2.89it/s][A
Iteration:  32%|███▏      | 105/329 [00:32<01:15,  2.95it/s][A
Iteration:  32%|███▏      | 106/329 [00:32<01:14,  3.00it/s][A
Iteration:  33%|███▎      | 107/329 [00:32<01:12,  3.05it/s][A
Iteration:  33%|███▎      | 108/329 [00:33<01:10,  3.12it/s][A
Iteration:  33%|███▎      | 109/329 [00:33<01:09,  3.18it/s][A
Iteration:  33%|███▎      | 110/329 [00:33<01:07,  3.25it/s][A
Iteration:  34%|███▎      | 111/329 [00:33<01:05,  3.31it/s][A
Iteration:  34%|███▍      | 112/329 [00:33

2023/08/08 07:14:21 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 1 after 128 steps:



                                                            
Epoch:  25%|██▌       | 1/4 [02:28<05:25, 108.46s/it]23it/s][A
                                                            
Epoch:  25%|██▌       | 1/4 [02:28<05:25, 108.46s/it]23it/s][A
                                                            
Epoch:  25%|██▌       | 1/4 [02:28<05:25, 108.46s/it]23it/s][A
                                                            
Epoch:  25%|██▌       | 1/4 [02:28<05:25, 108.46s/it]23it/s][A
Iteration:  39%|███▉      | 128/329 [00:40<01:23,  2.40it/s][A

2023/08/08 07:14:25 - Cosine-Similarity :	Pearson: 0.9597	Spearman: 0.9186
2023/08/08 07:14:25 - Manhattan-Distance:	Pearson: 0.9526	Spearman: 0.9180
2023/08/08 07:14:25 - Euclidean-Distance:	Pearson: 0.9527	Spearman: 0.9182
2023/08/08 07:14:25 - Dot-Product-Similarity:	Pearson: 0.9515	Spearman: 0.9028



Iteration:  39%|███▉      | 129/329 [00:40<01:21,  2.47it/s][A
Iteration:  40%|███▉      | 130/329 [00:40<01:18,  2.53it/s][A
Iteration:  40%|███▉      | 131/329 [00:41<01:16,  2.59it/s][A
Iteration:  40%|████      | 132/329 [00:41<01:13,  2.67it/s][A
Iteration:  40%|████      | 133/329 [00:41<01:11,  2.74it/s][A
Iteration:  41%|████      | 134/329 [00:41<01:09,  2.81it/s][A
Iteration:  41%|████      | 135/329 [00:41<01:07,  2.88it/s][A
Iteration:  41%|████▏     | 136/329 [00:41<01:05,  2.94it/s][A
Iteration:  42%|████▏     | 137/329 [00:42<01:03,  3.00it/s][A
Iteration:  42%|████▏     | 138/329 [00:42<01:02,  3.07it/s][A
Iteration:  42%|████▏     | 139/329 [00:42<01:00,  3.13it/s][A
Iteration:  43%|████▎     | 140/329 [00:42<00:58,  3.21it/s][A
Iteration:  43%|████▎     | 141/329 [00:42<00:57,  3.28it/s][A
Iteration:  43%|████▎     | 142/329 [00:43<00:56,  3.34it/s][A
Iteration:  43%|████▎     | 143/329 [00:43<00:54,  3.41it/s][A
Iteration:  44%|████▍     | 144/329 [00

2023/08/08 07:14:31 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 1 after 160 steps:



                                                            
Epoch:  25%|██▌       | 1/4 [02:38<05:25, 108.46s/it]23it/s][A
                                                            
Epoch:  25%|██▌       | 1/4 [02:38<05:25, 108.46s/it]23it/s][A
                                                            
Epoch:  25%|██▌       | 1/4 [02:38<05:25, 108.46s/it]23it/s][A
                                                            
Epoch:  25%|██▌       | 1/4 [02:38<05:25, 108.46s/it]23it/s][A
Iteration:  49%|████▊     | 160/329 [00:50<01:11,  2.38it/s][A

2023/08/08 07:14:35 - Cosine-Similarity :	Pearson: 0.9613	Spearman: 0.9206
2023/08/08 07:14:35 - Manhattan-Distance:	Pearson: 0.9557	Spearman: 0.9195
2023/08/08 07:14:35 - Euclidean-Distance:	Pearson: 0.9557	Spearman: 0.9194
2023/08/08 07:14:35 - Dot-Product-Similarity:	Pearson: 0.9559	Spearman: 0.9091



Iteration:  49%|████▉     | 161/329 [00:50<01:08,  2.45it/s][A
Iteration:  49%|████▉     | 162/329 [00:50<01:06,  2.51it/s][A
Iteration:  50%|████▉     | 163/329 [00:50<01:04,  2.57it/s][A
Iteration:  50%|████▉     | 164/329 [00:51<01:02,  2.64it/s][A
Iteration:  50%|█████     | 165/329 [00:51<01:00,  2.70it/s][A
Iteration:  50%|█████     | 166/329 [00:51<00:58,  2.78it/s][A
Iteration:  51%|█████     | 167/329 [00:51<00:56,  2.85it/s][A
Iteration:  51%|█████     | 168/329 [00:51<00:55,  2.92it/s][A
Iteration:  51%|█████▏    | 169/329 [00:51<00:53,  3.00it/s][A
Iteration:  52%|█████▏    | 170/329 [00:52<00:51,  3.06it/s][A
Iteration:  52%|█████▏    | 171/329 [00:52<00:50,  3.13it/s][A
Iteration:  52%|█████▏    | 172/329 [00:52<00:49,  3.18it/s][A
Iteration:  53%|█████▎    | 173/329 [00:52<00:47,  3.26it/s][A
Iteration:  53%|█████▎    | 174/329 [00:52<00:46,  3.32it/s][A
Iteration:  53%|█████▎    | 175/329 [00:53<00:45,  3.38it/s][A
Iteration:  53%|█████▎    | 176/329 [00

2023/08/08 07:14:41 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 1 after 192 steps:



                                                            
Epoch:  25%|██▌       | 1/4 [02:48<05:25, 108.46s/it]22it/s][A
                                                            
Epoch:  25%|██▌       | 1/4 [02:48<05:25, 108.46s/it]22it/s][A
                                                            
Epoch:  25%|██▌       | 1/4 [02:48<05:25, 108.46s/it]22it/s][A
                                                            
Epoch:  25%|██▌       | 1/4 [02:48<05:25, 108.46s/it]22it/s][A
Iteration:  58%|█████▊    | 192/329 [01:00<00:57,  2.38it/s][A

2023/08/08 07:14:45 - Cosine-Similarity :	Pearson: 0.9592	Spearman: 0.9192
2023/08/08 07:14:45 - Manhattan-Distance:	Pearson: 0.9530	Spearman: 0.9174
2023/08/08 07:14:45 - Euclidean-Distance:	Pearson: 0.9531	Spearman: 0.9174
2023/08/08 07:14:45 - Dot-Product-Similarity:	Pearson: 0.9535	Spearman: 0.9079



Iteration:  59%|█████▊    | 193/329 [01:00<00:55,  2.44it/s][A
Iteration:  59%|█████▉    | 194/329 [01:00<00:53,  2.51it/s][A
Iteration:  59%|█████▉    | 195/329 [01:00<00:51,  2.58it/s][A
Iteration:  60%|█████▉    | 196/329 [01:00<00:50,  2.64it/s][A
Iteration:  60%|█████▉    | 197/329 [01:01<00:48,  2.71it/s][A
Iteration:  60%|██████    | 198/329 [01:01<00:47,  2.79it/s][A
Iteration:  60%|██████    | 199/329 [01:01<00:45,  2.86it/s][A
Iteration:  61%|██████    | 200/329 [01:01<00:44,  2.93it/s][A
Iteration:  61%|██████    | 201/329 [01:01<00:42,  2.98it/s][A
Iteration:  61%|██████▏   | 202/329 [01:01<00:41,  3.07it/s][A
Iteration:  62%|██████▏   | 203/329 [01:02<00:40,  3.13it/s][A
Iteration:  62%|██████▏   | 204/329 [01:02<00:39,  3.20it/s][A
Iteration:  62%|██████▏   | 205/329 [01:02<00:37,  3.27it/s][A
Iteration:  63%|██████▎   | 206/329 [01:02<00:37,  3.31it/s][A
Iteration:  63%|██████▎   | 207/329 [01:02<00:36,  3.38it/s][A
Iteration:  63%|██████▎   | 208/329 [01

2023/08/08 07:14:51 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 1 after 224 steps:



                                                            
Epoch:  25%|██▌       | 1/4 [02:58<05:25, 108.46s/it]20it/s][A
                                                            
Epoch:  25%|██▌       | 1/4 [02:58<05:25, 108.46s/it]20it/s][A
                                                            
Epoch:  25%|██▌       | 1/4 [02:58<05:25, 108.46s/it]20it/s][A
                                                            
Epoch:  25%|██▌       | 1/4 [02:58<05:25, 108.46s/it]20it/s][A
                                                            
Epoch:  25%|██▌       | 1/4 [02:58<05:25, 108.46s/it]20it/s][A

2023/08/08 07:14:54 - Cosine-Similarity :	Pearson: 0.9618	Spearman: 0.9227
2023/08/08 07:14:54 - Manhattan-Distance:	Pearson: 0.9545	Spearman: 0.9207
2023/08/08 07:14:54 - Euclidean-Distance:	Pearson: 0.9544	Spearman: 0.9206
2023/08/08 07:14:54 - Dot-Product-Similarity:	Pearson: 0.9538	Spearman: 0.9099
2023/08/08 07:14:54 - Save model to output/training_sts_by_Softmaxlossklue-roberta-base-2023-08-08_07-08-05



Iteration:  68%|██████▊   | 224/329 [01:10<00:48,  2.18it/s][A
Iteration:  68%|██████▊   | 225/329 [01:10<00:46,  2.25it/s][A
Iteration:  69%|██████▊   | 226/329 [01:11<00:44,  2.31it/s][A
Iteration:  69%|██████▉   | 227/329 [01:11<00:43,  2.37it/s][A
Iteration:  69%|██████▉   | 228/329 [01:11<00:41,  2.44it/s][A
Iteration:  70%|██████▉   | 229/329 [01:11<00:40,  2.49it/s][A
Iteration:  70%|██████▉   | 230/329 [01:11<00:38,  2.54it/s][A
Iteration:  70%|███████   | 231/329 [01:12<00:37,  2.61it/s][A
Iteration:  71%|███████   | 232/329 [01:12<00:36,  2.68it/s][A
Iteration:  71%|███████   | 233/329 [01:12<00:34,  2.76it/s][A
Iteration:  71%|███████   | 234/329 [01:12<00:33,  2.82it/s][A
Iteration:  71%|███████▏  | 235/329 [01:12<00:32,  2.90it/s][A
Iteration:  72%|███████▏  | 236/329 [01:12<00:31,  2.96it/s][A
Iteration:  72%|███████▏  | 237/329 [01:13<00:30,  3.01it/s][A
Iteration:  72%|███████▏  | 238/329 [01:13<00:29,  3.05it/s][A
Iteration:  73%|███████▎  | 239/329 [01

2023/08/08 07:15:01 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 1 after 256 steps:



                                                            
Epoch:  25%|██▌       | 1/4 [03:08<05:25, 108.46s/it]08it/s][A
                                                            
Epoch:  25%|██▌       | 1/4 [03:09<05:25, 108.46s/it]08it/s][A
                                                            
Epoch:  25%|██▌       | 1/4 [03:09<05:25, 108.46s/it]08it/s][A
                                                            
Epoch:  25%|██▌       | 1/4 [03:09<05:25, 108.46s/it]08it/s][A
Iteration:  78%|███████▊  | 256/329 [01:20<00:31,  2.32it/s][A

2023/08/08 07:15:05 - Cosine-Similarity :	Pearson: 0.9607	Spearman: 0.9203
2023/08/08 07:15:05 - Manhattan-Distance:	Pearson: 0.9566	Spearman: 0.9197
2023/08/08 07:15:05 - Euclidean-Distance:	Pearson: 0.9564	Spearman: 0.9193
2023/08/08 07:15:05 - Dot-Product-Similarity:	Pearson: 0.9535	Spearman: 0.9056



Iteration:  78%|███████▊  | 257/329 [01:20<00:30,  2.38it/s][A
Iteration:  78%|███████▊  | 258/329 [01:21<00:29,  2.44it/s][A
Iteration:  79%|███████▊  | 259/329 [01:21<00:27,  2.50it/s][A
Iteration:  79%|███████▉  | 260/329 [01:21<00:26,  2.57it/s][A
Iteration:  79%|███████▉  | 261/329 [01:21<00:25,  2.64it/s][A
Iteration:  80%|███████▉  | 262/329 [01:21<00:24,  2.71it/s][A
Iteration:  80%|███████▉  | 263/329 [01:21<00:23,  2.77it/s][A
Iteration:  80%|████████  | 264/329 [01:22<00:22,  2.84it/s][A
Iteration:  81%|████████  | 265/329 [01:22<00:22,  2.88it/s][A
Iteration:  81%|████████  | 266/329 [01:22<00:21,  2.94it/s][A
Iteration:  81%|████████  | 267/329 [01:22<00:20,  3.00it/s][A
Iteration:  81%|████████▏ | 268/329 [01:23<00:19,  3.07it/s][A
Iteration:  82%|████████▏ | 269/329 [01:23<00:19,  3.14it/s][A
Iteration:  82%|████████▏ | 270/329 [01:23<00:18,  3.22it/s][A
Iteration:  82%|████████▏ | 271/329 [01:23<00:17,  3.28it/s][A
Iteration:  83%|████████▎ | 272/329 [01

2023/08/08 07:15:11 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 1 after 288 steps:



                                                            
Epoch:  25%|██▌       | 1/4 [03:18<05:25, 108.46s/it]14it/s][A
                                                            
Epoch:  25%|██▌       | 1/4 [03:18<05:25, 108.46s/it]14it/s][A
                                                            
Epoch:  25%|██▌       | 1/4 [03:18<05:25, 108.46s/it]14it/s][A
                                                            
Epoch:  25%|██▌       | 1/4 [03:18<05:25, 108.46s/it]14it/s][A
Iteration:  88%|████████▊ | 288/329 [01:30<00:17,  2.34it/s][A

2023/08/08 07:15:15 - Cosine-Similarity :	Pearson: 0.9603	Spearman: 0.9182
2023/08/08 07:15:15 - Manhattan-Distance:	Pearson: 0.9546	Spearman: 0.9174
2023/08/08 07:15:15 - Euclidean-Distance:	Pearson: 0.9547	Spearman: 0.9174
2023/08/08 07:15:15 - Dot-Product-Similarity:	Pearson: 0.9519	Spearman: 0.9024



Iteration:  88%|████████▊ | 289/329 [01:30<00:16,  2.41it/s][A
Iteration:  88%|████████▊ | 290/329 [01:30<00:15,  2.48it/s][A
Iteration:  88%|████████▊ | 291/329 [01:31<00:14,  2.55it/s][A
Iteration:  89%|████████▉ | 292/329 [01:31<00:14,  2.62it/s][A
Iteration:  89%|████████▉ | 293/329 [01:31<00:13,  2.68it/s][A
Iteration:  89%|████████▉ | 294/329 [01:31<00:12,  2.76it/s][A
Iteration:  90%|████████▉ | 295/329 [01:31<00:12,  2.82it/s][A
Iteration:  90%|████████▉ | 296/329 [01:32<00:11,  2.90it/s][A
Iteration:  90%|█████████ | 297/329 [01:32<00:10,  2.98it/s][A
Iteration:  91%|█████████ | 298/329 [01:32<00:10,  3.05it/s][A
Iteration:  91%|█████████ | 299/329 [01:32<00:09,  3.11it/s][A
Iteration:  91%|█████████ | 300/329 [01:32<00:09,  3.18it/s][A
Iteration:  91%|█████████▏| 301/329 [01:32<00:08,  3.24it/s][A
Iteration:  92%|█████████▏| 302/329 [01:33<00:08,  3.30it/s][A
Iteration:  92%|█████████▏| 303/329 [01:33<00:07,  3.36it/s][A
Iteration:  92%|█████████▏| 304/329 [01

2023/08/08 07:15:21 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 1 after 320 steps:



                                                            
Epoch:  25%|██▌       | 1/4 [03:28<05:25, 108.46s/it]16it/s][A
                                                            
Epoch:  25%|██▌       | 1/4 [03:28<05:25, 108.46s/it]16it/s][A
                                                            
Epoch:  25%|██▌       | 1/4 [03:28<05:25, 108.46s/it]16it/s][A
                                                            
Epoch:  25%|██▌       | 1/4 [03:28<05:25, 108.46s/it]16it/s][A
Iteration:  97%|█████████▋| 320/329 [01:40<00:03,  2.35it/s][A

2023/08/08 07:15:25 - Cosine-Similarity :	Pearson: 0.9616	Spearman: 0.9217
2023/08/08 07:15:25 - Manhattan-Distance:	Pearson: 0.9567	Spearman: 0.9213
2023/08/08 07:15:25 - Euclidean-Distance:	Pearson: 0.9565	Spearman: 0.9206
2023/08/08 07:15:25 - Dot-Product-Similarity:	Pearson: 0.9537	Spearman: 0.9066



Iteration:  98%|█████████▊| 321/329 [01:40<00:03,  2.41it/s][A
Iteration:  98%|█████████▊| 322/329 [01:40<00:02,  2.49it/s][A
Iteration:  98%|█████████▊| 323/329 [01:40<00:02,  2.56it/s][A
Iteration:  98%|█████████▊| 324/329 [01:41<00:01,  2.63it/s][A
Iteration:  99%|█████████▉| 325/329 [01:41<00:01,  2.71it/s][A
Iteration:  99%|█████████▉| 326/329 [01:41<00:01,  2.77it/s][A
Iteration:  99%|█████████▉| 327/329 [01:41<00:00,  2.84it/s][A
Iteration: 100%|█████████▉| 328/329 [01:41<00:00,  2.90it/s][A
Iteration: 100%|██████████| 329/329 [01:41<00:00,  3.23it/s][A
Epoch:  25%|██▌       | 1/4 [03:30<05:25, 108.46s/it]

2023/08/08 07:15:27 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset after epoch 1:


Epoch:  50%|█████     | 2/4 [03:34<03:33, 106.85s/it]

2023/08/08 07:15:30 - Cosine-Similarity :	Pearson: 0.9615	Spearman: 0.9199
2023/08/08 07:15:30 - Manhattan-Distance:	Pearson: 0.9558	Spearman: 0.9197
2023/08/08 07:15:30 - Euclidean-Distance:	Pearson: 0.9557	Spearman: 0.9191
2023/08/08 07:15:30 - Dot-Product-Similarity:	Pearson: 0.9537	Spearman: 0.9052



Iteration:   0%|          | 0/329 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/329 [00:00<01:21,  4.04it/s][A
Iteration:   1%|          | 2/329 [00:00<01:12,  4.51it/s][A
Iteration:   1%|          | 3/329 [00:00<01:12,  4.49it/s][A
Iteration:   1%|          | 4/329 [00:00<01:08,  4.77it/s][A
Iteration:   2%|▏         | 5/329 [00:01<01:08,  4.71it/s][A
Iteration:   2%|▏         | 6/329 [00:01<01:07,  4.80it/s][A
Iteration:   2%|▏         | 7/329 [00:01<01:06,  4.87it/s][A
Iteration:   2%|▏         | 8/329 [00:01<01:04,  4.94it/s][A
Iteration:   3%|▎         | 9/329 [00:01<01:03,  5.06it/s][A
Iteration:   3%|▎         | 10/329 [00:01<01:02,  5.08it/s][A
Iteration:   3%|▎         | 11/329 [00:02<01:03,  4.98it/s][A
Iteration:   4%|▎         | 12/329 [00:02<01:03,  5.02it/s][A
Iteration:   4%|▍         | 13/329 [00:02<01:02,  5.02it/s][A
Iteration:   4%|▍         | 14/329 [00:02<01:02,  5.02it/s][A
Iteration:   5%|▍         | 15/329 [00:02<01:01,  5.12it/s][A
Iteration

2023/08/08 07:15:36 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 2 after 32 steps:



                                                           
Epoch:  50%|█████     | 2/4 [03:43<03:33, 106.85s/it]0it/s][A
                                                           
Epoch:  50%|█████     | 2/4 [03:43<03:33, 106.85s/it]0it/s][A
                                                           
Epoch:  50%|█████     | 2/4 [03:43<03:33, 106.85s/it]0it/s][A
                                                           
Epoch:  50%|█████     | 2/4 [03:43<03:33, 106.85s/it]0it/s][A
Iteration:  10%|▉         | 32/329 [00:09<02:03,  2.40it/s][A

2023/08/08 07:15:40 - Cosine-Similarity :	Pearson: 0.9620	Spearman: 0.9220
2023/08/08 07:15:40 - Manhattan-Distance:	Pearson: 0.9573	Spearman: 0.9216
2023/08/08 07:15:40 - Euclidean-Distance:	Pearson: 0.9572	Spearman: 0.9213
2023/08/08 07:15:40 - Dot-Product-Similarity:	Pearson: 0.9549	Spearman: 0.9084



Iteration:  10%|█         | 33/329 [00:09<01:59,  2.48it/s][A
Iteration:  10%|█         | 34/329 [00:10<01:54,  2.57it/s][A
Iteration:  11%|█         | 35/329 [00:10<01:50,  2.66it/s][A
Iteration:  11%|█         | 36/329 [00:10<01:47,  2.74it/s][A
Iteration:  11%|█         | 37/329 [00:10<01:43,  2.82it/s][A
Iteration:  12%|█▏        | 38/329 [00:10<01:40,  2.91it/s][A
Iteration:  12%|█▏        | 39/329 [00:11<01:37,  2.97it/s][A
Iteration:  12%|█▏        | 40/329 [00:11<01:35,  3.03it/s][A
Iteration:  12%|█▏        | 41/329 [00:11<01:32,  3.12it/s][A
Iteration:  13%|█▎        | 42/329 [00:11<01:30,  3.17it/s][A
Iteration:  13%|█▎        | 43/329 [00:11<01:28,  3.25it/s][A
Iteration:  13%|█▎        | 44/329 [00:12<01:25,  3.32it/s][A
Iteration:  14%|█▎        | 45/329 [00:12<01:24,  3.38it/s][A
Iteration:  14%|█▍        | 46/329 [00:12<01:22,  3.43it/s][A
Iteration:  14%|█▍        | 47/329 [00:12<01:20,  3.49it/s][A
Iteration:  15%|█▍        | 48/329 [00:12<01:19,  3.55

2023/08/08 07:15:46 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 2 after 64 steps:



                                                           
Epoch:  50%|█████     | 2/4 [03:53<03:33, 106.85s/it]6it/s][A
                                                           
Epoch:  50%|█████     | 2/4 [03:53<03:33, 106.85s/it]6it/s][A
                                                           
Epoch:  50%|█████     | 2/4 [03:53<03:33, 106.85s/it]6it/s][A
                                                           
Epoch:  50%|█████     | 2/4 [03:53<03:33, 106.85s/it]6it/s][A
                                                           
Epoch:  50%|█████     | 2/4 [03:53<03:33, 106.85s/it]6it/s][A

2023/08/08 07:15:50 - Cosine-Similarity :	Pearson: 0.9626	Spearman: 0.9231
2023/08/08 07:15:50 - Manhattan-Distance:	Pearson: 0.9563	Spearman: 0.9217
2023/08/08 07:15:50 - Euclidean-Distance:	Pearson: 0.9564	Spearman: 0.9216
2023/08/08 07:15:50 - Dot-Product-Similarity:	Pearson: 0.9543	Spearman: 0.9088
2023/08/08 07:15:50 - Save model to output/training_sts_by_Softmaxlossklue-roberta-base-2023-08-08_07-08-05



Iteration:  19%|█▉        | 64/329 [00:20<02:01,  2.18it/s][A
Iteration:  20%|█▉        | 65/329 [00:20<01:57,  2.25it/s][A
Iteration:  20%|██        | 66/329 [00:20<01:53,  2.32it/s][A
Iteration:  20%|██        | 67/329 [00:20<01:49,  2.39it/s][A
Iteration:  21%|██        | 68/329 [00:21<01:45,  2.46it/s][A
Iteration:  21%|██        | 69/329 [00:21<01:42,  2.53it/s][A
Iteration:  21%|██▏       | 70/329 [00:21<01:39,  2.61it/s][A
Iteration:  22%|██▏       | 71/329 [00:21<01:36,  2.68it/s][A
Iteration:  22%|██▏       | 72/329 [00:21<01:33,  2.76it/s][A
Iteration:  22%|██▏       | 73/329 [00:21<01:30,  2.83it/s][A
Iteration:  22%|██▏       | 74/329 [00:22<01:27,  2.90it/s][A
Iteration:  23%|██▎       | 75/329 [00:22<01:25,  2.97it/s][A
Iteration:  23%|██▎       | 76/329 [00:22<01:23,  3.04it/s][A
Iteration:  23%|██▎       | 77/329 [00:22<01:21,  3.11it/s][A
Iteration:  24%|██▎       | 78/329 [00:22<01:19,  3.16it/s][A
Iteration:  24%|██▍       | 79/329 [00:23<01:17,  3.23

2023/08/08 07:15:56 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 2 after 96 steps:



                                                           
Epoch:  50%|█████     | 2/4 [04:04<03:33, 106.85s/it]1it/s][A
                                                           
Epoch:  50%|█████     | 2/4 [04:04<03:33, 106.85s/it]1it/s][A
                                                           
Epoch:  50%|█████     | 2/4 [04:04<03:33, 106.85s/it]1it/s][A
                                                           
Epoch:  50%|█████     | 2/4 [04:04<03:33, 106.85s/it]1it/s][A
                                                           
Epoch:  50%|█████     | 2/4 [04:04<03:33, 106.85s/it]1it/s][A

2023/08/08 07:16:00 - Cosine-Similarity :	Pearson: 0.9628	Spearman: 0.9251
2023/08/08 07:16:00 - Manhattan-Distance:	Pearson: 0.9573	Spearman: 0.9234
2023/08/08 07:16:00 - Euclidean-Distance:	Pearson: 0.9574	Spearman: 0.9232
2023/08/08 07:16:00 - Dot-Product-Similarity:	Pearson: 0.9553	Spearman: 0.9115
2023/08/08 07:16:00 - Save model to output/training_sts_by_Softmaxlossklue-roberta-base-2023-08-08_07-08-05



Iteration:  29%|██▉       | 96/329 [00:30<01:46,  2.18it/s][A
Iteration:  29%|██▉       | 97/329 [00:30<01:43,  2.25it/s][A
Iteration:  30%|██▉       | 98/329 [00:31<01:39,  2.32it/s][A
Iteration:  30%|███       | 99/329 [00:31<01:36,  2.38it/s][A
Iteration:  30%|███       | 100/329 [00:31<01:33,  2.45it/s][A
Iteration:  31%|███       | 101/329 [00:31<01:30,  2.52it/s][A
Iteration:  31%|███       | 102/329 [00:31<01:27,  2.59it/s][A
Iteration:  31%|███▏      | 103/329 [00:31<01:24,  2.67it/s][A
Iteration:  32%|███▏      | 104/329 [00:32<01:22,  2.73it/s][A
Iteration:  32%|███▏      | 105/329 [00:32<01:20,  2.80it/s][A
Iteration:  32%|███▏      | 106/329 [00:32<01:17,  2.86it/s][A
Iteration:  33%|███▎      | 107/329 [00:32<01:15,  2.93it/s][A
Iteration:  33%|███▎      | 108/329 [00:32<01:13,  3.01it/s][A
Iteration:  33%|███▎      | 109/329 [00:33<01:11,  3.08it/s][A
Iteration:  33%|███▎      | 110/329 [00:33<01:09,  3.15it/s][A
Iteration:  34%|███▎      | 111/329 [00:33<

2023/08/08 07:16:07 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 2 after 128 steps:



                                                            
Epoch:  50%|█████     | 2/4 [04:14<03:33, 106.85s/it]13it/s][A
                                                            
Epoch:  50%|█████     | 2/4 [04:14<03:33, 106.85s/it]13it/s][A
                                                            
Epoch:  50%|█████     | 2/4 [04:14<03:33, 106.85s/it]13it/s][A
                                                            
Epoch:  50%|█████     | 2/4 [04:14<03:33, 106.85s/it]13it/s][A
Iteration:  39%|███▉      | 128/329 [00:40<01:26,  2.31it/s][A

2023/08/08 07:16:11 - Cosine-Similarity :	Pearson: 0.9627	Spearman: 0.9241
2023/08/08 07:16:11 - Manhattan-Distance:	Pearson: 0.9572	Spearman: 0.9225
2023/08/08 07:16:11 - Euclidean-Distance:	Pearson: 0.9572	Spearman: 0.9223
2023/08/08 07:16:11 - Dot-Product-Similarity:	Pearson: 0.9544	Spearman: 0.9090



Iteration:  39%|███▉      | 129/329 [00:40<01:24,  2.38it/s][A
Iteration:  40%|███▉      | 130/329 [00:40<01:21,  2.45it/s][A
Iteration:  40%|███▉      | 131/329 [00:41<01:18,  2.52it/s][A
Iteration:  40%|████      | 132/329 [00:41<01:15,  2.59it/s][A
Iteration:  40%|████      | 133/329 [00:41<01:13,  2.67it/s][A
Iteration:  41%|████      | 134/329 [00:41<01:11,  2.74it/s][A
Iteration:  41%|████      | 135/329 [00:41<01:09,  2.81it/s][A
Iteration:  41%|████▏     | 136/329 [00:41<01:07,  2.87it/s][A
Iteration:  42%|████▏     | 137/329 [00:42<01:05,  2.94it/s][A
Iteration:  42%|████▏     | 138/329 [00:42<01:03,  3.00it/s][A
Iteration:  42%|████▏     | 139/329 [00:42<01:01,  3.08it/s][A
Iteration:  43%|████▎     | 140/329 [00:42<01:00,  3.14it/s][A
Iteration:  43%|████▎     | 141/329 [00:42<00:58,  3.21it/s][A
Iteration:  43%|████▎     | 142/329 [00:43<00:57,  3.27it/s][A
Iteration:  43%|████▎     | 143/329 [00:43<00:55,  3.33it/s][A
Iteration:  44%|████▍     | 144/329 [00

2023/08/08 07:16:17 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 2 after 160 steps:



                                                            
Epoch:  50%|█████     | 2/4 [04:24<03:33, 106.85s/it]23it/s][A
                                                            
Epoch:  50%|█████     | 2/4 [04:24<03:33, 106.85s/it]23it/s][A
                                                            
Epoch:  50%|█████     | 2/4 [04:24<03:33, 106.85s/it]23it/s][A
                                                            
Epoch:  50%|█████     | 2/4 [04:24<03:33, 106.85s/it]23it/s][A
Iteration:  49%|████▊     | 160/329 [00:50<01:11,  2.38it/s][A

2023/08/08 07:16:20 - Cosine-Similarity :	Pearson: 0.9634	Spearman: 0.9241
2023/08/08 07:16:20 - Manhattan-Distance:	Pearson: 0.9579	Spearman: 0.9231
2023/08/08 07:16:20 - Euclidean-Distance:	Pearson: 0.9579	Spearman: 0.9230
2023/08/08 07:16:20 - Dot-Product-Similarity:	Pearson: 0.9551	Spearman: 0.9097



Iteration:  49%|████▉     | 161/329 [00:50<01:08,  2.45it/s][A
Iteration:  49%|████▉     | 162/329 [00:50<01:06,  2.51it/s][A
Iteration:  50%|████▉     | 163/329 [00:50<01:04,  2.58it/s][A
Iteration:  50%|████▉     | 164/329 [00:50<01:02,  2.64it/s][A
Iteration:  50%|█████     | 165/329 [00:51<01:00,  2.71it/s][A
Iteration:  50%|█████     | 166/329 [00:51<00:58,  2.76it/s][A
Iteration:  51%|█████     | 167/329 [00:51<00:57,  2.81it/s][A
Iteration:  51%|█████     | 168/329 [00:51<00:56,  2.85it/s][A
Iteration:  51%|█████▏    | 169/329 [00:52<00:54,  2.92it/s][A
Iteration:  52%|█████▏    | 170/329 [00:52<00:53,  2.99it/s][A
Iteration:  52%|█████▏    | 171/329 [00:52<00:51,  3.06it/s][A
Iteration:  52%|█████▏    | 172/329 [00:52<00:50,  3.13it/s][A
Iteration:  53%|█████▎    | 173/329 [00:52<00:48,  3.19it/s][A
Iteration:  53%|█████▎    | 174/329 [00:52<00:47,  3.25it/s][A
Iteration:  53%|█████▎    | 175/329 [00:53<00:46,  3.31it/s][A
Iteration:  53%|█████▎    | 176/329 [00

2023/08/08 07:16:27 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 2 after 192 steps:



                                                            
Epoch:  50%|█████     | 2/4 [04:34<03:33, 106.85s/it]19it/s][A
                                                            
Epoch:  50%|█████     | 2/4 [04:34<03:33, 106.85s/it]19it/s][A
                                                            
Epoch:  50%|█████     | 2/4 [04:34<03:33, 106.85s/it]19it/s][A
                                                            
Epoch:  50%|█████     | 2/4 [04:34<03:33, 106.85s/it]19it/s][A
Iteration:  58%|█████▊    | 192/329 [01:00<00:58,  2.34it/s][A

2023/08/08 07:16:30 - Cosine-Similarity :	Pearson: 0.9634	Spearman: 0.9238
2023/08/08 07:16:30 - Manhattan-Distance:	Pearson: 0.9572	Spearman: 0.9225
2023/08/08 07:16:30 - Euclidean-Distance:	Pearson: 0.9572	Spearman: 0.9222
2023/08/08 07:16:30 - Dot-Product-Similarity:	Pearson: 0.9554	Spearman: 0.9099



Iteration:  59%|█████▊    | 193/329 [01:00<00:56,  2.41it/s][A
Iteration:  59%|█████▉    | 194/329 [01:00<00:54,  2.47it/s][A
Iteration:  59%|█████▉    | 195/329 [01:00<00:52,  2.54it/s][A
Iteration:  60%|█████▉    | 196/329 [01:00<00:51,  2.61it/s][A
Iteration:  60%|█████▉    | 197/329 [01:01<00:49,  2.67it/s][A
Iteration:  60%|██████    | 198/329 [01:01<00:47,  2.73it/s][A
Iteration:  60%|██████    | 199/329 [01:01<00:46,  2.79it/s][A
Iteration:  61%|██████    | 200/329 [01:01<00:44,  2.87it/s][A
Iteration:  61%|██████    | 201/329 [01:01<00:43,  2.93it/s][A
Iteration:  61%|██████▏   | 202/329 [01:02<00:42,  3.00it/s][A
Iteration:  62%|██████▏   | 203/329 [01:02<00:41,  3.06it/s][A
Iteration:  62%|██████▏   | 204/329 [01:02<00:39,  3.13it/s][A
Iteration:  62%|██████▏   | 205/329 [01:02<00:38,  3.19it/s][A
Iteration:  63%|██████▎   | 206/329 [01:02<00:37,  3.25it/s][A
Iteration:  63%|██████▎   | 207/329 [01:03<00:36,  3.30it/s][A
Iteration:  63%|██████▎   | 208/329 [01

2023/08/08 07:16:36 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 2 after 224 steps:



                                                            
Epoch:  50%|█████     | 2/4 [04:43<03:33, 106.85s/it]30it/s][A
                                                            
Epoch:  50%|█████     | 2/4 [04:43<03:33, 106.85s/it]30it/s][A
                                                            
Epoch:  50%|█████     | 2/4 [04:43<03:33, 106.85s/it]30it/s][A
                                                            
Epoch:  50%|█████     | 2/4 [04:43<03:33, 106.85s/it]30it/s][A
Iteration:  68%|██████▊   | 224/329 [01:09<00:43,  2.43it/s][A

2023/08/08 07:16:40 - Cosine-Similarity :	Pearson: 0.9636	Spearman: 0.9245
2023/08/08 07:16:40 - Manhattan-Distance:	Pearson: 0.9576	Spearman: 0.9230
2023/08/08 07:16:40 - Euclidean-Distance:	Pearson: 0.9576	Spearman: 0.9229
2023/08/08 07:16:40 - Dot-Product-Similarity:	Pearson: 0.9558	Spearman: 0.9100



Iteration:  68%|██████▊   | 225/329 [01:09<00:41,  2.50it/s][A
Iteration:  69%|██████▊   | 226/329 [01:10<00:40,  2.56it/s][A
Iteration:  69%|██████▉   | 227/329 [01:10<00:38,  2.62it/s][A
Iteration:  69%|██████▉   | 228/329 [01:10<00:37,  2.68it/s][A
Iteration:  70%|██████▉   | 229/329 [01:10<00:36,  2.75it/s][A
Iteration:  70%|██████▉   | 230/329 [01:10<00:35,  2.82it/s][A
Iteration:  70%|███████   | 231/329 [01:11<00:33,  2.89it/s][A
Iteration:  71%|███████   | 232/329 [01:11<00:32,  2.97it/s][A
Iteration:  71%|███████   | 233/329 [01:11<00:31,  3.03it/s][A
Iteration:  71%|███████   | 234/329 [01:11<00:30,  3.09it/s][A
Iteration:  71%|███████▏  | 235/329 [01:11<00:29,  3.16it/s][A
Iteration:  72%|███████▏  | 236/329 [01:12<00:28,  3.23it/s][A
Iteration:  72%|███████▏  | 237/329 [01:12<00:27,  3.29it/s][A
Iteration:  72%|███████▏  | 238/329 [01:12<00:27,  3.35it/s][A
Iteration:  73%|███████▎  | 239/329 [01:12<00:26,  3.44it/s][A
Iteration:  73%|███████▎  | 240/329 [01

2023/08/08 07:16:46 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 2 after 256 steps:



                                                            
Epoch:  50%|█████     | 2/4 [04:53<03:33, 106.85s/it]25it/s][A
                                                            
Epoch:  50%|█████     | 2/4 [04:53<03:33, 106.85s/it]25it/s][A
                                                            
Epoch:  50%|█████     | 2/4 [04:53<03:33, 106.85s/it]25it/s][A
                                                            
Epoch:  50%|█████     | 2/4 [04:53<03:33, 106.85s/it]25it/s][A
Iteration:  78%|███████▊  | 256/329 [01:19<00:30,  2.36it/s][A

2023/08/08 07:16:50 - Cosine-Similarity :	Pearson: 0.9633	Spearman: 0.9236
2023/08/08 07:16:50 - Manhattan-Distance:	Pearson: 0.9571	Spearman: 0.9223
2023/08/08 07:16:50 - Euclidean-Distance:	Pearson: 0.9572	Spearman: 0.9218
2023/08/08 07:16:50 - Dot-Product-Similarity:	Pearson: 0.9556	Spearman: 0.9093



Iteration:  78%|███████▊  | 257/329 [01:19<00:29,  2.42it/s][A
Iteration:  78%|███████▊  | 258/329 [01:20<00:28,  2.48it/s][A
Iteration:  79%|███████▊  | 259/329 [01:20<00:27,  2.55it/s][A
Iteration:  79%|███████▉  | 260/329 [01:20<00:26,  2.61it/s][A
Iteration:  79%|███████▉  | 261/329 [01:20<00:25,  2.68it/s][A
Iteration:  80%|███████▉  | 262/329 [01:20<00:24,  2.75it/s][A
Iteration:  80%|███████▉  | 263/329 [01:21<00:23,  2.82it/s][A
Iteration:  80%|████████  | 264/329 [01:21<00:22,  2.89it/s][A
Iteration:  81%|████████  | 265/329 [01:21<00:21,  2.94it/s][A
Iteration:  81%|████████  | 266/329 [01:21<00:20,  3.01it/s][A
Iteration:  81%|████████  | 267/329 [01:21<00:20,  3.09it/s][A
Iteration:  81%|████████▏ | 268/329 [01:22<00:19,  3.16it/s][A
Iteration:  82%|████████▏ | 269/329 [01:22<00:18,  3.23it/s][A
Iteration:  82%|████████▏ | 270/329 [01:22<00:17,  3.28it/s][A
Iteration:  82%|████████▏ | 271/329 [01:22<00:17,  3.35it/s][A
Iteration:  83%|████████▎ | 272/329 [01

2023/08/08 07:16:56 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 2 after 288 steps:



                                                            
Epoch:  50%|█████     | 2/4 [05:03<03:33, 106.85s/it]20it/s][A
                                                            
Epoch:  50%|█████     | 2/4 [05:03<03:33, 106.85s/it]20it/s][A
                                                            
Epoch:  50%|█████     | 2/4 [05:03<03:33, 106.85s/it]20it/s][A
                                                            
Epoch:  50%|█████     | 2/4 [05:03<03:33, 106.85s/it]20it/s][A
                                                            
Epoch:  50%|█████     | 2/4 [05:03<03:33, 106.85s/it]20it/s][A

2023/08/08 07:17:00 - Cosine-Similarity :	Pearson: 0.9638	Spearman: 0.9259
2023/08/08 07:17:00 - Manhattan-Distance:	Pearson: 0.9580	Spearman: 0.9239
2023/08/08 07:17:00 - Euclidean-Distance:	Pearson: 0.9579	Spearman: 0.9237
2023/08/08 07:17:00 - Dot-Product-Similarity:	Pearson: 0.9562	Spearman: 0.9119
2023/08/08 07:17:00 - Save model to output/training_sts_by_Softmaxlossklue-roberta-base-2023-08-08_07-08-05



Iteration:  88%|████████▊ | 288/329 [01:30<00:18,  2.19it/s][A
Iteration:  88%|████████▊ | 289/329 [01:30<00:17,  2.25it/s][A
Iteration:  88%|████████▊ | 290/329 [01:30<00:16,  2.32it/s][A
Iteration:  88%|████████▊ | 291/329 [01:30<00:15,  2.40it/s][A
Iteration:  89%|████████▉ | 292/329 [01:30<00:15,  2.47it/s][A
Iteration:  89%|████████▉ | 293/329 [01:31<00:14,  2.53it/s][A
Iteration:  89%|████████▉ | 294/329 [01:31<00:13,  2.60it/s][A
Iteration:  90%|████████▉ | 295/329 [01:31<00:12,  2.67it/s][A
Iteration:  90%|████████▉ | 296/329 [01:31<00:12,  2.74it/s][A
Iteration:  90%|█████████ | 297/329 [01:31<00:11,  2.81it/s][A
Iteration:  91%|█████████ | 298/329 [01:32<00:10,  2.87it/s][A
Iteration:  91%|█████████ | 299/329 [01:32<00:10,  2.92it/s][A
Iteration:  91%|█████████ | 300/329 [01:32<00:09,  2.99it/s][A
Iteration:  91%|█████████▏| 301/329 [01:32<00:09,  3.04it/s][A
Iteration:  92%|█████████▏| 302/329 [01:32<00:08,  3.10it/s][A
Iteration:  92%|█████████▏| 303/329 [01

2023/08/08 07:17:07 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 2 after 320 steps:



                                                            
Epoch:  50%|█████     | 2/4 [05:14<03:33, 106.85s/it]98it/s][A
                                                            
Epoch:  50%|█████     | 2/4 [05:14<03:33, 106.85s/it]98it/s][A
                                                            
Epoch:  50%|█████     | 2/4 [05:14<03:33, 106.85s/it]98it/s][A
                                                            
Epoch:  50%|█████     | 2/4 [05:14<03:33, 106.85s/it]98it/s][A
Iteration:  97%|█████████▋| 320/329 [01:40<00:03,  2.31it/s][A

2023/08/08 07:17:10 - Cosine-Similarity :	Pearson: 0.9631	Spearman: 0.9248
2023/08/08 07:17:10 - Manhattan-Distance:	Pearson: 0.9577	Spearman: 0.9230
2023/08/08 07:17:10 - Euclidean-Distance:	Pearson: 0.9576	Spearman: 0.9229
2023/08/08 07:17:10 - Dot-Product-Similarity:	Pearson: 0.9560	Spearman: 0.9110



Iteration:  98%|█████████▊| 321/329 [01:40<00:03,  2.37it/s][A
Iteration:  98%|█████████▊| 322/329 [01:40<00:02,  2.44it/s][A
Iteration:  98%|█████████▊| 323/329 [01:40<00:02,  2.50it/s][A
Iteration:  98%|█████████▊| 324/329 [01:41<00:01,  2.57it/s][A
Iteration:  99%|█████████▉| 325/329 [01:41<00:01,  2.63it/s][A
Iteration:  99%|█████████▉| 326/329 [01:41<00:01,  2.70it/s][A
Iteration:  99%|█████████▉| 327/329 [01:41<00:00,  2.78it/s][A
Iteration: 100%|█████████▉| 328/329 [01:41<00:00,  2.86it/s][A
Iteration: 100%|██████████| 329/329 [01:41<00:00,  3.23it/s][A
Epoch:  50%|█████     | 2/4 [05:16<03:33, 106.85s/it]

2023/08/08 07:17:12 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset after epoch 2:


Epoch:  75%|███████▌  | 3/4 [05:19<01:46, 106.25s/it]

2023/08/08 07:17:16 - Cosine-Similarity :	Pearson: 0.9633	Spearman: 0.9245
2023/08/08 07:17:16 - Manhattan-Distance:	Pearson: 0.9576	Spearman: 0.9229
2023/08/08 07:17:16 - Euclidean-Distance:	Pearson: 0.9575	Spearman: 0.9226
2023/08/08 07:17:16 - Dot-Product-Similarity:	Pearson: 0.9561	Spearman: 0.9108



Iteration:   0%|          | 0/329 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/329 [00:00<01:04,  5.06it/s][A
Iteration:   1%|          | 2/329 [00:00<01:03,  5.19it/s][A
Iteration:   1%|          | 3/329 [00:00<01:02,  5.20it/s][A
Iteration:   1%|          | 4/329 [00:00<01:03,  5.13it/s][A
Iteration:   2%|▏         | 5/329 [00:00<01:02,  5.21it/s][A
Iteration:   2%|▏         | 6/329 [00:01<01:02,  5.19it/s][A
Iteration:   2%|▏         | 7/329 [00:01<01:02,  5.19it/s][A
Iteration:   2%|▏         | 8/329 [00:01<01:00,  5.27it/s][A
Iteration:   3%|▎         | 9/329 [00:01<00:59,  5.39it/s][A
Iteration:   3%|▎         | 10/329 [00:01<00:59,  5.36it/s][A
Iteration:   3%|▎         | 11/329 [00:02<00:58,  5.41it/s][A
Iteration:   4%|▎         | 12/329 [00:02<00:58,  5.39it/s][A
Iteration:   4%|▍         | 13/329 [00:02<00:57,  5.48it/s][A
Iteration:   4%|▍         | 14/329 [00:02<00:57,  5.45it/s][A
Iteration:   5%|▍         | 15/329 [00:02<00:57,  5.46it/s][A
Iteration

2023/08/08 07:17:22 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 3 after 32 steps:



                                                           
Epoch:  75%|███████▌  | 3/4 [05:29<01:46, 106.25s/it]5it/s][A
                                                           
Epoch:  75%|███████▌  | 3/4 [05:29<01:46, 106.25s/it]5it/s][A
                                                           
Epoch:  75%|███████▌  | 3/4 [05:29<01:46, 106.25s/it]5it/s][A
                                                           
Epoch:  75%|███████▌  | 3/4 [05:29<01:46, 106.25s/it]5it/s][A
Iteration:  10%|▉         | 32/329 [00:09<02:06,  2.36it/s][A

2023/08/08 07:17:26 - Cosine-Similarity :	Pearson: 0.9633	Spearman: 0.9246
2023/08/08 07:17:26 - Manhattan-Distance:	Pearson: 0.9581	Spearman: 0.9232
2023/08/08 07:17:26 - Euclidean-Distance:	Pearson: 0.9580	Spearman: 0.9230
2023/08/08 07:17:26 - Dot-Product-Similarity:	Pearson: 0.9566	Spearman: 0.9116



Iteration:  10%|█         | 33/329 [00:10<02:01,  2.44it/s][A
Iteration:  10%|█         | 34/329 [00:10<01:56,  2.52it/s][A
Iteration:  11%|█         | 35/329 [00:10<01:52,  2.60it/s][A
Iteration:  11%|█         | 36/329 [00:10<01:49,  2.67it/s][A
Iteration:  11%|█         | 37/329 [00:10<01:46,  2.75it/s][A
Iteration:  12%|█▏        | 38/329 [00:10<01:42,  2.83it/s][A
Iteration:  12%|█▏        | 39/329 [00:11<01:39,  2.92it/s][A
Iteration:  12%|█▏        | 40/329 [00:11<01:36,  3.01it/s][A
Iteration:  12%|█▏        | 41/329 [00:11<01:33,  3.08it/s][A
Iteration:  13%|█▎        | 42/329 [00:11<01:31,  3.15it/s][A
Iteration:  13%|█▎        | 43/329 [00:11<01:28,  3.22it/s][A
Iteration:  13%|█▎        | 44/329 [00:12<01:26,  3.29it/s][A
Iteration:  14%|█▎        | 45/329 [00:12<01:25,  3.34it/s][A
Iteration:  14%|█▍        | 46/329 [00:12<01:23,  3.41it/s][A
Iteration:  14%|█▍        | 47/329 [00:12<01:21,  3.47it/s][A
Iteration:  15%|█▍        | 48/329 [00:12<01:19,  3.54

2023/08/08 07:17:32 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 3 after 64 steps:



                                                           
Epoch:  75%|███████▌  | 3/4 [05:39<01:46, 106.25s/it]9it/s][A
                                                           
Epoch:  75%|███████▌  | 3/4 [05:39<01:46, 106.25s/it]9it/s][A
                                                           
Epoch:  75%|███████▌  | 3/4 [05:39<01:46, 106.25s/it]9it/s][A
                                                           
Epoch:  75%|███████▌  | 3/4 [05:39<01:46, 106.25s/it]9it/s][A
Iteration:  19%|█▉        | 64/329 [00:19<01:51,  2.38it/s][A

2023/08/08 07:17:35 - Cosine-Similarity :	Pearson: 0.9634	Spearman: 0.9247
2023/08/08 07:17:35 - Manhattan-Distance:	Pearson: 0.9571	Spearman: 0.9229
2023/08/08 07:17:35 - Euclidean-Distance:	Pearson: 0.9571	Spearman: 0.9227
2023/08/08 07:17:35 - Dot-Product-Similarity:	Pearson: 0.9561	Spearman: 0.9115



Iteration:  20%|█▉        | 65/329 [00:19<01:48,  2.44it/s][A
Iteration:  20%|██        | 66/329 [00:20<01:44,  2.51it/s][A
Iteration:  20%|██        | 67/329 [00:20<01:41,  2.59it/s][A
Iteration:  21%|██        | 68/329 [00:20<01:37,  2.66it/s][A
Iteration:  21%|██        | 69/329 [00:20<01:34,  2.74it/s][A
Iteration:  21%|██▏       | 70/329 [00:20<01:31,  2.82it/s][A
Iteration:  22%|██▏       | 71/329 [00:20<01:29,  2.88it/s][A
Iteration:  22%|██▏       | 72/329 [00:21<01:27,  2.94it/s][A
Iteration:  22%|██▏       | 73/329 [00:21<01:25,  3.01it/s][A
Iteration:  22%|██▏       | 74/329 [00:21<01:23,  3.07it/s][A
Iteration:  23%|██▎       | 75/329 [00:21<01:21,  3.13it/s][A
Iteration:  23%|██▎       | 76/329 [00:21<01:19,  3.18it/s][A
Iteration:  23%|██▎       | 77/329 [00:22<01:17,  3.24it/s][A
Iteration:  24%|██▎       | 78/329 [00:22<01:16,  3.29it/s][A
Iteration:  24%|██▍       | 79/329 [00:22<01:14,  3.36it/s][A
Iteration:  24%|██▍       | 80/329 [00:22<01:12,  3.44

2023/08/08 07:17:42 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 3 after 96 steps:



                                                           
Epoch:  75%|███████▌  | 3/4 [05:49<01:46, 106.25s/it]3it/s][A
                                                           
Epoch:  75%|███████▌  | 3/4 [05:49<01:46, 106.25s/it]3it/s][A
                                                           
Epoch:  75%|███████▌  | 3/4 [05:49<01:46, 106.25s/it]3it/s][A
                                                           
Epoch:  75%|███████▌  | 3/4 [05:49<01:46, 106.25s/it]3it/s][A
Iteration:  29%|██▉       | 96/329 [00:29<01:38,  2.37it/s][A
Iteration:  29%|██▉       | 97/329 [00:29<01:34,  2.45it/s][A

2023/08/08 07:17:45 - Cosine-Similarity :	Pearson: 0.9631	Spearman: 0.9241
2023/08/08 07:17:45 - Manhattan-Distance:	Pearson: 0.9568	Spearman: 0.9225
2023/08/08 07:17:45 - Euclidean-Distance:	Pearson: 0.9567	Spearman: 0.9219
2023/08/08 07:17:45 - Dot-Product-Similarity:	Pearson: 0.9556	Spearman: 0.9100



Iteration:  30%|██▉       | 98/329 [00:29<01:31,  2.52it/s][A
Iteration:  30%|███       | 99/329 [00:30<01:29,  2.57it/s][A
Iteration:  30%|███       | 100/329 [00:30<01:26,  2.64it/s][A
Iteration:  31%|███       | 101/329 [00:30<01:24,  2.71it/s][A
Iteration:  31%|███       | 102/329 [00:30<01:21,  2.78it/s][A
Iteration:  31%|███▏      | 103/329 [00:30<01:19,  2.84it/s][A
Iteration:  32%|███▏      | 104/329 [00:31<01:17,  2.90it/s][A
Iteration:  32%|███▏      | 105/329 [00:31<01:15,  2.97it/s][A
Iteration:  32%|███▏      | 106/329 [00:31<01:13,  3.02it/s][A
Iteration:  33%|███▎      | 107/329 [00:31<01:11,  3.09it/s][A
Iteration:  33%|███▎      | 108/329 [00:31<01:10,  3.15it/s][A
Iteration:  33%|███▎      | 109/329 [00:32<01:08,  3.23it/s][A
Iteration:  33%|███▎      | 110/329 [00:32<01:06,  3.29it/s][A
Iteration:  34%|███▎      | 111/329 [00:32<01:05,  3.34it/s][A
Iteration:  34%|███▍      | 112/329 [00:32<01:03,  3.40it/s][A
Iteration:  34%|███▍      | 113/329 [00:3

2023/08/08 07:17:51 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 3 after 128 steps:



                                                            
Epoch:  75%|███████▌  | 3/4 [05:59<01:46, 106.25s/it]17it/s][A
                                                            
Epoch:  75%|███████▌  | 3/4 [05:59<01:46, 106.25s/it]17it/s][A
                                                            
Epoch:  75%|███████▌  | 3/4 [05:59<01:46, 106.25s/it]17it/s][A
                                                            
Epoch:  75%|███████▌  | 3/4 [05:59<01:46, 106.25s/it]17it/s][A
                                                            
Epoch:  75%|███████▌  | 3/4 [05:59<01:46, 106.25s/it]17it/s][A

2023/08/08 07:17:55 - Cosine-Similarity :	Pearson: 0.9639	Spearman: 0.9260
2023/08/08 07:17:55 - Manhattan-Distance:	Pearson: 0.9576	Spearman: 0.9242
2023/08/08 07:17:55 - Euclidean-Distance:	Pearson: 0.9575	Spearman: 0.9238
2023/08/08 07:17:55 - Dot-Product-Similarity:	Pearson: 0.9562	Spearman: 0.9120
2023/08/08 07:17:55 - Save model to output/training_sts_by_Softmaxlossklue-roberta-base-2023-08-08_07-08-05



Iteration:  39%|███▉      | 128/329 [00:40<01:32,  2.18it/s][A
Iteration:  39%|███▉      | 129/329 [00:40<01:29,  2.24it/s][A
Iteration:  40%|███▉      | 130/329 [00:40<01:26,  2.31it/s][A
Iteration:  40%|███▉      | 131/329 [00:40<01:23,  2.37it/s][A
Iteration:  40%|████      | 132/329 [00:40<01:20,  2.44it/s][A
Iteration:  40%|████      | 133/329 [00:41<01:18,  2.50it/s][A
Iteration:  41%|████      | 134/329 [00:41<01:15,  2.57it/s][A
Iteration:  41%|████      | 135/329 [00:41<01:13,  2.64it/s][A
Iteration:  41%|████▏     | 136/329 [00:41<01:11,  2.72it/s][A
Iteration:  42%|████▏     | 137/329 [00:41<01:09,  2.77it/s][A
Iteration:  42%|████▏     | 138/329 [00:42<01:07,  2.85it/s][A
Iteration:  42%|████▏     | 139/329 [00:42<01:05,  2.92it/s][A
Iteration:  43%|████▎     | 140/329 [00:42<01:03,  2.97it/s][A
Iteration:  43%|████▎     | 141/329 [00:42<01:01,  3.04it/s][A
Iteration:  43%|████▎     | 142/329 [00:42<00:59,  3.12it/s][A
Iteration:  43%|████▎     | 143/329 [00

2023/08/08 07:18:02 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 3 after 160 steps:



                                                            
Epoch:  75%|███████▌  | 3/4 [06:09<01:46, 106.25s/it]12it/s][A
                                                            
Epoch:  75%|███████▌  | 3/4 [06:09<01:46, 106.25s/it]12it/s][A
                                                            
Epoch:  75%|███████▌  | 3/4 [06:09<01:46, 106.25s/it]12it/s][A
                                                            
Epoch:  75%|███████▌  | 3/4 [06:09<01:46, 106.25s/it]12it/s][A
                                                            
Epoch:  75%|███████▌  | 3/4 [06:09<01:46, 106.25s/it]12it/s][A

2023/08/08 07:18:06 - Cosine-Similarity :	Pearson: 0.9638	Spearman: 0.9262
2023/08/08 07:18:06 - Manhattan-Distance:	Pearson: 0.9577	Spearman: 0.9246
2023/08/08 07:18:06 - Euclidean-Distance:	Pearson: 0.9576	Spearman: 0.9240
2023/08/08 07:18:06 - Dot-Product-Similarity:	Pearson: 0.9560	Spearman: 0.9119
2023/08/08 07:18:06 - Save model to output/training_sts_by_Softmaxlossklue-roberta-base-2023-08-08_07-08-05



Iteration:  49%|████▊     | 160/329 [00:50<01:18,  2.16it/s][A
Iteration:  49%|████▉     | 161/329 [00:50<01:15,  2.23it/s][A
Iteration:  49%|████▉     | 162/329 [00:50<01:13,  2.29it/s][A
Iteration:  50%|████▉     | 163/329 [00:51<01:10,  2.35it/s][A
Iteration:  50%|████▉     | 164/329 [00:51<01:08,  2.42it/s][A
Iteration:  50%|█████     | 165/329 [00:51<01:05,  2.49it/s][A
Iteration:  50%|█████     | 166/329 [00:51<01:03,  2.55it/s][A
Iteration:  51%|█████     | 167/329 [00:51<01:01,  2.62it/s][A
Iteration:  51%|█████     | 168/329 [00:52<00:59,  2.70it/s][A
Iteration:  51%|█████▏    | 169/329 [00:52<00:57,  2.77it/s][A
Iteration:  52%|█████▏    | 170/329 [00:52<00:56,  2.81it/s][A
Iteration:  52%|█████▏    | 171/329 [00:52<00:54,  2.88it/s][A
Iteration:  52%|█████▏    | 172/329 [00:52<00:53,  2.96it/s][A
Iteration:  53%|█████▎    | 173/329 [00:53<00:51,  3.03it/s][A
Iteration:  53%|█████▎    | 174/329 [00:53<00:50,  3.09it/s][A
Iteration:  53%|█████▎    | 175/329 [00

2023/08/08 07:18:13 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 3 after 192 steps:



                                                            
Epoch:  75%|███████▌  | 3/4 [06:20<01:46, 106.25s/it]01it/s][A
                                                            
Epoch:  75%|███████▌  | 3/4 [06:20<01:46, 106.25s/it]01it/s][A
                                                            
Epoch:  75%|███████▌  | 3/4 [06:20<01:46, 106.25s/it]01it/s][A
                                                            
Epoch:  75%|███████▌  | 3/4 [06:20<01:46, 106.25s/it]01it/s][A
                                                            
Epoch:  75%|███████▌  | 3/4 [06:20<01:46, 106.25s/it]01it/s][A

2023/08/08 07:18:16 - Cosine-Similarity :	Pearson: 0.9637	Spearman: 0.9264
2023/08/08 07:18:16 - Manhattan-Distance:	Pearson: 0.9579	Spearman: 0.9245
2023/08/08 07:18:16 - Euclidean-Distance:	Pearson: 0.9577	Spearman: 0.9240
2023/08/08 07:18:16 - Dot-Product-Similarity:	Pearson: 0.9557	Spearman: 0.9114
2023/08/08 07:18:16 - Save model to output/training_sts_by_Softmaxlossklue-roberta-base-2023-08-08_07-08-05



Iteration:  58%|█████▊    | 192/329 [01:01<01:03,  2.15it/s][A
Iteration:  59%|█████▊    | 193/329 [01:01<01:01,  2.22it/s][A
Iteration:  59%|█████▉    | 194/329 [01:01<00:58,  2.29it/s][A
Iteration:  59%|█████▉    | 195/329 [01:01<00:56,  2.36it/s][A
Iteration:  60%|█████▉    | 196/329 [01:01<00:54,  2.42it/s][A
Iteration:  60%|█████▉    | 197/329 [01:02<00:52,  2.49it/s][A
Iteration:  60%|██████    | 198/329 [01:02<00:51,  2.56it/s][A
Iteration:  60%|██████    | 199/329 [01:02<00:49,  2.63it/s][A
Iteration:  61%|██████    | 200/329 [01:02<00:47,  2.70it/s][A
Iteration:  61%|██████    | 201/329 [01:02<00:46,  2.78it/s][A
Iteration:  61%|██████▏   | 202/329 [01:02<00:44,  2.85it/s][A
Iteration:  62%|██████▏   | 203/329 [01:03<00:43,  2.92it/s][A
Iteration:  62%|██████▏   | 204/329 [01:03<00:41,  3.00it/s][A
Iteration:  62%|██████▏   | 205/329 [01:03<00:40,  3.07it/s][A
Iteration:  63%|██████▎   | 206/329 [01:03<00:38,  3.15it/s][A
Iteration:  63%|██████▎   | 207/329 [01

2023/08/08 07:18:23 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 3 after 224 steps:



                                                            
Epoch:  75%|███████▌  | 3/4 [06:30<01:46, 106.25s/it]13it/s][A
                                                            
Epoch:  75%|███████▌  | 3/4 [06:30<01:46, 106.25s/it]13it/s][A
                                                            
Epoch:  75%|███████▌  | 3/4 [06:30<01:46, 106.25s/it]13it/s][A
                                                            
Epoch:  75%|███████▌  | 3/4 [06:30<01:46, 106.25s/it]13it/s][A
Iteration:  68%|██████▊   | 224/329 [01:10<00:44,  2.34it/s][A

2023/08/08 07:18:27 - Cosine-Similarity :	Pearson: 0.9638	Spearman: 0.9261
2023/08/08 07:18:27 - Manhattan-Distance:	Pearson: 0.9573	Spearman: 0.9239
2023/08/08 07:18:27 - Euclidean-Distance:	Pearson: 0.9573	Spearman: 0.9234
2023/08/08 07:18:27 - Dot-Product-Similarity:	Pearson: 0.9556	Spearman: 0.9112



Iteration:  68%|██████▊   | 225/329 [01:11<00:43,  2.40it/s][A
Iteration:  69%|██████▊   | 226/329 [01:11<00:41,  2.46it/s][A
Iteration:  69%|██████▉   | 227/329 [01:11<00:40,  2.54it/s][A
Iteration:  69%|██████▉   | 228/329 [01:11<00:38,  2.61it/s][A
Iteration:  70%|██████▉   | 229/329 [01:11<00:37,  2.66it/s][A
Iteration:  70%|██████▉   | 230/329 [01:11<00:36,  2.73it/s][A
Iteration:  70%|███████   | 231/329 [01:12<00:35,  2.79it/s][A
Iteration:  71%|███████   | 232/329 [01:12<00:34,  2.84it/s][A
Iteration:  71%|███████   | 233/329 [01:12<00:32,  2.92it/s][A
Iteration:  71%|███████   | 234/329 [01:12<00:31,  2.99it/s][A
Iteration:  71%|███████▏  | 235/329 [01:12<00:30,  3.08it/s][A
Iteration:  72%|███████▏  | 236/329 [01:13<00:29,  3.16it/s][A
Iteration:  72%|███████▏  | 237/329 [01:13<00:28,  3.21it/s][A
Iteration:  72%|███████▏  | 238/329 [01:13<00:27,  3.29it/s][A
Iteration:  73%|███████▎  | 239/329 [01:13<00:26,  3.35it/s][A
Iteration:  73%|███████▎  | 240/329 [01

2023/08/08 07:18:33 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 3 after 256 steps:



                                                            
Epoch:  75%|███████▌  | 3/4 [06:40<01:46, 106.25s/it]22it/s][A
                                                            
Epoch:  75%|███████▌  | 3/4 [06:40<01:46, 106.25s/it]22it/s][A
                                                            
Epoch:  75%|███████▌  | 3/4 [06:40<01:46, 106.25s/it]22it/s][A
                                                            
Epoch:  75%|███████▌  | 3/4 [06:40<01:46, 106.25s/it]22it/s][A
Iteration:  78%|███████▊  | 256/329 [01:20<00:30,  2.37it/s][A

2023/08/08 07:18:36 - Cosine-Similarity :	Pearson: 0.9636	Spearman: 0.9257
2023/08/08 07:18:36 - Manhattan-Distance:	Pearson: 0.9571	Spearman: 0.9236
2023/08/08 07:18:36 - Euclidean-Distance:	Pearson: 0.9571	Spearman: 0.9231
2023/08/08 07:18:36 - Dot-Product-Similarity:	Pearson: 0.9555	Spearman: 0.9109



Iteration:  78%|███████▊  | 257/329 [01:20<00:29,  2.43it/s][A
Iteration:  78%|███████▊  | 258/329 [01:20<00:28,  2.49it/s][A
Iteration:  79%|███████▊  | 259/329 [01:21<00:27,  2.57it/s][A
Iteration:  79%|███████▉  | 260/329 [01:21<00:26,  2.64it/s][A
Iteration:  79%|███████▉  | 261/329 [01:21<00:25,  2.70it/s][A
Iteration:  80%|███████▉  | 262/329 [01:21<00:24,  2.76it/s][A
Iteration:  80%|███████▉  | 263/329 [01:21<00:23,  2.84it/s][A
Iteration:  80%|████████  | 264/329 [01:22<00:22,  2.92it/s][A
Iteration:  81%|████████  | 265/329 [01:22<00:21,  2.99it/s][A
Iteration:  81%|████████  | 266/329 [01:22<00:20,  3.05it/s][A
Iteration:  81%|████████  | 267/329 [01:22<00:19,  3.11it/s][A
Iteration:  81%|████████▏ | 268/329 [01:22<00:19,  3.18it/s][A
Iteration:  82%|████████▏ | 269/329 [01:23<00:18,  3.25it/s][A
Iteration:  82%|████████▏ | 270/329 [01:23<00:17,  3.31it/s][A
Iteration:  82%|████████▏ | 271/329 [01:23<00:17,  3.37it/s][A
Iteration:  83%|████████▎ | 272/329 [01

2023/08/08 07:18:42 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 3 after 288 steps:



                                                            
Epoch:  75%|███████▌  | 3/4 [06:49<01:46, 106.25s/it]25it/s][A
                                                            
Epoch:  75%|███████▌  | 3/4 [06:49<01:46, 106.25s/it]25it/s][A
                                                            
Epoch:  75%|███████▌  | 3/4 [06:49<01:46, 106.25s/it]25it/s][A
                                                            
Epoch:  75%|███████▌  | 3/4 [06:49<01:46, 106.25s/it]25it/s][A
Iteration:  88%|████████▊ | 288/329 [01:30<00:17,  2.40it/s][A

2023/08/08 07:18:46 - Cosine-Similarity :	Pearson: 0.9637	Spearman: 0.9257
2023/08/08 07:18:46 - Manhattan-Distance:	Pearson: 0.9574	Spearman: 0.9237
2023/08/08 07:18:46 - Euclidean-Distance:	Pearson: 0.9573	Spearman: 0.9233
2023/08/08 07:18:46 - Dot-Product-Similarity:	Pearson: 0.9556	Spearman: 0.9109



Iteration:  88%|████████▊ | 289/329 [01:30<00:16,  2.47it/s][A
Iteration:  88%|████████▊ | 290/329 [01:30<00:15,  2.54it/s][A
Iteration:  88%|████████▊ | 291/329 [01:30<00:14,  2.61it/s][A
Iteration:  89%|████████▉ | 292/329 [01:31<00:13,  2.66it/s][A
Iteration:  89%|████████▉ | 293/329 [01:31<00:13,  2.74it/s][A
Iteration:  89%|████████▉ | 294/329 [01:31<00:12,  2.80it/s][A
Iteration:  90%|████████▉ | 295/329 [01:31<00:11,  2.86it/s][A
Iteration:  90%|████████▉ | 296/329 [01:31<00:11,  2.94it/s][A
Iteration:  90%|█████████ | 297/329 [01:32<00:10,  3.00it/s][A
Iteration:  91%|█████████ | 298/329 [01:32<00:10,  3.07it/s][A
Iteration:  91%|█████████ | 299/329 [01:32<00:09,  3.14it/s][A
Iteration:  91%|█████████ | 300/329 [01:32<00:08,  3.23it/s][A
Iteration:  91%|█████████▏| 301/329 [01:32<00:08,  3.28it/s][A
Iteration:  92%|█████████▏| 302/329 [01:32<00:08,  3.35it/s][A
Iteration:  92%|█████████▏| 303/329 [01:33<00:07,  3.39it/s][A
Iteration:  92%|█████████▏| 304/329 [01

2023/08/08 07:18:52 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 3 after 320 steps:



                                                            
Epoch:  75%|███████▌  | 3/4 [06:59<01:46, 106.25s/it]27it/s][A
                                                            
Epoch:  75%|███████▌  | 3/4 [06:59<01:46, 106.25s/it]27it/s][A
                                                            
Epoch:  75%|███████▌  | 3/4 [06:59<01:46, 106.25s/it]27it/s][A
                                                            
Epoch:  75%|███████▌  | 3/4 [06:59<01:46, 106.25s/it]27it/s][A
Iteration:  97%|█████████▋| 320/329 [01:40<00:03,  2.38it/s][A

2023/08/08 07:18:56 - Cosine-Similarity :	Pearson: 0.9637	Spearman: 0.9258
2023/08/08 07:18:56 - Manhattan-Distance:	Pearson: 0.9575	Spearman: 0.9239
2023/08/08 07:18:56 - Euclidean-Distance:	Pearson: 0.9574	Spearman: 0.9235
2023/08/08 07:18:56 - Dot-Product-Similarity:	Pearson: 0.9556	Spearman: 0.9110



Iteration:  98%|█████████▊| 321/329 [01:40<00:03,  2.44it/s][A
Iteration:  98%|█████████▊| 322/329 [01:40<00:02,  2.51it/s][A
Iteration:  98%|█████████▊| 323/329 [01:40<00:02,  2.58it/s][A
Iteration:  98%|█████████▊| 324/329 [01:40<00:01,  2.65it/s][A
Iteration:  99%|█████████▉| 325/329 [01:40<00:01,  2.72it/s][A
Iteration:  99%|█████████▉| 326/329 [01:41<00:01,  2.80it/s][A
Iteration:  99%|█████████▉| 327/329 [01:41<00:00,  2.88it/s][A
Iteration: 100%|█████████▉| 328/329 [01:41<00:00,  2.95it/s][A
Iteration: 100%|██████████| 329/329 [01:41<00:00,  3.24it/s][A
Epoch:  75%|███████▌  | 3/4 [07:01<01:46, 106.25s/it]

2023/08/08 07:18:57 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset after epoch 3:


Epoch: 100%|██████████| 4/4 [07:05<00:00, 106.26s/it]

2023/08/08 07:19:01 - Cosine-Similarity :	Pearson: 0.9637	Spearman: 0.9258
2023/08/08 07:19:01 - Manhattan-Distance:	Pearson: 0.9575	Spearman: 0.9239
2023/08/08 07:19:01 - Euclidean-Distance:	Pearson: 0.9574	Spearman: 0.9235
2023/08/08 07:19:01 - Dot-Product-Similarity:	Pearson: 0.9556	Spearman: 0.9110





In [18]:
# evaluation sts-test
test_evaluator(model, output_path=sts_model_save_path)

2023/08/08 07:19:01 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-test dataset:
2023/08/08 07:19:03 - Cosine-Similarity :	Pearson: 0.8838	Spearman: 0.8855
2023/08/08 07:19:03 - Manhattan-Distance:	Pearson: 0.8849	Spearman: 0.8817
2023/08/08 07:19:03 - Euclidean-Distance:	Pearson: 0.8847	Spearman: 0.8816
2023/08/08 07:19:03 - Dot-Product-Similarity:	Pearson: 0.8698	Spearman: 0.8679


0.8855367274673944

In [10]:
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
from scipy.stats import pearsonr

In [20]:
import json
import pandas as pd

dset = []

with open("cbf_track_names.json") as fp:
    for l in fp:
        d = json.loads(l)
        dset.append(d)

print(dset[0])
df = pd.DataFrame(dset)

{'seed_track_nm': 'Seasons (Feat. Harley Bird) (Futuristik & Whogaux Remix)', 'seed_track_artist_nm_list': ['Cadmium', 'Rival'], 'similar_track_nm': 'Seasons (Futuristik & Whogaux Remix)', 'similar_track_artist_nm_list': ['Cadmium', 'Harley Bird', 'Rival'], 'seed_track_nm_rnm': 'seasons', 'similar_track_nm_rnm': 'seasons'}


In [2]:
import numpy as np
import pandas as pd
file1 = pd.read_csv('cbf_track.csv')
file1 = file1.set_index(np.arange(len(file1)))

In [5]:
file1['artist_same'] = 0
for i in range(len(file1)):
    if file1['track_artist'][i] == file1['similar_track_artist'][i]:
        file1['artist_same'][i]=1

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [11]:
file1.to_csv('cbf_track.csv')

In [72]:
file1 = pd.read_parquet('rep_track.parquet')
file1=file1.set_index(np.arange(len(file1)))
file1['merge_key'] = file1['track_nm_notbracspace']+file1['artist_ids']
file1['similar_merge_key'] = file1['similar_track_nm_notbracspace']+file1['similar_artist_ids']

In [7]:
new_df = pd.DataFrame({'sentence1':file1['merge_key'], 'sentence2':file1['similar_merge_key'],'label':0})

In [8]:
sentence1 = []
sentence2 = []
labels = []
for i in range(len(new_df)):
    sentence1.append(new_df['sentence1'][i])
    sentence2.append(new_df['sentence2'][i])
    labels.append(new_df['label'][i])

In [9]:
model = SentenceTransformer(sts_model_save_path)

corpus_embeddings = model.encode(sentence1, convert_to_tensor=True) # senetence1 유사도
query_embeddings = model.encode(sentence2, convert_to_tensor=True) # sentence2 유사도

def cosine_similarity_manual(x, y, small_number=1e-8): # sentence1과 sentence2의 임베딩값으로 유사도 계산
    result =  torch.dot(x, y) / (torch.linalg.norm(x) * torch.linalg.norm(y) + small_number)
    return result

test_scores = []
for i in range(len(sentence1)):
    score = cosine_similarity_manual(corpus_embeddings[i],query_embeddings[i])
    score=score.cpu().detach().numpy()
    test_scores.append(score)

test_scores = np.array(test_scores) # 모델 예측값
y_pred = np.where(test_scores>=0.6, 1, 0) # klue에서 3.0을 기준으로 binary label을 만들었기에, normalize 기준 threshold: 0.6
labels = np.array(labels)
y_label = np.where(labels >= 0.6, 1, 0)

NameError: name 'SentenceTransformer' is not defined

In [76]:
corpus_embeddings = corpus_embeddings.cpu().detach().numpy()
query_embeddings = query_embeddings.cpu().detach().numpy()

cosine_scores = 1 - (paired_cosine_distances(corpus_embeddings, query_embeddings))
manhattan_distances = -paired_manhattan_distances(corpus_embeddings, query_embeddings)
euclidean_distances = -paired_euclidean_distances(corpus_embeddings, query_embeddings)
dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(corpus_embeddings, query_embeddings)]

In [77]:
new_df['sentence_bert_notblac_label']= cosine_scores

In [79]:
new_df['sentence_bert_notblac_label'].describe()

count    902099.000000
mean          0.999852
std           0.006305
min           0.046246
25%           1.000000
50%           1.000000
75%           1.000000
max           1.000000
Name: sentence_bert_notblac_label, dtype: float64

In [71]:
len(new_df[new_df['sentence_bert_notblac_label']<=0.6])

6596

In [80]:
len(new_df[new_df['sentence_bert_notblac_label']<1])

756