Sentence similarity
Dataset: STS

In [None]:
import torch
from datasets import load_dataset, Dataset
import nltk
import re
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses

from nltk.corpus import words
from nltk.corpus import stopwords
nltk.download('stopwords', quiet=True)
nltk.download('words', quiet=True)
nltk.download('wordnet', quiet=True)
# vocab = set(words.words())
from nltk.stem import *

from preprocessing import clean_sentence, vocab, clean_dataset

In [None]:
ds = load_dataset("tabilab/biosses")

In [None]:
train_test = ds['train'].train_test_split(test_size=0.2, seed=42)
train_validate = train_test['train'].train_test_split(test_size=0.2, seed=42)
train_set = train_validate['train']
validate_set = train_validate['test']
test_set = train_test['test']

In [None]:
stemmer = PorterStemmer()

In [None]:
stemmed_words = set(map(lambda word: stemmer.stem(word), words.words()))

In [None]:
cleaned_train, unknown_words = clean_dataset(train_set, stemmed_words)
cleaned_validation, _ = clean_dataset(validate_set, stemmed_words)
cleaned_test, _ = clean_dataset(test_set, stemmed_words)

In [None]:
print(unknown_words)
print(len(unknown_words))

{'pRb', 'Arp4-related', 'miR-16−1', 'Transfection', 'RIP3', 'et', 'non-neural', '3', 'PPP', 'OSCC', 'downregulated', '22]', '(Mirus)', 'AML', 'ESCs', 'up-regulation', '95%', 'receptor-interacting', 'ligase', 'upregulation', 'upregulated', 'protein-α', 'rate-limiting', 'miR-24', 'NH2-terminal', 'hr', 'Kras-driven', 'let-7)', 'miR-223', 'IL-1', '1', 'CCAAT/enhancer', 'MiR-155', 'S2)', 'TFs', 'Sox11', 'SOX2', 'review)', 'Aurora-A', '[18]', 'IDH1/IDH2', 'signal-regulated', 'TET', 'proteasome', 'RNAi-mediated', 'I/A', 'miR-34a', '(GAPs)', 'GTPase', 'miR-143', 'hWts2', 'MiR-223', '(BRG-associated', 'microtubule-nucleating', '(GEFs)', '“cyclin', 'interest;', '(RIP', 'IDH1', 'RNAi', 'miR-204', 'TEL-AML1-positive', 'Arp', 'ductal', 'Up-regulation', 'OCT4', 'dose-dependent', 'GATA6', 'HeLa', '<20%', 'ubiquitin', '(together', 'TargetScan', 'RXXL', 'oncogene', 'Co-transfection', 'EC', 'Neuro-2a', 'qRT-PCR', 'RIP1', 'miR-146b', 'large-scale', '7.4-fold', 'EGFR', '43', 'K-Ras-driven', 'BCL-XL/MEK', 

In [None]:
torch.cuda.empty_cache()

In [None]:
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainingArguments
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
from sentence_transformers.losses import CoSENTLoss

# Load a model to train/finetune
model = SentenceTransformer('all-MiniLM-L6-v2', device="cuda")

# Initialize the CoSENTLoss
# This loss requires pairs of text and a float similarity score as a label
loss = CoSENTLoss(model)

In [None]:
train_s1 = cleaned_train['sentence1']
train_s2 = cleaned_train['sentence2']
validation_s1 = cleaned_validation['sentence1']
validation_s2 = cleaned_validation['sentence2']

In [None]:
# train_emb1 = model.encode(train_s1, normalize_embeddings=True)
# train_emb2 = model.encode(train_s2, normalize_embeddings=True)
# validation_emb1 = model.encode(validation_s1, normalize_embeddings=True)
# validation_emb2 = model.encode(validation_s2, normalize_embeddings=True)

In [None]:
# validation_similarities = model.similarity_pairwise(validation_emb1, validation_emb2)
# print(validation_similarities)
ground_truth_validation = cleaned_validation["score"]
# print(ground_truth_validation)
# print(loss.forward(
#     {"sentence1": cleaned_validation["sentence1"],
#      "sentence2": cleaned_validation["sentence2"]}, 
#     ground_truth_validation))

In [None]:
test_similarities = model.similarity_pairwise(
    model.encode(cleaned_test['sentence1'], normalize_embeddings=True),
      model.encode(cleaned_test['sentence2'], normalize_embeddings=True))
print(test_similarities)
ground_truth_test = cleaned_test["score"]
print(ground_truth_test)

tensor([0.6227, 0.7996, 0.4542, 0.7373, 0.5717, 0.9421, 0.4272, 0.3671, 0.2034,
        0.7460, 0.5736, 0.5206, 0.8329, 0.6441, 0.5521, 0.7422, 0.3377, 0.7593,
        0.4291, 0.3967])
[0.20000004768371582, 0.7999999523162842, -0.30000001192092896, 0.6000000238418579, 0.5, 1.0, 0.0, -0.30000001192092896, -0.10000002384185791, 0.7000000476837158, 0.0, 0.0, 0.8999999761581421, 0.7000000476837158, 0.7999999523162842, 0.5, 0.20000004768371582, 0.6000000238418579, 0.20000004768371582, -0.30000001192092896]


  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [None]:
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/all-MiniLM-L6-v2",
    # Optional training parameters:
    num_train_epochs=300,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=100,
    learning_rate=1e-6,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    # batch_sampler=BatchSamplers.NO_DUPLICATES,  # losses that use "in-batch negatives" benefit from no duplicates
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=0.05,
    save_strategy="steps",
    save_steps=0.05,
    save_total_limit=3,
    load_best_model_at_end=True,
    logging_steps=0.05,
    run_name='all-MiniLM-L6-v2',  # Will be used in W&B if `wandb` is installed
)

In [None]:
dev_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=cleaned_validation["sentence1"],
    sentences2=cleaned_validation["sentence2"],
    scores=cleaned_validation["score"],
    name="BIOSSES_validate",
)
test_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=cleaned_test["sentence1"],
    sentences2=cleaned_test["sentence2"],
    scores=cleaned_test["score"],
    name="BIOSSES_test",
)
results = dev_evaluator(model)

In [None]:
results

{'BIOSSES_validate_pearson_cosine': 0.8034244246101594,
 'BIOSSES_validate_spearman_cosine': 0.7961717622928737}

In [None]:
trainer = SentenceTransformerTrainer(
    model=model,
    train_dataset=cleaned_train,
    eval_dataset=cleaned_validation,
    loss=loss,
    args=args,
    evaluator=dev_evaluator,
)
trainer.train()

  0%|          | 0/1600 [00:00<?, ?it/s]

{'loss': 1.7895, 'grad_norm': 132.9878387451172, 'learning_rate': 5e-07, 'epoch': 5.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 7.377745628356934, 'eval_BIOSSES_validate_pearson_cosine': 0.8051602243791965, 'eval_BIOSSES_validate_spearman_cosine': 0.784310730452384, 'eval_runtime': 0.1367, 'eval_samples_per_second': 117.028, 'eval_steps_per_second': 7.314, 'epoch': 5.0}


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

{'loss': 2.0507, 'grad_norm': 62.082393646240234, 'learning_rate': 1e-06, 'epoch': 10.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 7.120354175567627, 'eval_BIOSSES_validate_pearson_cosine': 0.8122595182515597, 'eval_BIOSSES_validate_spearman_cosine': 0.7887586173925677, 'eval_runtime': 0.1869, 'eval_samples_per_second': 85.612, 'eval_steps_per_second': 5.351, 'epoch': 10.0}
{'loss': 1.6099, 'grad_norm': 123.14497375488281, 'learning_rate': 9.444444444444444e-07, 'epoch': 15.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.738901138305664, 'eval_BIOSSES_validate_pearson_cosine': 0.821820646531903, 'eval_BIOSSES_validate_spearman_cosine': 0.8273069708741594, 'eval_runtime': 0.1121, 'eval_samples_per_second': 142.724, 'eval_steps_per_second': 8.92, 'epoch': 15.0}
{'loss': 1.3293, 'grad_norm': 126.5667724609375, 'learning_rate': 8.888888888888888e-07, 'epoch': 20.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.439163684844971, 'eval_BIOSSES_validate_pearson_cosine': 0.829804580517085, 'eval_BIOSSES_validate_spearman_cosine': 0.8391680027146491, 'eval_runtime': 0.1552, 'eval_samples_per_second': 103.087, 'eval_steps_per_second': 6.443, 'epoch': 20.0}
{'loss': 1.1596, 'grad_norm': 7.657847881317139, 'learning_rate': 8.333333333333333e-07, 'epoch': 25.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.25223970413208, 'eval_BIOSSES_validate_pearson_cosine': 0.8346190147114352, 'eval_BIOSSES_validate_spearman_cosine': 0.8347201157744654, 'eval_runtime': 0.1207, 'eval_samples_per_second': 132.539, 'eval_steps_per_second': 8.284, 'epoch': 25.0}
{'loss': 0.9193, 'grad_norm': 114.37818145751953, 'learning_rate': 7.777777777777778e-07, 'epoch': 30.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.134754180908203, 'eval_BIOSSES_validate_pearson_cosine': 0.8385801058493763, 'eval_BIOSSES_validate_spearman_cosine': 0.8347201157744654, 'eval_runtime': 0.1182, 'eval_samples_per_second': 135.396, 'eval_steps_per_second': 8.462, 'epoch': 30.0}
{'loss': 0.9124, 'grad_norm': 53.17500305175781, 'learning_rate': 7.222222222222221e-07, 'epoch': 35.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.028105735778809, 'eval_BIOSSES_validate_pearson_cosine': 0.8417979818569801, 'eval_BIOSSES_validate_spearman_cosine': 0.8569595504753837, 'eval_runtime': 0.1869, 'eval_samples_per_second': 85.602, 'eval_steps_per_second': 5.35, 'epoch': 35.0}
{'loss': 0.7661, 'grad_norm': 39.26213455200195, 'learning_rate': 6.666666666666666e-07, 'epoch': 40.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.011462211608887, 'eval_BIOSSES_validate_pearson_cosine': 0.8428962354566085, 'eval_BIOSSES_validate_spearman_cosine': 0.8569595504753837, 'eval_runtime': 0.2282, 'eval_samples_per_second': 70.119, 'eval_steps_per_second': 4.382, 'epoch': 40.0}
{'loss': 0.6676, 'grad_norm': 103.1226577758789, 'learning_rate': 6.111111111111112e-07, 'epoch': 45.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 6.015958786010742, 'eval_BIOSSES_validate_pearson_cosine': 0.8438237390263221, 'eval_BIOSSES_validate_spearman_cosine': 0.8569595504753837, 'eval_runtime': 0.1379, 'eval_samples_per_second': 116.017, 'eval_steps_per_second': 7.251, 'epoch': 45.0}
{'loss': 0.6737, 'grad_norm': 77.2341537475586, 'learning_rate': 5.555555555555555e-07, 'epoch': 50.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 5.991096496582031, 'eval_BIOSSES_validate_pearson_cosine': 0.8457008004157961, 'eval_BIOSSES_validate_spearman_cosine': 0.8569595504753837, 'eval_runtime': 0.1604, 'eval_samples_per_second': 99.775, 'eval_steps_per_second': 6.236, 'epoch': 50.0}
{'loss': 0.7249, 'grad_norm': 48.72550964355469, 'learning_rate': 5e-07, 'epoch': 55.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 5.932181358337402, 'eval_BIOSSES_validate_pearson_cosine': 0.8474433752641244, 'eval_BIOSSES_validate_spearman_cosine': 0.8569595504753837, 'eval_runtime': 0.1161, 'eval_samples_per_second': 137.871, 'eval_steps_per_second': 8.617, 'epoch': 55.0}
{'loss': 0.5621, 'grad_norm': 20.066314697265625, 'learning_rate': 4.444444444444444e-07, 'epoch': 60.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 5.922840595245361, 'eval_BIOSSES_validate_pearson_cosine': 0.8478882233576817, 'eval_BIOSSES_validate_spearman_cosine': 0.8569595504753837, 'eval_runtime': 0.1034, 'eval_samples_per_second': 154.714, 'eval_steps_per_second': 9.67, 'epoch': 60.0}
{'loss': 0.5654, 'grad_norm': 50.50199508666992, 'learning_rate': 3.888888888888889e-07, 'epoch': 65.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 5.864381790161133, 'eval_BIOSSES_validate_pearson_cosine': 0.849756488631928, 'eval_BIOSSES_validate_spearman_cosine': 0.8569595504753837, 'eval_runtime': 0.1229, 'eval_samples_per_second': 130.14, 'eval_steps_per_second': 8.134, 'epoch': 65.0}
{'loss': 0.4994, 'grad_norm': 130.8285675048828, 'learning_rate': 3.333333333333333e-07, 'epoch': 70.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 5.857019424438477, 'eval_BIOSSES_validate_pearson_cosine': 0.8501359131079611, 'eval_BIOSSES_validate_spearman_cosine': 0.8569595504753837, 'eval_runtime': 0.2271, 'eval_samples_per_second': 70.449, 'eval_steps_per_second': 4.403, 'epoch': 70.0}
{'loss': 0.5415, 'grad_norm': 45.098487854003906, 'learning_rate': 2.7777777777777776e-07, 'epoch': 75.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 5.853856563568115, 'eval_BIOSSES_validate_pearson_cosine': 0.8503467718783346, 'eval_BIOSSES_validate_spearman_cosine': 0.8569595504753837, 'eval_runtime': 0.1017, 'eval_samples_per_second': 157.266, 'eval_steps_per_second': 9.829, 'epoch': 75.0}
{'loss': 0.4312, 'grad_norm': 26.051889419555664, 'learning_rate': 2.222222222222222e-07, 'epoch': 80.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 5.844217777252197, 'eval_BIOSSES_validate_pearson_cosine': 0.8510057677066618, 'eval_BIOSSES_validate_spearman_cosine': 0.8569595504753837, 'eval_runtime': 0.1083, 'eval_samples_per_second': 147.787, 'eval_steps_per_second': 9.237, 'epoch': 80.0}
{'loss': 0.5843, 'grad_norm': 8.837791442871094, 'learning_rate': 1.6666666666666665e-07, 'epoch': 85.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 5.846545219421387, 'eval_BIOSSES_validate_pearson_cosine': 0.8510002730467289, 'eval_BIOSSES_validate_spearman_cosine': 0.8465811476149551, 'eval_runtime': 0.15, 'eval_samples_per_second': 106.649, 'eval_steps_per_second': 6.666, 'epoch': 85.0}
{'loss': 0.5052, 'grad_norm': 85.35380554199219, 'learning_rate': 1.111111111111111e-07, 'epoch': 90.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 5.84234619140625, 'eval_BIOSSES_validate_pearson_cosine': 0.851320818434129, 'eval_BIOSSES_validate_spearman_cosine': 0.8465811476149551, 'eval_runtime': 0.115, 'eval_samples_per_second': 139.181, 'eval_steps_per_second': 8.699, 'epoch': 90.0}
{'loss': 0.4542, 'grad_norm': 2.0444140434265137, 'learning_rate': 5.555555555555555e-08, 'epoch': 95.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 5.833976745605469, 'eval_BIOSSES_validate_pearson_cosine': 0.8515467225051898, 'eval_BIOSSES_validate_spearman_cosine': 0.8465811476149551, 'eval_runtime': 0.1328, 'eval_samples_per_second': 120.503, 'eval_steps_per_second': 7.531, 'epoch': 95.0}
{'loss': 0.4925, 'grad_norm': 11.6255464553833, 'learning_rate': 0.0, 'epoch': 100.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 5.8351335525512695, 'eval_BIOSSES_validate_pearson_cosine': 0.851519633019667, 'eval_BIOSSES_validate_spearman_cosine': 0.8465811476149551, 'eval_runtime': 0.2451, 'eval_samples_per_second': 65.28, 'eval_steps_per_second': 4.08, 'epoch': 100.0}
{'train_runtime': 174.7099, 'train_samples_per_second': 36.632, 'train_steps_per_second': 9.158, 'train_loss': 0.8619360375404358, 'epoch': 100.0}


TrainOutput(global_step=1600, training_loss=0.8619360375404358, metrics={'train_runtime': 174.7099, 'train_samples_per_second': 36.632, 'train_steps_per_second': 9.158, 'total_flos': 0.0, 'train_loss': 0.8619360375404358, 'epoch': 100.0})

In [None]:
test_results = test_evaluator(model)
test_results

{'BIOSSES_test_pearson_cosine': 0.8518288612627938,
 'BIOSSES_test_spearman_cosine': 0.8237669743411928}

In [None]:
model.save_pretrained("models/all-MiniLM-L6-v2/final")

In [None]:
cleaned_train

Dataset({
    features: ['sentence1', 'sentence2', 'score'],
    num_rows: 64
})

In [None]:
def evaluate_model(model_, test_set_, name="BIOSSES_test"):
    test_evaluator = EmbeddingSimilarityEvaluator(
        sentences1=test_set_["sentence1"],
        sentences2=test_set_["sentence2"],
        scores=test_set_["score"],
        name=name,
    )
    return test_evaluator(model)

In [None]:
print(evaluate_model(model, cleaned_test)['BIOSSES_test_pearson_cosine'])
print(evaluate_model(model, cleaned_train)['BIOSSES_test_pearson_cosine'])
print(evaluate_model(model, cleaned_validation)['BIOSSES_test_pearson_cosine'])

0.8518288612627938
0.9721099006701728
0.8515467225051898
