Sentence similarity
Dataset: STS

In [1]:
import torch
from datasets import load_dataset, Dataset
import nltk
import re
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses

from nltk.corpus import words
from nltk.corpus import stopwords
nltk.download('stopwords', quiet=True)
nltk.download('words', quiet=True)
nltk.download('wordnet', quiet=True)
# vocab = set(words.words())
from nltk.stem import *

from preprocessing import clean_sentence, vocab, clean_dataset

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\olyat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\olyat\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\olyat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
from transformers.integrations import TensorBoardCallback

In [3]:
ds = load_dataset("tabilab/biosses")

In [4]:
train_test = ds['train'].train_test_split(test_size=0.2, seed=42)
train_validate = train_test['train'].train_test_split(test_size=0.2, seed=42)
train_set = train_validate['train']
validate_set = train_validate['test']
test_set = train_test['test']

In [5]:
stemmer = PorterStemmer()

In [6]:
stemmed_words = set(map(lambda word: stemmer.stem(word), words.words()))

In [7]:
cleaned_train, unknown_words = clean_dataset(train_set, stemmed_words)
cleaned_validation, _ = clean_dataset(validate_set, stemmed_words)
cleaned_test, _ = clean_dataset(test_set, stemmed_words)

In [8]:
print(unknown_words)
print(len(unknown_words))

{'vitro', 'chromatin/nuclear', 'LATS1', 'OCT4', 'RHIMs)', '(GEFs)', 'Oct4', 'and/or', 'plasmid', 'POU5F1', 'NSCLC', 'caspase', 'I/A', 'miR-126', 'localisation', '23]', 'G-proteins', '(e.g', '95%', 'miRNA-regulated', 'deregulation', 'tumour', 'MiR-155', 'TEL-AML1-positive', 'SOX-17', 'RNAi-mediated', 'IDH1', '(RIP1', 'pRB', 'ligase', '16', 'SWI/SNF', 'ubiquitin', 'Neuro-2a', 'hWts2', 'HOPX', 'et', '(RIP', '(together', 'LATS2', 'loss-of-function', 'GTP-bound', 'PicTar', 'ESCs', 'electroporation', '3', '(BRG-associated', '(Mirus)', 'miRNAs', '24-well', 'let-7)', 'hr', 'oncogene-driven', 'Wts2', '(APC)', 'RNAhybrid', 'miR-223', 'clear-cell', 'Arp', '90%', 'SWI/SNF-like', 'SDS-PAGE', 'PPP', 'hepatocellular', 'NH2-terminal', 'ribonucleotide', 'stem/progenitor', 'Caco-2', 'prometastatic', 'upregulation', '(see', 'pCAG-GFP', '[8]', '(Fig', 'shRNA-based', 'miR-204-miRVec', '43', 'RNA', 'Aurora-A', 'miR-146a', 'chromatin-remodeling', '2004)', 'WT1', 'Toji', 'ubiquitination', 'miRVec-miR-204', 'm

In [9]:
torch.cuda.empty_cache()

In [10]:
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainingArguments
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
from sentence_transformers.losses import CoSENTLoss

# Load a model to train/finetune
model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize the CoSENTLoss
# This loss requires pairs of text and a float similarity score as a label
loss = CoSENTLoss(model)

In [11]:
def evaluate_model(model_, test_set_, name="BIOSSES_test"):
    test_evaluator = EmbeddingSimilarityEvaluator(
        sentences1=test_set_["sentence1"],
        sentences2=test_set_["sentence2"],
        scores=test_set_["score"],
        name=name,
    )
    return test_evaluator(model)

In [12]:
train_s1 = cleaned_train['sentence1']
train_s2 = cleaned_train['sentence2']
validation_s1 = cleaned_validation['sentence1']
validation_s2 = cleaned_validation['sentence2']

In [13]:
# train_emb1 = model.encode(train_s1, normalize_embeddings=True)
# train_emb2 = model.encode(train_s2, normalize_embeddings=True)
# validation_emb1 = model.encode(validation_s1, normalize_embeddings=True)
# validation_emb2 = model.encode(validation_s2, normalize_embeddings=True)

In [14]:
# validation_similarities = model.similarity_pairwise(validation_emb1, validation_emb2)
# print(validation_similarities)
ground_truth_validation = cleaned_validation["score"]
# print(ground_truth_validation)
# print(loss.forward(
#     {"sentence1": cleaned_validation["sentence1"],
#      "sentence2": cleaned_validation["sentence2"]}, 
#     ground_truth_validation))

In [15]:
test_similarities = model.similarity_pairwise(
    model.encode(cleaned_test['sentence1'], normalize_embeddings=True),
      model.encode(cleaned_test['sentence2'], normalize_embeddings=True))
print(test_similarities)
ground_truth_test = cleaned_test["score"]
print(ground_truth_test)

tensor([0.6227, 0.7996, 0.4542, 0.7373, 0.5717, 0.9421, 0.4272, 0.3671, 0.2034,
        0.7460, 0.5736, 0.5206, 0.8329, 0.6441, 0.5521, 0.7422, 0.3377, 0.7593,
        0.4291, 0.3967])
[0.20000004768371582, 0.7999999523162842, -0.30000001192092896, 0.6000000238418579, 0.5, 1.0, 0.0, -0.30000001192092896, -0.10000002384185791, 0.7000000476837158, 0.0, 0.0, 0.8999999761581421, 0.7000000476837158, 0.7999999523162842, 0.5, 0.20000004768371582, 0.6000000238418579, 0.20000004768371582, -0.30000001192092896]


In [16]:
import datetime

In [17]:
num_train_epochs=100
per_device_train_batch_size=2
learning_rate=1e-6

In [18]:
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/all-MiniLM-L6-v2/"+str(datetime.datetime.now()).split()[1].split('.')[0].replace(':', '-'),
    # Optional training parameters:
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=100,
    learning_rate=learning_rate,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    # batch_sampler=BatchSamplers.NO_DUPLICATES,  # losses that use "in-batch negatives" benefit from no duplicates
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=0.05,
    save_strategy="steps",
    save_steps=0.05,
    # save_total_limit=3,
    load_best_model_at_end=True,
    logging_steps=0.05,
    run_name='{num_train_epochs}_{per_device_train_batch_size}_{learning_rate}',  # Will be used in W&B if `wandb` is installed
)

In [19]:
dev_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=cleaned_validation["sentence1"],
    sentences2=cleaned_validation["sentence2"],
    scores=cleaned_validation["score"],
    name="BIOSSES_validate",
)
test_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=cleaned_test["sentence1"],
    sentences2=cleaned_test["sentence2"],
    scores=cleaned_test["score"],
    name="BIOSSES_test",
)

In [20]:
print(evaluate_model(model, cleaned_test)['BIOSSES_test_pearson_cosine'])
print(evaluate_model(model, cleaned_train)['BIOSSES_test_pearson_cosine'])
print(evaluate_model(model, cleaned_validation)['BIOSSES_test_pearson_cosine'])

0.8255907202262405
0.8178702181441257
0.8034242921820968


In [21]:
tensor_board_callback = TensorBoardCallback()

In [None]:
trainer = SentenceTransformerTrainer(
    model=model,
    train_dataset=cleaned_train,
    eval_dataset=cleaned_validation,
    loss=loss,
    args=args,
    evaluator=dev_evaluator,
    callbacks=[tensor_board_callback]
)
trainer.train()

 50%|█████     | 1607/3200 [08:11<09:18,  2.85it/s]

In [None]:
test_results = test_evaluator(model)
test_results

{'BIOSSES_test_pearson_cosine': 0.8453659209668248,
 'BIOSSES_test_spearman_cosine': 0.807881660786404}

In [None]:
model.save_pretrained("models/all-MiniLM-L6-v2/final/2-100-1e-6")

In [None]:
print(evaluate_model(model, cleaned_test)['BIOSSES_test_pearson_cosine'])
print(evaluate_model(model, cleaned_train)['BIOSSES_test_pearson_cosine'])
print(evaluate_model(model, cleaned_validation)['BIOSSES_test_pearson_cosine'])

0.8453659209668248
0.9390772826974033
0.8379312901742688
