Sentence similarity
Dataset: STS

In [2]:
import torch
from datasets import load_dataset, Dataset
import nltk
import re
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses

from nltk.corpus import words
from nltk.corpus import stopwords
nltk.download('stopwords', quiet=True)
nltk.download('words', quiet=True)
nltk.download('wordnet', quiet=True)
# vocab = set(words.words())
from nltk.stem import *

from preprocessing import clean_sentence, vocab, clean_dataset




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
ds = load_dataset("tabilab/biosses")

In [4]:
train_test = ds['train'].train_test_split(test_size=0.2, seed=42)
train_validate = train_test['train'].train_test_split(test_size=0.2, seed=42)
train_set = train_validate['train']
validate_set = train_validate['test']
test_set = train_test['test']

In [5]:
stemmer = PorterStemmer()

In [6]:
stemmed_words = set(map(lambda word: stemmer.stem(word), words.words()))

In [7]:
cleaned_train, unknown_words = clean_dataset(train_set, stemmed_words)
cleaned_validation, _ = clean_dataset(validate_set, stemmed_words)

In [8]:
print(unknown_words)
print(len(unknown_words))

{'pre-miRNA', 'senescence-like', 'tumorigenic', 'NSCLCs', 'GEFs', 'miR-126', 'H-RASV12', 'STK33', 'TAK1', 'vitro', 'Wts2', '(GEFs)', 'anaphase-promoting', 'downregulated', '(Fig', 'microtubule-nucleating', 'BAF', 'oncogenic', '(BRG-associated', 'K-Ras-driven', '(PTC)', 'LATS2', 'interest;', '72', 'tumorigenesis', 'POU5F1', 'miRVec-miR-204', 'HOPX', 'RNAhybrid', 'upregulation', 'ligase', 'et', 'vivo', '[8]', 'RHIMs)', 'miR-15a', 'Aurora-A', 'oncogenesis]', 'IL-1', '(RIP', '90%', 'EGFR', 'p53', 'hWts2', 'TFs', 'PLK1', 'caspase', 'K-ras-dependent', 'Thr288', 'self-renewal', '(e.g', 'granulopoiesis', 'SDS-PAGE', 'hmC', 'TargetScan', '(see', '10]', 'coverslips', '[', 'proteasome', 'miR-155', 'MOE430A', 'Renilla-3′', 'PPP', 'Caco-2', '<20%', 'Cyclin', '(AML)', 'IDH1', 'TRAF6', 'myeloid-specific', 'ATP-dependent', 'RNA', 'NIH3T3', 'miR-34a', 'E2F1', 'MiR-223', 'mdm2', 'PC9', '(RB1)', 'zebrafish', 'miR-204-miRVec', 'miR-24', 'dose-dependent', 'carcinoma]', 'oncogenes', 'MCF7', 'scrambled-miRVe

In [9]:
max(cleaned_validation['score'])

1.0

In [None]:
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainingArguments
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
from sentence_transformers.losses import CoSENTLoss

# Load a model to train/finetune
model = SentenceTransformer("xlm-roberta-base", device="cuda")

# Initialize the CoSENTLoss
# This loss requires pairs of text and a float similarity score as a label
loss = CoSENTLoss(model)

No sentence-transformers model found with name xlm-roberta-base. Creating a new one with mean pooling.


In [11]:
train_s1 = cleaned_train['sentence1']
train_s2 = cleaned_train['sentence2']
validation_s1 = cleaned_validation['sentence1']
validation_s2 = cleaned_validation['sentence2']

In [12]:
train_emb1 = model.encode(train_s1)
train_emb2 = model.encode(train_s2)

In [13]:
validation_emb1 = model.encode(validation_s1)
validation_emb2 = model.encode(validation_s2)

In [14]:
validation_similarities = model.similarity_pairwise(validation_emb1, validation_emb2)

In [15]:
validation_similarities

tensor([0.9988, 0.9987, 0.9981, 0.9977, 0.9968, 0.9985, 0.9984, 0.9983, 0.9969,
        0.9959, 0.9972, 0.9993, 0.9974, 0.9983, 0.9976, 0.9969])

In [22]:
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/xlm-roberta-base",
    # Optional training parameters:
    num_train_epochs=5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    # batch_sampler=BatchSamplers.NO_DUPLICATES,  # losses that use "in-batch negatives" benefit from no duplicates
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=10,
    save_strategy="steps",
    save_steps=10,
    save_total_limit=2,
    logging_steps=1,
    run_name="xlm-roberta-base",  # Will be used in W&B if `wandb` is installed
)

In [19]:
dev_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=cleaned_validation["sentence1"],
    sentences2=cleaned_validation["sentence2"],
    scores=cleaned_validation["score"],
    name="sts_dev",
)
results = dev_evaluator(model)

In [20]:
results

{'sts_dev_pearson_cosine': 0.1882809392202916,
 'sts_dev_spearman_cosine': 0.2639079584508967}

In [None]:
trainer = SentenceTransformerTrainer(
    model=model,
    train_dataset=cleaned_train,
    loss=loss,
    args=args,
    evaluator=dev_evaluator
)
trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
wandb: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: