<a href="https://colab.research.google.com/github/kor1999/Sentence-Similarity-LLM-course-/blob/main/Sentence_similarity_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Downloading libraries and importing it

In [1]:
!pip install datasets
!pip install sentence-transformers

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.1/547.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m542.7/547.8 kB[0m [31m12.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 k

In [2]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation, util
from torch.utils.data import DataLoader
import shutil

Imporing STS Dataset and dividing it to train, validation and test sets

In [14]:
# Loading dataset of STS Benchmark
dataset = load_dataset('stsb_multi_mt', 'en')

# print info about dataset
print(dataset)

# Dividing dataset to train, validation and test
train_data = dataset['train']
val_data = dataset['dev']
test_data = dataset['test']

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 5749
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 1379
    })
    dev: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 1500
    })
})


In [15]:
# print examples of train data
print(train_data[:2])

{'sentence1': ['A plane is taking off.', 'A man is playing a large flute.'], 'sentence2': ['An air plane is taking off.', 'A man is playing a flute.'], 'similarity_score': [5.0, 3.799999952316284]}


Transforming data batches to required format

In [16]:
# Putting data in InputExample
train_examples = [InputExample(texts=[row['sentence1'], row['sentence2']], label=row['similarity_score']) for row in train_data]
val_examples = [InputExample(texts=[row['sentence1'], row['sentence2']], label=row['similarity_score']) for row in val_data]
test_examples = [InputExample(texts=[row['sentence1'], row['sentence2']], label=row['similarity_score']) for row in test_data]

# Putting InputExamples into DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
val_dataloader = DataLoader(val_examples, shuffle=False, batch_size=16)
test_dataloader = DataLoader(test_examples, shuffle=False, batch_size=16)

Importing model that we will use as base

In [17]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
train_loss = losses.CosineSimilarityLoss(model) #Loss function from official documentation

Loop for training model

In [11]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=1,
          warmup_steps=100,
          evaluator=None,
          evaluation_steps=0,
          output_path=None)


Step,Training Loss


Model evaluation on validation data

In [18]:
evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(val_examples, name='sts-dev')
results = model.evaluate(evaluator)
print("Pearson correlation (cosine similarity):", results['sts-dev_pearson_cosine']) # Pearson because it is better if we want to check linear correlation


Pearson correlation (cosine similarity): 0.8694754836384828


In [9]:
# Method for calculating cos similarity between two words
def check_similarity(word1, word2, model):

    # Getting embeddings from two words
    embedding1 = model.encode(word1, convert_to_tensor=True)
    embedding2 = model.encode(word2, convert_to_tensor=True)

    # Calculating cos similarity between words
    cosine_similarity = util.pytorch_cos_sim(embedding1, embedding2)

    return cosine_similarity.item()

In [20]:
# Examples words for comparising
word1 = "cat"
word2 = "dog"
word3 = "table"
word4 = "tank"
word5 = "cats"

# Comparing words
print(f"Cosine similarity between '{word1}' and '{word2}': {check_similarity(word1, word2, model)}")
print(f"Cosine similarity between '{word1}' and '{word3}': {check_similarity(word1, word3, model)}")
print(f"Cosine similarity between '{word1}' and '{word4}': {check_similarity(word1, word4, model)}")
print(f"Cosine similarity between '{word1}' and '{word5}': {check_similarity(word1, word5, model)}")
print(f"Cosine similarity between '{word1}' and '{word1}': {check_similarity(word1, word1, model)}")

Cosine similarity between 'cat' and 'dog': 0.3943599462509155
Cosine similarity between 'cat' and 'table': 0.1546907126903534
Cosine similarity between 'cat' and 'tank': 0.16069401800632477
Cosine similarity between 'cat' and 'cats': 0.8833944797515869
Cosine similarity between 'cat' and 'cat': 1.0000001192092896


Saving and loading model for demo

In [21]:
#Save the trained model to a directory
model_save_path = 'output/trained_model'
model.save(model_save_path)
print(f"Model saved to {model_save_path}")

# Zip the model directory
shutil.make_archive('trained_model', 'zip', model_save_path)

Model saved to output/trained_model


'/content/trained_model.zip'

In [7]:
#Loading the saved model
!unzip trained_model.zip -d ./trained_model/

model_save_path = 'trained_model'
loaded_model = SentenceTransformer(model_save_path)

!unzip too_trained_model.zip -d ./too_trained_model/
too_loaded_model = SentenceTransformer('too_trained_model')

print("Model loaded successfully.")

Model loaded successfully.


In [10]:
# Examples words for comparising
word1 = "cat"
word2 = "dog"
word3 = "table"
word4 = "tank"
word5 = "cats"

# Comparing words
print("Normal model")
print(f"Cosine similarity between '{word1}' and '{word2}': {check_similarity(word1, word2, loaded_model)}")
print(f"Cosine similarity between '{word1}' and '{word3}': {check_similarity(word1, word3, loaded_model)}")
print(f"Cosine similarity between '{word1}' and '{word4}': {check_similarity(word1, word4, loaded_model)}")
print(f"Cosine similarity between '{word1}' and '{word5}': {check_similarity(word1, word5, loaded_model)}")
print(f"Cosine similarity between '{word1}' and '{word1}': {check_similarity(word1, word1, loaded_model)}")

print("Overfitting model")
print(f"Cosine similarity between '{word1}' and '{word2}': {check_similarity(word1, word2, too_loaded_model)}")
print(f"Cosine similarity between '{word1}' and '{word3}': {check_similarity(word1, word3, too_loaded_model)}")
print(f"Cosine similarity between '{word1}' and '{word4}': {check_similarity(word1, word4, too_loaded_model)}")
print(f"Cosine similarity between '{word1}' and '{word5}': {check_similarity(word1, word5, too_loaded_model)}")
print(f"Cosine similarity between '{word1}' and '{word1}': {check_similarity(word1, word1, too_loaded_model)}")

Cosine similarity between 'cat' and 'dog': 0.9970299601554871
Cosine similarity between 'cat' and 'table': 0.9760968685150146
Cosine similarity between 'cat' and 'tank': 0.9943073391914368
Cosine similarity between 'cat' and 'cats': 0.9988532066345215
Cosine similarity between 'cat' and 'cat': 0.9999999403953552
