<a href="https://colab.research.google.com/github/kor1999/Sentence-Similarity-LLM-course-/blob/main/Sentence_similarity_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Downloading libraries and importing it

In [None]:
!pip install datasets
!pip install sentence-transformers



In [None]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation, util
from torch.utils.data import DataLoader
import shutil

Imporing STS Dataset and dividing it to train, validation and test sets

In [None]:
# Loading dataset of STS Benchmark
dataset = load_dataset('stsb_multi_mt', 'en')

# print info about dataset
print(dataset)

# Dividing dataset to train, validation and test
train_data = dataset['train']
val_data = dataset['dev']
test_data = dataset['test']

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 5749
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 1379
    })
    dev: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 1500
    })
})


In [None]:
# print examples of train data
print(train_data[:2])

{'sentence1': ['A plane is taking off.', 'A man is playing a large flute.'], 'sentence2': ['An air plane is taking off.', 'A man is playing a flute.'], 'similarity_score': [5.0, 3.799999952316284]}


Transforming data batches to required format

In [None]:
# Putting data in InputExample
train_examples = [InputExample(texts=[row['sentence1'], row['sentence2']], label=row['similarity_score']) for row in train_data]
val_examples = [InputExample(texts=[row['sentence1'], row['sentence2']], label=row['similarity_score']) for row in val_data]
test_examples = [InputExample(texts=[row['sentence1'], row['sentence2']], label=row['similarity_score']) for row in test_data]

# Putting InputExamples into DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
val_dataloader = DataLoader(val_examples, shuffle=False, batch_size=16)
test_dataloader = DataLoader(test_examples, shuffle=False, batch_size=16)

Importing model that we will use as base

In [None]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
train_loss = losses.CosineSimilarityLoss(model) #Loss function from official documentation

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loop for training model

In [None]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=1,
          warmup_steps=100,
          evaluator=None,
          evaluation_steps=0,
          output_path=None)


Step,Training Loss


Model evaluation on validation data

In [None]:
evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(val_examples, name='sts-dev')
results = model.evaluate(evaluator)
print("Pearson correlation (cosine similarity):", results['sts-dev_pearson_cosine']) # Pearson because it is better if we want to check linear correlation


Pearson correlation (cosine similarity): 0.8694754841691497


In [None]:
# Method for calculating cos similarity between two words
def check_similarity(word1, word2, model):

    # Getting embeddings from two words
    embedding1 = model.encode(word1, convert_to_tensor=True)
    embedding2 = model.encode(word2, convert_to_tensor=True)

    # Calculating cos similarity between words
    cosine_similarity = util.pytorch_cos_sim(embedding1, embedding2)

    return cosine_similarity.item()

In [None]:
# Examples words for comparising

word1 = "cat"
word2 = "dog"
word3 = "table"
word4 = "tank"
word5 = "cats"
# Comparing words

print(f"Cosine similarity between '{word1}' and '{word2}': {check_similarity(word1, word2, model)}")
print(f"Cosine similarity between '{word1}' and '{word3}': {check_similarity(word1, word3, model)}")
print(f"Cosine similarity between '{word1}' and '{word4}': {check_similarity(word1, word4, model)}")
print(f"Cosine similarity between '{word1}' and '{word5}': {check_similarity(word1, word5, model)}")
print(f"Cosine similarity between '{word1}' and '{word1}': {check_similarity(word1, word1, model)}")

Cosine similarity between 'cat' and 'dog': 0.39435985684394836
Cosine similarity between 'cat' and 'table': 0.1546907126903534
Cosine similarity between 'cat' and 'tank': 0.16069403290748596
Cosine similarity between 'cat' and 'cats': 0.8833943009376526
Cosine similarity between 'cat' and 'cat': 0.9999998807907104


Saving and loading model for demo

In [None]:
#Save the trained model to a directory
model_save_path = 'output/trained_model'
model.save(model_save_path)
print(f"Model saved to {model_save_path}")

# Zip the model directory
shutil.make_archive('trained_model', 'zip', model_save_path)

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Model saved to output/trained_model


In [None]:
#Loading the saved model
model_save_path = 'output/trained_model'
loaded_model = SentenceTransformer(model_save_path)
print("Model loaded successfully.")

Model loaded successfully.


'/content/trained_model.zip'

In [None]:
# Examples words for comparising
word1 = "cat"
word2 = "dog"
word3 = "table"
word4 = "tank"
word5 = "cats"

# Comparing words
print(f"Cosine similarity between '{word1}' and '{word2}': {check_similarity(word1, word2, loaded_model)}")
print(f"Cosine similarity between '{word1}' and '{word3}': {check_similarity(word1, word3, loaded_model)}")
print(f"Cosine similarity between '{word1}' and '{word4}': {check_similarity(word1, word4, loaded_model)}")
print(f"Cosine similarity between '{word1}' and '{word5}': {check_similarity(word1, word5, loaded_model)}")
print(f"Cosine similarity between '{word1}' and '{word1}': {check_similarity(word1, word1, loaded_model)}")