In [1]:
# references: https://www.sbert.net/examples/training/sts/README.html#
import os
import math
import random
import datasets
import torch

from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
batch_size = 8
num_epochs = 4
model_save_path = './ftmodel'

In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [4]:
# create output directory
if os.path.exists(model_save_path)==False:
    os.mkdir(model_save_path)

In [5]:
# load model
model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')

In [6]:
# function to create dataset with binary similarity scores [0,1]

def get_uk_data(data):
    rand_sen1 = data['sourceString']
    random.shuffle(rand_sen1)

    rand_sen2 = data['targetString']
    random.shuffle(rand_sen2)

    score1 = []
    
    # set scores of randomised pairs in dataset to 0
    for i in range(len(rand_sen1)):
        if (rand_sen1[i]==rand_sen2[i]): # if any randomised pairs happen to match
            score1.append(1.0)
        else:
            score1.append(0.0)

    # set scores of orignal similar pairs in dataset to 1
    orig_sen1 = data['sourceString']
    orig_sen2 = data['targetString']
    score2 = [1.0] * len(orig_sen1)

    # append randomised sentence pairs to similar pairings dataset
    sentence1 = orig_sen1 + rand_sen1
    sentence2 = orig_sen2 + rand_sen2
    similarity_score = score2 + score1

    return sentence1, sentence2, similarity_score


In [7]:
# get list of InputExamples (var type required for training sentence_transformers)

def get_samples(sen1, sen2, score):
    samples = []
    
    for i in range(len(sen1)):
        samples.append(InputExample(texts=[sen1[i], sen2[i]], label=score[i]))
    
    return samples

English Dataset

In [8]:
en_dataset = datasets.load_dataset("stsb_multi_mt", "en")

Found cached dataset stsb_multi_mt (/home/krichards/.cache/huggingface/datasets/stsb_multi_mt/en/1.0.0/a5d260e4b7aa82d1ab7379523a005a366d9b124c76a5a5cf0c4c5365458b0ba9)
100%|██████████| 3/3 [00:00<00:00, 883.94it/s]


In [9]:
en_scores = [x / 5.0 for x in en_dataset['test']['similarity_score']]
en_samples = get_samples(en_dataset['test']['sentence1'], en_dataset['test']['sentence2'], en_scores)

Ukrainian Dataset

In [10]:
uk_dataset = datasets.load_dataset("Helsinki-NLP/tatoeba_mt", "ukr-ukr", split='test')

Found cached dataset tatoeba_mt (/home/krichards/.cache/huggingface/datasets/Helsinki-NLP___tatoeba_mt/ukr-ukr/0.0.0/01e819f3f64a772a2ca70949061d295d3a2dc99d05183fe4776a3be23f75f619)


In [11]:
uk_dataset = uk_dataset.train_test_split(test_size=0.2)

uk_train = uk_dataset['train']

uk_test = uk_dataset['test']
uk_test = uk_test.train_test_split(test_size=0.5)

uk_val = uk_test['train']
uk_test = uk_test['test']

In [12]:
train_sen1, train_sen2, train_score = get_uk_data(uk_train)
val_sen1, val_sen2, val_score = get_uk_data(uk_val)
test_sen1, test_sen2, test_score = get_uk_data(uk_test)

In [13]:
train_samples = get_samples(train_sen1, train_sen2, train_score)
val_samples = get_samples(val_sen1, val_sen2, val_score)
test_samples = get_samples(test_sen1, test_sen2, test_score)

Evaluation of Original Model

In [14]:
uk_test_eval1 = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, batch_size=batch_size, name='uk-test1', show_progress_bar=True, main_similarity=SimilarityFunction.COSINE)
uk_eval1 = uk_test_eval1(model, output_path=model_save_path)

Batches: 100%|██████████| 21/21 [00:04<00:00,  5.03it/s]
Batches: 100%|██████████| 21/21 [00:04<00:00,  5.09it/s]


In [15]:
en_test_eval1 = EmbeddingSimilarityEvaluator.from_input_examples(en_samples, batch_size=batch_size, name='en-test1', show_progress_bar=True, main_similarity=SimilarityFunction.COSINE)
en_eval1 = en_test_eval1(model, output_path=model_save_path)

Batches: 100%|██████████| 173/173 [00:48<00:00,  3.55it/s]
Batches: 100%|██████████| 173/173 [00:48<00:00,  3.57it/s]


Fine-Tune on Ukrainian

In [16]:
train_dataloader = DataLoader(train_samples, shuffle=False, batch_size=batch_size)

In [17]:
train_loss = losses.CosineSimilarityLoss(model=model)

evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_samples, name='uk-val', main_similarity=SimilarityFunction.COSINE)

warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up

In [18]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path,
          show_progress_bar = True
          )

Iteration: 100%|██████████| 165/165 [06:19<00:00,  2.30s/it]
Iteration: 100%|██████████| 165/165 [06:25<00:00,  2.34s/it]
Iteration: 100%|██████████| 165/165 [06:21<00:00,  2.31s/it]
Iteration: 100%|██████████| 165/165 [06:22<00:00,  2.32s/it]
Epoch: 100%|██████████| 4/4 [26:16<00:00, 394.12s/it]


In [19]:
model = SentenceTransformer(model_save_path)

Evaluation of Fine-Tuned Model

In [20]:
uk_test_eval2 = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, batch_size=batch_size, name='uk-test2', show_progress_bar=True, main_similarity=SimilarityFunction.COSINE)
uk_eval2 = uk_test_eval2(model, output_path=model_save_path)

Batches: 100%|██████████| 21/21 [00:04<00:00,  5.06it/s]
Batches: 100%|██████████| 21/21 [00:04<00:00,  5.14it/s]


In [21]:
en_test_eval2 = EmbeddingSimilarityEvaluator.from_input_examples(en_samples, batch_size=batch_size, name='en-test2', show_progress_bar=True, main_similarity=SimilarityFunction.COSINE)
en_eval2 = en_test_eval2(model, output_path=model_save_path)

Batches: 100%|██████████| 173/173 [00:48<00:00,  3.56it/s]
Batches: 100%|██████████| 173/173 [00:48<00:00,  3.54it/s]


Results

In [22]:
print('Initial Ukrainian Evaluation: ', uk_eval1)
print('Final Ukrainian Evaluation: ', uk_eval2)

print('Initial English Evaluation: ', en_eval1)
print('Final English Evaluation: ', en_eval2)

Initial Ukrainian Evaluation:  0.8558588763533628
Final Ukrainian Evaluation:  0.854853166745427
Initial English Evaluation:  0.807547875467642
Final English Evaluation:  0.7848593741970551
