In [22]:
import math
import random
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
import datasets
import torch
from torch.utils.data import DataLoader

In [23]:
batch_size = 8
num_epochs = 4
model_save_path = './ftmodel'

In [24]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [25]:
model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')

In [26]:
def get_uk_data(data):
    rand_sen1 = data['sourceString']
    random.shuffle(rand_sen1)

    rand_sen2 = data['targetString']
    random.shuffle(rand_sen2)

    score1 = []
    
    for i in range(len(rand_sen1)):
        if (rand_sen1[i]==rand_sen2[i]):
            score1.append(1.0)
        else:
            score1.append(0.0)

    orig_sen1 = data['sourceString']
    orig_sen2 = data['targetString']
    score2 = [1.0] * len(orig_sen1)

    sentence1 = orig_sen1 + rand_sen1
    sentence2 = orig_sen2 + rand_sen2
    similarity_score = score2 + score1

    return sentence1, sentence2, similarity_score


In [27]:
def get_samples(sen1, sen2, score):
    samples = []
    
    for i in range(len(sen1)):
        samples.append(InputExample(texts=[sen1[i], sen2[i]], label=score[i]))
    
    return samples

English Dataset

In [28]:
en_dataset = datasets.load_dataset("stsb_multi_mt", "en")

Found cached dataset stsb_multi_mt (/home/krichards/.cache/huggingface/datasets/stsb_multi_mt/en/1.0.0/a5d260e4b7aa82d1ab7379523a005a366d9b124c76a5a5cf0c4c5365458b0ba9)
100%|██████████| 3/3 [00:00<00:00, 322.10it/s]


In [29]:
en_scores = [x / 5.0 for x in en_dataset['test']['similarity_score']]
en_samples = get_samples(en_dataset['test']['sentence1'], en_dataset['test']['sentence2'], en_scores)

Ukrainian Dataset

In [30]:
uk_dataset = datasets.load_dataset("Helsinki-NLP/tatoeba_mt", "ukr-ukr", split='test')

Found cached dataset tatoeba_mt (/home/krichards/.cache/huggingface/datasets/Helsinki-NLP___tatoeba_mt/ukr-ukr/0.0.0/01e819f3f64a772a2ca70949061d295d3a2dc99d05183fe4776a3be23f75f619)


In [31]:
uk_dataset = uk_dataset.train_test_split(test_size=0.2)

uk_train = uk_dataset['train']

uk_test = uk_dataset['test']
uk_test = uk_test.train_test_split(test_size=0.5)

uk_val = uk_test['train']
uk_test = uk_test['test']

In [32]:
train_sen1, train_sen2, train_score = get_uk_data(uk_train)
val_sen1, val_sen2, val_score = get_uk_data(uk_val)
test_sen1, test_sen2, test_score = get_uk_data(uk_test)

In [33]:
train_samples = get_samples(train_sen1, train_sen2, train_score)
val_samples = get_samples(val_sen1, val_sen2, val_score)
test_samples = get_samples(test_sen1, test_sen2, test_score)

Initial Evaluation

In [34]:
uk_test_eval1 = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, batch_size=batch_size, name='uk-test1', show_progress_bar=True, main_similarity=SimilarityFunction.COSINE)
uk_eval1 = uk_test_eval1(model, output_path=model_save_path)

Batches: 100%|██████████| 21/21 [00:07<00:00,  2.70it/s]
Batches: 100%|██████████| 21/21 [00:07<00:00,  2.79it/s]


In [35]:
en_test_eval1 = EmbeddingSimilarityEvaluator.from_input_examples(en_samples, batch_size=batch_size, name='en-test1', show_progress_bar=True, main_similarity=SimilarityFunction.COSINE)
en_eval1 = en_test_eval1(model, output_path=model_save_path)

Batches: 100%|██████████| 173/173 [01:27<00:00,  1.97it/s]
Batches: 100%|██████████| 173/173 [01:28<00:00,  1.95it/s]


Fine Tune on Ukrainian

In [36]:
train_dataloader = DataLoader(train_samples, shuffle=False, batch_size=batch_size)

In [37]:
train_loss = losses.CosineSimilarityLoss(model=model)

evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_samples, name='uk-val', main_similarity=SimilarityFunction.COSINE)

warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up

In [38]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path,
          show_progress_bar = True
          )

Iteration: 100%|██████████| 165/165 [10:34<00:00,  3.85s/it]
Iteration: 100%|██████████| 165/165 [08:22<00:00,  3.05s/it]
Iteration: 100%|██████████| 165/165 [06:33<00:00,  2.38s/it]
Iteration: 100%|██████████| 165/165 [06:30<00:00,  2.37s/it]
Epoch: 100%|██████████| 4/4 [32:51<00:00, 492.88s/it]


In [39]:
model = SentenceTransformer(model_save_path)

Final Evaluation

In [40]:
uk_test_eval2 = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, batch_size=batch_size, name='uk-test2', show_progress_bar=True, main_similarity=SimilarityFunction.COSINE)
uk_eval2 = uk_test_eval2(model, output_path=model_save_path)

Batches: 100%|██████████| 21/21 [00:04<00:00,  5.04it/s]
Batches: 100%|██████████| 21/21 [00:04<00:00,  5.19it/s]


In [41]:
en_test_eval2 = EmbeddingSimilarityEvaluator.from_input_examples(en_samples, batch_size=batch_size, name='en-test2', show_progress_bar=True, main_similarity=SimilarityFunction.COSINE)
en_eval2 = en_test_eval2(model, output_path=model_save_path)

Batches: 100%|██████████| 173/173 [00:48<00:00,  3.58it/s]
Batches: 100%|██████████| 173/173 [00:48<00:00,  3.56it/s]


Results

In [42]:
print('Initial Ukrainian Evaluation: ', uk_eval1)
print('Final Ukrainian Evaluation: ', uk_eval2)

print('Initial English Evaluation: ', en_eval1)
print('Final English Evaluation: ', en_eval2)

Initial Ukrainian Evaluation:  0.8655382636551608
Final Ukrainian Evaluation:  0.8652868364180787
Initial English Evaluation:  0.807547875467642
Final English Evaluation:  0.7816900731421348
