In [1]:
import math
import random
from datetime import datetime
from sentence_transformers import SentenceTransformer, util, InputExample, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from transformers import AdamW
import datasets
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt 
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
batch_size = 8
num_epochs = 1
# model_save_path = './output/-distiluse-base-multilingual-cased-v2-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
model_save_path = './output'

In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [4]:
model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')

In [5]:
def get_samples(sen1, sen2, score):
    samples = []
    
    for i in range(len(sen1)):
        samples.append(InputExample(texts=[sen1[i], sen2[i]], label=score[i]))
    
    return samples

Testing English

In [6]:
en_dataset = datasets.load_dataset("stsb_multi_mt", "en")

Found cached dataset stsb_multi_mt (/home/krichards/.cache/huggingface/datasets/stsb_multi_mt/en/1.0.0/a5d260e4b7aa82d1ab7379523a005a366d9b124c76a5a5cf0c4c5365458b0ba9)
100%|██████████| 3/3 [00:00<00:00, 876.43it/s]


In [7]:
en_scores = [x / 5.0 for x in en_dataset['test']['similarity_score']]

In [8]:
en_samples = get_samples(en_dataset['test']['sentence1'], en_dataset['test']['sentence2'], en_scores)

In [9]:
en_test_eval1 = EmbeddingSimilarityEvaluator.from_input_examples(en_samples, batch_size=batch_size, name='en-test1', show_progress_bar=True)

In [10]:
en_test_eval1(model, output_path=model_save_path)

Batches: 100%|██████████| 173/173 [00:48<00:00,  3.59it/s]
Batches: 100%|██████████| 173/173 [00:48<00:00,  3.59it/s]


0.807547875467642

Preparing Ukrainian Dataset

In [11]:
uk_dataset = datasets.load_dataset("Helsinki-NLP/tatoeba_mt", "ukr-ukr", split=datasets.ReadInstruction('test', to=100, unit='%'))

Found cached dataset tatoeba_mt (/home/krichards/.cache/huggingface/datasets/Helsinki-NLP___tatoeba_mt/ukr-ukr/0.0.0/01e819f3f64a772a2ca70949061d295d3a2dc99d05183fe4776a3be23f75f619)


In [12]:
uk_dataset

Dataset({
    features: ['sourceLang', 'targetlang', 'sourceString', 'targetString'],
    num_rows: 823
})

In [13]:
uk_dataset = uk_dataset.train_test_split(test_size=0.2)

In [14]:
uk_train = uk_dataset['train']

uk_test = uk_dataset['test']
uk_test = uk_test.train_test_split(test_size=0.5)

uk_val = uk_test['train']
uk_test = uk_test['test']

In [15]:
uk_train

Dataset({
    features: ['sourceLang', 'targetlang', 'sourceString', 'targetString'],
    num_rows: 658
})

In [16]:
def get_uk_data(data):
    rand_sen1 = data['sourceString']
    random.shuffle(rand_sen1)

    rand_sen2 = data['targetString']
    random.shuffle(rand_sen2)

    score1 = []
    
    for i in range(len(rand_sen1)):
        if (rand_sen1[i]==rand_sen2[i]):
            score1.append(1.0)
        else:
            score1.append(0.0)

    orig_sen1 = data['sourceString']
    orig_sen2 = data['targetString']
    score2 = [1.0] * len(orig_sen1)

    sentence1 = orig_sen1 + rand_sen1
    sentence2 = orig_sen2 + rand_sen2
    similarity_score = score2 + score1

    return sentence1, sentence2, similarity_score


In [17]:
train_sen1, train_sen2, train_score = get_uk_data(uk_train)
val_sen1, val_sen2, val_score = get_uk_data(uk_val)
test_sen1, test_sen2, test_score = get_uk_data(uk_test)

In [18]:
train_samples = get_samples(train_sen1, train_sen2, train_score)
val_samples = get_samples(val_sen1, val_sen2, val_score)
test_samples = get_samples(test_sen1, test_sen2, test_score)

Initial Test on Ukrainian

In [19]:
uk_test_eval1 = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, batch_size=batch_size, name='uk-test1', show_progress_bar=True)

In [20]:
uk_test_eval1(model, output_path=model_save_path)

Batches: 100%|██████████| 11/11 [00:02<00:00,  5.02it/s]
Batches: 100%|██████████| 11/11 [00:02<00:00,  5.26it/s]


nan

Fine-Tune on Ukrainian

In [21]:
# train_samples = SentenceTransformer.tokenize(train_samples)
# test_samples = SentenceTransformer.tokenize(test_samples)
# en_samples = SentenceTransformer.tokenize(en_samples)

TypeError: tokenize() missing 1 required positional argument: 'texts'

In [23]:
train_dataloader = DataLoader(train_samples, shuffle=False, batch_size=batch_size)
test_dataloader = DataLoader(test_samples, shuffle=False, batch_size=batch_size)
en_dataloader = DataLoader(en_samples, shuffle=False, batch_size=batch_size)

In [24]:
train_loss = losses.CosineSimilarityLoss(model=model)
test_loss = losses.CosineSimilarityLoss(model=model)

In [25]:
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_samples, name='sts-val')

In [26]:
# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up

In [27]:
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1,
          warmup_steps=warmup_steps,
          output_path=model_save_path,
          show_progress_bar = True
)

Iteration:   7%|▋         | 6/83 [00:44<09:29,  7.39s/it]
Epoch:   0%|          | 0/1 [00:44<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# optim = AdamW(model.parameters(), lr=1e-5)

In [None]:
# def train(dataloader, model, optimizer):
#     loss = 0
    
#     model.train()

#     loop = tqdm(dataloader, leave=True)
#     for batch in loop:
#         optimizer.zero_grad()

#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         labels = batch['labels'].to(device)

#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

#         loss = outputs.loss

#         loss.backward()
#         optimizer.step()

#         loop.set_description(f'train')
#         loop.set_postfix(loss=loss.item())

#     return loss.item()

In [None]:
# def test(dataloader, model, lang):

#     loop = tqdm(dataloader, leave=True)

#     metric = datasets.load_metric('accuracy')
#     num_batches = len(dataloader)
#     model.eval()
    
#     test_loss = 0

#     with torch.no_grad():
#         for batch in loop:
#             input_ids = batch['input_ids'].to(device)
#             attention_mask = batch['attention_mask'].to(device)
#             labels = batch['labels'].to(device)

#             outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#             # test_loss += outputs.loss.item()
#             test_loss = outputs.loss.item()
#             # loop.set_description(f'Epoch {epoch} ({lang} test)')
#             loop.set_description(f'{lang} test')
#             loop.set_postfix(loss=test_loss)
#     # metric.compute()
    
#     return test_loss

In [None]:
# def loss_curve(train, test, en, epochs, title='Loss Curve'):
#   x = [*range(1,epochs+1,1)]


#   plt.plot(x, train, color='r', label='train loss')
#   plt.plot(x, test, color='g', label='uk test loss')
#   plt.plot(x, en, color='b', label='en test loss')

#   plt.xlabel('Epoch')
#   plt.ylabel('Loss')
#   plt.title(title)

#   plt.legend()

#   plt.show()

In [None]:
# def train_test(epochs):
#     train_losses = []
#     test_losses = []
#     en_losses = []

#     for t in range(epochs):
#         print(f'Epoch {t}:\n')

#         train_loss = train(train_dataloader, model, optim)
#         test_loss = test(test_dataloader, model, 'uk')
#         en_loss = test(en_dataloader, model, 'en')

#         train_losses.append(train_loss)
#         test_losses.append(test_loss)
#         en_losses.append(en_loss)

#         print(f'\n')


#     print("Done!\n")

#     return train_losses, test_losses, en_losses

In [None]:
# epochs = 10
# train_losses, test_losses, en_losses = train_test(epochs)

In [None]:
# loss_curve(train_losses, test_losses, en_losses, epochs, 'Loss Curve')

Test again on Ukrainian

In [None]:
uk_test_eval2 = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, batch_size=batch_size, name='uk-test2', show_progress_bar=True)
uk_fin_acc = uk_test_eval2(model, output_path=model_save_path)

Test again on English

In [None]:
en_test_eval2 = EmbeddingSimilarityEvaluator.from_input_examples(en_samples, batch_size=batch_size, name='en-test2', show_progress_bar=True)
en_fin_acc = en_test_eval2(model, output_path=model_save_path)

Examples

In [None]:
example_1 = [
    "Апельсини завжди оранжеві?", # comparison sentence

    "Помаранчі завжди помаранчеві?", # 1
    "Атлю десять.", # 2
    "Будинок горить." # 3
]

In [None]:
sentence_embeddings = model.encode(example_1)

In [None]:
# first sentence should have similarity of ~1
cosine_similarity(
    [sentence_embeddings[0]],
    sentence_embeddings[1:]
)

In [None]:
example_2 = [
    "Він має книжку.", # comparison sentence

    "У нього є книжка.", # 1
    "Він не мій кузен.", # 2
    "У мене багато друзів." # 3
]

In [None]:
sentence_embeddings = model.encode(example_2)

In [None]:
# first sentence should have similarity of ~1
cosine_similarity(
    [sentence_embeddings[0]],
    sentence_embeddings[1:]
)