In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import datasets
from datasets import load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from transformers import DataCollatorWithPadding



In [2]:
data = datasets.load_dataset('merionum/ru_paraphraser')

In [3]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'id_1', 'id_2', 'text_1', 'text_2', 'class'],
        num_rows: 7227
    })
    test: Dataset({
        features: ['id', 'id_1', 'id_2', 'text_1', 'text_2', 'class'],
        num_rows: 1924
    })
})

In [4]:
model_name = "IlyaGusev/xlm_roberta_large_headline_cause_simple"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [5]:
label_list = sorted(set(data['train']['class']))
labels2id = { key:id for id, key in enumerate(label_list)}

def tokenize_and_align_labels(tokenizer, labels2id):    
    def tokenize_and_align_labels_(examples):
#         tokenized_inputs = tokenizer([[text_1, text_2] for (text_1, text_2) in zip(examples['text_1'],examples['text_2'])])
        tokenized_inputs = tokenizer(examples['text_1'],examples['text_2'], truncation=True)
#         tokenized_inputs["labels"] = [[labels2id[label] for _ in range(len(tokenized_inputs["input_ids"][i]))] for i, label in enumerate(examples['class'])]
        tokenized_inputs["labels"] = [labels2id[label] for label in examples['class']]
#         tokenized_inputs["labels"] = [label for label in examples['class']]
        return tokenized_inputs
    return tokenize_and_align_labels_
tokenized_datasets = data.map(tokenize_and_align_labels(tokenizer, labels2id), batched=True)    

In [6]:
metric = load_metric("accuracy")
def compute_metrics(eval_pred):

    logits, labels = eval_pred

    predictions = np.argmax(logits, axis=-1)
    
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy")


In [7]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [8]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    "paraphras",
    evaluation_strategy = "epoch",
    learning_rate=2e-6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.05,
    save_strategy='no',
    report_to='none',
)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"].select(range(500)),
    eval_dataset=tokenized_datasets["test"].select(range(500)),
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [9]:
trainer.train()

  0%|          | 0/96 [00:00<?, ?it/s]

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
def get_similarity(text1, text2):
    """ Predict the probability that two Russian sentences are paraphrases of each other. """
    with torch.inference_mode():
        batch = tokenizer(
            text1, text2, 
            truncation=True, max_length=model.config.max_position_embeddings, return_tensors='pt',
        ).to(model.device)
        proba = torch.softmax(model(**batch).logits, -1)
    return proba[0][1].item()

In [None]:
text1 = "–Ø –Ω–µ –ø–æ–º–Ω—é —Ç–µ–±—è"
text2 = "–Ø –Ω–µ –∑–Ω–∞—é —Ç–µ–±—è"
get_similarity(text1, text2)

0.05976682901382446

In [None]:
text1 = "–≠—Ç–æ –∫—Ä–∞—Å–∏–≤–∞—è –º–∞—à–∏–Ω–∞"
text2 = "–≠—Ç–∞ –º–∞—à–∏–Ω–∞ –≤—ã–≥–ª—è–¥–∏—Ç –∫—Ä–∞—Å–∏–≤–æ"
get_similarity(text1, text2)

0.7301298379898071

In [None]:
text1 = "–¢–≤–æ–π –æ—Ç–µ—Ü –≥–æ–≤–æ—Ä–∏–ª, —á—Ç–æ –Ω–µ –ª—é–±–∏—Ç —Ñ—É—Ç–±–æ–ª"
text2 = "–¢–≤–æ–π –æ—Ç–µ—Ü —Ä–∞—Å—Å–∫–∞–∑–∞–ª –æ —Å–≤–æ–µ–π –Ω–µ–ª—é–±–≤–∏ –∫ —Ñ—É—Ç–±–æ–ª—É"
get_similarity(text1, text2)

0.6921333074569702

In [None]:
text1 = "–ú–æ—è —Å–µ—Å—Ç—Ä–∞ –ª—é–±–∏—Ç –∂–∏–≤–æ—Ç–Ω—ã—Ö"
text2 = "–ú–æ—è —Å–µ—Å—Ç—Ä–∞ –ª—é–±–∏—Ç —Å–æ–±–∞–∫—É"
get_similarity(text1, text2)

0.49897482991218567

In [None]:
text1 = "–ú–æ—è —Å–µ—Å—Ç—Ä–∞ –ª—é–±–∏—Ç –∂–∏–≤–æ—Ç–Ω—ã—Ö"
text2 = "–ú–æ—è —Å–µ—Å—Ç—Ä–∞ –ª—é–±–∏—Ç –ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞—Ç—å –Ω–∞ Python"
get_similarity(text1, text2)

0.1102612093091011

In [None]:
text1 = "–ú–æ—è —Å–µ—Å—Ç—Ä–∞ –ª—é–±–∏—Ç –∂–∏–≤–æ—Ç–Ω—ã—Ö"
text2 = "–ú–æ—è —Å–µ—Å—Ç—Ä–∞ –ª—é–±–∏—Ç –ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞—Ç—å"
get_similarity(text1, text2)

0.08786703646183014

In [None]:
id2labels = { id:key for id, key in enumerate(label_list)}
example = tokenized_datasets["test"][0:15]
tokens = tokenizer(example['text_1'], example['text_1'], padding=True, truncation=True, return_tensors='pt')
tokens = tokens.to('cuda:0')
with torch.no_grad():
    outputs = model(**tokens)

predicted = outputs.logits.argmax(dim=-1).cpu().numpy()
classes = [id2labels[id_label] for id_label in predicted]
df_example =pd.DataFrame({'text_1':example['text_1'], 'text_2':example['text_2'], 'class':example['class'], 'predict':classes})
df_example

Unnamed: 0,text_1,text_2,class,predict
0,–¶–µ–Ω—ã –Ω–∞ –Ω–µ—Ñ—Ç—å –≤–æ—Å—Å—Ç–∞–Ω–∞–≤–ª–∏–≤–∞—é—Ç—Å—è,–ü–∞—Ä–ª–∞–º–µ–Ω—Ç –°–ª–æ–≤–∞–∫–∏–∏ –ø–æ–±–ª–∞–≥–æ–¥–∞—Ä–∏–ª –Ω–∞—Ä–æ–¥—ã –±—ã–≤—à–µ–≥–æ...,-1,1
1,"""–ì–æ–≥–æ–ª—å-—Ü–µ–Ω—Ç—Ä"" –ø–æ–∫–∞–∂–µ—Ç –≤–∏–¥–µ–æ–∑–∞–ø–∏—Å—å —Å–∫–∞–Ω–¥–∞–ª—å–Ω–æ–≥...",–ö–µ—Ö–º–∞–Ω –∑–∞–ø—Ä–µ—Ç–∏–ª ¬´–ì–æ–≥–æ–ª—å-—Ü–µ–Ω—Ç—Ä—É¬ª –ø–æ–∫–∞–∑—ã–≤–∞—Ç—å –≤–∏–¥...,-1,1
2,–ê–≥–µ–Ω—Ç: –†–§–° –≤–Ω–æ–≤—å –∑–∞–¥–µ—Ä–∂–∏–≤–∞–µ—Ç –∑–∞—Ä–ø–ª–∞—Ç—É –§–∞–±–∏–æ –ö–∞...,–°–ú–ò: –ê–≥–µ–Ω—Ç –§–∞–±–∏–æ –ö–∞–ø–µ–ª–ª–æ –≥—Ä–æ–∑–∏—Ç—Å—è –ø–æ–¥–∞—Ç—å –≤ —Å—É–¥...,-1,1
3,–î–µ–Ω—å –ü–æ–±–µ–¥—ã –≤ –ú–æ—Å–∫–≤–µ –æ–±–µ—â–∞–µ—Ç –≤—ã–¥–∞—Ç—å—Å—è –æ–±–ª–∞—á–Ω—ã–º,–õ—é–±–ª—è–Ω–∞ –æ—Ç–ø—Ä–∞–∑–¥–Ω—É–µ—Ç –î–µ–Ω—å –ü–æ–±–µ–¥—ã –≤–º–µ—Å—Ç–µ —Å –ú–æ—Å–∫–≤–æ–π,-1,1
4,–ü–æ—Å–æ–ª –†–§ –≤ –°–®–ê: –†–æ—Å—Å–∏—è –±—É–¥–µ—Ç –±–æ—Ä–æ—Ç—å—Å—è —Å –ø–æ–ø—ã—Ç–∫...,–ü—Ä–∞–≤–∏—Ç–µ–ª—å—Å—Ç–≤–æ –∑–∞–ø–ª–∞–Ω–∏—Ä–æ–≤–∞–ª–æ –∑–∞—Ä–∞–±–æ—Ç–∞—Ç—å –Ω–∞ –ª–æ—Ç–µ...,-1,1
5,–í–µ—Ä—Ç–æ–ª–µ—Ç —Å 11 –∏–Ω–æ—Å—Ç—Ä–∞–Ω—Ü–∞–º–∏ –Ω–∞ –±–æ—Ä—Ç—É —É–ø–∞–ª –≤ –ü–∞–∫...,–í –ü–∞–∫–∏—Å—Ç–∞–Ω–µ —É–ø–∞–ª –≤–µ—Ä—Ç–æ–ª–µ—Ç —Å 11 –∏–Ω–æ—Å—Ç—Ä–∞–Ω—Ü–∞–º–∏,1,1
6,–°–∞–º–æ–ª–µ—Ç –≤–µ—Ä–Ω—É–ª—Å—è –≤ –∞—ç—Ä–æ–ø–æ—Ä—Ç –ù–æ–≤–æ—Å–∏–±–∏—Ä—Å–∫–∞ –∏–∑-–∑–∞...,–°–∞–º–æ–ª–µ—Ç –≤–µ—Ä–Ω—É–ª—Å—è –≤ –Ω–æ–≤–æ—Å–∏–±–∏—Ä—Å–∫–∏–π –∞—ç—Ä–æ–ø–æ—Ä—Ç –∏–∑-–∑...,1,1
7,–í–∞—Å–∏–ª—å–µ–≤–∞ –ø—Ä–∏–∑–Ω–∞–Ω–∞ –≤–∏–Ω–æ–≤–Ω–æ–π –≤ –º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–µ –∏ ...,–í–∞—Å–∏–ª—å–µ–≤–∞ –ø—Ä–∏–∑–Ω–∞–Ω–∞ –≤–∏–Ω–æ–≤–Ω–æ–π –≤ —Ö–∏—â–µ–Ω–∏—è—Ö –∏ –æ—Ç–º—ã–≤...,0,1
8,–ü—É—Ç–∏–Ω –ø–µ—Ä–µ–¥ –î–Ω–µ–º –ü–æ–±–µ–¥—ã –ø–æ–¥–ø–∏—Å–∞–ª —É–∫–∞–∑ –æ –ø—Ä–∏—Å–≤–æ...,–°–ö –†–§: –û—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–π —Å–∞–π—Ç –°–ª–µ–¥—Å—Ç–≤–µ–Ω–Ω–æ–≥–æ –∫–æ–º–∏—Ç–µ—Ç–∞...,-1,1
9,–°—É–¥ –æ–ø—Ä–∞–≤–¥–∞–ª –í–∞—Å–∏–ª—å–µ–≤—É –≤ —Ö–∏—â–µ–Ω–∏–∏ –∞–∫—Ü–∏–π –Ω–∞ –¥–≤–∞ ...,–°—É–¥ –æ–ø—Ä–∞–≤–¥–∞–ª –í–∞—Å–∏–ª—å–µ–≤—É –≤ —Ö–∏—â–µ–Ω–∏–∏ –∞–∫—Ü–∏–π –Ω–∞ 2 –º–ª...,1,1
