In [1]:
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from transformers import AutoModelForCausalLM
import torch
import torch.nn.functional
from tqdm.auto import tqdm
from nltk.translate.bleu_score import sentence_bleu
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#device = 'cpu'

In [3]:
labse_name = 'cointegrated/LaBSE-en-ru'
labse_model = AutoModel.from_pretrained(labse_name)
labse_tokenizer = AutoTokenizer.from_pretrained(labse_name)
if torch.cuda.is_available():
    labse_model.cuda()

Some weights of the model checkpoint at cointegrated/LaBSE-en-ru were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
mname = 'sberbank-ai/rugpt3small_based_on_gpt2'
gpt_tokenizer = AutoTokenizer.from_pretrained(mname)
gpt_model = AutoModelForCausalLM.from_pretrained(mname)
if torch.cuda.is_available():
    gpt_model.cuda()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
def encode_labse(texts):
    encoded_input = labse_tokenizer(
        texts, padding=True, truncation=True, max_length=64, return_tensors='pt'
    ).to(labse_model.device)
    with torch.no_grad():
        model_output = labse_model(**encoded_input)
    embeddings = model_output.pooler_output
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings.cpu().numpy()


def get_sims(df, batch_size=32):
    sims = []
    for i in range(0, df.shape[0], batch_size):
        batch = df.iloc[i: i+batch_size]
        e1 = encode_labse(batch.text1.tolist())
        e2 = encode_labse(batch.text2.tolist())
        sims.extend((e1 * e2).sum(axis=1))
    return np.array(sims)


def get_random_sims(df, batch_size=32, random_state=1):
    df2 = pd.DataFrame({
        'text1': df.text1.tolist(),
        'text2': df.text2.sample(frac=1.0, random_state=random_state).tolist()
    })
    return get_sims(df2, batch_size=batch_size)


def get_bleu(df):
    return np.array([sentence_bleu([row.text1], row.text2) for i, row in df.iterrows()])


def ngrams(word, n=3):
    return [word[i: i+n] for i in range(len(word)-n+1)]


def common_grams(text1, text2):
    g1 = {g for w in text1.lower().split() for n in range(3, 7) for g in ngrams(f' {w} ', n=n)}
    g2 = {g for w in text2.lower().split() for n in range(3, 7) for g in ngrams(f' {w} ', n=n)}
    return len(g1.intersection(g2)) / len(g1.union(g2))


def get_char_ngram_overlap(df):
    return np.array([common_grams(row.text1, row.text2) for i, row in df.iterrows()])


def calc_gpt2_ppl_corpus(test_sentences, aggregate=False, sep='\n'):
    """ Calculate average perplexity per token and number of tokens in each text."""
    lls = []
    weights = []
    for text in tqdm(test_sentences):
        encodings = gpt_tokenizer(f'{sep}{text}{sep}', return_tensors='pt')
        input_ids = encodings.input_ids.to(gpt_model.device)
        target_ids = input_ids.clone()

        w = max(0, len(input_ids[0]) - 1)
        if w > 0:
            with torch.no_grad():
                outputs = gpt_model(input_ids, labels=target_ids)
                log_likelihood = outputs[0]
                ll = log_likelihood.item()
        else:
            ll = 0
        lls.append(ll)
        weights.append(w)
    likelihoods, weights = np.array(lls), np.array(weights)
    if aggregate:
        return sum(likelihoods * weights) / sum(weights)
    return likelihoods, weights


def analyze_pairs(texts1, texts2):
    df = pd.DataFrame({'text1': texts1, 'text2': texts2})
    b1 = get_bleu(df)
    b2 = get_bleu(pd.DataFrame({'text1': texts2, 'text2': texts1}))
    p1, w1 = calc_gpt2_ppl_corpus(df.text1.tolist())
    p2, w2 = calc_gpt2_ppl_corpus(df.text2.tolist())
    return {
        'sim': get_sims(df).mean(),
        'sim_random': get_random_sims(df).mean(),
        'bleu_1': b1.mean(),
        'bleu_2': b2.mean(),
        'bleu': (b1+b2).mean() / 2,
        'char_ngram_overlap': get_char_ngram_overlap(df).mean(),
        'perp_1': (p1 * w1).sum() / w1.sum(),
        'perp_2': (p2 * w2).sum() / w2.sum(),
        'perp_mean': (p1 * w1 + p2 * w2).sum() / (w1 + w1).sum(),
    }

In [6]:
train = pd.read_csv('data\\train.csv', index_col=0)
train_small = pd.read_csv('data\\train_small.csv', index_col=0)
val = pd.read_csv('data\\eval.csv', index_col=0)
test = pd.read_csv('data\\test.csv', index_col=0)

In [7]:
data = pd.concat([train, train_small, val, test], axis=0)

In [8]:
data = data[data['size'].isin(['small','medium','large'])]

In [9]:
data['type'].value_counts()

para_phraser    715772
ru_xlsum         52010
ru_adapt         41494
gazeta           33894
ru_simp           6432
Name: type, dtype: int64

In [23]:
corpus_list = ['ru_xlsum', 'ru_adapt', 'gazeta', 'ru_simp','para_phraser']
result_list = []
for i in corpus_list:
    subdata = data[data['type']==i]
    result = analyze_pairs(subdata.source.values, subdata.target.values)
    result['text_name'] = i
    result_list.append(result)

ru_xlsum


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
100%|██████████| 52010/52010 [13:09<00:00, 65.89it/s] 
100%|██████████| 52010/52010 [06:27<00:00, 13

ru_adapt


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
100%|██████████| 41494/41494 [05:12<00:00, 132.72it/s]
100%|██████████| 41494/41494 [04:56<00:00, 14

gazeta


100%|██████████| 33894/33894 [14:05<00:00, 40.11it/s]
100%|██████████| 33894/33894 [04:10<00:00, 135.40it/s]


ru_simp


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe

para_phraser


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe

In [24]:
result = pd.DataFrame(result_list)

In [27]:
result

Unnamed: 0,sim,sim_random,bleu_1,bleu_2,bleu,char_ngram_overlap,perp_1,perp_2,perp_mean,text_name
0,0.537202,0.26643,0.010702,0.08327,0.046986,0.079309,3.031028,3.064053,1.648707,ru_xlsum
1,0.742496,0.221048,0.528971,0.575525,0.552248,0.529097,4.057999,3.70735,3.067881,ru_adapt
2,0.686615,0.25975,0.000195,0.070985,0.03559,0.093601,2.986411,3.13157,1.61995,gazeta
3,0.77109,0.272943,0.365496,0.427363,0.39643,0.341063,4.236945,4.118633,3.33526,ru_simp
4,0.662475,0.192331,0.320993,0.358986,0.33999,0.26877,4.650053,4.097087,3.529901,para_phraser


In [25]:
result.to_csv('data_info.csv')