In [None]:
import joblib
import pandas as pd
import numpy as np

In [None]:
train = pd.read_parquet('train_texts.parquet')

In [8]:
train = train.sort_values(by=['variantid_1', 'variantid_2'])
train = train.sample(len(train), random_state=42)

In [None]:
test = pd.read_parquet('test_texts.parquet')

In [None]:
from rouge_metric import PyRouge

metric = PyRouge(
    rouge_n=(1, 2, 3, 4), 
    rouge_l=False, 
    # rouge_w=True, 
    # rouge_w_weight=1.2, 
    rouge_s=True, rouge_su=True, skip_gap=4
)
metric_w = PyRouge(
    rouge_n=(), 
    rouge_l=False, 
    rouge_w=True, 
    # rouge_w_weight=1.2, 
    # rouge_s=True, rouge_su=True, skip_gap=4
)

In [552]:
from razdel import tokenize, sentenize 
from functools import lru_cache

@lru_cache(None)
def s(text):
    return [i.text for i in sentenize(text)]

@lru_cache(None)
def t(text):
    return [i.text for i in tokenize(text)]

metric_w.evaluate(['Диван лофт'], [['Диван книжка']], sentencizer=s, tokenizer=t)

{'rouge-w-1.2': {'r': 0.4352752816480622, 'p': 0.5, 'f': 0.46539803861923656}}

In [None]:
row = train[train['is_double'] == 1].iloc[2]

metric.evaluate(
        [row['name_1'] + '\n' + row['description_1']], 
        [[row['name_2'] + '\n' + row['description_2']]], sentencizer=s, tokenizer=t),\
metric.evaluate(
        [row['name_2'] + '\n' + row['description_2']], 
        [[row['name_1'] + '\n' + row['description_1']]], sentencizer=s, tokenizer=t)

({'rouge-1': {'r': 0.8095238095238095, 'p': 0.8125, 'f': 0.8110091743119265},
  'rouge-2': {'r': 0.6727941176470589,
   'p': 0.6752767527675276,
   'f': 0.6740331491712708},
  'rouge-3': {'r': 0.5830258302583026,
   'p': 0.5851851851851851,
   'f': 0.5841035120147874},
  'rouge-4': {'r': 0.5185185185185185,
   'p': 0.5204460966542751,
   'f': 0.5194805194805194},
  'rouge-s4': {'r': 0.6570370370370371,
   'p': 0.6594795539033457,
   'f': 0.6582560296846011},
  'rouge-su4': {'r': 0.6824907521578298,
   'p': 0.6850247524752475,
   'f': 0.6837554045707227}},
 {'rouge-1': {'r': 0.8125, 'p': 0.8095238095238095, 'f': 0.8110091743119265},
  'rouge-2': {'r': 0.6752767527675276,
   'p': 0.6727941176470589,
   'f': 0.6740331491712708},
  'rouge-3': {'r': 0.5851851851851851,
   'p': 0.5830258302583026,
   'f': 0.5841035120147874},
  'rouge-4': {'r': 0.5204460966542751,
   'p': 0.5185185185185185,
   'f': 0.5194805194805194},
  'rouge-s4': {'r': 0.6594795539033457,
   'p': 0.6570370370370371,
   '

In [None]:
def get_rogues(row):
    r1 = metric.evaluate(
        ['\n'.join([row['name_1'], row['description_1']])], 
        [['\n'.join([row['name_2'], row['description_2']])]], 
        sentencizer=s, tokenizer=t)
    # r2 = metric.evaluate(
    #     [row['name_2'] + '\n' + row['description_2']], 
    #     [[row['name_1'] + '\n' + row['description_1']]], sentencizer=s, tokenizer=t)
    # print(desc_r1)
    # print(desc_r2)
    # rw = metric_w.evaluate(
    #     [row['name_1'] + '\n' + row['description_1']], 
    #     [[row['name_2'] + '\n' + row['description_2']]], sentencizer=s, tokenizer=t)
    # rw2 = metric_w.evaluate(
    #     [row['name_2'] + '\n' + row['description_2']], 
    #     [[row['name_1'] + '\n' + row['description_1']]], sentencizer=s, tokenizer=t)
    
    # return pd.Series([
    return pd.Series([
        # (r1[k]['f']+r2[k]['f'])/2 for k in r1.keys()
        r1[k]['f'] for k in r1.keys()
    # ] + [
    #     (rw['rouge-w-1.2']['f']+rw2['rouge-w-1.2']['f'])/2
    ])

In [555]:
get_rogues(train.iloc[2])

0    0.200000
1    0.000000
2    0.000000
3    0.000000
4    0.050000
5    0.081081
dtype: float64

In [557]:
a = train[train['is_double'] == 1].iloc[2]
%timeit get_rogues(a)

7.27 ms ± 124 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [516]:
from tqdm import tqdm
tqdm.pandas()

In [558]:
rogues = train.progress_apply(get_rogues, axis=1)

100%|█████████████████████████████████████████████████████████████████████| 1879555/1879555 [2:21:24<00:00, 221.53it/s]


In [559]:
rogues_test = test.progress_apply(get_rogues, axis=1)

100%|█████████████████████████████████████████████████████████████████████████| 500000/500000 [29:27<00:00, 282.94it/s]


In [None]:
colnames = ['rouge_' + i for i in ['1', '2', '3', '4', 's4', 'su4']]

In [566]:
rogues.columns = colnames
rogues_test.columns = colnames

In [570]:
train_with_rouge = pd.concat([train[['variantid_1', 'variantid_2']], rogues], axis=1)
test_with_rouge = pd.concat([test[['variantid_1', 'variantid_2']], rogues_test], axis=1)

In [575]:
test_with_rouge

Unnamed: 0,variantid_1,variantid_2,rouge_1,rouge_2,rouge_3,rouge_4,rouge_s4,rouge_su4
0,00001e4f563a4dc91c63663a8b64068b82b5d046cc7b24...,883cb78f30a3cb90b26c674c4477464c11ac4fca8bf713...,0.402948,0.281481,0.248139,0.219451,0.240399,0.267220
1,00001e4f563a4dc91c63663a8b64068b82b5d046cc7b24...,afd827a7e59d8e855f0079b18d78048495757e2a2a4351...,0.403226,0.308108,0.271739,0.240437,0.260109,0.283636
2,00001e4f563a4dc91c63663a8b64068b82b5d046cc7b24...,d4969ce50032645ba4dfea0d4aeeb0316df579788caa22...,0.347368,0.238095,0.202128,0.176471,0.203209,0.226868
3,0000649b9fb42fec39328949bc74877e98f6687714136d...,ba6045b39544dd904b7be112d0802ee1fe15d279df137e...,0.603604,0.371041,0.268182,0.205479,0.310502,0.359422
4,00010817f43942cd236fc300f0d8d497127cf48bd65245...,04b0e2595bf4fe466418d4158f8acf724549825519676a...,0.186495,0.012903,0.000000,0.000000,0.011688,0.040541
...,...,...,...,...,...,...,...,...
499995,ffff336a8447d1c9f24c47fc85af5edd26671d68171523...,8ee26e594c6ab4ed3f368eee38b14ae7a8fd4712ecfe5a...,0.229508,0.000000,0.000000,0.000000,0.021818,0.053892
499996,ffff6b7f2e635225076944eb333e422d4a19d87d6c5b41...,42814661b581f992113b20e73b7386f5b5bc7be4f44105...,0.964187,0.930748,0.896936,0.885154,0.934454,0.939422
499997,ffff7c60d34773608a9ca373d2cb10d12d15357cbe63c4...,731f44210320229d83dbd906e41f37c2a871e6a6f007fe...,0.340426,0.130435,0.066667,0.045455,0.090909,0.131579
499998,ffff97e250e7e5e9c10c79ad9c9c36c1a2d7d019655ef7...,1e15aa5d08f69e0c6cfb35f228c0ecc5a1d15027ab9665...,0.970732,0.950980,0.931034,0.910891,0.946535,0.950577


In [576]:
train_with_rouge.to_csv('train_rouge.csv')
test_with_rouge.to_csv('test_rouge.csv')