In [236]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import gc
import re
from tqdm import tqdm

tqdm.pandas()

In [172]:
text_and_bert = pd.read_parquet('/kaggle/input/extracted_data/text_and_bert.parquet', engine='pyarrow')
text_and_bert['description'] = text_and_bert['description'].fillna('no desc')

In [173]:
attrs = pd.read_parquet('/kaggle/input/extracted_data/attributes.parquet', columns=['categories', 'characteristic_attributes_mapping'], engine='pyarrow')
attrs['category_level_2'] = attrs['categories'].progress_apply(lambda x: eval(x)['2'])

100%|██████████| 2252569/2252569 [00:51<00:00, 44054.05it/s]


In [174]:
data = pd.concat([text_and_bert, attrs], axis=1)

In [175]:
del text_and_bert, attrs
gc.collect()

2866

In [176]:
def remove_html_tags_and_emoji(text):
    if text is None:
        return None
    clean = re.compile('<.*?>')
    text = re.sub(clean, '', text)
    text = text.replace('\n', ' ')
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF"
                               u"\U0001F1E0-\U0001F1FF"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

data['description'] = data['description'].progress_apply(remove_html_tags_and_emoji)
data['name'] = data['name'].progress_apply(remove_html_tags_and_emoji)
data['category_level_2'] = data['category_level_2'].progress_apply(lambda x: x.lower())

100%|██████████| 2252569/2252569 [01:04<00:00, 35189.63it/s]
100%|██████████| 2252569/2252569 [00:15<00:00, 144817.57it/s]
100%|██████████| 2252569/2252569 [00:03<00:00, 722564.60it/s]


In [177]:
test_pairs = pd.read_parquet('/kaggle/input/extracted_data/test.parquet', engine='pyarrow')

In [178]:
test_pairs.rename(
    columns={
        'variantid1': 'variantid_1',
        'variantid2': 'variantid_2'
    }, inplace=True
)

test_df = test_pairs.merge(
    data.add_suffix('_1'), 
    on='variantid_1'
).merge(
    data.add_suffix('_2'), 
    on='variantid_2'
)

In [179]:
test_df['category_level_2'] = test_df['category_level_2_1']
test_df.drop(columns=['category_level_2_1', 'category_level_2_2'], axis=1, inplace=True)

In [181]:
def get_dist_and_sim(dict1, dict2):
    dist, sim = [], []
    try:
        dict1, dict2 = eval(dict1), eval(dict2)
        dict_keys = set(dict1.keys()) & set(dict2.keys())
    except:
        return dist, sim
    
    for key in dict_keys:
        val1 = dict1.get(key)
        val2 = dict2.get(key)
        if val1 != val2:
            dist.append(key)
        if val1 == val2:
            sim.append(key)
    return dist, sim

dataset = []
for i in tqdm(range(len(test_df))):
    row = test_df.iloc[i]
    target = -1
    category = row.category_level_2
    name1 = row.name_1
    name2 = row.name_2
    desc1 = row.description_1
    desc2 = row.description_2
    res_dist, res_similar = get_dist_and_sim(
        row.characteristic_attributes_mapping_1,
        row.characteristic_attributes_mapping_2
    )
    dataset.append(
        (category, 
         name1, 
         name2,
         desc1, 
         desc2, 
         ', '.join(res_dist), 
         ', '.join(res_similar), 
         target)
    )

100%|██████████| 49620/49620 [00:21<00:00, 2314.95it/s]


In [None]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")

model_attr = AutoModelForSequenceClassification.from_pretrained("cointegrated/rubert-tiny2", num_labels=2)
model_attr.load_state_dict(torch.load('name_attr_bert_full.pth', map_location=torch.device('cpu')))

model_desc = AutoModelForSequenceClassification.from_pretrained("cointegrated/rubert-tiny2", num_labels=2)
model_desc.load_state_dict(torch.load('name_desc_bert_full.pth', map_location=torch.device('cpu')))

In [None]:
eval_df = []
for t in tqdm(range(len(dataset))):
    category, name1, name2, desc1, desc2, dist, sim, target = dataset[t]
    # ATTR
    s_attr = category + '[SEP]' + name1 + '[SEP]' + name2 + '[SEP]' + dist
    tks_attr = tokenizer.encode_plus(
        s_attr[:1600], 
        max_length=768, 
        pad_to_max_length=False, 
        return_attention_mask=True, 
        return_tensors='pt', 
        truncation=True
    )
    # DESC
    name = name1 + '[SEP]' + name2
    desc_start = desc1[:300] + '[SEP]' + desc2[:300]
    desc_end = desc1[-300:] + '[SEP]' + desc2[-300:] 
    s_desc = category + '[SEP]' + 'Названия: ' + name + '[SEP]' + 'Описания: ' + desc_start + '[SEP]' + desc_end
    tks_desc = tokenizer.encode_plus(
        s_desc[:1600], 
        max_length=768, 
        pad_to_max_length=False, 
        return_attention_mask=True, 
        return_tensors='pt', 
        truncation=True
    )

    with torch.no_grad():
        score_attr = model_attr(
            tks_attr['input_ids'], 
            attention_mask=tks_attr['attention_mask'],
            token_type_ids=tks_attr['token_type_ids']
        ).logits[0][1].item()
        
        score_desc = model_desc(
            tks_desc['input_ids'], 
            attention_mask=tks_desc['attention_mask'],
            token_type_ids=tks_desc['token_type_ids']
        ).logits[0][1].item()

        eval_df.append(
            (test_pairs.iloc[t].variantid_1, 
             test_pairs.iloc[t].variantid_2, 
             score_attr, score_desc)
        )
        
eval_df = pd.DataFrame(eval_df)
eval_df.columns = ["variantid_1", "variantid_2", "name_attr_bert_oof", "name_desc_bert_oof"]
eval_df.to_parquet('test_berts.parquet')