In [52]:
import json
import re
import pandas as pd
import numpy as np
import catboost as cb
from nltk.tokenize import RegexpTokenizer
from gensim.models import Word2Vec
from nltk import ngrams
import os
import dill

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import pairwise_distances
from collections import Counter
from metric import pr_auc_macro
from utils import process_nan, try_cast_to_float, augment
from distances import cosine_distance, cosine_distances, jaccard
import Levenshtein
import jellyfish

from tqdm.auto import tqdm
tqdm().pandas()

0it [00:00, ?it/s]

In [53]:
data = pd.read_parquet('../hackathon_files_for_participants_ozon/train_data.parquet')

test_pairs = pd.read_parquet('../hackathon_files_for_participants_ozon/test_pairs_wo_target.parquet')
test_data = pd.read_parquet('../hackathon_files_for_participants_ozon/test_data.parquet')

In [54]:
def _merge_pictures(main_pic, rest_pic):
    if rest_pic is None:
        return np.stack(main_pic)
    
    return np.concatenate((np.stack(main_pic), np.stack(rest_pic)))


def _tokenize_attributes(attributes):
    if not attributes:
        return []

    tokens = []
    
    for k, vs in attributes.items():
        for v in vs:
            token = ''
            token += k.replace(' ', '_')
            token += '='
            token += v.replace(' ', '_')
            tokens.append(token)
    return tokens


import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('russian'))
stemmer = SnowballStemmer('russian')
_tokenizer = RegexpTokenizer(r'\w+')


def preprocess_text(text):
    # Tokenize the text into individual words
    tokens = _tokenizer.tokenize(text.lower())

    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # Apply stemming using the Snowball stemmer
    stemmed_tokens = [stemmer.stem(word) for word in tokens]

    return stemmed_tokens


def preprocess_data(df):
    # types
    df['categories'] = df['categories'].apply(json.loads)
    df['characteristic_attributes_mapping'] = (
        df['characteristic_attributes_mapping']
        .apply(lambda row: json.loads(row) if row is not None else None)
    )
    
    # merge all pictures
    df['all_pic_embeddings_resnet_v1'] = (
        df.apply(
            lambda row: _merge_pictures(row.main_pic_embeddings_resnet_v1, row.pic_embeddings_resnet_v1),
            axis=1,
        )
    )
    
    # normalize categories
    _categories = pd.json_normalize(df['categories'])
    _categories.columns = [f'category_{c}' for c in _categories]

    df = pd.concat([df, _categories], axis=1)
    df.drop(['categories'], axis=1, inplace=True)
    
    # tokenize
    _tokenizer = RegexpTokenizer(r'\w+')

    df['tokenized_name'] = df['name'].apply(preprocess_text)
    df['tokenized_attributes'] = df['characteristic_attributes_mapping'].apply(_tokenize_attributes)
    df['tokenized_name_ebi'] = df['name'].apply(lambda x: re.findall(r'\b[\w|/|\-|.]*\d\w*\b', x.lower()))
    df['tokenized_name_en'] = df['name'].apply(lambda x: re.findall(r'\b[a-zA-Z]+\b', x.lower()))
    df['tokenized_name_ru'] = df['name'].apply(lambda x: re.findall(r'\b[а-яА-Я]+\b', x.lower()))
    
    # todo: ngrams of this ebi
    
    return df

In [55]:
data = preprocess_data(data)
test_data = preprocess_data(test_data)

In [56]:
test_pairs = (
    test_pairs
    .merge(
        test_data
        .add_suffix('_1'),
        left_on=['variantid1'],
        right_on=['variantid_1'],
    )
    .merge(
        test_data
        .add_suffix('_2'),
        left_on=['variantid2'],
        right_on=['variantid_2'],
    )
).drop(['variantid_1', 'variantid_2'], axis=1)

In [57]:
all_pairs = pd.read_pickle('features_v2.pickle')

In [58]:
%%time

model = dill.load(open('tfidf_tokenized_name_ebi.model', 'rb'))

vectors = np.asarray(model.transform(all_pairs['tokenized_name_ebi_1']).todense())
all_pairs['_emb_1'] = list(vectors)

vectors = np.asarray(model.transform(all_pairs['tokenized_name_ebi_2']).todense())
all_pairs['_emb_2'] = list(vectors)

all_pairs['names_ebi_tfidf_distance'] = (
    all_pairs.progress_apply(lambda row: cosine_distance(row._emb_1, row._emb_2), axis=1)
)

  0%|          | 0/18084 [00:00<?, ?it/s]

CPU times: user 2.09 s, sys: 143 ms, total: 2.23 s
Wall time: 2.26 s




In [59]:
ebi_vectors = pd.concat([
    pd.read_pickle('../hackathon_files_for_participants_ozon/test_data_rubert_tiny_2.pickle')
], axis=0).drop_duplicates(subset=['variantid'], keep='last').set_index('variantid')['ebi_vector'].to_dict()

In [60]:
all_pairs['_emb_1'] = all_pairs['variantid1'].map(ebi_vectors)
all_pairs['_emb_2'] = all_pairs['variantid2'].map(ebi_vectors)

all_pairs['ebi_vector_distance'] = (
    all_pairs.progress_apply(lambda row: cosine_distance(row._emb_1, row._emb_2), axis=1)
)

  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs['ebi_vector_distance'] = (


In [61]:
f = pd.concat([
    test_data[['variantid', 'tokenized_name_en']],
], axis=0).drop_duplicates(subset=['variantid'], keep='last').set_index('variantid')['tokenized_name_en'].to_dict()

all_pairs['tokenized_name_en_1'] = all_pairs['variantid1'].map(f)
all_pairs['tokenized_name_en_2'] = all_pairs['variantid2'].map(f)

  all_pairs['tokenized_name_en_1'] = all_pairs['variantid1'].map(f)
  all_pairs['tokenized_name_en_2'] = all_pairs['variantid2'].map(f)


In [62]:
all_pairs['names_en_jaccard'] = (
    all_pairs.progress_apply(lambda row: jaccard(row.tokenized_name_en_1, row.tokenized_name_en_2), axis=1)
)

  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs['names_en_jaccard'] = (


In [63]:
f = pd.concat([
    test_data[['variantid', 'tokenized_name_ru']],
], axis=0).drop_duplicates(subset=['variantid'], keep='last').set_index('variantid')['tokenized_name_ru'].to_dict()

all_pairs['tokenized_name_ru_1'] = all_pairs['variantid1'].map(f)
all_pairs['tokenized_name_ru_2'] = all_pairs['variantid2'].map(f)

  all_pairs['tokenized_name_ru_1'] = all_pairs['variantid1'].map(f)
  all_pairs['tokenized_name_ru_2'] = all_pairs['variantid2'].map(f)


In [64]:
all_pairs['names_ru_jaccard'] = (
    all_pairs.progress_apply(lambda row: jaccard(row.tokenized_name_ru_1, row.tokenized_name_ru_2), axis=1)
)

  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs['names_ru_jaccard'] = (


In [65]:
import joblib
model = joblib.load('./tfidf_joined.model')

vectors = model.fit_transform(all_pairs.apply(
    lambda x: ' '.join(x.tokenized_name_1) + ' ' + ' '.join(x.tokenized_name_2), 
    axis=1,
).values)

vectors = pd.DataFrame(vectors.todense()).add_prefix('tfidf_vectorizer_')
all_pairs = pd.concat([all_pairs, vectors], axis=1)



In [66]:
embeddings = pd.concat([
    pd.read_pickle('../hackathon_files_for_participants_ozon/test_bert_embeddings_768.pickle')
], axis=0).drop_duplicates(subset=['variantid'], keep='last').set_index('variantid')['embedding'].to_dict()

In [67]:
all_pairs['_emb_1'] = all_pairs['variantid1'].map(embeddings)
all_pairs['_emb_2'] = all_pairs['variantid2'].map(embeddings)

all_pairs['bert_768_vector_distance'] = (
    all_pairs.progress_apply(lambda row: cosine_distance(row._emb_1, row._emb_2), axis=1)
)

  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs['bert_768_vector_distance'] = (


In [68]:
def _jaccard_by_tokens(c1, c2, how='inner'):
    _tokenizer = RegexpTokenizer(r'\w+')
    if c1 in ('unknown', 'null') or c1 in ('unknown', 'null'):
        return
    
    try:
        return jaccard(_tokenizer.tokenize(c1.lower()), _tokenizer.tokenize(c2.lower()), how=how)
    except:
        return


all_pairs['complectation_names_jaccard_1'] = (
    all_pairs.progress_apply(lambda row: _jaccard_by_tokens(row.complectation_1, row.name_1), axis=1)
)
all_pairs['complectation_names_jaccard_2'] = (
    all_pairs.progress_apply(lambda row: _jaccard_by_tokens(row.complectation_2, row.name_2), axis=1)
)

  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs['complectation_names_jaccard_1'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs['complectation_names_jaccard_2'] = (


In [69]:
all_pairs_eq_isnull = all_pairs[[c for c in all_pairs if c.startswith('attribute_') and c.endswith(('_eq'))]].isnull()
all_pairs_eq_isnull = all_pairs_eq_isnull.add_suffix('_isnull')
all_pairs = pd.concat([all_pairs, all_pairs_eq_isnull], axis=1)

In [70]:
%%time


all_pairs['pic_dists_mat'] = all_pairs.progress_apply(
    lambda row: cosine_distances(
        row.all_pic_embeddings_resnet_v1_1, 
        row.all_pic_embeddings_resnet_v1_2,
    ),
    axis=1,
)

all_pairs['pic_dists_mat_1'] = all_pairs.progress_apply(
    lambda row: cosine_distances(
        row.all_pic_embeddings_resnet_v1_1, 
        row.all_pic_embeddings_resnet_v1_1,
    ),
    axis=1,
)

all_pairs['pic_dists_mat_2'] = all_pairs.progress_apply(
    lambda row: cosine_distances(
        row.all_pic_embeddings_resnet_v1_2, 
        row.all_pic_embeddings_resnet_v1_2,
    ),
    axis=1,
)

all_pairs['pic_dists_min_median'] = (
    all_pairs.pic_dists_mat.apply(lambda mat: np.median(mat.min(axis=1)))
)

all_pairs['pic_dists_mean_1'] = (
    all_pairs.pic_dists_mat_1.apply(lambda mat: mat.mean())
)
all_pairs['pic_dists_median_1'] = (
    all_pairs.pic_dists_mat_1.apply(lambda mat: np.median(mat))
)
all_pairs['pic_dists_max_1'] = (
    all_pairs.pic_dists_mat_1.apply(lambda mat: mat.max())
)

all_pairs['pic_dists_mean_2'] = (
    all_pairs.pic_dists_mat_2.apply(lambda mat: mat.mean())
)
all_pairs['pic_dists_median_2'] = (
    all_pairs.pic_dists_mat_2.apply(lambda mat: np.median(mat))
)
all_pairs['pic_dists_max_2'] = (
    all_pairs.pic_dists_mat_2.apply(lambda mat: mat.max())
)

  0%|          | 0/18084 [00:00<?, ?it/s]



  0%|          | 0/18084 [00:00<?, ?it/s]



  0%|          | 0/18084 [00:00<?, ?it/s]



CPU times: user 8.4 s, sys: 608 ms, total: 9.01 s
Wall time: 9.05 s


In [71]:
all_pairs['color_parsed_jaccard_left'] = all_pairs.progress_apply(
    lambda row: process_nan(jaccard)(row.color_parsed_1, row.color_parsed_2, how='left'),
    axis=1,
)

all_pairs['color_parsed_jaccard_right'] = all_pairs.progress_apply(
    lambda row: process_nan(jaccard)(row.color_parsed_1, row.color_parsed_2, how='right'),
    axis=1,
)

  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs['color_parsed_jaccard_left'] = all_pairs.progress_apply(


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs['color_parsed_jaccard_right'] = all_pairs.progress_apply(


In [72]:
def euclidean_distance(emb1, emb2):
    return pairwise_distances(emb1.reshape(1, -1), emb2.reshape(1, -1), metric='euclidean')[0][0]

In [73]:
all_pairs['main_pic_resnet_v1_euc_distance'] = (
    all_pairs.apply(
        lambda row: euclidean_distance(row.main_pic_embeddings_resnet_v1_1[0], row.main_pic_embeddings_resnet_v1_2[0]),
        axis=1,
    )
)

  all_pairs['main_pic_resnet_v1_euc_distance'] = (


In [74]:
all_pairs['names_bert_64_euc_distance'] = (
    all_pairs.apply(lambda row: euclidean_distance(row.name_bert_64_1, row.name_bert_64_2), axis=1)
)

  all_pairs['names_bert_64_euc_distance'] = (


In [75]:
cnts = Counter()
for c in data['characteristic_attributes_mapping'].values:
    if c != c or not c:
        continue
    
    cnts.update(set(c.keys()))
    
attribute_keys = [key for key, _ in cnts.most_common(100)]
new_attributes_features = []

unknown_set = set({'unknown'})

for key in tqdm(attribute_keys):
    all_pairs['_attr_1'] = all_pairs['characteristic_attributes_mapping_1'].apply(
        lambda x: set(x.get(key, ['unknown'])) if x is not None else set({'null'}),
    )
    
    all_pairs['_attr_2'] = all_pairs['characteristic_attributes_mapping_2'].apply(
        lambda x: set(x.get(key, ['unknown'])) if x is not None else set({'null'}),
    )

    all_pairs[f'attribute_{key}_eq_jaccard'] = (
        all_pairs.progress_apply(lambda x: jaccard(x['_attr_1'], x['_attr_2']), axis=1)
    )

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attribute_{key}_eq_jaccard'] = (


In [76]:
def get_iphone_version(x):
    try:
        return re.search(r'(7|8|11|12|13|14|xr|xs|se|x)', x.lower()).group()
    except:
        return 'None'

    
def get_iphone_spec(x):
    try:
        return re.search(r'(mini|pro max|pro|max|plus)', x.lower()).group()
    except:
        return 'None'


all_pairs.loc[:, 'iphone_version_1'] = all_pairs['name_1'].apply(get_iphone_version)
all_pairs.loc[:, 'iphone_version_2'] = all_pairs['name_2'].apply(get_iphone_version)

all_pairs.loc[:, 'iphone_spec_1'] = all_pairs['name_1'].apply(get_iphone_spec)
all_pairs.loc[:, 'iphone_spec_2'] = all_pairs['name_2'].apply(get_iphone_spec)

  all_pairs.loc[:, 'iphone_version_1'] = all_pairs['name_1'].apply(get_iphone_version)
  all_pairs.loc[:, 'iphone_version_2'] = all_pairs['name_2'].apply(get_iphone_version)
  all_pairs.loc[:, 'iphone_spec_1'] = all_pairs['name_1'].apply(get_iphone_spec)
  all_pairs.loc[:, 'iphone_spec_2'] = all_pairs['name_2'].apply(get_iphone_spec)


In [77]:
all_pairs.loc[:, 'iphone_version_eq'] = all_pairs['iphone_version_1'] == all_pairs['iphone_version_2']
all_pairs.loc[:, 'iphone_spec_eq'] = all_pairs['iphone_spec_1'] == all_pairs['iphone_spec_2']

  all_pairs.loc[:, 'iphone_version_eq'] = all_pairs['iphone_version_1'] == all_pairs['iphone_version_2']
  all_pairs.loc[:, 'iphone_spec_eq'] = all_pairs['iphone_spec_1'] == all_pairs['iphone_spec_2']


In [78]:
def get_watch_size(x):
    try:
        return re.search(r'(\d+)\s?(mm|мм)', x.lower()).group(1).strip(' ')
    except:
        return 'None'

    
def get_for_watches(x):
    try:
        return re.search(r'(для|для часов)\s([a-zA-Z\s\d\/]+)', x.lower()).group(2).strip(' ')
    except:
        return 'None'

all_pairs.loc[:, 'watches_size_1'] = all_pairs['name_1'].apply(get_watch_size)
all_pairs.loc[:, 'watches_size_2'] = all_pairs['name_2'].apply(get_watch_size)

all_pairs.loc[:, 'watches_for_1'] = all_pairs['name_1'].apply(get_for_watches)
all_pairs.loc[:, 'watches_for_2'] = all_pairs['name_2'].apply(get_for_watches)

  all_pairs.loc[:, 'watches_size_1'] = all_pairs['name_1'].apply(get_watch_size)
  all_pairs.loc[:, 'watches_size_2'] = all_pairs['name_2'].apply(get_watch_size)
  all_pairs.loc[:, 'watches_for_1'] = all_pairs['name_1'].apply(get_for_watches)
  all_pairs.loc[:, 'watches_for_2'] = all_pairs['name_2'].apply(get_for_watches)


In [79]:
all_pairs.loc[:, 'watches_size_eq'] = all_pairs['watches_size_1'] == all_pairs['watches_size_2']
all_pairs.loc[:, 'watches_for_eq'] = all_pairs['watches_for_1'] == all_pairs['watches_for_2']

  all_pairs.loc[:, 'watches_size_eq'] = all_pairs['watches_size_1'] == all_pairs['watches_size_2']
  all_pairs.loc[:, 'watches_for_eq'] = all_pairs['watches_for_1'] == all_pairs['watches_for_2']


In [80]:
def get_length(x):
    try:
        return re.search(r'(\d+)\s?(m|м)', x.lower()).group(1).strip(' ')
    except:
        return 'None'

all_pairs.loc[:, 'cable_length_1'] = all_pairs['name_1'].apply(get_length)
all_pairs.loc[:, 'cable_length_2'] = all_pairs['name_2'].apply(get_length)

  all_pairs.loc[:, 'cable_length_1'] = all_pairs['name_1'].apply(get_length)
  all_pairs.loc[:, 'cable_length_2'] = all_pairs['name_2'].apply(get_length)


In [81]:
all_pairs.loc[:, 'cable_length_eq'] = all_pairs['cable_length_1'] == all_pairs['cable_length_2']

  all_pairs.loc[:, 'cable_length_eq'] = all_pairs['cable_length_1'] == all_pairs['cable_length_2']


In [82]:
def get_pieces(x):
    try:
        return re.search(r'(\d+)\s?(шт)', x.lower()).group(1).strip(' ')
    except:
        return 'None'

all_pairs.loc[:, 'pieces_1'] = all_pairs['name_1'].apply(get_pieces)
all_pairs.loc[:, 'pieces_2'] = all_pairs['name_2'].apply(get_pieces)

  all_pairs.loc[:, 'pieces_1'] = all_pairs['name_1'].apply(get_pieces)
  all_pairs.loc[:, 'pieces_2'] = all_pairs['name_2'].apply(get_pieces)


In [83]:
all_pairs.loc[:, 'pieces_eq'] = all_pairs['pieces_1'] == all_pairs['pieces_2']

  all_pairs.loc[:, 'pieces_eq'] = all_pairs['pieces_1'] == all_pairs['pieces_2']


In [84]:
regex = '(?:Samsung|Apple|Sony|LG|Panasonic|Intel|Toshiba|Dell|HP|Microsoft|IBM|Lenovo|Nokia|Canon|Huawei|ASUS|Xiaomi|Cisco Systems|Hitachi|Philips|Fujitsu|Oracle|Acer|NEC|TCL|Sharp|GoPro|Vizio|Honeywell|Siemens|Western Digital|Hewlett Packard Enterprise|Alibaba|Jabil|Qualcomm|Texas Instruments|Nvidia|LG Display|SK Hynix|Hon Hai Precision Industry (Foxconn)|Ericsson|ZTE|Oppo|Vivo|Lenovo Group|BYD|Wistron|Pegatron|Flex|Wacom|Microchip Technology|Kyocera|OnePlus|Doogee|Palit|JBL|iPhone|IRONSET|Gigabyte|DVD-R|realme|RyzenPC|Honor|MSI|Hi-Black|Hoco|Phaser|M12|BBK|Razer|Redmi|Amazfit|Remax|Cactus|Dahua|Dyson|Baseus)\\s+([a-zA-Z\\s\\d\\/\\-\\+]+)'

def process(x):
    if len(x) != 2:
        raise Exception
    x = x[1]
    return [w for w in x.lower().split(' ') if w]

all_pairs['_models_1'] = all_pairs['name_1'].apply(lambda x: [process(t) for t in re.findall(regex, x)])
all_pairs['_models_2'] = all_pairs['name_2'].apply(lambda x: [process(t) for t in re.findall(regex, x)])

  all_pairs['_models_1'] = all_pairs['name_1'].apply(lambda x: [process(t) for t in re.findall(regex, x)])
  all_pairs['_models_2'] = all_pairs['name_2'].apply(lambda x: [process(t) for t in re.findall(regex, x)])


In [85]:
def jaccard_mat(vs1, vs2):
    mat = []
    
    try:
        for v1 in vs1:
            mat.append([])
            for v2 in vs2:
                mat[-1].append(jaccard(v1, v2))
    except:
        return []
    return mat

all_pairs['jaccard_mat'] = (
    all_pairs.progress_apply(lambda row: jaccard_mat(row._models_1, row._models_2), axis=1)
)

  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs['jaccard_mat'] = (


In [86]:
all_pairs['max_jaccard'] = all_pairs['jaccard_mat'].progress_apply(
    lambda x: np.max(x) if x and x[0] else np.nan
)

all_pairs['min_jaccard'] = all_pairs['jaccard_mat'].progress_apply(
    lambda x: np.min(x) if x and x[0] else np.nan
)

all_pairs['max_mean_jaccard'] = all_pairs['jaccard_mat'].progress_apply(
    lambda x: np.mean(np.max(x, axis=0)) if x and x[0] else np.nan
)

all_pairs['max_min_jaccard'] = all_pairs['jaccard_mat'].progress_apply(
    lambda x: np.min(np.max(x, axis=0)) if x and x[0] else np.nan
)

  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs['max_jaccard'] = all_pairs['jaccard_mat'].progress_apply(


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs['min_jaccard'] = all_pairs['jaccard_mat'].progress_apply(


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs['max_mean_jaccard'] = all_pairs['jaccard_mat'].progress_apply(


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs['max_min_jaccard'] = all_pairs['jaccard_mat'].progress_apply(


In [87]:
import Levenshtein

def levenstein_mat(vs1, vs2):
    mat = []
    
    try:
        for v1 in vs1:
            mat.append([])
            for v2 in vs2:
                mat[-1].append(Levenshtein.distance(v1, v2))
    except:
        return []
    return mat

all_pairs['levenstein_mat'] = (
    all_pairs.progress_apply(lambda row: levenstein_mat(row._models_1, row._models_2), axis=1)
)

  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs['levenstein_mat'] = (


In [88]:
all_pairs['max_levenstein'] = all_pairs['levenstein_mat'].progress_apply(
    lambda x: np.max(x) if x and x[0] else np.nan
)

all_pairs['min_levenstein'] = all_pairs['levenstein_mat'].progress_apply(
    lambda x: np.min(x) if x and x[0] else np.nan
)

all_pairs['max_mean_levenstein'] = all_pairs['levenstein_mat'].progress_apply(
    lambda x: np.mean(np.max(x, axis=0)) if x and x[0] else np.nan
)

all_pairs['max_min_levenstein'] = all_pairs['levenstein_mat'].progress_apply(
    lambda x: np.min(np.max(x, axis=0)) if x and x[0] else np.nan
)

  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs['max_levenstein'] = all_pairs['levenstein_mat'].progress_apply(


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs['min_levenstein'] = all_pairs['levenstein_mat'].progress_apply(


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs['max_mean_levenstein'] = all_pairs['levenstein_mat'].progress_apply(


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs['max_min_levenstein'] = all_pairs['levenstein_mat'].progress_apply(


In [89]:
all_pairs['tokenized_ebi_silno_name_1'] = (
    all_pairs['name_1'].progress_apply(lambda x: re.findall(f'\((.*?)\)', x.lower()))
)
all_pairs['tokenized_ebi_silno_name_2'] = (
    all_pairs['name_2'].progress_apply(lambda x: re.findall(f'\((.*?)\)', x.lower()))
)

  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs['tokenized_ebi_silno_name_1'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs['tokenized_ebi_silno_name_2'] = (


In [90]:
def process_empty(func):
    def wrapper(a, b, *args, **kwargs):
        if not a or not b:
            return None
        return func(a, b, *args, **kwargs)
    return wrapper


all_pairs['tokenized_ebi_silno_name_jaccard'] = all_pairs.progress_apply(
    lambda row: process_empty(jaccard)(row.tokenized_ebi_silno_name_1, row.tokenized_ebi_silno_name_2),
    axis=1,
)

all_pairs['tokenized_name_ebi_silno_jaccard_left'] = all_pairs.progress_apply(
    lambda row: process_empty(jaccard)(row.tokenized_ebi_silno_name_1, row.tokenized_ebi_silno_name_2, how='left'),
    axis=1,
)

all_pairs['tokenized_name_ebi_silno_jaccard_right'] = all_pairs.progress_apply(
    lambda row: process_empty(jaccard)(row.tokenized_ebi_silno_name_1, row.tokenized_ebi_silno_name_2, how='right'),
    axis=1,
)

  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs['tokenized_ebi_silno_name_jaccard'] = all_pairs.progress_apply(


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs['tokenized_name_ebi_silno_jaccard_left'] = all_pairs.progress_apply(


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs['tokenized_name_ebi_silno_jaccard_right'] = all_pairs.progress_apply(


In [95]:
all_pairs['_name_processed_1'] = all_pairs['name_1'].apply(lambda x: re.findall(f'\((.*?)\)', x.lower()))
all_pairs['_name_processed_2'] = all_pairs['name_2'].apply(lambda x: re.findall(f'\((.*?)\)', x.lower()))

all_pairs['names_in_brackets_jaccard'] = (
    all_pairs.progress_apply(lambda row: jaccard(row._name_processed_1, row._name_processed_2), axis=1)
)

  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs['names_in_brackets_jaccard'] = (


In [96]:
test_dataset = all_pairs

In [97]:
# todo: try to add left and right jaccards and so on

features = [
    # basic

    # names 
    'name_num_1',
    'name_num_2',
    'name_num_diff',
    'tokenized_name_num_1',
    'tokenized_name_num_2',
    'tokenized_name_num_diff',
    
    'names_ebi_jaccard',
    'names_ebi_jaccard_max',
    'names_ebi_jaccard_mean',
    '_names_ebi_jaccard_left',
    '_names_ebi_jaccard_right',
    'tokenized_names_ebi_num_1',
    'tokenized_names_ebi_num_2', 
    'tokenized_names_ebi_num_diff',
    
    'tokenized_name_digits_1',
    'tokenized_name_digits_2',
    'tokenized_name_digits_diff',

    'names_ebi_levenstein',
    'names_ebi_hamming',
    'names_ebi_jaro_winkler',
    'names_ebi_jaro',
    'names_ebi_damerau_levenshtein',
    
    'names_jaccard',
    'names_jaccard_max',
    'names_jaccard_mean',
    'names_levenstein',
    'names_hamming',
    'names_jaro_winkler',
    'names_jaro',
    'names_damerau_levenshtein',
    'names_bert_64_distance',
    'names_w2v_distance',
    'names_tfidf_distance',
    
    'names_2grams_jaccard',
    'names_length_of_common_prefix',
    
    # colors
    'color_parsed_jaccard',
    'color_parsed_num_diff',
    
    # pictures
    'main_pic_resnet_v1_distance',
    'all_pic_distances_mean',
    'all_pic_distances_min',
    'all_pic_num_diff',    

    # attributes
    'attributes_jaccard',
    'attributes_jaccard_max',
    'attributes_jaccard_mean',
    'attributes_keys_jaccard',
    'attributes_values_avg_jaccard',
    'attributes_w2v_distance',
    'attributes_values_avg_fully_eq',
    
    'complectation_jaccard',
    'name_of_colors_jaccard',
    
    'sum_of_attr_eq',
    'sum_of_attr_both_unknown',
    'sum_of_attr_any_unknown',
    
    'attributes_num_this_category',
    'attributes_num_1',
    'attributes_num_2',
    'attributes_num_diff',
    'attributes_num_normed_1',
    'attributes_num_normed_2',
    'attributes_num_empty_1',
    'attributes_num_empty_2',
    
    'complectation_ebi_jaccard',
    'fuzzywuzzy_ratio', 
    'fuzzywuzzy_partial_ratio',
    'fuzzywuzzy_token_sort_ratio', 
    'fuzzywuzzy_token_set_ratio',
    
    'all_pic_distances_min_mean',
    'all_pic_distances_min_max',
    'all_pic_num_1',
    'all_pic_num_2',
    
    'attr_color_jaccard',
    'attr_color_jaccard_max',
    'attr_color_jaccard_mean',
    
    'attributes_keys_top_1_for_category_jaccard',
    'attributes_values_top_1_for_category_jaccard',
    'attributes_keys_top_1_for_category_jaccard_left',
    'attributes_values_top_1_for_category_jaccard_left',
    'attributes_keys_top_1_for_category_jaccard_right',
    'attributes_values_top_1_for_category_jaccard_right',
    'attributes_values_top_1_for_category_fully_eq',
    
    'attributes_keys_top_3_for_category_jaccard',
    'attributes_values_top_3_for_category_jaccard',
    'attributes_values_top_3_for_category_fully_eq',
    'attributes_keys_top_3_for_category_jaccard_left',
    'attributes_values_top_3_for_category_jaccard_left',
    'attributes_keys_top_3_for_category_jaccard_right',
    'attributes_values_top_3_for_category_jaccard_right',
    
    'attributes_keys_top_5_for_category_jaccard',
    'attributes_values_top_5_for_category_jaccard',
    'attributes_values_top_5_for_category_fully_eq',
    'attributes_keys_top_5_for_category_jaccard_left',
    'attributes_values_top_5_for_category_jaccard_left',
    'attributes_keys_top_5_for_category_jaccard_right',
    'attributes_values_top_5_for_category_jaccard_right',
    
    'attributes_keys_top_10_for_category_jaccard',
    'attributes_values_top_10_for_category_jaccard',
    'attributes_values_top_10_for_category_fully_eq',
    'attributes_keys_top_10_for_category_jaccard_left',
    'attributes_values_top_10_for_category_jaccard_left',
    'attributes_keys_top_10_for_category_jaccard_right',
    'attributes_values_top_10_for_category_jaccard_right',
    
    'attributes_div',
    
    'names_no_digits_jaccard',
    'names_ebi_levenstein_mean',
    'names_in_brackets_levenstein_mean',
    'names_in_brackets_jaccard',
    'names_by_comma_jaccard',
    
    'names_ebi_tfidf_distance',
    'ebi_vector_distance',
    'names_ru_jaccard',
    'names_en_jaccard',
    'bert_768_vector_distance',
    'complectation_names_jaccard_1',
    'complectation_names_jaccard_2',
    
    'color_parsed_jaccard_left',
    'color_parsed_jaccard_right',
    'pic_dists_min_median',
    'pic_dists_mean_1',
    'pic_dists_median_1',
    'pic_dists_max_1',
    'pic_dists_mean_2',
    'pic_dists_median_2',
    'pic_dists_max_2',
    'main_pic_resnet_v1_euc_distance',
    'names_bert_64_euc_distance',
    
    'cable_length_eq',
    'watches_size_eq',
    'watches_for_eq',
    'iphone_version_eq',
    'iphone_spec_eq',
    'pieces_eq',
    
    'tokenized_name_ebi_silno_jaccard_right',
    'tokenized_name_ebi_silno_jaccard_left',
    'tokenized_ebi_silno_name_jaccard',
    'max_min_levenstein',
    'max_mean_levenstein',
    'max_levenstein',
    'min_levenstein',
    'max_min_jaccard',
    'max_mean_jaccard',
    'max_jaccard',
    'min_jaccard',
] + (
    [c for c in all_pairs if c.startswith('attribute_') and c.endswith(('_eq', '_unknown', '_isnull', '_eq_jaccard'))]
) + (
    [c for c in all_pairs if c.startswith('tfidf_vectorizer_')]
)

cat_features = [
    'category_3_1', 
    'category_4_1',
    'category_3_2', 
    'category_4_2',
    'brand_1',
    'brand_2',
    'type_1',
    'type_2',
    'name_1',
    'name_2',
    'variantid1',
    'variantid2',
    'cable_length_1',
    'cable_length_2',
    'watches_for_1',
    'watches_for_2',
    'iphone_version_1',
    'iphone_version_2',
    'iphone_spec_1',
    'iphone_spec_2',
    'pieces_1',
    'pieces_2',
]

In [100]:
print(len(features), len(cat_features))

2866 22


In [104]:
from sklearn.model_selection import KFold


oof_predictions = []

for i in tqdm(range(5)):
    model = cb.CatBoostClassifier().load_model(f'model_{i}.cb')
    oof_predictions.append(model.predict_proba(test_dataset[features + cat_features])[:, 1])

  0%|          | 0/5 [00:00<?, ?it/s]

In [105]:
test_dataset['prediction'] = np.mean(oof_predictions, axis=0)

  test_dataset['prediction'] = np.mean(oof_predictions, axis=0)


In [106]:
submission = (
    test_dataset[['variantid1', 'variantid2', 'prediction']].rename(columns={'prediction': 'target'})
)
submission.to_csv('submission.csv', index=False)