In [1]:
import json
import re
import pandas as pd
import numpy as np
import catboost as cb
from nltk.tokenize import RegexpTokenizer
from gensim.models import Word2Vec
from nltk import ngrams
import os
import dill

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import pairwise_distances
from collections import Counter
from metric import pr_auc_macro
from utils import process_nan, try_cast_to_float, augment
from distances import cosine_distance, cosine_distances, jaccard
import Levenshtein
import jellyfish

from tqdm.auto import tqdm
tqdm().pandas()

0it [00:00, ?it/s]

In [2]:
data = pd.read_parquet('../hackathon_files_for_participants_ozon/train_data.parquet')

test_pairs = pd.read_parquet('../hackathon_files_for_participants_ozon/test_pairs_wo_target.parquet')
test_data = pd.read_parquet('../hackathon_files_for_participants_ozon/test_data.parquet')

In [3]:
def _merge_pictures(main_pic, rest_pic):
    if rest_pic is None:
        return np.stack(main_pic)
    
    return np.concatenate((np.stack(main_pic), np.stack(rest_pic)))


def _tokenize_attributes(attributes):
    if not attributes:
        return []

    tokens = []
    
    for k, vs in attributes.items():
        for v in vs:
            token = ''
            token += k.replace(' ', '_')
            token += '='
            token += v.replace(' ', '_')
            tokens.append(token)
    return tokens


import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('russian'))
stemmer = SnowballStemmer('russian')
_tokenizer = RegexpTokenizer(r'\w+')


def preprocess_text(text):
    # Tokenize the text into individual words
    tokens = _tokenizer.tokenize(text.lower())

    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # Apply stemming using the Snowball stemmer
    stemmed_tokens = [stemmer.stem(word) for word in tokens]

    return stemmed_tokens


def preprocess_data(df):
    # types
    df['categories'] = df['categories'].apply(json.loads)
    df['characteristic_attributes_mapping'] = (
        df['characteristic_attributes_mapping']
        .apply(lambda row: json.loads(row) if row is not None else None)
    )
    
    # merge all pictures
    df['all_pic_embeddings_resnet_v1'] = (
        df.apply(
            lambda row: _merge_pictures(row.main_pic_embeddings_resnet_v1, row.pic_embeddings_resnet_v1),
            axis=1,
        )
    )
    
    # normalize categories
    _categories = pd.json_normalize(df['categories'])
    _categories.columns = [f'category_{c}' for c in _categories]

    df = pd.concat([df, _categories], axis=1)
    df.drop(['categories'], axis=1, inplace=True)
    
    # tokenize
    _tokenizer = RegexpTokenizer(r'\w+')

    df['tokenized_name'] = df['name'].apply(preprocess_text)
    df['tokenized_attributes'] = df['characteristic_attributes_mapping'].apply(_tokenize_attributes)
    df['tokenized_name_ebi'] = df['name'].apply(lambda x: re.findall(r'\b[\w|/|-|.]*\d\w*\b', x.lower()))
    
    # todo: ngrams of this ebi
    return df

In [4]:
data = preprocess_data(data)
test_data = preprocess_data(test_data)

In [5]:
test_pairs = (
    test_pairs
    .merge(
        test_data
        .add_suffix('_1'),
        left_on=['variantid1'],
        right_on=['variantid_1'],
    )
    .merge(
        test_data
        .add_suffix('_2'),
        left_on=['variantid2'],
        right_on=['variantid_2'],
    )
).drop(['variantid_1', 'variantid_2'], axis=1)

In [6]:
all_pairs = test_pairs

In [7]:
all_pairs['tokenized_name_num_1'] = all_pairs['tokenized_name_1'].apply(len)
all_pairs['tokenized_name_num_2'] = all_pairs['tokenized_name_2'].apply(len)
all_pairs['tokenized_name_num_diff'] = (all_pairs['tokenized_name_num_2'] - all_pairs['tokenized_name_num_1']).abs()

all_pairs['name_num_1'] = all_pairs['name_1'].apply(len)
all_pairs['name_num_2'] = all_pairs['name_2'].apply(len)
all_pairs['name_num_diff'] = (all_pairs['name_num_2'] - all_pairs['name_num_1']).abs()

In [8]:
all_pairs['tokenized_name_digits_1'] = all_pairs['tokenized_name_1'].apply(lambda s: sum(w.isdigit() for w in s))
all_pairs['tokenized_name_digits_2'] = all_pairs['tokenized_name_2'].apply(lambda s: sum(w.isdigit() for w in s))
all_pairs['tokenized_name_digits_diff'] = (
    all_pairs['tokenized_name_digits_2'] - all_pairs['tokenized_name_digits_1']
).abs()

In [9]:
all_pairs['names_jaccard'] = (
    all_pairs.apply(lambda row: jaccard(row.tokenized_name_1, row.tokenized_name_2), axis=1)
)

for how in ['left', 'right']:
    all_pairs[f'_names_jaccard_{how}'] = (
        all_pairs.apply(lambda row: jaccard(row.tokenized_name_1, row.tokenized_name_2, how=how), axis=1)
    )
    
all_pairs['names_jaccard_max'] = (
    all_pairs.apply(lambda row: max(row._names_jaccard_left, row._names_jaccard_right), axis=1)
)

all_pairs['names_jaccard_mean'] = (
    all_pairs.apply(lambda row: (row._names_jaccard_left + row._names_jaccard_right) / 2, axis=1)
)

In [10]:
all_pairs['names_ebi_jaccard'] = (
    all_pairs.apply(lambda row: jaccard(row.tokenized_name_ebi_1, row.tokenized_name_ebi_2), axis=1)
)

for how in ['left', 'right']:
    all_pairs[f'_names_ebi_jaccard_{how}'] = (
        all_pairs.apply(lambda row: jaccard(row.tokenized_name_ebi_1, row.tokenized_name_ebi_2, how=how), axis=1)
    )
    
all_pairs['names_ebi_jaccard_max'] = (
    all_pairs.apply(lambda row: max(row._names_ebi_jaccard_left, row._names_ebi_jaccard_right), axis=1)
)

all_pairs['names_ebi_jaccard_mean'] = (
    all_pairs.apply(lambda row: (row._names_ebi_jaccard_left + row._names_ebi_jaccard_right) / 2, axis=1)
)

In [11]:
all_pairs['tokenized_names_ebi_num_1'] = all_pairs['tokenized_name_ebi_1'].apply(len)
all_pairs['tokenized_names_ebi_num_2'] = all_pairs['tokenized_name_ebi_2'].apply(len)
all_pairs['tokenized_names_ebi_num_diff'] = (
    all_pairs['tokenized_names_ebi_num_1'] - all_pairs['tokenized_names_ebi_num_2']
).abs()

In [12]:
f = lambda s: list(ngrams(s, 2))

all_pairs['names_2grams_jaccard'] = (
    all_pairs.progress_apply(lambda row: jaccard(f(row.tokenized_name_1), f(row.tokenized_name_2)), axis=1)
)

  0%|          | 0/18084 [00:00<?, ?it/s]

In [13]:
all_pairs['names_length_of_common_prefix'] = (
    all_pairs.progress_apply(lambda row: len(os.path.commonprefix([row.name_1, row.name_2])), axis=1)
)

  0%|          | 0/18084 [00:00<?, ?it/s]

In [14]:
all_pairs['names_levenstein'] = (
    all_pairs.apply(lambda row: Levenshtein.distance(row.name_1, row.name_2), axis=1)
)
all_pairs['names_hamming'] = (
    all_pairs.apply(lambda row: Levenshtein.hamming(row.name_1, row.name_2), axis=1)
)
all_pairs['names_jaro_winkler'] = (
    all_pairs.apply(lambda row: jellyfish.jaro_winkler_similarity(row.name_1, row.name_2), axis=1)
)
all_pairs['names_jaro'] = (
    all_pairs.apply(lambda row: jellyfish.jaro_similarity(row.name_1, row.name_2), axis=1)
)
all_pairs['names_damerau_levenshtein'] = (
    all_pairs.apply(lambda row: jellyfish.damerau_levenshtein_distance(row.name_1, row.name_2), axis=1)
)

In [15]:
all_pairs['names_ebi_levenstein'] = (
    all_pairs.apply(lambda row: Levenshtein.distance(
        ' '.join(row.tokenized_name_ebi_1), ' '.join(row.tokenized_name_ebi_2)
    ), axis=1)
)
all_pairs['names_ebi_hamming'] = (
    all_pairs.apply(lambda row: Levenshtein.hamming(
        ' '.join(row.tokenized_name_ebi_1), ' '.join(row.tokenized_name_ebi_2)
    ), axis=1)
)
all_pairs['names_ebi_jaro_winkler'] = (
    all_pairs.apply(lambda row: jellyfish.jaro_winkler_similarity(
        ' '.join(row.tokenized_name_ebi_1), ' '.join(row.tokenized_name_ebi_2)
    ), axis=1)
)
all_pairs['names_ebi_jaro'] = (
    all_pairs.apply(lambda row: jellyfish.jaro_similarity(
         ' '.join(row.tokenized_name_ebi_1), ' '.join(row.tokenized_name_ebi_2)
    ), axis=1)
)
all_pairs['names_ebi_damerau_levenshtein'] = (
    all_pairs.apply(lambda row: jellyfish.damerau_levenshtein_distance(
        ' '.join(row.tokenized_name_ebi_1), ' '.join(row.tokenized_name_ebi_2)
    ), axis=1)
)

In [16]:
all_pairs['names_bert_64_distance'] = (
    all_pairs.apply(lambda row: cosine_distance(row.name_bert_64_1, row.name_bert_64_2), axis=1)
)

In [17]:
model = dill.load(open('./w2v_tokenized_name.model', 'rb'))

all_pairs['_emb_1'] = all_pairs['tokenized_name_1'].apply(model.wv.get_mean_vector)
all_pairs['_emb_2'] = all_pairs['tokenized_name_2'].apply(model.wv.get_mean_vector)

all_pairs['names_w2v_distance'] = (
    all_pairs.apply(lambda row: cosine_distance(row._emb_1, row._emb_2), axis=1)
)

In [18]:
import joblib

model = joblib.load('./tfidf_name.model')

vectors = np.asarray(model.transform(all_pairs['name_1']).todense())
all_pairs['_emb_1'] = list(vectors)

vectors = np.asarray(model.transform(all_pairs['name_2']).todense())
all_pairs['_emb_2'] = list(vectors)

all_pairs['names_tfidf_distance'] = (
    all_pairs.apply(lambda row: cosine_distance(row._emb_1, row._emb_2), axis=1)
)

In [19]:
all_pairs['color_parsed_jaccard'] = (
    all_pairs.apply(lambda row: process_nan(jaccard)(row.color_parsed_1, row.color_parsed_2), axis=1)
)

for how in ['left', 'right']:
    all_pairs[f'_color_parsed_jaccard_{how}'] = (
        all_pairs.apply(lambda row: process_nan(jaccard)(row.color_parsed_1, row.color_parsed_2, how=how), axis=1)
    )
    
all_pairs['color_parsed_jaccard_max'] = (
    all_pairs.apply(lambda row: max(row._color_parsed_jaccard_left, row._color_parsed_jaccard_right), axis=1)
)

all_pairs['color_parsed_jaccard_mean'] = (
    all_pairs.apply(lambda row: (row._color_parsed_jaccard_left + row._color_parsed_jaccard_right) / 2, axis=1)
)

In [20]:
all_pairs['_color_parsed_num_1'] = (
    all_pairs['color_parsed_1'].apply(lambda x: len(x) if x is not None else None)
)

all_pairs['_color_parsed_num_2'] = (
    all_pairs['color_parsed_2'].apply(lambda x: len(x) if x is not None else None)
)

all_pairs['color_parsed_num_diff'] = (all_pairs['_color_parsed_num_2'] - all_pairs['_color_parsed_num_1']).abs()

In [21]:
all_pairs['main_pic_resnet_v1_distance'] = (
    all_pairs.apply(
        lambda row: cosine_distance(row.main_pic_embeddings_resnet_v1_1[0], row.main_pic_embeddings_resnet_v1_2[0]),
        axis=1,
    )
)

In [22]:
all_pairs['_mat'] = (
    all_pairs.apply(
        lambda row: cosine_distances(row.all_pic_embeddings_resnet_v1_1, row.all_pic_embeddings_resnet_v1_2),
        axis=1,
    )
)

all_pairs['all_pic_distances_mean'] = all_pairs['_mat'].apply(np.mean)
all_pairs['all_pic_distances_min'] = all_pairs['_mat'].apply(np.min)
all_pairs['all_pic_distances_min_mean'] = all_pairs['_mat'].apply(lambda x: np.mean(np.min(x, axis=0)))
all_pairs['all_pic_distances_min_max'] = all_pairs['_mat'].apply(lambda x: np.max(np.min(x, axis=0)))

In [23]:
all_pairs['all_pic_num_1'] = all_pairs['all_pic_embeddings_resnet_v1_1'].apply(len)
all_pairs['all_pic_num_2'] = all_pairs['all_pic_embeddings_resnet_v1_2'].apply(len)

all_pairs['all_pic_num_diff'] = (all_pairs['all_pic_num_2'] - all_pairs['all_pic_num_1']).abs()

In [24]:
all_pairs['attributes_jaccard'] = (
    all_pairs.apply(lambda row: jaccard(row.tokenized_attributes_1, row.tokenized_attributes_2), axis=1)
)

for how in ['left', 'right']:
    all_pairs[f'_attributes_jaccard_{how}'] = (
        all_pairs.apply(lambda row: jaccard(row.tokenized_attributes_1, row.tokenized_attributes_2, how=how), axis=1)
    )
    
all_pairs['attributes_jaccard_max'] = (
    all_pairs.apply(lambda row: max(row._attributes_jaccard_left, row._attributes_jaccard_right), axis=1)
)

all_pairs['attributes_jaccard_mean'] = (
    all_pairs.apply(lambda row: (row._attributes_jaccard_left + row._attributes_jaccard_right) / 2, axis=1)
)

In [25]:
all_pairs['attributes_keys_jaccard'] = (
    all_pairs.apply(
        lambda row: process_nan(jaccard)(
            row.characteristic_attributes_mapping_1,
            row.characteristic_attributes_mapping_2,
        ),
        axis=1,
    )
)

In [26]:
def _avg_jaccard_between_attributes(d1, d2):
    keys = set(d1) & set(d2)
    metrics = []
    for key in keys:
        metrics.append(jaccard(d1[key], d2[key]))
    return np.mean(metrics)

all_pairs['attributes_values_avg_jaccard'] = (
    all_pairs.apply(
        lambda row: process_nan(_avg_jaccard_between_attributes)(
            row.characteristic_attributes_mapping_1,
            row.characteristic_attributes_mapping_2,
        ),
        axis=1,
    )
)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [27]:
def _avg_fully_eq_attributes(d1, d2):
    keys = set(d1) & set(d2)
    metrics = []
    for key in keys:
        metrics.append(set(d1[key]) == set(d2[key]))
    return np.mean(metrics)

all_pairs['attributes_values_avg_fully_eq'] = (
    all_pairs.apply(
        lambda row: process_nan(_avg_fully_eq_attributes)(
            row.characteristic_attributes_mapping_1,
            row.characteristic_attributes_mapping_2,
        ),
        axis=1,
    )
)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [28]:
model = dill.load(open('./w2v_attributes.model', 'rb'))

_get_w2v_vector = lambda x: model.wv.get_mean_vector(x) if x else None

all_pairs['_emb_1'] = all_pairs['tokenized_attributes_1'].apply(_get_w2v_vector)
all_pairs['_emb_2'] = all_pairs['tokenized_attributes_2'].apply(_get_w2v_vector)

all_pairs['attributes_w2v_distance'] = (
    all_pairs.apply(lambda row: process_nan(cosine_distance)(row._emb_1, row._emb_2), axis=1)
)

In [29]:
all_pairs['brand_1'] = all_pairs['characteristic_attributes_mapping_1'].apply(
    lambda x: x.get('Бренд', ['Нет бренда'])[0] if x is not None else 'null',
)

all_pairs['brand_2'] = all_pairs['characteristic_attributes_mapping_2'].apply(
    lambda x: x.get('Бренд', ['Нет бренда'])[0] if x is not None else 'null',
)

In [30]:
all_pairs['type_1'] = all_pairs['characteristic_attributes_mapping_1'].apply(
    lambda x: x.get('Тип', ['unknown'])[0] if x is not None else 'null',
)

all_pairs['type_2'] = all_pairs['characteristic_attributes_mapping_2'].apply(
    lambda x: x.get('Тип', ['unknown'])[0] if x is not None else 'null',
)

In [31]:
all_pairs['_attr_colors_1'] = all_pairs['characteristic_attributes_mapping_1'].apply(
    lambda x: set(x.get('Цвет товара', ['unknown'])) if x is not None else set({'null'}),
)

all_pairs['_attr_colors_2'] = all_pairs['characteristic_attributes_mapping_2'].apply(
    lambda x: set(x.get('Цвет товара', ['unknown'])) if x is not None else set({'null'}),
)

In [32]:
all_pairs['attr_color_jaccard'] = (
    all_pairs.apply(lambda row: process_nan(jaccard)(row._attr_colors_1, row._attr_colors_2), axis=1)
)

for how in ['left', 'right']:
    all_pairs[f'_attr_color_jaccard_{how}'] = (
        all_pairs.apply(lambda row: process_nan(jaccard)(row._attr_colors_1, row._attr_colors_2, how=how), axis=1)
    )
    
all_pairs['attr_color_jaccard_max'] = (
    all_pairs.apply(lambda row: max(row._attr_color_jaccard_left, row._attr_color_jaccard_right), axis=1)
)

all_pairs['attr_color_jaccard_mean'] = (
    all_pairs.apply(lambda row: (row._attr_color_jaccard_left + row._attr_color_jaccard_right) / 2, axis=1)
)

In [33]:
def _jaccard_by_tokens(c1, c2, how='inner'):
    _tokenizer = RegexpTokenizer(r'\w+')
    if c1 in ('unknown', 'null') or c1 in ('unknown', 'null'):
        return
    
    try:
        return jaccard(_tokenizer.tokenize(c1.lower()), _tokenizer.tokenize(c2.lower()), how=how)
    except:
        return


all_pairs['complectation_1'] = all_pairs['characteristic_attributes_mapping_1'].apply(
    lambda x: x.get('Комплектация', ['unknown'])[0] if x is not None else 'null',
)

all_pairs['complectation_2'] = all_pairs['characteristic_attributes_mapping_2'].apply(
    lambda x: x.get('Комплектация', ['unknown'])[0] if x is not None else 'null',
)

all_pairs['complectation_jaccard'] = (
    all_pairs.apply(lambda row: _jaccard_by_tokens(row.complectation_1, row.complectation_2), axis=1)
)

In [34]:
all_pairs['_tokenized_complectation_1'] = all_pairs['complectation_1'].apply(
    lambda x: re.findall(r'\b[\w|/|-|.]*\d\w*\b', x.lower()) if x not in ('unknown', 'null', ) else None
)

all_pairs['_tokenized_complectation_2'] = all_pairs['complectation_2'].apply(
    lambda x: re.findall(r'\b[\w|/|-|.]*\d\w*\b', x.lower()) if x not in ('unknown', 'null', ) else None
)

all_pairs['complectation_ebi_jaccard'] = (
    all_pairs.apply(
        lambda row: process_nan(jaccard)(row._tokenized_complectation_1, row._tokenized_complectation_2), 
        axis=1,
    )
)

In [35]:
all_pairs['name_of_colors_1'] = all_pairs['characteristic_attributes_mapping_1'].apply(
    lambda x: x.get('Название цвета', ['unknown'])[0] if x is not None else 'null',
)

all_pairs['name_of_colors_2'] = all_pairs['characteristic_attributes_mapping_2'].apply(
    lambda x: x.get('Название цвета', ['unknown'])[0] if x is not None else 'null',
)

all_pairs['name_of_colors_jaccard'] = (
    all_pairs.apply(lambda row: _jaccard_by_tokens(row.name_of_colors_1, row.name_of_colors_2), axis=1)
)

In [36]:
category_to_cnts = {}

for category, df in tqdm(pd.concat([data, test_data], axis=0).groupby('category_3')):
    cnts = Counter()
    for c in df['characteristic_attributes_mapping'].values:
        if c != c or not c:
            continue
    
        cnts.update(set(c.keys()))
    
    category_to_cnts[category] = cnts

  0%|          | 0/127 [00:00<?, ?it/s]

In [37]:
cnts = Counter()
for c in data['characteristic_attributes_mapping'].values:
    if c != c or not c:
        continue
    
    cnts.update(set(c.keys()))
    
attribute_keys = [key for key, _ in cnts.most_common(400)]
attributes_features = []

unknown_set = set({'unknown'})

for key in tqdm(attribute_keys):
    all_pairs['_attr_1'] = all_pairs['characteristic_attributes_mapping_1'].apply(
        lambda x: set(x.get(key, ['unknown'])) if x is not None else set({'null'}),
    )
    
    all_pairs['_attr_2'] = all_pairs['characteristic_attributes_mapping_2'].apply(
        lambda x: set(x.get(key, ['unknown'])) if x is not None else set({'null'}),
    )

    all_pairs['_key_in'] = (
        all_pairs['category_3_1'].map(category_to_cnts).apply(lambda x: key in x)
    )

    all_pairs['_attr_1_unknown'] = all_pairs['_attr_1'] == unknown_set
    all_pairs['_attr_2_unknown'] = all_pairs['_attr_2'] == unknown_set

    feature_name = f'attribute_{key}_eq'
    all_pairs[feature_name] = (
        all_pairs['_attr_1'] == all_pairs['_attr_2']
    ).where(
        all_pairs['_key_in'] & ~all_pairs['_attr_1_unknown'] & ~all_pairs['_attr_2_unknown'],
        np.nan,
    )
    attributes_features.append(feature_name)

    feature_name = f'attribute_{key}_any_unknown'
    all_pairs[feature_name] = (
        all_pairs['_attr_1_unknown'] | all_pairs['_attr_2_unknown']
    ).where(
        all_pairs['_key_in'],
        np.nan,
    )
    attributes_features.append(feature_name)

    feature_name = f'attribute_{key}_both_unknown'
    all_pairs[feature_name] = (
        all_pairs['_attr_1_unknown'] & all_pairs['_attr_2_unknown']
    ).where(
        all_pairs['_key_in'],
        np.nan,
    )
    attributes_features.append(feature_name)

  0%|          | 0/400 [00:00<?, ?it/s]

  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pair

  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pair

  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pair

  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pair

  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pair

  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pair

  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pair

  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pair

  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (
  all_pairs[feature_name] = (


In [38]:
all_pairs['sum_of_attr_eq'] = all_pairs[
    [col for col in attributes_features if col.endswith('_eq')]
].sum(axis=1)

all_pairs['sum_of_attr_any_unknown'] = all_pairs[
    [col for col in attributes_features if col.endswith('_any_unknown')]
].sum(axis=1)

all_pairs['sum_of_attr_both_unknown'] = all_pairs[
    [col for col in attributes_features if col.endswith('_both_unknown')]
].sum(axis=1)

  all_pairs['sum_of_attr_eq'] = all_pairs[
  all_pairs['sum_of_attr_any_unknown'] = all_pairs[
  all_pairs['sum_of_attr_both_unknown'] = all_pairs[


In [39]:
all_pairs['tokenized_names_ebi_num_1'] = all_pairs['tokenized_name_ebi_1'].apply(len)
all_pairs['tokenized_names_ebi_num_2'] = all_pairs['tokenized_name_ebi_2'].apply(len)
all_pairs['tokenized_names_ebi_num_diff'] = (
    all_pairs['tokenized_names_ebi_num_1'] - all_pairs['tokenized_names_ebi_num_2']
).abs()

In [40]:
all_pairs['attributes_num_this_category'] = (
    all_pairs['category_3_1'].map(category_to_cnts).apply(len)
)

  all_pairs['attributes_num_this_category'] = (


In [41]:
all_pairs['attributes_num_1'] = (
    all_pairs['characteristic_attributes_mapping_1'].apply(lambda x: len(x) if x is not None else None)
)
all_pairs['attributes_num_2'] = (
    all_pairs['characteristic_attributes_mapping_2'].apply(lambda x: len(x) if x is not None else None)
)
all_pairs['attributes_num_diff'] = (all_pairs['attributes_num_1'] - all_pairs['attributes_num_2']).abs()

  all_pairs['attributes_num_1'] = (
  all_pairs['attributes_num_2'] = (
  all_pairs['attributes_num_diff'] = (all_pairs['attributes_num_1'] - all_pairs['attributes_num_2']).abs()


In [42]:
all_pairs['attributes_num_normed_1'] = all_pairs['attributes_num_1'] / all_pairs['attributes_num_this_category']
all_pairs['attributes_num_normed_2'] = all_pairs['attributes_num_2'] / all_pairs['attributes_num_this_category']
all_pairs['attributes_num_empty_1'] = (
    (all_pairs['attributes_num_this_category'] - all_pairs['attributes_num_1']) / all_pairs['attributes_num_this_category']
)
all_pairs['attributes_num_empty_2'] = (
    (all_pairs['attributes_num_this_category'] - all_pairs['attributes_num_2']) / all_pairs['attributes_num_this_category']
)


  all_pairs['attributes_num_normed_1'] = all_pairs['attributes_num_1'] / all_pairs['attributes_num_this_category']
  all_pairs['attributes_num_normed_2'] = all_pairs['attributes_num_2'] / all_pairs['attributes_num_this_category']
  all_pairs['attributes_num_empty_1'] = (
  all_pairs['attributes_num_empty_2'] = (


In [43]:
from fuzzywuzzy import fuzz


for n, f in [
    ('ratio', fuzz.ratio),
    ('partial_ratio', fuzz.partial_ratio),
    ('token_sort_ratio', fuzz.token_sort_ratio),
    ('token_set_ratio', fuzz.token_set_ratio),
]:
    all_pairs[f'fuzzywuzzy_{n}'] = all_pairs.progress_apply(lambda x: f(x.name_1, x.name_2), axis=1)

  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'fuzzywuzzy_{n}'] = all_pairs.progress_apply(lambda x: f(x.name_1, x.name_2), axis=1)


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'fuzzywuzzy_{n}'] = all_pairs.progress_apply(lambda x: f(x.name_1, x.name_2), axis=1)


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'fuzzywuzzy_{n}'] = all_pairs.progress_apply(lambda x: f(x.name_1, x.name_2), axis=1)


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'fuzzywuzzy_{n}'] = all_pairs.progress_apply(lambda x: f(x.name_1, x.name_2), axis=1)


In [44]:
def _avg_fully_eq_attributes(d1, d2, popular_keys):
    keys = set(d1) & set(d2) & popular_keys
    metrics = []
    for key in keys:
        metrics.append(set(d1[key]) == set(d2[key]))
        
    return np.mean(metrics)


def _avg_jaccard_between_attributes(d1, d2, popular_keys, how='inner'):
    keys = set(d1) & set(d2) & popular_keys
    metrics = []
    for key in keys:
        metrics.append(jaccard(d1[key], d2[key], how=how))
    return np.mean(metrics)

In [45]:
submission = pd.read_csv('../hackathon_files_for_participants_ozon/submission_example.csv')

mapping = pd.merge(
    submission,
    test_data,
    left_on=['variantid1'],
    right_on=['variantid'],
).set_index('category_3')['cat3_grouped'].to_dict()

all_pairs['cat3_grouped'] = all_pairs['category_3_1'].map(mapping).fillna('rest')

  all_pairs['cat3_grouped'] = all_pairs['category_3_1'].map(mapping).fillna('rest')


In [46]:
top_attrs = {'Сетевые фильтры, разветвители и удлинители': ['Длина кабеля питания, м',
  'Страна-изготовитель',
  'Длина кабеля питания, м',
  'Электробезопасность',
  'Длина кабеля питания, м',
  'Страна-изготовитель',
  'Цвет товара',
  'Комплектация',
  'Стандарт защиты',
  'Комплектация'],
 'Кабели и переходники': ['Длина, м',
  'Тип',
  'Цвет товара',
  'Назначение',
  'Длина, м',
  'Длина, м',
  'Гарантийный срок',
  'Вес товара, г',
  'Назначение',
  'Вес товара, г'],
 'Оптические приборы': ['Макс. увеличение, крат',
  'Цвет товара',
  'Фокусное расстояние, мм',
  'Цвет товара',
  'Страна-изготовитель',
  'Тип насадки микроскопа',
  'Конструктивные особенности',
  'Страна-изготовитель',
  'Конструктивные особенности',
  'Функциональные особенности оптического прибора'],
 'Смартфоны, планшеты, мобильные телефоны': ['Цвет товара',
  'Оперативная память',
  'Встроенная память',
  'Диагональ экрана, дюймы',
  'Емкость аккумулятора, мАч',
  'Гарантийный срок',
  'Навигация',
  'Функции камеры',
  'Название цвета',
  'Стандарты связи'],
 'rest': ['Количество разъемов USB 3.1',
  'Доп. комплектация',
  'Бренд',
  'Цвет товара',
  'Игры и подписки',
  'Возможности приставки',
  'Гарантийный срок',
  'Гарантийный срок',
  'Кол-во встроенных игр',
  'Цвет товара'],
 'Устройство ручного ввода': ['Комплектация',
  'Цвет товара',
  'Комплектация',
  'Гарантийный срок',
  'Питание',
  'Цвет товара',
  'Гарантийный срок',
  'Тип',
  'Страна-изготовитель',
  'Бренд'],
 'Смарт-часы': ['Бренд',
  'Цвет товара',
  'Модель браслета/умных часов',
  'Название цвета',
  'Модель браслета/умных часов',
  'Размер циферблата',
  'Встроенная память',
  'Размеры, мм',
  'Название цвета',
  'Длина ремешка, мм'],
 'Сетевое оборудование': ['Цвет товара',
  'Тип',
  'Цвет товара',
  'Страна-изготовитель',
  'Гарантийный срок',
  'Страна-изготовитель',
  'Комплектация',
  'Цвет товара',
  'Гарантийный срок',
  'Страна-изготовитель'],
 'Запчасти для ноутбуков': ['Бренд',
  'Партномер',
  'Гарантийный срок',
  'Рекомендовано для',
  'Рекомендовано для',
  'Партномер',
  'Цвет товара',
  'Страна-изготовитель',
  'Гарантийный срок',
  'Комплектация'],
 'Компьютер': ['Оперативная память',
  'Общий объем SSD, ГБ',
  'Процессор',
  'Операционная система',
  'Видеокарта',
  'Интерфейсы и разъемы',
  'Интерфейсы и разъемы',
  'Цвет товара',
  'Версия Windows',
  'Частота процессора, ГГц'],
 'Карты памяти и флешки': ['Объем',
  'Вес товара, г',
  'Цвет товара',
  'Объем',
  'Объем',
  'Цвет товара',
  'Цвет товара',
  'Размеры, мм',
  'Страна-изготовитель',
  'Страна-изготовитель'],
 'Расходник для печати': ['Цвет тонера/чернил',
  'Совместимые модели принтеров',
  'Количество в упаковке, шт',
  'Количество в упаковке, шт',
  'Совместимые модели принтеров',
  'Тип',
  'Количество в упаковке, шт',
  'Комплектация',
  'Ресурс',
  'Страна-изготовитель'],
 'Чехол': ['Модель устройства',
  'Бренд',
  'Название цвета',
  'Модель устройства',
  'Цвет товара',
  'Модель устройства',
  'Материал',
  'Рекомендовано для',
  'Цвет товара',
  'Цвет товара'],
 'Телевизоры': ['Диагональ экрана, дюймы',
  'Разрешение',
  'Интерфейсы',
  'Интерфейсы',
  'Частота обновления',
  'Декодеры звука',
  'ТВ-тюнер',
  'Запись эфира',
  'ТВ-тюнер',
  'Беспроводные интерфейсы'],
 'Защитные пленки и стекла': ['Модель устройства',
  'Количество в упаковке, шт',
  'Цвет товара',
  'Покрытие',
  'Дополнительные свойства покрытия',
  'Бренд',
  'Дополнительные свойства покрытия',
  'Модель устройства',
  'Назначение',
  'Модель устройства'],
 'Рюкзаки, чехлы, сумки': ['Страна-изготовитель',
  'Гарантийный срок',
  'Страна-изготовитель',
  'Цвет товара',
  'Гарантийный срок',
  'Внешние размеры, мм',
  'Количество внешних карманов',
  'Пол',
  'Пол',
  'Материал'],
 'Принтеры и МФУ': ['Поддерживаемые материалы',
  'Интерфейсы',
  'Сетевые интерфейсы',
  'Совместимость',
  'Взаимодействие с устройствами',
  'Сетевые интерфейсы',
  'Поддерживаемые материалы',
  'Облачные технологии',
  'Цвет товара',
  'Тип дисплея'],
 'Батарейки и аккумуляторы': ['Количество в упаковке, шт',
  'Емкость, мА•ч',
  'Количество в упаковке, шт',
  'Количество в упаковке, шт',
  'Емкость, мА•ч',
  'Бренд',
  'Форм-фактор батареи',
  'Цвет товара',
  'Емкость, мА•ч',
  'Партномер'],
 'Мониторы и запчасти': ['Диагональ экрана, дюймы',
  'Макс. частота обновления, Гц',
  'Разрешение',
  'Разъёмы монитора',
  'Особенности',
  'Установка монитора',
  'Назначение монитора',
  'Разъёмы монитора',
  'Назначение монитора',
  'Матрица монитора'],
 'Наушники и гарнитуры': ['Цвет товара',
  'Управление',
  'Конструкция наушников',
  'Управление',
  'Гарантийный срок',
  'Складные',
  'True Wireless',
  'Разъем',
  'Вес товара, г',
  'Страна-изготовитель'],
 'Жесткие диски, SSD и сетевые накопители': ['Объем',
  'Объем',
  'Объем',
  'Комплектация',
  'Цвет товара',
  'Комплектация',
  'Страна-изготовитель',
  'Гарантийный срок',
  'Материал корпуса',
  'Форм-фактор'],
 'Видеонаблюдение': ['Форматы файлов видео',
  'Качество видео',
  'Сетевые протоколы',
  'Цвет товара',
  'Цвет товара',
  'Гарантийный срок',
  'Качество видео',
  'Комплектация',
  'Общее количество пикселей',
  'Цвет товара'],
 'Процессор': ['Базовая частота, ГГц',
  'Комплектация процессора',
  'Особенности',
  'Особенности',
  'Встроенная графика',
  'Турбо-частота, ГГц',
  'Наличие встроенной графики',
  'Вес товара, г',
  'Встроенная графика',
  'Страна-изготовитель'],
 'Материнская плата': ['RAID',
  'RAID',
  'Разъемы USB на задней панели',
  'Видеовыходы на задней панели',
  'Разъемы USB на задней панели',
  'Особенности',
  'Форм-фактор материнской платы',
  'Особенности',
  'Страна-изготовитель',
  'Видеовыходы на задней панели'],
 'Оперативная память': ['Суммарный объем памяти',
  'Емкость одного модуля',
  'Количество модулей в комплекте',
  'Комплектация',
  'Гарантийный срок',
  'Тип поставки',
  'CAS Latency (CL)',
  'Размеры, мм',
  'Напряжение питания, В',
  'Размеры, мм'],
 'Запчасти для смартфонов': ['Бренд',
  'Тип',
  'Цвет товара',
  'Совместимые модели',
  'Гарантийный срок',
  'Цвет товара',
  'Совместимые устройства',
  'Гарантийный срок',
  'Совместимые устройства',
  'Комплектация'],
 'Видеорегистратор': ['Датчики видеорегистратора',
  'Тип',
  'Способ крепления',
  'Навигационные системы',
  'Встроенные датчики',
  'Навигационные системы',
  'Запись',
  'Функционал устройства',
  'Беспроводные интерфейсы',
  'Принцип детекции'],
 'ИБП': ['Цвет товара',
  'Цвет товара',
  'Емкость, А•ч',
  'Тип аккумулятора',
  'Страна-изготовитель',
  'Вес товара, г',
  'Емкость, А•ч',
  'Гарантийный срок',
  'Тип аккумулятора',
  'Страна-изготовитель'],
 'Видеокарты и графические ускорители': ['Серия графического процессора',
  'Поддерживаемые API',
  'Радиатор',
  'Технологии',
  'Поддерживаемые API',
  'Интерфейсы',
  'Интерфейсы',
  'Ревизия',
  'Количество вентиляторов',
  'Бренд'],
 'Зарядные устройства и док-станции': ['Гарантийный срок',
  'Макс. выходной ток, А',
  'Страна-изготовитель',
  'Комплектация',
  'Цвет товара',
  'Страна-изготовитель',
  'Особенности',
  'Бренд',
  'Цвет товара',
  'Назначение'],
 'Кронштейн': ['Комплектация',
  'Цвет товара',
  'Гарантийный срок',
  'Материал',
  'Материал',
  'Настенное крепление (VESA)',
  'Гарантийный срок',
  'Страна-изготовитель',
  'Размеры, мм',
  'Гарантийный срок'],
 'Акустика и колонки': ['Цвет товара',
  'Конструктивные особенности',
  'Материал корпуса',
  'Звуковая схема',
  'Входные интерфейсы',
  'Максимальная мощность, Вт',
  'Максимальная мощность, Вт',
  'Тип',
  'Входные интерфейсы',
  'Размеры, мм'],
 'Аксессуары для фото и видеотехники': ['Бренд',
  'Гарантийный срок',
  'Цвет товара',
  'Совместимость с фотокамерами',
  'Совместимость с фотокамерами',
  'Страна-изготовитель',
  'Тип',
  'Комплектация',
  'Страна-изготовитель',
  'Гарантийный срок'],
 'Микрофоны и аксессуары': ['Интерфейсы и разъемы',
  'Диаграмма направленности',
  'Цвет товара',
  'Интерфейсы и разъемы',
  'Подключение',
  'Диаграмма направленности',
  'Цвет товара',
  'Гарантийный срок',
  'Вес товара, г',
  'Особенности'],
 'Коврик для мыши': ['Название цвета',
  'Размер коврика',
  'Цвет товара',
  'Материал поверхности',
  'Страна-изготовитель',
  'Материал подложки',
  'Материал подложки',
  'Особенности коврика',
  'Материал поверхности',
  'Тип'],
 'Видеокамеры': ['Гарантийный срок',
  'Цвет товара',
  'Угол обзора, градусов',
  'Бренд',
  'Тип матрицы',
  'Интерфейсы',
  'Страна-изготовитель',
  'Общее количество пикселей',
  'Страна-изготовитель',
  'Комплектация'],
 'Системы охлаждения для компьютеров': ['Цвет товара',
  'Цвет товара',
  'Гарантийный срок',
  'Размеры, мм',
  'Комплектация',
  'Гарантийный срок',
  'Размеры, мм',
  'Гарантийный срок',
  'Сокет процессора',
  'Сокет процессора'],
 'Запчасти для аудио/видеотехники': ['Гарантийный срок',
  'Бренд',
  'Размеры, мм',
  'Вес товара, г',
  'Размеры, мм',
  'Гарантийный срок',
  'Гарантийный срок',
  'Вес товара, г',
  'Комплектация',
  'Количество, шт'],
 'Электронные модули': ['Количество в упаковке, шт',
  'Количество в упаковке, шт',
  'Страна-изготовитель',
  'Гарантийный срок',
  'Страна-изготовитель',
  'Гарантийный срок',
  'Страна-изготовитель',
  'Комплектация',
  'Вид активного компонента',
  'Диапазон рабочих температур, °С'],
 'Расходные материалы': ['Диаметр, мм',
  'Технология 3D печати',
  'Гарантийный срок',
  'Масса, кг',
  'Технология 3D печати',
  'Размеры, мм',
  'Гарантийный срок',
  'Диаметр, мм',
  'Цвет товара',
  'Комплектация'],
 'Корпуса для компьютеров': ['Поддерживаемые формфакторы материнской платы',
  'Страна-изготовитель',
  'Цвет товара',
  'Материал корпуса',
  'Гарантийный срок',
  'Материал корпуса',
  'Поддерживаемые формфакторы материнской платы',
  'Гарантийный срок',
  'Особенности',
  'Интерфейсы на передней панели'],
 'Умный дом': ['Фотоэлементы в комплекте',
  'Диапазон рабочих температур, °С',
  'Страна-изготовитель',
  'Для управления',
  'Цвет товара',
  'Фотоэлементы в комплекте',
  'Беспроводные интерфейсы',
  'Фотоэлементы в комплекте',
  'Монтаж',
  'Страна-изготовитель'],
 'Блоки питания': ['Страна-изготовитель',
  'Цвет товара',
  'Электробезопасность',
  'Электробезопасность',
  'Разъемы питания видеокарт',
  'Цвет товара',
  'Гарантийный срок',
  'Мощность блока питания, Вт',
  'Страна-изготовитель',
  'Мощность блока питания, Вт'],
 'Гаджет': ['Цвет товара',
  'Вес с упаковкой, г',
  'Назначение',
  'Страна-изготовитель',
  'Количество в упаковке, шт',
  'Бренд',
  'Вес товара, г',
  'Вес товара, г',
  'Вес с упаковкой, г',
  'Совместимость с ОС']}

In [47]:
for i in [1, 3, 5, 10]:
    all_pairs['_top_attrs'] = all_pairs['cat3_grouped'].map(top_attrs).apply(lambda x: set(x[:i]))

    all_pairs[f'attributes_values_top_{i}_for_category_fully_eq'] = (
        all_pairs.progress_apply(
            lambda row: process_nan(_avg_fully_eq_attributes)(
                row.characteristic_attributes_mapping_1,
                row.characteristic_attributes_mapping_2,
                row._top_attrs,
            ),
            axis=1,
        )
    )


    all_pairs[f'attributes_values_top_{i}_for_category_jaccard'] = (
        all_pairs.progress_apply(
            lambda row: process_nan(_avg_jaccard_between_attributes)(
                row.characteristic_attributes_mapping_1,
                row.characteristic_attributes_mapping_2,
                row._top_attrs,
            ),
            axis=1,
        )
    )

    all_pairs[f'attributes_keys_top_{i}_for_category_jaccard'] = (
        all_pairs.progress_apply(
            lambda row: jaccard(
                set(row.characteristic_attributes_mapping_1) & row._top_attrs if row.characteristic_attributes_mapping_1 is not None else set(),
                set(row.characteristic_attributes_mapping_2) & row._top_attrs if row.characteristic_attributes_mapping_2 is not None else set(),
            ),
            axis=1,
        )
    )

    for how in ['left', 'right']:
        all_pairs[f'attributes_values_top_{i}_for_category_jaccard_{how}'] = (
            all_pairs.progress_apply(
                lambda row: process_nan(_avg_jaccard_between_attributes)(
                    row.characteristic_attributes_mapping_1,
                    row.characteristic_attributes_mapping_2,
                    row._top_attrs,
                    how=how,
                ),
                axis=1,
            )
        )

        all_pairs[f'attributes_keys_top_{i}_for_category_jaccard_{how}'] = (
            all_pairs.progress_apply(
                lambda row: jaccard(
                    set(row.characteristic_attributes_mapping_1) & row._top_attrs if row.characteristic_attributes_mapping_1 is not None else set(),
                    set(row.characteristic_attributes_mapping_2) & row._top_attrs if row.characteristic_attributes_mapping_2 is not None else set(),
                    how=how,
                ),
                axis=1,
            )
        )



  all_pairs['_top_attrs'] = all_pairs['cat3_grouped'].map(top_attrs).apply(lambda x: set(x[:i]))


  0%|          | 0/18084 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  all_pairs[f'attributes_values_top_{i}_for_category_fully_eq'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  all_pairs[f'attributes_values_top_{i}_for_category_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attributes_keys_top_{i}_for_category_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  all_pairs[f'attributes_values_top_{i}_for_category_jaccard_{how}'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attributes_keys_top_{i}_for_category_jaccard_{how}'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  all_pairs[f'attributes_values_top_{i}_for_category_jaccard_{how}'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attributes_keys_top_{i}_for_category_jaccard_{how}'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  all_pairs[f'attributes_values_top_{i}_for_category_fully_eq'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  all_pairs[f'attributes_values_top_{i}_for_category_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attributes_keys_top_{i}_for_category_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  all_pairs[f'attributes_values_top_{i}_for_category_jaccard_{how}'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attributes_keys_top_{i}_for_category_jaccard_{how}'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  all_pairs[f'attributes_values_top_{i}_for_category_jaccard_{how}'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attributes_keys_top_{i}_for_category_jaccard_{how}'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  all_pairs[f'attributes_values_top_{i}_for_category_fully_eq'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  all_pairs[f'attributes_values_top_{i}_for_category_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attributes_keys_top_{i}_for_category_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  all_pairs[f'attributes_values_top_{i}_for_category_jaccard_{how}'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attributes_keys_top_{i}_for_category_jaccard_{how}'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  all_pairs[f'attributes_values_top_{i}_for_category_jaccard_{how}'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attributes_keys_top_{i}_for_category_jaccard_{how}'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  all_pairs[f'attributes_values_top_{i}_for_category_fully_eq'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  all_pairs[f'attributes_values_top_{i}_for_category_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attributes_keys_top_{i}_for_category_jaccard'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  all_pairs[f'attributes_values_top_{i}_for_category_jaccard_{how}'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attributes_keys_top_{i}_for_category_jaccard_{how}'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  all_pairs[f'attributes_values_top_{i}_for_category_jaccard_{how}'] = (


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs[f'attributes_keys_top_{i}_for_category_jaccard_{how}'] = (


In [48]:
all_pairs['attributes_div'] = all_pairs['attributes_num_1'] / all_pairs['attributes_num_2']

  all_pairs['attributes_div'] = all_pairs['attributes_num_1'] / all_pairs['attributes_num_2']


In [49]:
all_pairs['_name_processed_1'] = all_pairs['name_1'].apply(lambda x: [w.strip(' ') for w in x.lower().split(',')])
all_pairs['_name_processed_2'] = all_pairs['name_2'].apply(lambda x: [w.strip(' ') for w in x.lower().split(',')])

all_pairs['names_by_comma_jaccard'] = (
    all_pairs.progress_apply(lambda row: jaccard(row._name_processed_1, row._name_processed_2), axis=1)
)

  all_pairs['_name_processed_1'] = all_pairs['name_1'].apply(lambda x: [w.strip(' ') for w in x.lower().split(',')])
  all_pairs['_name_processed_2'] = all_pairs['name_2'].apply(lambda x: [w.strip(' ') for w in x.lower().split(',')])


  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs['names_by_comma_jaccard'] = (


In [50]:
def levenstein_between_arrays(a1, a2):
    distances = []
    
    for e1 in a1:
        min_distance = 1e10
        for e2 in a2:
            d = Levenshtein.distance(e1, e2)
            if d < min_distance:
                min_distance = d
                
        distances.append(min_distance)
        
    return distances

In [51]:
all_pairs['names_in_brackets_levenstein_mean'] = (
    all_pairs
    .progress_apply(lambda row: levenstein_between_arrays(row._name_processed_1, row._name_processed_2), axis=1)
    .apply(np.mean)
)

  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs['names_in_brackets_levenstein_mean'] = (


In [52]:
all_pairs['names_ebi_levenstein_mean'] = (
    all_pairs
    .progress_apply(lambda row: levenstein_between_arrays(row.tokenized_name_ebi_1, row.tokenized_name_ebi_2), axis=1)
    .apply(np.mean)
)

  0%|          | 0/18084 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  all_pairs['names_ebi_levenstein_mean'] = (


In [53]:
all_pairs['_name_processed_1'] = all_pairs['name_1'].apply(lambda x: re.sub(r'\s+', ' ', re.sub(r'[\d|,|-|"|-]', ' ', data['name'].iloc[0].lower())).split())
all_pairs['_name_processed_2'] = all_pairs['name_2'].apply(lambda x: re.sub(r'\s+', ' ', re.sub(r'[\d|,|-|"|-]', ' ', data['name'].iloc[0].lower())).split())

all_pairs['names_no_digits_jaccard'] = (
    all_pairs.progress_apply(lambda row: jaccard(row._name_processed_1, row._name_processed_2), axis=1)
)

  0%|          | 0/18084 [00:00<?, ?it/s]

  all_pairs['names_no_digits_jaccard'] = (


In [54]:
all_pairs.to_pickle('features_v2.pickle')