In [None]:
!pip install rapidfuzz jellyfish textdistance

Collecting rapidfuzz
  Downloading rapidfuzz-3.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting jellyfish
  Downloading jellyfish-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.6 kB)
Collecting textdistance
  Downloading textdistance-4.6.3-py3-none-any.whl.metadata (18 kB)
Downloading rapidfuzz-3.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading jellyfish-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (335 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m336.0/336.0 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading textdistance-4.6.3-py3-none-any.whl (31 kB)


In [None]:
import numpy as np
import pandas as pd
import json
import ast
import pickle
import gc
import re

from rapidfuzz import fuzz
import jellyfish
import textdistance

from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer

import scipy.sparse as sp

from tqdm import tqdm
tqdm.pandas()

In [None]:
attrs = pd.read_parquet('/kaggle/input/extracted_data/attributes.parquet', engine='pyarrow')
resnet = pd.read_parquet('/kaggle/input/extracted_data/resnet.parquet', engine='pyarrow')
text_and_bert = pd.read_parquet('/kaggle/input/extracted_data/text_and_bert.parquet', engine='pyarrow')

In [None]:
test_pairs = pd.read_parquet('/kaggle/input/extracted_data/test.parquet', engine='pyarrow')

In [None]:
data = pd.concat([attrs, resnet.drop(columns=['variantid']), text_and_bert.drop(columns=['variantid'])], axis=1)

In [None]:
data['variantid'] = data['variantid'].astype('uint32')

In [None]:
del attrs, resnet, text_and_bert
gc.collect()

In [None]:
data['description'] = data['description'].fillna('no desc')

In [None]:
# Нормализация текста
def normalize(text):
    if text is None:
        return None
    text = text.lower()
    chars = []
    for char in text:
        if char.isalnum():
            chars.append(char)
        else:
            chars.append(' ')
    tokens = ''.join(chars).split() 
    return '_'.join(tokens)

In [None]:
# Удаление html тэгов и эмодзи из строки
def remove_html_tags_and_emoji(text):
    if text is None:
        return None
    clean = re.compile('<.*?>')
    text = re.sub(clean, '', text)
    text = text.replace('\n', ' ')
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF"
                               u"\U0001F1E0-\U0001F1FF"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
# Нормализация названий товаров
def normalize_names(df: pd.DataFrame) -> pd.DataFrame:
    print('Нормализую названия товаров...')
    df['name'] = df['name'].progress_apply(remove_html_tags_and_emoji)
    df['name_norm'] = df['name'].progress_apply(normalize)
    df['name_tokens'] = df['name'].str.strip().str.lower()
    df['name'] = df['name_tokens'].progress_apply(lambda tokens: ' '.join(tokens.split()))
    return df

In [None]:
# Нормализация описаний товаров
def normalize_desc(df: pd.DataFrame) -> pd.DataFrame:
    print('Нормализую описания товаров...')
    df['description'] = df['description'].progress_apply(remove_html_tags_and_emoji)
    df['description_norm'] = df['description'].progress_apply(normalize)
    df['description_tokens'] = df['description'].str.strip().str.lower()
    df['description'] = df['description_tokens'].progress_apply(lambda tokens: ' '.join(tokens.split()))
    return df

In [None]:
# Выделение бренда как отдельной фичи
def extract_brand(df: pd.DataFrame) -> pd.DataFrame:
    print('Извлекаю названия брендов...')
    brand_arr = []
    for i in tqdm(range(len(df))):
        try:
            brand_arr.append(json.loads(df['characteristic_attributes_mapping'][i])['Бренд'][0]) 
        except:
            brand_arr.append(None)

    df['brand'] = brand_arr

    return df

In [None]:
# Выделение страны как отдельной фичи
def extract_country(df: pd.DataFrame) -> pd.DataFrame:
    print('Извлекаю страны-изготовители...')
    country_arr = []
    for i in tqdm(range(len(df))):
        try:
            country_arr.append(json.loads(df['characteristic_attributes_mapping'][i])['Страна-изготовитель'][0]) 
        except:
            country_arr.append(None)

    df['country'] = country_arr

    return df

In [None]:
# Выделение партномера [ТОЛЬКО ДЛЯ ПОСТПРОЦЕССИНГА, НЕ ИСПОЛЬЗОВАТЬ КАК ФИЧУ]
def extract_partnomer(df: pd.DataFrame) -> pd.DataFrame:
    print('Извлекаю партномера...')
    partnomer_arr = []
    for i in tqdm(range(len(df))):
        try:
            partnomer_arr.append(json.loads(df['characteristic_attributes_mapping'][i])['Партномер'][0]) 
        except:
            partnomer_arr.append(None)

    df['partnomer'] = partnomer_arr

    return df

In [None]:
# Извлечение категорий
def extract_categories(df: pd.DataFrame) -> pd.DataFrame:
    print('Извлекаю категории...')
    categories = pd.json_normalize(df['categories'].progress_apply(ast.literal_eval))
    categories.columns = [f'category_level_{i+1}' for i in range(categories.shape[1])]
    return df.drop(columns=['categories']).join(categories)

In [None]:
# [[emb]] -> [emb]
def squeeze_main_pic_embeddings(df: pd.DataFrame) -> pd.DataFrame:
    print('Распаковываю эмбеддинги...')
    df['main_pic_embeddings_resnet_v1'] = df['main_pic_embeddings_resnet_v1'].progress_apply(
        lambda x: x[0] if isinstance(x, np.ndarray) else x
    )
    return df

In [None]:
# Нормализация атрибутов
def normalize_characteristic_attributes(df: pd.DataFrame) -> pd.DataFrame:
    def normalize_attributes(char_attrs_map):
        if char_attrs_map is not None:
            char_attrs_map = ast.literal_eval(char_attrs_map)
            parsed = [normalize(key) for key in char_attrs_map.keys()]
            return '; '.join([''.join(val) for val in parsed])
        return 'none'

    def normalize_values(attrs_map):
        if attrs_map is not None:
            attrs_map = ast.literal_eval(attrs_map)
            parsed = [list(map(normalize, attr_list)) for attr_list in attrs_map.values()]
            return '; '.join([' '.join(val) for val in parsed])
        return 'none'

    print('Нормализую значения атрибутов...')
    df['attr_vals'] = df['characteristic_attributes_mapping'].progress_apply(normalize_values)
    
    print('Нормализую атрибуты...')
    df['attr_keys'] = df['characteristic_attributes_mapping'].progress_apply(normalize_attributes)
    
    def combine_char_attributes(dct):
        if dct is not None:
            parsed = ast.literal_eval(dct)
            return '; '.join([f'{k}:{v}' for k, v in parsed.items()])
        return 'none'
    
    print('Собираю атрибуты и значения...')
    df['characteristics_attributes'] = df['characteristic_attributes_mapping'].progress_apply(combine_char_attributes)
    
    return df

In [None]:
# Количество картинок, токенов в названии, в описании
def get_lengths(df: pd.DataFrame) -> pd.DataFrame:
    len_w_nans = lambda x: len(x) if x is not None else None
    
    print('Создаю количественные фичи...')
    df['pic_embeddings_resnet_v1_len'] = df['pic_embeddings_resnet_v1'].progress_apply(len_w_nans)
    df['name_tokens_len'] = df['name_tokens'].apply(lambda x: x.split()).progress_apply(len_w_nans)
    df['description_tokens_len'] = df['description_tokens'].apply(lambda x: x.split()).progress_apply(len_w_nans)
    df['characteristics_attributes_len'] = df['characteristics_attributes'].apply(lambda x: x.split('; ')).progress_apply(len_w_nans)
    
    return df

In [None]:
# Извлечение чисел из строк
def get_digits_elements(df: pd.DataFrame) -> pd.DataFrame:
    def has_more_than_two_digits(s):
        return len(re.findall(r'\d', s)) > 2

    print('Нахожу числа в названиях, описаниях и атрибутах...')
    for col in ('attr_vals', 'name_tokens', 'description_tokens'):
        if 'attr' not in col:
            df[f'{col}_w_digits'] = df[col].progress_apply(lambda row: ' '.join([s for s in row.split() if has_more_than_two_digits(s)]))
        else:
            df[f'{col}_w_digits'] = df[col].progress_apply(lambda row: ' '.join([s for s in row.split('; ') if has_more_than_two_digits(s)]))
    return df

In [None]:
# Конкатенированный эмбеддинг bert и resnet
def concat_embs(df: pd.DataFrame) -> pd.DataFrame:
    def normalize(array):
        norm = np.linalg.norm(array)
        if norm == 0:
            return array
        return array / norm
    
    print('Конкатенирую эмбеддинги...')
    df['concat_emb'] = df.progress_apply(
        lambda row: np.concatenate(
            [
                normalize(row['main_pic_embeddings_resnet_v1']), 
                normalize(row['name_bert_64'])
            ]
        ), 
        axis=1
    )
    return df

In [None]:
def load_tfidf_vectorizer(main_path, columns):
    tfidf_vectorizers = {}
    for col in columns:
        with open(f'{main_path}/{col}_tfidf_vectorizer.pkl', 'rb') as f:
            vectorizer = pickle.load(f)
        tfidf_vectorizers[col] = vectorizer   
    return tfidf_vectorizers

columns = ['name', 'description', 'attr_keys', 'attr_vals']
tfidf_vectorizers = load_tfidf_vectorizer(main_path='/kaggle/working', columns=columns) # УКАЗАТЬ ПУТЬ К ОБУЧЕННЫМ ВЕКТОРАЙЗЕРАМ

In [None]:
# Tfidf фичи
def tfidf_emb_gen(data, tfidf_vectorizers, columns, batch_size=5000):
    for col in columns:
        tfidf_col_sparse = []
        for start in tqdm(range(0, len(data), batch_size)):
            end = min(start + batch_size, len(data))
            batch_texts = data[col].iloc[start:end].astype(str).tolist()
            tfidf_batch_sparse = tfidf_vectorizers[col].transform(batch_texts)
            tfidf_col_sparse.append(tfidf_batch_sparse)
        tfidf_col_sparse = sp.vstack(tfidf_col_sparse)
        data[f'{col}_tfidf'] = [row for row in tfidf_col_sparse]
    return data

In [None]:
def preprocess(data: pd.DataFrame) -> pd.DataFrame:
    data = extract_categories(data)
    data = normalize_names(data)
    data = normalize_desc(data)
    data = extract_brand(data)
    data = extract_country(data)
    data = extract_partnomer(data)
    data = squeeze_main_pic_embeddings(data)
    data = normalize_characteristic_attributes(data)
    data = get_lengths(data)
    data = get_digits_elements(data)
    data = concat_embs(data)
    data = tfidf_emb_gen(data, tfidf_vectorizers=fit_tfidf_vectorizer(data, columns), columns=columns)
    return data

In [None]:
data = preprocess(data)

In [None]:
test_pairs.rename(
    columns={
        'variantid1': 'variantid_1',
        'variantid2': 'variantid_2'
    }, inplace=True
)

In [None]:
df = test_pairs.merge(
    data.add_suffix('_1'), 
    on='variantid_1'
).merge(
    data.add_suffix('_2'), 
    on='variantid_2'
)

In [None]:
# Мэтч по категориям (полное совпадение + частичное по последнему уровню)
for i in range(1, 5):
    df[f'category_level_{i}_match'] = df.progress_apply(
        lambda row: row[f'category_level_{i}_1'].lower() == row[f'category_level_{i}_2'].lower(), axis=1
    )
    if i == 4:
        df[f'category_level_{i}_token_sort_ratio_match'] = df.progress_apply(
            lambda row: fuzz.token_sort_ratio(row[f'category_level_{i}_1'], row[f'category_level_{i}_2']) / 100, axis=1
        )

In [None]:
# Мэтч по описанию
df[f'description_match'] = df.progress_apply(lambda row: row['description_1'] == row['description_2'], axis=1)

In [None]:
def match_brends(row):
    if row[f'brand_1'] is None or row[f'brand_2'] is None:
        return None
    return row[f'brand_1'].lower() == row[f'brand_2'].lower()

# Мэтч по бренду   
df[f'brand_match'] = df.progress_apply(match_brends, axis=1)
df.drop(columns=['brand_1', 'brand_2'], axis=1, inplace=True)

In [None]:
def match_countries(row):
    if row[f'country_1'] is None or row[f'country_2'] is None:
        return None
    return row[f'country_1'].lower() == row[f'country_2'].lower()

# Мэтч по стране
df[f'country_match'] = df.progress_apply(match_countries, axis=1)
df.drop(columns=['country_1', 'country_2'], axis=1, inplace=True)

In [None]:
def match_partnomers(row):
    if row[f'partnomer_1'] is None or row[f'partnomer_2'] is None:
        return None
    return row[f'partnomer_1'].lower() == row[f'partnomer_2'].lower()

# Мэтч по партномеру [ТОЛЬКО ДЛЯ ПОСТПРОЦЕССИНГА ПОСЛЕ ИНФЕРЕНСА МОДЕЛИ]
df[f'partnomer_match'] = df.progress_apply(match_partnomers, axis=1)
df.drop(columns=['partnomer_1', 'partnomer_2'], axis=1, inplace=True)

In [None]:
# Мэтч по описанию  
df[f'description_match'] = df.progress_apply(lambda row: row[f'description_1'] == row[f'description_2'], axis=1)

In [None]:
# Отношения длин
for col in ('pic_embeddings_resnet_v1_len', 'name_tokens_len', 'description_tokens_len', 'characteristics_attributes_len'):
    df[f'{col}_ratio_left'] = df.progress_apply(
        lambda row: row[f'{col}_1'] / row[f'{col}_2'] if row[f'{col}_2'] not in (0, None) else 0, axis=1
    )
    df[f'{col}_ratio_right'] = df.progress_apply(
        lambda row: row[f'{col}_2'] / row[f'{col}_1'] if row[f'{col}_1'] not in (0, None) else 0, axis=1
    )
    
for col in ('attr_vals', 'attr_vals_w_digits', 'attr_keys', 'name_tokens_w_digits'):
    df[f'{col}_ratio_left'] = df.progress_apply(
        lambda row: len(row[f'{col}_1'].split()) / len(row[f'{col}_2'].split()) if len(row[f'{col}_2'].split()) not in (0, None) else 0, axis=1
    )
    df[f'{col}_ratio_right'] = df.progress_apply(
        lambda row: len(row[f'{col}_2'].split()) / len(row[f'{col}_1'].split()) if len(row[f'{col}_1'].split()) not in (0, None) else 0, axis=1
    )

In [None]:
def cosine_sim(vec1, vec2):
    vec1 = vec1.reshape(1, -1)
    vec2 = vec2.reshape(1, -1)
    return cosine_similarity(vec1, vec2)[0][0]

def euc_dist(vec1, vec2):
    vec1 = vec1.reshape(1, -1)
    vec2 = vec2.reshape(1, -1)
    return euclidean_distances(vec1, vec2)[0][0]

In [None]:
# Мэтч по main_pic_embeddings_resnet_v1 с main_pic_embeddings_resnet_v1
df['main_pic_embeddings_resnet_v1_cos_sim'] = df.progress_apply(
    lambda row: cosine_sim(row['main_pic_embeddings_resnet_v1_1'], row['main_pic_embeddings_resnet_v1_2']), axis=1
)
df['main_pic_embeddings_resnet_v1_euc_dist'] = df.progress_apply(
    lambda row: euc_dist(row['main_pic_embeddings_resnet_v1_1'], row['main_pic_embeddings_resnet_v1_2']), axis=1
)

In [None]:
def pair_cos_sim(vecs1, vecs2):
    if vecs1 is None or vecs2 is None:
        return {
            'mean': None,
            'median': None,
            'min': None,
            'max': None,
            'std': None
        }
    
    sim = []
    for vec1 in vecs1:
        for vec2 in vecs2:
            sim.append(cosine_sim(vec1, vec2))
    
    sim_array = np.array(sim)
    return {
        'mean': np.mean(sim_array),
        'median': np.median(sim_array),
        'min': np.min(sim_array),
        'max': np.max(sim_array),
        'std': np.std(sim_array)
    }

# Мэтч по pic_embeddings_resnet_v1 с pic_embeddings_resnet_v1
results = df.progress_apply(
    lambda row: pair_cos_sim(row['pic_embeddings_resnet_v1_1'], row['pic_embeddings_resnet_v1_2']), axis=1
)
df['pic_embeddings_resnet_v1_mean_cos_sim'] = results.apply(lambda x: x['mean'])
df['pic_embeddings_resnet_v1_median_cos_sim'] = results.apply(lambda x: x['median'])
df['pic_embeddings_resnet_v1_min_cos_sim'] = results.apply(lambda x: x['min'])
df['pic_embeddings_resnet_v1_max_cos_sim'] = results.apply(lambda x: x['max'])
df['pic_embeddings_resnet_v1_std_cos_sim'] = results.apply(lambda x: x['std'])

In [None]:
def cross_cos_sim(vec1, vecs2):
    if vec1 is None or vecs2 is None:
        return {
            'mean': None,
            'median': None,
            'min': None,
            'max': None,
            'std': None
        }
    sim = []
    for vec2 in vecs2:
        sim.append(cosine_sim(vec1, vec2))
    
    sim_array = np.array(sim)
    return {
        'mean': np.mean(sim_array),
        'median': np.median(sim_array),
        'min': np.min(sim_array),
        'max': np.max(sim_array),
        'std': np.std(sim_array)
    }

# Мэтч по main_pic_embeddings_resnet_v1 с pic_embeddings_resnet_v1
results = df.progress_apply(
    lambda row: cross_cos_sim(row['main_pic_embeddings_resnet_v1_1'], row['pic_embeddings_resnet_v1_2']), axis=1
)
df['cross1_mean_cos_sim'] = results.apply(lambda x: x['mean'])
df['cross1_median_cos_sim'] = results.apply(lambda x: x['median'])
df['cross1_min_cos_sim'] = results.apply(lambda x: x['min'])
df['cross1_max_cos_sim'] = results.apply(lambda x: x['max'])
df['cross1_std_cos_sim'] = results.apply(lambda x: x['std'])

results = df.progress_apply(
    lambda row: cross_cos_sim(row['main_pic_embeddings_resnet_v1_2'], row['pic_embeddings_resnet_v1_1']), axis=1
)
df['cross2_mean_cos_sim'] = results.apply(lambda x: x['mean'])
df['cross2_median_cos_sim'] = results.apply(lambda x: x['median'])
df['cross2_min_cos_sim'] = results.apply(lambda x: x['min'])
df['cross2_max_cos_sim'] = results.apply(lambda x: x['max'])
df['cross2_std_cos_sim'] = results.apply(lambda x: x['std'])

In [None]:
# Мэтч по bert
df['name_bert_64_cos_sim'] = df.progress_apply(
    lambda row: cosine_sim(row['name_bert_64_1'], row['name_bert_64_2']), axis=1
)
df['name_bert_64_euc_dist'] = df.progress_apply(
    lambda row: euc_dist(row['name_bert_64_1'], row['name_bert_64_2']), axis=1
)

In [None]:
# Мэтч по concat_emb
df['concat_emb_cos_sim'] = df.progress_apply(
    lambda row: cosine_sim(row['concat_emb_1'], row['concat_emb_2']), axis=1
)
df['concat_emb_euc_dist'] = df.progress_apply(
    lambda row: euc_dist(row['concat_emb_1'], row['concat_emb_2']), axis=1
)

In [None]:
# Мэтч по tfidf
for col in ('name', 'description', 'attr_keys', 'attr_vals'):
    df[f'{col}_tfidf_cos_sim'] = df.progress_apply(
        lambda row: cosine_sim(row[f'{col}_tfidf_1'], row[f'{col}_tfidf_2']), axis=1
    )
    df[f'{col}_tfidf_euc_dist'] = df.progress_apply(
        lambda row: euc_dist(row[f'{col}_tfidf_1'], row[f'{col}_tfidf_2']), axis=1
    )

In [None]:
def fillness(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
    condition_both = df[f'{col_name}_1'].notna() & df[f'{col_name}_2'].notna()
    condition_none = df[f'{col_name}_1'].isna() & df[f'{col_name}_2'].isna()
    
    df[f'{col_name}_fillness'] = np.where(
        condition_both, 'both',
        np.where(condition_none, 'none', 'only one')
    )
    
    return df

# Заполненность строк у товаров
df = fillness(df, 'main_pic_embeddings_resnet_v1')

In [None]:
def avg_fully_eq_attributes(d1, d2):
    if d1 is None or d2 is None:
        return None
    d1 = ast.literal_eval(d1)
    d2 = ast.literal_eval(d2)
    keys = set(d1) & set(d2)
    metrics = []
    for key in keys:
        metrics.append(set(d1[key]) == set(d2[key]))
    return np.mean(metrics)

# Совпадения для словаря атрибутов
df['attributes_values_avg_fully_eq'] = (
    df.progress_apply(
        lambda row: avg_fully_eq_attributes(
            row['characteristic_attributes_mapping_1'],
            row['characteristic_attributes_mapping_2'],
        ),
        axis=1
    )
)

In [None]:
# Мэтч по названию и атрибутам
for col in ('name', 'name_norm', 'attr_vals', 'attr_keys', 'characteristics_attributes'):
    df[f'{col}_token_sort_ratio'] = df.progress_apply(
        lambda row: fuzz.token_sort_ratio(row[f'{col}_1'], row[f'{col}_2']) / 100, axis=1
    )
    df[f'{col}_token_set_ratio'] = df.progress_apply(
        lambda row: fuzz.token_set_ratio(row[f'{col}_1'], row[f'{col}_2']) / 100, axis=1
    )
    df[f'{col}_jaro_winkler_similarity'] = df.progress_apply(
        lambda row: jellyfish.jaro_winkler_similarity(row[f'{col}_1'], row[f'{col}_2']), axis=1
    )
    df[f'{col}_dice'] = df.progress_apply(
        lambda row: textdistance.dice(row[f'{col}_1'], row[f'{col}_2']), axis=1
    )
    df[f'{col}_tanimoto'] = df.progress_apply(
        lambda row: textdistance.tanimoto(row[f'{col}_1'], row[f'{col}_2']), axis=1
    )
    df[f'{col}_sorensen'] = df.progress_apply(
        lambda row: textdistance.sorensen(row[f'{col}_1'], row[f'{col}_2']), axis=1
    )
    if 'attr' not in col:
        df[f'{col}_damerau_levenshtein_distance'] = df.progress_apply(
            lambda row: jellyfish.damerau_levenshtein_distance(row[f'{col}_1'], row[f'{col}_2']), axis=1
        )
        df[f'{col}_WRatio'] = df.progress_apply(
            lambda row: fuzz.WRatio(row[f'{col}_1'], row[f'{col}_2']) / 100, axis=1
        )

# Мэтч по описанию    
for col in ('description',):
    df[f'{col}_jaro_winkler_similarity'] = df.progress_apply(
        lambda row: jellyfish.jaro_winkler_similarity(row[f'{col}_1'], row[f'{col}_2']), axis=1
    )

In [None]:
def longest_common_prefix(str1, str2):
    if str1 is None or str2 is None:
        return None
    
    min_len = min(len(str1), len(str2))
    prefix_len = 0
    
    for i in range(min_len):
        if str1[i] == str2[i]:
            prefix_len += 1
        else:
            break
    
    return prefix_len / min_len if min_len != 0 else 0

def longest_common_subsequence(str1, str2):
    if str1 is None or str2 is None:
        return None
    
    len1, len2 = len(str1), len(str2)
    dp = [[0] * (len2 + 1) for _ in range(len1 + 1)]
    
    for i in range(1, len1 + 1):
        for j in range(1, len2 + 1):
            if str1[i - 1] == str2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1] + 1
            else:
                dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
    
    lcs_len = dp[len1][len2]
    return lcs_len / max(len1, len2) if max(len1, len2) != 0 else 0

In [None]:
# LCP + LCS для названий
for col in ('name_norm',):
    df[f'{col}_lcp'] = df.progress_apply(
        lambda row: longest_common_prefix(row[f'{col}_1'], row[f'{col}_2']), axis=1
    )
    df[f'{col}_lcs'] = df.progress_apply(
        lambda row: longest_common_subsequence(row[f'{col}_1'], row[f'{col}_2']), axis=1
    )

In [None]:
def jaccard_similarity(list1, list2):
    if list1 is None or list2 is None:
        return None
    set1 = set(list1)
    set2 = set(list2)
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

def overlap_coefficient(list1, list2):
    if list1 is None or list2 is None:
        return None
    set1 = set(list1)
    set2 = set(list2)
    intersection = len(set1.intersection(set2))
    return intersection / min(len(set1), len(set2)) if min(len(set1), len(set2)) != 0 else 0

In [None]:
# Сходство для списков
for col in (
    'attr_keys', 
    'attr_vals', 
    'description_tokens', 
    'name_tokens', 
    'attr_vals_w_digits', 
    'description_tokens_w_digits', 
    'name_tokens_w_digits'
):
    df[f'{col}_jaccard_score'] = df.progress_apply(
        lambda row: jaccard_similarity(row[f'{col}_1'].split(), row[f'{col}_2'].split()), axis=1
    )
    df[f'{col}_overlap_score'] = df.progress_apply(
        lambda row: overlap_coefficient(row[f'{col}_1'].split(), row[f'{col}_2'].split()), axis=1
    )

In [None]:
# Удалим ненужные фичи
df.drop(
    columns=[
        'name_1', 'name_2', 
        'description_1', 'description_2', 
        'name_norm_1', 'name_norm_2',
        'description_norm_1', 'description_norm_2', 
        'attr_vals_1', 'attr_vals_2',
        'attr_keys_1', 'attr_keys_2',
        'characteristics_attributes_1', 'characteristics_attributes_2',
        'description_tokens_1', 'description_tokens_2',
        'name_tokens_1', 'name_tokens_2', 
        'attr_vals_w_digits_1', 'attr_vals_w_digits_2', 
        'description_tokens_w_digits_1', 'description_tokens_w_digits_2',
        'name_tokens_w_digits_1', 'name_tokens_w_digits_2',
        'name_tfidf_1', 'name_tfidf_2',
        'description_tfidf_1', 'description_tfidf_2',
        'attr_keys_tfidf_1', 'attr_keys_tfidf_2',
        'attr_vals_tfidf_1', 'attr_vals_tfidf_2', 
        'name_bert_64_1', 'name_bert_64_2', 
        'main_pic_embeddings_resnet_v1_1', 'main_pic_embeddings_resnet_v1_2', 
        'pic_embeddings_resnet_v1_1', 'pic_embeddings_resnet_v1_2',
        'concat_emb_1', 'concat_emb_2',
    ], 
    axis=1, 
    inplace=True
)

In [None]:
df.to_parquet('test.parquet', index=False)