In [13]:
from pathlib import Path 
import os
import gc 

import polars as pl 
import numpy as np 
import networkx as nx
from tqdm.notebook import tqdm

In [14]:
DATA_PATH = Path('../data/avito-orig-data')
train_path_list = [DATA_PATH / i for i in os.listdir(DATA_PATH) if 'train' in i]

In [15]:
RENAME_MAPPING = {
    'base_item_id': 'variantid_1', 
    'cand_item_id': 'variantid_2',
    'base_title': 'name_1',
    'cand_title': 'name_2',
    'base_description': 'description_1',
    'cand_description': 'description_2',
    'base_category_name': 'category_level_1_1',
    'cand_category_name': 'category_level_1_2',
    'base_subcategory_name': 'category_level_2_1',
    'cand_subcategory_name': 'category_level_2_2',
    'base_param1': 'category_level_3_1',
    'cand_param1': 'category_level_3_2',
    'base_param2': 'category_level_4_1',
    'cand_param2': 'category_level_4_2',
    'base_json_params': 'characteristic_attributes_mapping_1',
    'cand_json_params': 'characteristic_attributes_mapping_2',
    'base_count_images': 'n_images_1',
    'cand_count_images': 'n_images_2',
    'base_price': 'price_1',
    'cand_price': 'price_2'
}

IDS = ['variantid_1', 'variantid_2']
FOR_SPLIT = ['group_id', 'action_date']
IMAGE_PATHS = ['base_title_image', 'cand_title_image']
BINARY_FEATURES = ['is_same_location', 'is_same_region']
TARGET = 'is_double'

In [16]:
train_df = pl.DataFrame()

for file in tqdm(train_path_list):

    chunk = pl.read_parquet(file)
    train_df = pl.concat([train_df, chunk])
    print(f'{train_df.shape=}')

del chunk 
gc.collect()
train_df = train_df.rename(mapping=RENAME_MAPPING)

  0%|          | 0/4 [00:00<?, ?it/s]

train_df.shape=(500000, 28)
train_df.shape=(1000000, 28)
train_df.shape=(1500000, 28)
train_df.shape=(1879555, 28)


In [42]:
idx_df = train_df.select(['variantid_1', 'variantid_2', 'is_double'])
print(f'{idx_df.shape=}')

idx_df.shape=(1879555, 3)


In [None]:
def apply_k_level_transitivity_with_networkx(
    df: pl.DataFrame, 
    K: int = 1, 
    verbose: bool = True
) -> pl.DataFrame:

    def get_nodes_at_distance(start_node, distance):
        return {node for node in nx.single_source_shortest_path_length(G, start_node, cutoff=distance) 
                if nx.shortest_path_length(G, start_node, node) == distance}

    G = nx.Graph()
    edges = df.select(['variantid_1', 'variantid_2']).rows()
    G.add_edges_from(edges)
    
    existing_edges = set(frozenset(edge) for edge in edges)
    new_edges = set()
    
    for level in range(1, K+1):
        current_level_edges = set()
        
        for node in G.nodes():
            nodes_at_level = get_nodes_at_distance(node, level)
            
            for u in nodes_at_level:
                for v in nodes_at_level:
                    if u < v:  # Вроде так должен избежать дубликатов...
                        edge = frozenset({u, v})
                        if edge not in existing_edges and edge not in new_edges:
                            current_level_edges.add(edge)
        
        new_edges.update(current_level_edges)
        
        if verbose and current_level_edges:
            print(f"Уровень {level}: добавлено {len(current_level_edges)} новых пар")
    
    if new_edges:
        new_rows = pl.DataFrame(
            {
                'variantid_1': [tuple(edge)[0] for edge in new_edges],
                'variantid_2': [tuple(edge)[1] for edge in new_edges],
                'is_double': 1
            }, 
            schema={
                'variantid_1': pl.String,  
                'variantid_2': pl.String,
                'is_double': pl.Int64
            }
        )
        df_expanded = pl.concat([df, new_rows]).unique(subset=['variantid_1', 'variantid_2'])
    else:
        df_expanded = df.clone()
    
    if verbose:
        print(f"\nВсего добавлено новых пар: {len(new_edges) if new_edges else 0}")
        print(f"Общее количество пар после расширения: {len(df_expanded)}")
    
    return df_expanded

In [46]:
apply_k_level_transitivity_with_networkx(idx_df.filter(pl.col('is_double') == 1))

Уровень 1: добавлено 68803 новых пар

Всего добавлено новых пар: 68803
Общее количество пар после расширения: 173861


variantid_1,variantid_2,is_double
str,str,i64
"""8a76827c486b623d811ed02e49d384…","""82720486240f5667f1a4fac4f54ab5…",1
"""410231d0a5921f617e8c3135232fe4…","""e492e963e1108d1ea5d6d954bc0e95…",1
"""51a87e951352dcd8b1a0254d14e2d1…","""b270beb837d109c4c71c2952b01b74…",1
"""79e1c8f9dcfd1cda8610d47450722d…","""6f4ffd0ebab5a74441ab57a30af832…",1
"""4b89ba324aebbc776e035db0e2baa0…","""55f3091e597a43a39ec1b0e635e44d…",1
…,…,…
"""b15b9413fbdb49fe783de72c9a9b0b…","""1208d850c2ba05f3fa692f7d27573f…",1
"""fc09e448583122d3fb7a7ffb57034c…","""eb440f3b456f7cef168b00538e78d9…",1
"""becb3e9f0a359540ba3bf0e79dd6cf…","""6d084c8cfee8e22be53c146515063c…",1
"""4400e8fda58f006974242014ff8ca2…","""3320e24e3c1ed59ab9b3d924fae354…",1


In [47]:
def apply_k_level_transitivity_with_networkx_save_disbalance(
    df: pl.DataFrame, 
    K: int = 1,
    verbose: bool = True
) -> pl.DataFrame:

    df_1 = df.filter(pl.col('is_double') == 1)
    G_1 = nx.Graph()
    edges_1 = df_1.select(['variantid_1', 'variantid_2']).rows()
    G_1.add_edges_from(edges_1)
    
    existing_edges = set(frozenset(edge) for edge in edges_1)
    new_edges_1 = set()
    
    def get_nodes_at_distance(start_node, distance):
        return {node for node in nx.single_source_shortest_path_length(G_1, start_node, cutoff=distance)
                if nx.shortest_path_length(G_1, start_node, node) == distance}
    
    for level in range(1, K+1):
        current_level_edges = set()
        
        for node in G_1.nodes():
            nodes_at_level = get_nodes_at_distance(node, level)
            
            for u in nodes_at_level:
                for v in nodes_at_level:
                    if u < v:
                        edge = frozenset({u, v})
                        if edge not in existing_edges and edge not in new_edges_1:
                            current_level_edges.add(edge)
        
        new_edges_1.update(current_level_edges)
        
        if verbose and current_level_edges:
            print(f"is_double=1, уровень {level}: добавлено {len(current_level_edges)} новых пар")
    
    df_0 = df.filter(pl.col('is_double') == 0)
    G_0 = nx.Graph()
    edges_0 = df_0.select(['variantid_1', 'variantid_2']).rows()
    G_0.add_edges_from(edges_0)
    
    existing_edges_0 = set(frozenset(edge) for edge in edges_0)
    new_edges_0 = set()
    
    max_new_pairs = len(new_edges_1)
    generated_pairs = 0
    
    for node in G_0.nodes():
        neighbors = list(G_0.neighbors(node))
        
        for i in range(len(neighbors)):
            for j in range(i+1, len(neighbors)):
                if generated_pairs >= max_new_pairs:
                    break
                
                edge = frozenset({neighbors[i], neighbors[j]})
                if edge not in existing_edges_0 and edge not in new_edges_0:
                    new_edges_0.add(edge)
                    generated_pairs += 1
        
        if generated_pairs >= max_new_pairs:
            break
    
    if verbose:
        print(f"\nДля is_double=1 добавлено {len(new_edges_1)} новых пар")
        print(f"Для is_double=0 добавлено {len(new_edges_0)} новых пар")
    
    new_rows = []
    
    if new_edges_1:
        new_rows.append(pl.DataFrame(
            {
                'variantid_1': [tuple(edge)[0] for edge in new_edges_1],
                'variantid_2': [tuple(edge)[1] for edge in new_edges_1],
                'is_double': 1
            },
            schema=df.schema
        ))
    
    if new_edges_0:
        new_rows.append(pl.DataFrame(
            {
                'variantid_1': [tuple(edge)[0] for edge in new_edges_0],
                'variantid_2': [tuple(edge)[1] for edge in new_edges_0],
                'is_double': 0
            },
            schema=df.schema
        ))
    
    if new_rows:
        df_expanded = pl.concat([df] + new_rows).unique(subset=['variantid_1', 'variantid_2'])
    else:
        df_expanded = df.clone()
    
    if verbose:
        print(f"\nОбщее количество пар после расширения: {len(df_expanded)}")
    
    return df_expanded

In [48]:
apply_k_level_transitivity_with_networkx_save_disbalance(idx_df)

is_double=1, уровень 1: добавлено 68803 новых пар

Для is_double=1 добавлено 68803 новых пар
Для is_double=0 добавлено 68803 новых пар

Общее количество пар после расширения: 2017103


variantid_1,variantid_2,is_double
str,str,i64
"""382624dfbd10bddbfa5c98512f29ca…","""0a97cd09420f21b09d984cc656af63…",0
"""280deded198898f7b219ba18efde1c…","""94e7f806698ac69fff996eca93e067…",0
"""a198b1f0b3fd7ab434dd6bb1371040…","""159eb9ffc752d554c05b5bfd6f8234…",0
"""a4ca91db8be510c0f53aba2cb46e0c…","""78c28e6a4efbb4c1eac8218f169426…",0
"""caa7574ebc153fe594576560390b35…","""353f30c31a8a8b8cedd5d1061e4383…",0
…,…,…
"""e418adbd1bdafe9faa816c107f3ddd…","""d7aed4d5a45f97f0279cb44b85a26c…",0
"""6b13ca674929df840877a66a6c048d…","""2b59857c70037af96db1fa5b9c6ab7…",0
"""574a9ae3af5a6333668fe92d2620a1…","""982d87e5c00a78e7ad92c75589bf9d…",0
"""5427e7f70d7d45cecf2fe16d6a03ce…","""bf5322b86c64881dc2ef87e00e6b24…",0
