In [1]:
import pandas as pd
import networkx as nx

In [2]:
train_pairs = pd.read_parquet('/kaggle/input/extracted_data/train.parquet')

In [3]:
def apply_transitivity_with_networkx(df):
    G = nx.Graph()

    edges = df[df['target'] == 1][['variantid1', 'variantid2']].values
    G.add_edges_from(edges)

    transitive_closure = set()
    for component in nx.connected_components(G):
        component = list(component)
        for i in range(len(component)):
            for j in range(i + 1, len(component)):
                transitive_closure.add((component[i], component[j]))

    new_rows = pd.DataFrame(list(transitive_closure), columns=['variantid1', 'variantid2'])
    new_rows['target'] = 1
    new_rows[['variantid1', 'variantid2']] = new_rows[['variantid1', 'variantid2']].apply(lambda x: sorted(x), axis=1, result_type='expand')

    df_expanded = pd.concat([df, new_rows], ignore_index=True)
    df_expanded = df_expanded.drop_duplicates(subset=['variantid1', 'variantid2', 'target'], keep='first')

    return df_expanded

In [4]:
df_expanded = apply_transitivity_with_networkx(train_pairs)

In [5]:
df_expanded.to_parquet('train_with_chains.parquet')