In [1]:
import pandas as pd
import numpy as np
np.random.seed(42)
import random
random.seed(42)

from tqdm.notebook import tqdm
import time

from gensim.corpora import Dictionary
from gensim.similarities import SparseMatrixSimilarity
from gensim.similarities import Similarity

## Two functions for building positive and negative pairs

In [2]:
def build_positive_pairs(corpus, clusters, attribute, num_pos):
    """
    Builds positive pairs for all offers in each cluster in 'clusters'
    which can be found in 'corpus' using 'attribute' for calculating
    BOW cosine similarity to select hard pairs.
    Selects an equal amount of hard and random pairs depending on 'num_pos'
    per offer. If it is not possible to build 'num_pos' pairs, the heuristic
    will build as many pairs as possible for that cluster.
    
    Parameters:
    corpus (pandas.DataFrame): Corpus containing product offers.
    clusters (List): List of cluster_ids for which Positive pairs should be built.
    attribute (str): Attribute of 'corpus' to use for similarity calculations.
    num_pos (int): Number of positive examples to build per offer.
    
    Returns:
    List(Tuple(int, List(List,List))): a list of tuples, each tuple containing
    the offer id and a list of two lists containing the offer ids of the hard
    and random pairs.
    """
    pos_pairs = []
    for current_cluster in tqdm(clusters):
        cluster_data = corpus[corpus['cluster_id'] == current_cluster]
        
        # build gensim dictionary, corpus and search index for selected cluster
        dct = Dictionary(cluster_data[attribute], prune_at=5000000)
        dct.filter_extremes(no_below=2, no_above=1.0, keep_n=None)
        gensim_corpus = [dct.doc2bow(text) for text in cluster_data[attribute]]
        index = SparseMatrixSimilarity(gensim_corpus, num_features=len(dct),num_best=80)
        
        # query up to 80 most similar offers, only offers with similarity > 0 will be returned
        query = index[gensim_corpus]
        
        for i, offer_sim_dup in enumerate(query):
            
            current_num_pos = num_pos
            current_id = cluster_data.iloc[i]['id']
            
            offer_sim = []
            
            #remove self
            for x in offer_sim_dup:
                if x[0] != i:
                    offer_sim.append(x)

            # check if any pairs > 0 similarity remain
            if len(offer_sim) == 0:
                pos_pairs.append((current_id,[[],[]]))
                continue
                
            # adapt number of selectable pairs if too few available
            offer_len = len(offer_sim)
            if offer_len < current_num_pos:
                current_num_pos = offer_len

            if current_num_pos == 1:
                hard_pos = 1
                random_pos = 0
            elif current_num_pos % 2 == 1:
                hard_pos = int(current_num_pos / 2) + 1
                random_pos = int(current_num_pos / 2)
            else:
                hard_pos = int(current_num_pos / 2)
                random_pos = int(current_num_pos / 2)

            # get hard offers from bottom of list
            hard_offers = offer_sim[-hard_pos:]
            
            if random_pos == 0:
                pos_pairs.append((current_id, [[cluster_data.iloc[x[0]]['id'] for x in hard_offers],[]]))
                continue
                
            # remaining offers
            rest = offer_sim[:-hard_pos]

            # randomly select from remaining
            random_select = random.sample(range(len(rest)), random_pos)
            random_offers = [rest[idx] for idx in random_select]

            hard_ids = [cluster_data.iloc[x[0]]['id'] for x in hard_offers]
            random_ids = [cluster_data.iloc[x[0]]['id'] for x in random_offers]
            
            pos_pairs.append((current_id, [hard_ids, random_ids]))
    return pos_pairs

def build_neg_pairs_for_cat(corpus, category, offers, attribute, num_neg):
    """
    Builds negative pairs for all offers in 'offers' which are of category
    'category' which can be found in 'corpus' using 'attribute' for calculating
    BOW cosine similarity to select hard pairs.
    Selects an equal amount of hard and random pairs depending on 'num_neg'
    per offer. Each hard negative will originate from a different cluster
    to avoid building hard negatives with only a small amount of different
    products. If offers in 'offers' originate from multiple categories,
    this function should be called multiple times while iterating over
    the different categories.
    
    Parameters:
    corpus (pandas.DataFrame): Corpus containing product offers
    category (str): Category for which to build negatives
    offers (List): List of offer_ids for which to build negatives
    attribute (str): Attribute of 'corpus' to use for similarity calculations
    num_neg (int): Number of negative examples to build per offer
    
    Returns:
    List(Tuple(int, List(List,List))): a list of tuples, each tuple containing
    the offer id and a list of two lists containing the offer ids of the hard
    and random pairs.
    """
    # select data from relevant category
    cat_data = corpus[corpus['category'] == category].copy()
    cat_data = cat_data.reset_index(drop=True)
    cat_data['subindex'] = list(cat_data.index)
    
    # build gensim dictionary, corpus and search index for selected cluster
    dct = Dictionary(cat_data[attribute], prune_at=5000000)
    dct.filter_extremes(no_below=2, no_above=0.8, keep_n=None)
    
    gensim_corpus = [dct.doc2bow(text) for text in cat_data[attribute]]
    
    index = Similarity(None, gensim_corpus, num_features=len(dct), num_best=200)
    
    # corpus to select negatives against
    corpus_neg_all = cat_data
    
    # corpus containing only offers for which negatives should be built
    corpus_neg = corpus_neg_all[corpus_neg_all['id'].isin(offers)]
    
    neg_pairs_cat = []
    
    # query for 200 most similar offers across whole category
    query_corpus = [gensim_corpus[i] for i in list(corpus_neg['subindex'])]
    start = time.time()
    query = index[query_corpus]
    end = time.time()
    print(f'Category {category} query took {end-start} seconds')
    
    for i, offer_sim in enumerate(tqdm(query)):
        
        current_index = corpus_neg.iloc[i]['subindex']
        current_id = corpus_neg.iloc[i]['id']
        current_cluster_id = corpus_neg.iloc[i]['cluster_id']
        current_num_neg = num_neg
        
        # remove any offers with similarity 1.0
        sim_indices = []
        for x in offer_sim:
            if x[1] >= 1.0:
                continue
            else:
                sim_indices.append(x[0])
        
        possible_pairs = corpus_neg_all.loc[sim_indices]
        
        # filter by cluster_id, i.e. only 1 offer per cluster remains to allow for product diversity
        idx = sorted(np.unique(possible_pairs['cluster_id'], return_index=True)[1])
        
        possible_pairs = possible_pairs.iloc[idx]
        
        # remove any offer from same cluster
        possible_pairs = possible_pairs[possible_pairs['cluster_id'] != current_cluster_id]
        
        possible_pairs_len = len(possible_pairs)
        
        # check if any pairs > 0 similarity remain
        if possible_pairs_len == 0:
            neg_pairs_cat.append((current_id,[[],[]]))
            continue
        
        # adapt number of selectable pairs if too few available
        if possible_pairs_len < current_num_neg:
            current_num_neg = possible_pairs_len

        if current_num_neg == 1:
            hard_neg = 1
            random_neg = 0
        elif current_num_neg % 2 == 1:
            hard_neg = int(current_num_neg / 2) + 1
            random_neg = int(current_num_neg / 2)
        else:
            hard_neg = int(current_num_neg / 2)
            random_neg = int(current_num_neg / 2)
        
        # select hard pairs from top of list
        candidates = possible_pairs.iloc[:hard_neg]
        
        hard_pairs = candidates['id'].tolist()
        
        if random_neg == 0:
            neg_pairs_cat.append((current_id, [hard_pairs,[]]))
            continue
        else:
            remove = list(candidates.index)
            remove.append(current_index)
            
            # randomly select from all offers among same category
            random_select = random.sample(range(len(corpus_neg_all)),random_neg)
            random_pairs = corpus_neg_all.iloc[random_select]
            while(any(random_pairs['id'].isin(remove)) or any(random_pairs['cluster_id'] == current_cluster_id)):
                random_select = random.sample(range(len(corpus_neg_all)),random_neg)
                random_pairs = corpus_neg_all.iloc[random_select]
            random_pairs = random_pairs['id'].tolist()

            combined_pairs = [hard_pairs, random_pairs]
        neg_pairs_cat.append((current_id, combined_pairs))
    
    return neg_pairs_cat

# Pair generation example

### Load the corpus data

In [3]:
corpus = pd.read_json('offers_corpus_english_v2_swc.json.gz', lines=True)
corpus = corpus.set_index('id', drop=False)
corpus.head()

Unnamed: 0_level_0,id,cluster_id,category,title,description,brand,price,keyValuePairs,specTableContent
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
11920489,11920489,2533918,Musical_Instruments,"""Ritter RGP5-D/BSG Performance 5 Series Bag: ...","""\nThe Ritter RGP5-D padded gigbag offers styl...",,,,
12648455,12648455,11167803,Tools_and_Home_Improvement,"""Krowne - 14-802L 8 in Royal Series Wall Moun...","""The 14-802L Royal Series Wall Mount Faucet w/...","""Krowne""@en",,,
7634831,7634831,11621476,Jewelry,"""A. Jaffe Art Deco ME2105Q-163"" ""Shop A. MES6...","""<p> An everlasting symbol of love, model num...",,,,
16519583,16519583,8824768,Sports_and_Outdoors,"""Gore bike wear Element Lady 2in1 Shorts"" Sho...","""\n\t\tProduktbeskrivning Gore bike wear Eleme...","""Gore bike wear""",,,
3362858,3362858,7523117,Shoes,,,,"""USD""",,


### Select a category and some clusters to build positives for

We also need to select (or engineer) an attribute to use for similarity comparisons. It has to be provided in the form of a tokenized list per sample

In [4]:
category = 'Computers_and_Accessories'
corpus_computers = corpus[corpus['category'] == category].copy()

# tokenize title for use in similarity computations
tokenized_title = corpus_computers['title'].str.split()
corpus_computers['title_tokenized'] = tokenized_title

# select clusters with size > 1
gt1_bool = corpus_computers['cluster_id'].value_counts() > 1
clusters_gt1 = list(gt1_bool[gt1_bool == True].index)
random_clusters = random.sample(clusters_gt1, 100)

### Build 10 positive pairs (5 hard / 5 random) for each of the selected clusters

In [5]:
pos_pairs = build_positive_pairs(corpus_computers, random_clusters, attribute='title_tokenized', num_pos=10)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




For each offer we built a positive for, we want to build negatives, so first of all we need to select the relevant offer ids:

In [6]:
offers_for_negatives = [x[0] for x in pos_pairs]

### Build 10 negative pairs (5 hard / 5 random) for each offer that has positives

In [7]:
neg_pairs = build_neg_pairs_for_cat(corpus_computers, category, offers_for_negatives, attribute='title_tokenized', num_neg=10)

Category Computers_and_Accessories query took 10.135547637939453 seconds


HBox(children=(FloatProgress(value=0.0, max=466.0), HTML(value='')))




In [8]:
print(f'{pos_pairs[:5]}\n\n\n{neg_pairs[:5]}')

[(3640469, [[204768, 6661367], [15759274]]), (204768, [[3640469, 15759274], [6661367]]), (6661367, [[3640469, 15759274], [204768]]), (15759274, [[204768, 6661367], [3640469]]), (8546404, [[15814565, 15341084, 2657937, 3286938, 15603980], [15758163, 11136228, 17037368, 4483355, 9708144]])]


[(16920343, [[13951302, 15545416, 5326737, 1702266, 12649993], [897942, 13906644, 11054593, 6430902, 5275235]]), (4416171, [[14677750, 9601821, 5889786, 14204738, 3881443], [15375264, 1199529, 9594862, 2606521, 529774]]), (13118591, [[6392863, 9187550, 449100, 1777815, 5282676], [6722365, 15017674, 2992783, 5035593, 13875802]]), (2312409, [[7211157, 1141987, 13222955, 10695553, 172265], [9026944, 11979577, 668607, 5741898, 13209193]]), (11253837, [[10827809, 1813737, 9116117, 17003291, 9226955], [10647168, 3991454, 3744991, 14971212, 12426002]])]


Do further processing steps, like deduplication of pairs etc...