In [1]:
import pandas as pd

import itertools
from tqdm.auto import tqdm

import gensim.downloader as api

In [2]:
pd.set_option('display.max_rows', 500)

In [18]:
# https://github.com/RaRe-Technologies/gensim-data#models
# 50, 100, 200, 300
word_vectors = api.load("glove-wiki-gigaword-300")



In [20]:
DATA_PATH = '../Data/'
FILES = ['clean_max.csv', 'clean_mar.csv', 'clean_sam.csv']
#COLS = [0, 1, 14, 5]  # Index, ID, Basket, Item
COLS = [0, 1, 2, 5]  # Index, ID, Session, Item
DTYPES = {'ID': 'uint8', 'Session': 'uint8', 'Item': str}

dfs = [pd.read_csv(DATA_PATH + file, index_col=0, usecols=COLS, dtype=DTYPES) for file in FILES]

ids_shared = set.intersection(*[set(df.ID.unique()) for df in dfs])
dfs = [df[df.ID.isin(ids_shared)].fillna('').reset_index(drop=True) for df in dfs]

In [5]:
def equalize_length(df1, df2):
    df1_length, df2_length = len(df1), len(df2)
    
    if df1_length > df2_length:    
        df2 = df2.reindex(list(range(df1_length)))
        df2 = df2.fillna('')
    elif df2_length > df1_length:
        df1 = df1.reindex(list(range(df2_length)))
        df1 = df1.fillna('')

    assert len(df1) == len(df2)
    return df1, df2

In [74]:
# TODO: unpack big functions in order to reduce permutations
def compare_tmp(df_row):
    result = []
    df_split = df_row.str.split()
    df_split = df_split.fillna('') # hack: somehow a NaN value can sneak in 
    
    shared_words = set.intersection(*map(set, df_split))
    unshared_words = set.symmetric_difference(*map(set, df_split))
    
    shared_subwords = []
    for word1 in unshared_words:
        for word2 in unshared_words:
            if word1 in word2 and word1 != word2:
                result.append(word1)
                shared_subwords.extend([word1, word2])
    remaining_words = unshared_words.difference(set(shared_subwords))
    

    # cases: identical match or one contained in other
    if shared_words:
        result.extend(shared_words)
        # if both phrases are exhausted return
        if not unshared_words:  # complete identical match
            df_row['WordVec'] = ' '.join(result)
            df_row['Distance'] = 0
            return df_row
        elif set(df_split[0]) == shared_words or set(df_split[1]) == shared_words:  # one contains the other
            df_row['WordVec'] = ' '.join(result)
            df_row['Distance'] = 0.1234  # small identifiable number (not magic)
            return df_row
    
    # cases: 
    if len(remaining_words) == 1:
        result.append(*remaining_words)
        df_row['WordVec'] = ' '.join(result)
        df_row['Distance'] = 0.666  # small identifiable number (not magic)
        return df_row
    
    # discard unshared words outside vocabulary
    remaining_words_in_voc = [word for word in remaining_words if word in word_vectors.vocab] 
    #print(f'remaining_words_in_voc = {remaining_words_in_voc}')
    if not remaining_words_in_voc:
        if result: # substring match
            df_row['WordVec'] = ' '.join(result)
            df_row['Distance'] = 0.1234  # small identifiable number (not magic)
            return df_row            
        else:  # empty row
            df_row['WordVec'] = ' '.join([*df_row])
            df_row['Distance'] = 0.271828  # small identifiable number (not magic)
            return df_row
    
    # use word vectors to average unshared words
    shared_words_in_voc = [word for word in shared_words if word in word_vectors.vocab]
    unshared_words_in_voc = [word for word in unshared_words if word in word_vectors.vocab]
    #print(f'shared_words_in_voc = {shared_words_in_voc}')
    #print(f'unshared_words_in_voc = {unshared_words_in_voc}')
    
    words_in_voc = shared_words_in_voc + unshared_words_in_voc
    #print(f'words_in_voc = {words_in_voc}')
    
    most_similar_key, _ = word_vectors.most_similar(positive=words_in_voc)[0]  # take top result
    
    # don't append word vector if it repeats a word
    for res in result: 
        if most_similar_key in res or res in most_similar_key:
            df_row['WordVec'] = ' '.join(result)
            df_row['Distance'] = word_vectors.wmdistance(df_split[0], df_split[1]) # wmdistance handles oov
            return df_row

    result.append(most_similar_key)
    df_row['WordVec'] = ' '.join(result)
    df_row['Distance'] = word_vectors.wmdistance(df_split[0], df_split[1]) # wmdistance handles oov
    return df_row

In [82]:
# TODO: unpack big functions in order to reduce permutations
def compare(df_row):
    result = []
    df_split = df_row.str.split()
    df_split = df_split.fillna('') # hack: somehow a NaN value can sneak in 
    
    shared_words = set.intersection(*map(set, df_split))
    unshared_words = set.symmetric_difference(*map(set, df_split))
    
    shared_subwords = []
    subword_matches = []
    for word1 in unshared_words:
        for word2 in unshared_words:
            if word1 in word2 and word1 != word2:
                shared_subwords.append(word1)
                subword_matches.extend([word1, word2])
    unmatched_words = unshared_words.difference(set(subword_matches))
    
    unmatched_words_in_voc = [word for word in unmatched_words if word in word_vectors.vocab]     
    shared_words_in_voc = [word for word in shared_words if word in word_vectors.vocab]
    unshared_words_in_voc = [word for word in unshared_words if word in word_vectors.vocab]
    words_in_voc = shared_words_in_voc + unshared_words_in_voc
    
    # Matching Case Flow
    # Return values are small identifiable numbers, primarily used as flags, but their ordering is important
    # 1. complete identical match
    # 2. one contains all the words of the other
    # 3. one contains all the words of the other as substrings
    # 4. one unmatched word
    # 5. unmatched words in vocabulary
    # 6. unmatched words but out of vocabulary
    
    # always return identical matches
    result.extend(shared_words)
    
    # 1. complete identical match
    if not unshared_words:  
        df_row['WordVec'] = ' '.join(result)
        df_row['Distance'] = 0
        return df_row
    
    # 2. one contains all the words of the other
    if set(df_split[0]) == shared_words or set(df_split[1]) == shared_words:  
        df_row['WordVec'] = ' '.join(result)
        df_row['Distance'] = 0.123456  
        return df_row
    
    # 3. one contains all the words of the other as substrings
    result.extend(shared_subwords)
    if not unmatched_words:   
        df_row['WordVec'] = ' '.join(result)
        df_row['Distance'] = 0.271828  
        return df_row            
    # 4. one unmatched word
    elif len(unmatched_words) == 1:
        result.append(*unmatched_words)
        df_row['WordVec'] = ' '.join(result)
        df_row['Distance'] = 0.314159       
        return df_row
  
    # 5. unmatched words in vocabulary
    if unmatched_words_in_voc:
        most_similar_key, _ = word_vectors.most_similar(positive=words_in_voc)[0]  # take top result
        
        # don't append word vector if it repeats a word or subword
        for res in result: 
            if most_similar_key in res or res in most_similar_key:
                df_row['WordVec'] = ' '.join(result)
                df_row['Distance'] = 0.666
                return df_row
            
        result.append(most_similar_key)
        df_row['WordVec'] = ' '.join(result)
        df_row['Distance'] = word_vectors.wmdistance(df_split[0], df_split[1]) # wmdistance handles oov
        return df_row
    # 6. unmatched words but out of vocabulary
    else:
        df_row['WordVec'] = ' '.join(result)
        df_row['Distance'] = 0.999  

In [7]:
# TODO: unpack big functions in order to reduce permutations
def align(df1, df2):
    df1, df2 = equalize_length(df1, df2)
    
    # remove identical matches
    result_pairs = []
    df1_dropped, df2_dropped = df1.index, df2.index
    for df1_idx, df1_word in df1.iteritems():
        matches = df2[df2_dropped].str.fullmatch(df1_word)
        if any(matches):
            match_index = matches.idxmax() # return index of first match
            result_pairs.append((df1_idx, match_index))
            df1_dropped = df1_dropped.drop(df1_idx)
            df2_dropped = df2_dropped.drop(match_index)
    
    # remove substring matches
    df1_split = df1[df1_dropped].str.split()
    for df1_idx, df1_words in df1_split.iteritems():
        for word in df1_words:
            matches = df2[df2_dropped].str.contains(word, regex=False)
            if any(matches):
                match_index = matches.idxmax() # return index of first match
                result_pairs.append((df1_idx, match_index))
                df1_dropped = df1_dropped.drop(df1_idx)
                df2_dropped = df2_dropped.drop(match_index)
                break
    
    # remove substring matches in the other direction
    df2_split = df2[df2_dropped].str.split()
    for df2_idx, df2_words in df2_split.iteritems():
        for word in df2_words:
            matches = df1[df1_dropped].str.contains(word, regex=False)
            if any(matches):
                match_index = matches.idxmax() # return index of first match
                result_pairs.append((match_index, df2_idx))
                df1_dropped = df1_dropped.drop(match_index)
                df2_dropped = df2_dropped.drop(df2_idx)
                break
    
    # remove additional unmatched empty items
    df2_dropped = df2_dropped.drop(df2[df2_dropped][df2[df2_dropped] == ''].index)    

    ## short circut for debugging permutation counts
    #print(len(df2_dropped), end=' ')
    #return
    
    # all permutations of remaining indices
    perms = itertools.permutations(df2_dropped)
    print(f'Permutations: {len(df2_dropped)}!', end=' ')

    # generate word vectors and similarity
    if len(df2_dropped) > 1:
        total_distance = []
        df1_reindexed = df1[df1_dropped].reset_index(drop=True)
        for p in tqdm(perms, desc="Permutations", leave=False):
            p = pd.Index(p)
            total_distance.append(
                sum(pd.concat(
                    [df1_reindexed, df2[p].reset_index(drop=True)], axis=1).apply(compare, axis=1).Distance))        
        # find max permutation
        perms_reset = itertools.permutations(df2_dropped) # reset generator
        result_index = pd.Index(next(itertools.islice(perms_reset, total_distance.index(min(total_distance)), None)))
    else:
        perms_reset = itertools.permutations(df2_dropped) # reset generator
        result_index = pd.Index(next(itertools.islice(perms_reset, 0, None)))
    
    # return concatendated dataframe with word vectors
    if result_pairs:
        top_index_left, top_index_right = map(pd.Index, zip(*result_pairs))
    else:
        top_index_left, top_index_right = pd.Index([]), pd.Index([])
    bot_index_left, bot_index_right = df1_dropped, result_index

    df_combined = pd.concat([pd.concat([df1[top_index_left].reset_index(drop=True), 
                                        df2[top_index_right].reset_index(drop=True)], axis=1, ignore_index=True), 
                             pd.concat([df1[bot_index_left].reset_index(drop=True), 
                                        df2[bot_index_right].reset_index(drop=True)], axis=1, ignore_index=True)], 
                            ignore_index=True)
    
    return df_combined

In [83]:
### Test hand alignment against algorithm
# in dfs[0]<->dfs[1] comparison we encounter 12! / 6 / 60 / 60 / 24 = 924 = 3 years run time
def align_by_algo():
    df_final = pd.DataFrame()
    IDs = [130, 153, 135, 137, 141, 114, 121, 127]
    for pid in [121]: #tqdm(IDs, desc="IDs"):
        for session in [5]: #tqdm(dfs[0].loc[dfs[0].ID == pid, 'Session'].unique(), desc="Sessions"):
            print(f'ID: {pid}, Session: {session}')
            display(align(dfs[0].loc[(dfs[0].ID == pid) & (dfs[0].Session == session), 'Item'].reset_index(drop=True),
                          dfs[2].loc[(dfs[2].ID == pid) & (dfs[2].Session == session), 'Item'].reset_index(drop=True)).apply(compare, axis=1))

align_by_algo()

ID: 121, Session: 5
Permutations: 3! 

Permutations: 0it [00:00, ?it/s]

Unnamed: 0,0,1,WordVec,Distance
0,ice cream,ice cream,ice cream,0.0
1,gelato,gelato,gelato,0.0
2,sushi,sushi,sushi,0.0
3,apple pie,apple pie,pie apple,0.0
4,cashews,cashews,cashews,0.0
5,potato salad,potato salad,potato salad,0.0
6,gelato,gelato,gelato,0.0
7,garlic bread,garlic bread,bread garlic,0.0
8,eggs,eggs,eggs,0.0
9,tortillas,tortillas,tortillas,0.0


In [67]:
# TODO: Collect basket outliers for inspection
display(pd.concat([dfs[0].loc[(dfs[0].ID == 121) & (dfs[0].Session == 5), 'Item'].reset_index(drop=True),
                   dfs[2].loc[(dfs[2].ID == 121) & (dfs[2].Session == 5), 'Item'].reset_index(drop=True)],
                      axis=1, ignore_index=True))
#pd.concat([dfs[0].loc[(dfs[0].ID == 121) & (dfs[0].Session == 5), 'Item'].reset_index(drop=True),
#                   dfs[2].loc[(dfs[2].ID == 121) & (dfs[2].Session == 5), 'Item'].reset_index(drop=True)],
#                      axis=1, ignore_index=True).to_csv('sample_raw.csv')
align(dfs[0].loc[(dfs[0].ID == 121) & (dfs[0].Session == 5), 'Item'].reset_index(drop=True),
      dfs[2].loc[(dfs[2].ID == 121) & (dfs[2].Session == 5), 'Item'].reset_index(drop=True)).apply(compare, axis=1).to_csv('sample_aligned.csv')

Unnamed: 0,0,1
0,ice cream,ice cream
1,sparkling ice beverage,sparkling drink
2,bratwurst sausage,bratwurst
3,gelato,gelato
4,sushi,sushi
5,apple pie,apple pie
6,chicken potpie,pot pie
7,cashews,cashews
8,wings,cauliflower wings
9,potato salad,potato salad


Permutations: 3! 

Permutations: 0it [00:00, ?it/s]

In [76]:
#chicken potpie	pot pie	pie pot soup
item1 = 'chicken potpie'.split()
item2 = 'pot pie'.split()
print(word_vectors.n_similarity(item1, item2))
print(word_vectors.most_similar(positive=[*item1, *item2])[0:5])
print(word_vectors.wmdistance(item1, item2))

0.47083616
[('soup', 0.6386921405792236), ('cooked', 0.6072036027908325), ('pies', 0.6055544018745422), ('roast', 0.5986239314079285), ('dish', 0.5927294492721558)]
8.038801811697006
