In [1]:
import pandas as pd

import itertools
from tqdm.auto import tqdm

import gensim.downloader as api

In [2]:
pd.set_option('display.max_rows', 2000)

In [3]:
# https://github.com/RaRe-Technologies/gensim-data#models
# 50, 100, 200, 300
word_vectors = api.load("glove-wiki-gigaword-300")

In [4]:
DATA_PATH = '../Data/'
FILES = ['clean_max.csv', 'clean_mar.csv', 'clean_sam.csv']
COLS = [0, 1, 2, 5]  # Index, ID, Session, Item
DTYPES = {'ID': 'uint8', 'Session': 'uint8', 'Item': str}

dfs = [pd.read_csv(DATA_PATH + file, index_col=0, usecols=COLS, dtype=DTYPES) for file in FILES]

ids_shared = set.intersection(*[set(df.ID.unique()) for df in dfs])
dfs = [df[df.ID.isin(ids_shared)].fillna('').reset_index(drop=True) for df in dfs]

In [5]:
def equalize_length(df1, df2):
    df1_length, df2_length = len(df1), len(df2)
    
    if df1_length > df2_length:    
        df2 = df2.reindex(list(range(df1_length)))
        df2 = df2.fillna('')
    elif df2_length > df1_length:
        df1 = df1.reindex(list(range(df2_length)))
        df1 = df1.fillna('')

    assert len(df1) == len(df2)
    return df1, df2

In [15]:
# TODO: unpack big functions in order to reduce permutations
def compare_v1(df_row):
    result = []
    df_split = df_row.str.split()
    df_split = df_split.fillna('') # hack: somehow a NaN value can sneak in 
    
    shared_words = set.intersection(*map(set, df_split))
    unshared_words = set.symmetric_difference(*map(set, df_split))
    
    shared_subwords = []
    subword_matches = []
    for word1 in unshared_words:
        for word2 in unshared_words:
            if word1 in word2 and word1 != word2:
                shared_subwords.append(word1)
                subword_matches.extend([word1, word2])
    unmatched_words = unshared_words.difference(set(subword_matches))
    
    unmatched_words_in_voc = [word for word in unmatched_words if word in word_vectors.vocab]     
    shared_words_in_voc = [word for word in shared_words if word in word_vectors.vocab]
    unshared_words_in_voc = [word for word in unshared_words if word in word_vectors.vocab]
    words_in_voc = shared_words_in_voc + unshared_words_in_voc
    
    # Matching Case Flow
    # Return values are small identifiable numbers, primarily used as flags, but their ordering is important
    # 1. complete identical match
    # 2. one contains all the words of the other
    # 3. one contains all the words of the other as substrings
    # 4. one unmatched word
    # 5. unmatched words in vocabulary
    # 6. unmatched words but out of vocabulary
    
    # always return identical matches
    result.extend(shared_words)
    
    # 1. complete identical match
    if not unshared_words:  
        df_row['WordVec'] = ' '.join(result)
        df_row['Distance'] = 0
        return df_row
    
    # 2. one contains all the words of the other
    if set(df_split[0]) == shared_words or set(df_split[1]) == shared_words:  
        df_row['WordVec'] = ' '.join(result)
        df_row['Distance'] = 0.123456  
        return df_row
    
    # 3. one contains all the words of the other as substrings
    result.extend(shared_subwords)
    if not unmatched_words:   
        df_row['WordVec'] = ' '.join(result)
        df_row['Distance'] = 0.271828  
        return df_row            
    # 4. one unmatched word
    elif len(unmatched_words) == 1:
        result.append(*unmatched_words)
        df_row['WordVec'] = ' '.join(result)
        df_row['Distance'] = 0.314159       
        return df_row
  
    # 5. unmatched words in vocabulary
    if unmatched_words_in_voc:
        most_similar_key, _ = word_vectors.most_similar(positive=[*shared_words_in_voc, *unmatched_words_in_voc])[0]  # take top result #words_in_voc
        
        # don't append word vector if it repeats a word or subword
        for res in result: 
            if most_similar_key in res or res in most_similar_key:
                df_row['WordVec'] = ' '.join(result)
                df_row['Distance'] = 0.666
                return df_row
            
        result.append(most_similar_key)
        df_row['WordVec'] = ' '.join(result)
        df_row['Distance'] = word_vectors.wmdistance(df_split[0], df_split[1]) # wmdistance handles oov
        return df_row
    # 6. unmatched words but out of vocabulary
    else:
        df_row['WordVec'] = ' '.join(result)
        df_row['Distance'] = 0.999  

In [39]:
# TODO: unpack big functions in order to reduce permutations
def align(df1, df2, count_only=None):
    df1, df2 = equalize_length(df1, df2)
    
    # remove identical matches
    result_pairs = []
    df1_dropped, df2_dropped = df1.index, df2.index
    for df1_idx, df1_word in df1.iteritems():
        matches = df2[df2_dropped].str.fullmatch(df1_word)
        if any(matches):
            match_index = matches.idxmax() # return index of first match
            result_pairs.append((df1_idx, match_index))
            df1_dropped = df1_dropped.drop(df1_idx)
            df2_dropped = df2_dropped.drop(match_index)
    
    # remove substring matches
    df1_split = df1[df1_dropped].str.split()
    for df1_idx, df1_words in df1_split.iteritems():
        for word in df1_words:
            matches = df2[df2_dropped].str.contains(word, regex=False)
            if any(matches):
                match_index = matches.idxmax() # return index of first match
                result_pairs.append((df1_idx, match_index))
                df1_dropped = df1_dropped.drop(df1_idx)
                df2_dropped = df2_dropped.drop(match_index)
                break
    
    # remove substring matches in the other direction
    df2_split = df2[df2_dropped].str.split()
    for df2_idx, df2_words in df2_split.iteritems():
        for word in df2_words:
            matches = df1[df1_dropped].str.contains(word, regex=False)
            if any(matches):
                match_index = matches.idxmax() # return index of first match
                result_pairs.append((match_index, df2_idx))
                df1_dropped = df1_dropped.drop(match_index)
                df2_dropped = df2_dropped.drop(df2_idx)
                break
    
    # remove additional unmatched empty items
    df2_dropped = df2_dropped.drop(df2[df2_dropped][df2[df2_dropped] == ''].index)    

    ## short circut for debugging permutation counts
    if count_only:
        print(f'{len(df2_dropped)}! {list(df2_dropped)}')
        return
    
    # all permutations of remaining indices
    perms = itertools.permutations(df2_dropped)

    # generate word vectors and similarity
    if len(df2_dropped) > 1:
        total_distance = []
        df1_reindexed = df1[df1_dropped].reset_index(drop=True)
        for p in tqdm(perms, desc="Permutations", leave=False):
            p = pd.Index(p)
            total_distance.append(
                sum(pd.concat(
                    [df1_reindexed, df2[p].reset_index(drop=True)], axis=1).apply(compare_v1, axis=1).Distance))        
        # find max permutation
        perms_reset = itertools.permutations(df2_dropped) # reset generator
        result_index = pd.Index(next(itertools.islice(perms_reset, total_distance.index(min(total_distance)), None)))
    else:
        perms_reset = itertools.permutations(df2_dropped) # reset generator
        result_index = pd.Index(next(itertools.islice(perms_reset, 0, None)))
    
    # return concatendated dataframe with word vectors
    if result_pairs:
        top_index_left, top_index_right = map(pd.Index, zip(*result_pairs))
    else:
        top_index_left, top_index_right = pd.Index([]), pd.Index([])
    bot_index_left, bot_index_right = df1_dropped, result_index

    df_combined = pd.concat([pd.concat([df1[top_index_left].reset_index(drop=True), 
                                        df2[top_index_right].reset_index(drop=True)], axis=1, ignore_index=True), 
                             pd.concat([df1[bot_index_left].reset_index(drop=True), 
                                        df2[bot_index_right].reset_index(drop=True)], axis=1, ignore_index=True)], 
                            ignore_index=True)
    
    return df_combined

In [31]:
def align_by_algo():
    df_final = pd.DataFrame()
    IDs = [130, 153, 135, 137, 141, 114, 121, 127]
    for pid in tqdm(IDs, desc="IDs"):
        for session in tqdm(dfs[0].loc[dfs[0].ID == pid, 'Session'].unique(), desc="Sessions"):
            print(f'ID: {pid}, Session: {session}')
            #df_final = pd.concat([df_final, align(dfs[0].loc[(dfs[0].ID == pid) & (dfs[0].Session == session), 'Item'].reset_index(drop=True),
            #                                      dfs[2].loc[(dfs[2].ID == pid) & (dfs[2].Session == session), 'Item'].reset_index(drop=True)).apply(compare_v1, axis=1)],
            #                    ignore_index=True)
    #return df_final
            align(dfs[0].loc[(dfs[0].ID == pid) & (dfs[0].Session == session), 'Item'].reset_index(drop=True), 
                  dfs[1].loc[(dfs[1].ID == pid) & (dfs[1].Session == session), 'Item'].reset_index(drop=True),
                  count_only=True)

In [40]:
align_by_algo()

IDs:   0%|          | 0/8 [00:00<?, ?it/s]

Sessions:   0%|          | 0/4 [00:00<?, ?it/s]

ID: 130, Session: 2
4! [1, 6, 28, 45]
ID: 130, Session: 3
5! [5, 6, 17, 27, 46]
ID: 130, Session: 4
2! [12, 37]
ID: 130, Session: 5
5! [9, 22, 24, 25, 50]


Sessions:   0%|          | 0/5 [00:00<?, ?it/s]

ID: 153, Session: 1
0! []
ID: 153, Session: 2
0! []
ID: 153, Session: 3
0! []
ID: 153, Session: 4
4! [15, 20, 21, 27]
ID: 153, Session: 5
0! []


Sessions:   0%|          | 0/5 [00:00<?, ?it/s]

ID: 135, Session: 1
12! [5, 9, 14, 18, 20, 23, 29, 36, 40, 42, 43, 44]
ID: 135, Session: 2
10! [5, 10, 32, 33, 34, 39, 42, 43, 58, 59]
ID: 135, Session: 4
5! [4, 9, 17, 18, 19]
ID: 135, Session: 5
10! [3, 7, 9, 10, 11, 27, 28, 45, 51, 54]
ID: 135, Session: 6
7! [0, 22, 24, 29, 30, 36, 45]


Sessions:   0%|          | 0/6 [00:00<?, ?it/s]

ID: 137, Session: 1
4! [0, 2, 4, 10]
ID: 137, Session: 2
1! [12]
ID: 137, Session: 3
7! [2, 5, 13, 17, 19, 23, 25]
ID: 137, Session: 4
3! [3, 13, 14]
ID: 137, Session: 5
11! [0, 1, 3, 5, 7, 8, 12, 13, 23, 29, 32]
ID: 137, Session: 6
2! [0, 1]


Sessions:   0%|          | 0/5 [00:00<?, ?it/s]

ID: 141, Session: 1
10! [0, 2, 14, 20, 22, 23, 24, 27, 29, 31]
ID: 141, Session: 2
3! [1, 5, 17]
ID: 141, Session: 3
1! [4]
ID: 141, Session: 4
1! [9]
ID: 141, Session: 5
12! [2, 14, 17, 18, 19, 22, 23, 24, 25, 33, 35, 36]


Sessions:   0%|          | 0/6 [00:00<?, ?it/s]

ID: 114, Session: 1
1! [14]
ID: 114, Session: 2
1! [20]
ID: 114, Session: 3
1! [7]
ID: 114, Session: 4
1! [7]
ID: 114, Session: 5
0! []
ID: 114, Session: 6
2! [3, 21]


Sessions:   0%|          | 0/6 [00:00<?, ?it/s]

ID: 121, Session: 1
7! [9, 10, 20, 21, 27, 36, 41]
ID: 121, Session: 2
7! [1, 5, 6, 9, 19, 35, 39]
ID: 121, Session: 3
3! [23, 26, 28]
ID: 121, Session: 4
5! [0, 2, 9, 10, 18]
ID: 121, Session: 5
19! [2, 9, 17, 18, 19, 23, 29, 30, 31, 32, 33, 37, 38, 42, 49, 53, 54, 65, 66]
ID: 121, Session: 6
4! [0, 10, 22, 35]


Sessions:   0%|          | 0/5 [00:00<?, ?it/s]

ID: 127, Session: 1
8! [0, 8, 10, 25, 26, 34, 41, 42]
ID: 127, Session: 2
4! [0, 13, 14, 21]
ID: 127, Session: 4
4! [4, 17, 22, 30]
ID: 127, Session: 5
7! [6, 12, 14, 23, 27, 35, 53]
ID: 127, Session: 6
1! [10]


In [36]:
pid, session = 135, 1
display(pd.concat([dfs[0].loc[(dfs[0].ID == pid) & (dfs[0].Session == session), 'Item'].reset_index(drop=True), 
                   dfs[1].loc[(dfs[1].ID == pid) & (dfs[1].Session == session), 'Item'].reset_index(drop=True),
                   dfs[2].loc[(dfs[2].ID == pid) & (dfs[2].Session == session), 'Item'].reset_index(drop=True)],
                  ignore_index=True, axis=1))

Unnamed: 0,0,1,2
0,almond milk,almond milk,almond milk
1,half and half,half and half,half and half
2,orange juice,juice,orange juice
3,parmesan cheese,parmesan shredded cheese,shredded parmesan cheese
4,cheese,cheese sargento,block cheese
5,mashed potatoes,cheese bar,mashed potato
6,bacon,mashed potatoes,bacon
7,coffee,bacon,cups coffee
8,pickles,coffee,pickles
9,curry sauce,gherkin,curry cooking sauce


In [10]:
df_merged = align_by_algo()

IDs:   0%|          | 0/8 [00:00<?, ?it/s]

Sessions:   0%|          | 0/4 [00:00<?, ?it/s]

Permutations: 1! Permutations: 1! Permutations: 0! Permutations: 1! 

Sessions:   0%|          | 0/5 [00:00<?, ?it/s]

Permutations: 0! Permutations: 0! Permutations: 0! Permutations: 2! 

Permutations: 0it [00:00, ?it/s]

Permutations: 0! 

Sessions:   0%|          | 0/5 [00:00<?, ?it/s]

Permutations: 0! Permutations: 4! 

Permutations: 0it [00:00, ?it/s]

Permutations: 0! Permutations: 4! 

Permutations: 0it [00:00, ?it/s]

Permutations: 0! 

Sessions:   0%|          | 0/6 [00:00<?, ?it/s]

Permutations: 1! Permutations: 0! Permutations: 0! Permutations: 0! Permutations: 3! 

Permutations: 0it [00:00, ?it/s]

Permutations: 1! 

Sessions:   0%|          | 0/5 [00:00<?, ?it/s]

Permutations: 0! Permutations: 1! Permutations: 1! Permutations: 0! Permutations: 5! 

Permutations: 0it [00:00, ?it/s]

Sessions:   0%|          | 0/6 [00:00<?, ?it/s]

Permutations: 0! Permutations: 0! Permutations: 1! Permutations: 1! Permutations: 0! Permutations: 1! 

Sessions:   0%|          | 0/6 [00:00<?, ?it/s]

Permutations: 0! Permutations: 2! 

Permutations: 0it [00:00, ?it/s]

Permutations: 0! Permutations: 3! 

Permutations: 0it [00:00, ?it/s]

Permutations: 3! 

Permutations: 0it [00:00, ?it/s]

Permutations: 0! 

Sessions:   0%|          | 0/5 [00:00<?, ?it/s]

Permutations: 1! Permutations: 0! Permutations: 2! 

Permutations: 0it [00:00, ?it/s]

Permutations: 1! Permutations: 1! 

In [11]:
display(df_merged[~df_merged.Distance.isin([0, 0.123456, 0.271828, 0.314159])])

Unnamed: 0,0,1,WordVec,Distance
50,pepsi soda,cola diet soda,soda coke,4.099474
52,veggie mix,mixed vegetables,mix vegetable,7.239677
53,york candy,york peppermint patties,york chocolate,5.708718
57,black beans,baked beans,beans potatoes,4.336012
59,whipped cream,whipped topping,whipped chocolate,3.583635
60,gillette shave item,shaving product,products,6.753878
105,hot dog buns,hotdog rolls,dog hot dumplings,8.015231
118,chocolate mint candy,mint chocolate patties,chocolate mint cream,2.941059
120,hot dogs,beef franks,meat,8.942181
168,haddock fish,haddock fillets,haddock fillet,4.24766


In [12]:
align_case_counts = df_merged.Distance.value_counts(normalize=True)
print(align_case_counts.iloc[0:6])
print(f'{1 - sum(align_case_counts.iloc[0:5]):%} using word vectors')

0.000000    0.664938
0.123456    0.211277
0.271828    0.023979
0.314159    0.014258
0.666000    0.009073
3.006759    0.003889
Name: Distance, dtype: float64
7.647440% using word vectors


In [14]:
#hot dogs	beef franks	meat	8.942181
item1 = 'dog'.split()
item2 = 'meat'.split()
print(word_vectors.n_similarity(item1, item2))
print(word_vectors.most_similar(positive=[*item1, *item2])[0:5])
print(word_vectors.wmdistance(item1, item2))

0.37929165
[('dogs', 0.7001258134841919), ('animal', 0.637894868850708), ('chicken', 0.6358696818351746), ('beef', 0.6147623062133789), ('eating', 0.60530686378479)]
7.839144706726074
