In [1]:
import pandas as pd

import itertools
from tqdm.auto import tqdm

import gensim.downloader as api
word_vectors = api.load("glove-wiki-gigaword-50")

In [2]:
DATA_PATH = '../Data/'
FILES = ['clean_max.csv', 'clean_mar.csv', 'clean_sam.csv']
#COLS = [0, 1, 14, 5]  # Index, ID, Basket, Item
COLS = [0, 1, 2, 5]  # Index, ID, Session, Item
DTYPES = {'ID': 'uint8', 'Session': 'uint8', 'Item': str}

dfs = [pd.read_csv(DATA_PATH + file, index_col=0, usecols=COLS, dtype=DTYPES) for file in FILES]

ids_shared = set.intersection(*[set(df.ID.unique()) for df in dfs])
dfs = [df[df.ID.isin(ids_shared)].fillna('').reset_index(drop=True) for df in dfs]

In [3]:
def equalize_length(df1, df2):
    df1_length, df2_length = len(df1), len(df2)
    
    if df1_length > df2_length:    
        df2 = df2.reindex(list(range(df1_length)))
        df2 = df2.fillna('')
    elif df2_length > df1_length:
        df1 = df1.reindex(list(range(df2_length)))
        df1 = df1.fillna('')

    assert len(df1) == len(df2)
    return df1, df2

In [4]:
# TODO: unpack big functions in order to reduce permutations
def compare(df_row):
    result = []
    df_split = df_row.str.split()
  
    # return shared words
    shared_words = set.intersection(*map(set, df_split))
    unshared_words = set.symmetric_difference(*map(set, df_split))
    if shared_words:
        result.extend(shared_words)
        # if both phrases are exhausted return
        if not unshared_words:
            df_row['WordVec'] = ' '.join(result)
            df_row['Distance'] = 0
            return df_row
    
    # discard words outside vocabulary
    df_in_voc = [[word for word in phrase if word in word_vectors.vocab] for phrase in df_split]
    if not any(df_in_voc):
        df_row['WordVec'] = ' '.join([*df_row])
        df_row['Distance'] = 999  # large number smaller than inf
        return df_row
    
    # use word vectors to average remaining words
    #words = [word for text in df_in_voc for word in text]
    most_similar_key, _ = word_vectors.most_similar(positive=[*df_in_voc[0], *df_in_voc[1]])[0]  # take top result
    result.append(most_similar_key)
    df_row['WordVec'] = ' '.join(result)
    df_row['Distance'] = word_vectors.wmdistance(df_split[0], df_split[1])
    return df_row

In [5]:
# TODO: unpack big functions in order to reduce permutations
def align(df1, df2):
    df1, df2 = equalize_length(df1, df2)
    
    # remove identical matches
    result_pairs = []
    df1_dropped, df2_dropped = df1.index, df2.index
    for df1_idx, df1_word in df1.iteritems():
        matches = df2[df2_dropped].str.fullmatch(df1_word)
        if any(matches):
            match_index = matches.idxmax() # return index of first match
            result_pairs.append((df1_idx, match_index))
            df1_dropped = df1_dropped.drop(df1_idx)
            df2_dropped = df2_dropped.drop(match_index)
    
    # remove substring matches
    df1_split = df1[df1_dropped].str.split()
    for df1_idx, df1_words in df1_split.iteritems():
        for word in df1_words:
            matches = df2[df2_dropped].str.contains(word, regex=False)
            if any(matches):
                match_index = matches.idxmax() # return index of first match
                result_pairs.append((df1_idx, match_index))
                df1_dropped = df1_dropped.drop(df1_idx)
                df2_dropped = df2_dropped.drop(match_index)
                break
    
    # remove substring matches in the other direction
    df2_split = df2[df2_dropped].str.split()
    for df2_idx, df2_words in df2_split.iteritems():
        for word in df2_words:
            matches = df1[df1_dropped].str.contains(word, regex=False)
            if any(matches):
                match_index = matches.idxmax() # return index of first match
                result_pairs.append((match_index, df2_idx))
                df1_dropped = df1_dropped.drop(match_index)
                df2_dropped = df2_dropped.drop(df2_idx)
                break
    
    # remove additional unmatched empty items
    df2_dropped = df2_dropped.drop(df2[df2_dropped][df2[df2_dropped] == ''].index)
    #df2_dropped = df2_dropped.drop(df2[df2_dropped].isna().index)
    #print(df2[df2_dropped][df2[df2_dropped] == ''].index)
    

    print(len(df2_dropped), end=' ')
    return
    # all permutations of remaining indices
    perms = list(itertools.permutations(df2_dropped))
    print(len(perms), end=' ')
    
    # TODO: this might be needed
    # df1['WordVec'], df2['WordVec'] = None, None
    # df1['Distance'], df2['Distance'] = 99, 99
    # generate word vectors and similarity
    if len(perms) > 1:
        total_distance = []
        df1_reindexed = df1[df1_dropped].reset_index(drop=True)
        for p in tqdm(perms, desc="Permutations", leave=False):
            p = pd.Index(p)
            total_distance.append(
                sum(pd.concat(
                    [df1_reindexed, df2[p].reset_index(drop=True)], axis=1).apply(compare, axis=1).Distance))        
        # find max permutation
        result_index = pd.Index(perms[total_distance.index(min(total_distance))])
    else:
        result_index = pd.Index(perms[0])
    
    # return concatendated dataframe with word vectors
    top_index_left, top_index_right = map(pd.Index, zip(*result_pairs))
    bot_index_left, bot_index_right = df1_dropped, result_index

    df_combined = pd.concat([pd.concat([df1[top_index_left].reset_index(drop=True), 
                                        df2[top_index_right].reset_index(drop=True)], axis=1, ignore_index=True), 
                             pd.concat([df1[bot_index_left].reset_index(drop=True), 
                                        df2[bot_index_right].reset_index(drop=True)], axis=1, ignore_index=True)], 
                            ignore_index=True)
    
    return df_combined

In [19]:
### Test hand alignment against algorithm
# probably depricated as baskets no longer exist
def align_by_algo():
    #12 total baskets
    df_final = pd.DataFrame()
    for basket in tqdm(range(1,2), desc="Basket"):
        align(dfs[0].loc[(dfs[0].ID == 137) & (dfs[0].Basket == basket), 'Item'].reset_index(drop=True),
              dfs[2].loc[(dfs[2].ID == 137) & (dfs[2].Basket == basket), 'Item'].reset_index(drop=True))
        #df_final = pd.concat([df_final, 
        #                      align(dfs[0].loc[(dfs[0].ID == 137) & (dfs[0].Basket == basket), 'Item'].reset_index(drop=True),
        #                            dfs[2].loc[(dfs[2].ID == 137) & (dfs[2].Basket == basket), 'Item'].reset_index(drop=True))], ignore_index=True)
    #display(df_final.apply(compare, axis=1))

#align_by_algo()
    
def align_count_free_rows():
    for pid in tqdm(ids_shared, desc="IDs"):
        print(f'PID={pid}:', end=' ')
        for session in range(1, 7):
            align(dfs[1].loc[(dfs[1].ID == pid) & (dfs[1].Session == session), 'Item'].reset_index(drop=True),
                  dfs[0].loc[(dfs[0].ID == pid) & (dfs[0].Session == session), 'Item'].reset_index(drop=True))
        print()
        
#align_count_free_rows()

IDs:   0%|          | 0/8 [00:00<?, ?it/s]

PID=130: 0 4 4 3 3 0 
PID=153: 4 9 0 3 5 0 
PID=135: 10 11 0 2 11 7 
PID=137: 7 1 6 2 6 2 
PID=141: 10 3 3 3 19 0 
PID=114: 1 1 1 1 1 2 
PID=121: 9 3 4 3 9 4 
PID=127: 3 8 0 5 9 3 


In [7]:
### Align Data Sets by hand
def align_by_hand():
    # ID 137 and 114 have low variation
    df3 = dfs[2].loc[dfs[2].ID == 137, 'Item'].copy().reset_index(drop=True)

    # align by inspecting for proof of concept
    df3_aligned = df3.drop([102]).reset_index(drop=True)
    
    df_hand_aligned = pd.concat([dfs[0].loc[dfs[0].ID == 137, 'Item'].reset_index(drop=True), df3_aligned], axis=1)
    df_hand_aligned = df_hand_aligned.apply(compare, axis=1)
    display(df_hand_aligned)
    
#align_by_hand()

In [18]:
# TODO: Collect basket outliers for inspection
# 0-1 MANY and 1-2 MANY
pd.set_option('display.max_rows', 500)

display(pd.concat([dfs[0].loc[(dfs[0].ID == 135) & (dfs[0].Session == 6), 'Item'].reset_index(drop=True),
                   dfs[1].loc[(dfs[1].ID == 135) & (dfs[1].Session == 6), 'Item'].reset_index(drop=True)],
                      axis=1, ignore_index=True))

Unnamed: 0,0,1
0,hair product,shampoo
1,romaine lettuce,romaine hearts
2,red grapes,red grapes
3,white grapes,grapes
4,gouda cheese,gouda cheese
5,manchego cheese,manchego cheese
6,gouda with bacon cheese,blackberries
7,orange juice,tomato
8,santa fe salad,eggs
9,blackberries,noodles


In [9]:
item1 = 'cleaning spray'.split()
item2 = 'glass wipes'.split()
print(word_vectors.n_similarity(item1, item2))
print(word_vectors.most_similar(positive=[*item1, *item2])[0])
print(word_vectors.wmdistance(item1, item2))

0.70241946
('plastic', 0.8458467721939087)
4.836613549687386
