In [1]:
import pandas as pd
import numpy as np

import itertools
from tqdm.auto import tqdm

import gensim.downloader as api

In [2]:
pd.set_option('display.max_rows', 1000)

In [3]:
%%time
# https://github.com/RaRe-Technologies/gensim-data#models
# 50, 100, 200, 300
word_vectors = api.load("glove-wiki-gigaword-300")

Wall time: 1min 34s


In [15]:
%%time
DATA_PATH = '../Data/'
FILES = ['receipts_clean_max.csv', 'receipts_clean_maria.csv', 'receipts_clean_samantha.csv']
COLS = [0, 1, 2, 3, 4]  # Index, ID, Session, Receipt, Item
DTYPES = {'ID': 'uint8', 'Session': 'uint8', 'Receipt': 'uint8', 'Item': 'string'}

dfs = [pd.read_csv(DATA_PATH + file, index_col=0, usecols=COLS, dtype=DTYPES) for file in FILES]

ids_shared = set.intersection(*[set(df.ID.unique()) for df in dfs])
print(f'IDs for merging: {ids_shared}')
dfs = [df[df.ID.isin(ids_shared)].reset_index(drop=True) for df in dfs] # .fillna('')

Wall time: 118 ms


In [44]:
for df in dfs:
    df.info()
    print()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 895 entries, 0 to 894
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ID       895 non-null    uint8 
 1   Session  895 non-null    uint8 
 2   Receipt  895 non-null    uint8 
 3   Item     895 non-null    string
dtypes: string(1), uint8(3)
memory usage: 9.7 KB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 864 entries, 0 to 863
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ID       864 non-null    uint8 
 1   Session  864 non-null    uint8 
 2   Receipt  864 non-null    uint8 
 3   Item     864 non-null    string
dtypes: string(1), uint8(3)
memory usage: 9.4 KB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 898 entries, 0 to 897
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ID       898 non-null    uint8 
 1   Session  898 non-null    uin

In [6]:
def equalize_length(df1, df2):
    df1_length, df2_length = len(df1), len(df2)
    
    if df1_length > df2_length:    
        df2 = df2.reindex(list(range(df1_length))).fillna('')
    elif df2_length > df1_length:
        df1 = df1.reindex(list(range(df2_length))).fillna('')

    assert len(df1) == len(df2)
    return df1, df2

In [7]:
def compare_naive(df_row):
    result = []
    df_split = df_row.str.split()
    df_split = df_split.fillna('') # hack: somehow a NaN value can sneak in 
    
    shared_words = set.intersection(*map(set, df_split))
    unshared_words = set.symmetric_difference(*map(set, df_split))
    
    shared_subwords = []
    subword_matches = []
    for word1 in unshared_words:
        for word2 in unshared_words:
            if word1 in word2 and word1 != word2:
                shared_subwords.append(word1)
                subword_matches.extend([word1, word2])
    unmatched_words = unshared_words.difference(set(subword_matches))
    
    unmatched_words_in_voc = [word for word in unmatched_words if word in word_vectors.vocab]     
    shared_words_in_voc = [word for word in shared_words if word in word_vectors.vocab]
    unshared_words_in_voc = [word for word in unshared_words if word in word_vectors.vocab]
    words_in_voc = shared_words_in_voc + unshared_words_in_voc
    
    # Matching Case Flow
    # Return values are small identifiable numbers, primarily used as flags, but their ordering is important
    # 1. complete identical match
    # 2. one contains all the words of the other
    # 3. one contains all the words of the other as substrings
    # 4. one unmatched word
    # 5. unmatched words in vocabulary
    # 6. unmatched words but out of vocabulary
    
    # always return identical matches
    result.extend(shared_words)
    
    # 1. complete identical match
    if not unshared_words:  
        df_row['WordVec'] = ' '.join(result)
        df_row['Distance'] = 0
        return df_row
    
    # 2. one contains all the words of the other
    if set(df_split[0]) == shared_words or set(df_split[1]) == shared_words:  
        df_row['WordVec'] = ' '.join(result)
        df_row['Distance'] = 0.123456  
        return df_row
    
    # 3. one contains all the words of the other as substrings
    result.extend(shared_subwords)
    if not unmatched_words:   
        df_row['WordVec'] = ' '.join(result)
        df_row['Distance'] = 0.271828  
        return df_row            
    # 4. one unmatched word
    elif len(unmatched_words) == 1:
        result.append(*unmatched_words)
        df_row['WordVec'] = ' '.join(result)
        df_row['Distance'] = 0.314159       
        return df_row
  
    # 5. unmatched words in vocabulary
    if unmatched_words_in_voc:
        most_similar_key, _ = word_vectors.most_similar(positive=[*shared_words_in_voc, *unmatched_words_in_voc], topn=1)[0]  # take top result #words_in_voc
        
        # don't append word vector if it repeats a word or subword
        for res in result: 
            if most_similar_key in res or res in most_similar_key:
                df_row['WordVec'] = ' '.join(result)
                df_row['Distance'] = 0.666
                return df_row
            
        result.append(most_similar_key)
        df_row['WordVec'] = ' '.join(result)
        df_row['Distance'] = word_vectors.wmdistance(df_split[0], df_split[1]) # wmdistance handles oov
        return df_row
    # 6. unmatched words but out of vocabulary
    else:
        df_row['WordVec'] = ' '.join(result)
        df_row['Distance'] = 0.999  

In [8]:
def align(df1, df2, count_only=False):
    df1, df2 = equalize_length(df1, df2)
    
    # remove identical matches
    result_pairs = []
    df1_dropped, df2_dropped = df1.index, df2.index
    for df1_idx, df1_word in df1.iteritems():
        matches = df2[df2_dropped].str.fullmatch(df1_word)
        if any(matches):
            match_index = matches.idxmax() # return index of first match
            result_pairs.append((df1_idx, match_index))
            df1_dropped = df1_dropped.drop(df1_idx)
            df2_dropped = df2_dropped.drop(match_index)
    
    # remove substring matches
    df1_split = df1[df1_dropped].str.split()
    for df1_idx, df1_words in df1_split.iteritems():
        for word in df1_words:
            matches = df2[df2_dropped].str.contains(word, regex=False)
            if any(matches):
                match_index = matches.idxmax() # return index of first match
                result_pairs.append((df1_idx, match_index))
                df1_dropped = df1_dropped.drop(df1_idx)
                df2_dropped = df2_dropped.drop(match_index)
                break
    
    # remove substring matches in the other direction
    df2_split = df2[df2_dropped].str.split()
    for df2_idx, df2_words in df2_split.iteritems():
        for word in df2_words:
            matches = df1[df1_dropped].str.contains(word, regex=False)
            if any(matches):
                match_index = matches.idxmax() # return index of first match
                result_pairs.append((match_index, df2_idx))
                df1_dropped = df1_dropped.drop(match_index)
                df2_dropped = df2_dropped.drop(df2_idx)
                break
    
    # remove additional unmatched empty items
    df2_dropped = df2_dropped.drop(df2[df2_dropped][df2[df2_dropped] == ''].index)    

    ## short circut for debugging permutation counts
    if count_only:
        print(f'{len(df2_dropped)}! {list(df2_dropped)}')
        return
    
    # all permutations of remaining indices
    perms = itertools.permutations(df2_dropped)

    # generate word vectors and similarity
    if len(df2_dropped) > 1:
        total_distance = []
        df1_reindexed = df1[df1_dropped].reset_index(drop=True)
        for p in tqdm(perms, desc="Permutations", leave=False):
            p = pd.Index(p)
            total_distance.append(
                sum(pd.concat(
                    [df1_reindexed, df2[p].reset_index(drop=True)], axis=1).apply(compare_naive, axis=1).Distance))        
        # find max permutation
        perms_reset = itertools.permutations(df2_dropped) # reset generator
        result_index = pd.Index(next(itertools.islice(perms_reset, total_distance.index(min(total_distance)), None)))
    else:
        perms_reset = itertools.permutations(df2_dropped) # reset generator
        result_index = pd.Index(next(itertools.islice(perms_reset, 0, None)))
    
    # return concatendated dataframe with word vectors
    if result_pairs:
        top_index_left, top_index_right = map(pd.Index, zip(*result_pairs))
    else:
        top_index_left, top_index_right = pd.Index([]), pd.Index([])
    bot_index_left, bot_index_right = df1_dropped, result_index

    df_combined = pd.concat([pd.concat([df1[top_index_left].reset_index(drop=True), 
                                        df2[top_index_right].reset_index(drop=True)], axis=1, ignore_index=True), 
                             pd.concat([df1[bot_index_left].reset_index(drop=True), 
                                        df2[bot_index_right].reset_index(drop=True)], axis=1, ignore_index=True)], 
                            ignore_index=True)
    
    return df_combined

In [9]:
def align_df(count_only=False):
    df_final = pd.DataFrame()
    IDs = [130, 153, 135, 137, 141, 114, 121, 127]
    for pid in tqdm(IDs, desc="IDs"):
        for session in tqdm(dfs[0].loc[dfs[0].ID == pid, 'Session'].unique(), desc="Sessions"):
            print(f'ID: {pid}, Session: {session}')
            if count_only:
                df_final = align(dfs[0].loc[(dfs[0].ID == pid) & (dfs[0].Session == session), 'Item'].reset_index(drop=True), 
                                 dfs[1].loc[(dfs[1].ID == pid) & (dfs[1].Session == session), 'Item'].reset_index(drop=True),
                                 count_only)
            else:
                df_final = pd.concat([df_final, align(dfs[0].loc[(dfs[0].ID == pid) & (dfs[0].Session == session), 'Item'].reset_index(drop=True),
                                                      dfs[2].loc[(dfs[2].ID == pid) & (dfs[2].Session == session), 'Item'].reset_index(drop=True), 
                                                      count_only).apply(compare_naive, axis=1)],
                                     ignore_index=True)
    return df_final


In [37]:
def align_by_receipt(count_only=False):
    df_final = pd.DataFrame()
    IDs = [130, 153, 135, 137, 141, 114, 121, 127]
    for pid in tqdm(IDs, desc="IDs"):
        for session in tqdm(dfs[0].loc[dfs[0].ID == pid, 'Session'].unique(), desc="Sessions"):
            for receipt in tqdm(dfs[0].loc[(dfs[0].ID == pid) & (dfs[0].Session == session), 'Receipt'].unique(), desc="Receipts"):
                print(f'ID: {pid}, Session: {session}, Receipt: {receipt}')
                if count_only:
                    df_final = align(dfs[0].loc[(dfs[0].ID == pid) & (dfs[0].Session == session) & (dfs[0].Receipt == receipt), 'Item'].reset_index(drop=True), 
                                     dfs[2].loc[(dfs[2].ID == pid) & (dfs[2].Session == session) & (dfs[2].Receipt == receipt), 'Item'].reset_index(drop=True),
                                     count_only)
                else:
                    df_final = pd.concat([df_final, align(dfs[0].loc[(dfs[0].ID == pid) & (dfs[0].Session == session) & (dfs[0].Receipt == receipt), 'Item'].reset_index(drop=True),
                                                          dfs[1].loc[(dfs[1].ID == pid) & (dfs[1].Session == session) & (dfs[1].Receipt == receipt), 'Item'].reset_index(drop=True), 
                                                          count_only).apply(compare_naive, axis=1)],
                                         ignore_index=True)
    return df_final


In [38]:
df_merged = align_by_receipt()

IDs:   0%|          | 0/8 [00:00<?, ?it/s]

Sessions:   0%|          | 0/4 [00:00<?, ?it/s]

Receipts:   0%|          | 0/5 [00:00<?, ?it/s]

ID: 130, Session: 2, Receipt: 1


Permutations: 0it [00:00, ?it/s]

ID: 130, Session: 2, Receipt: 2
ID: 130, Session: 2, Receipt: 3


Permutations: 0it [00:00, ?it/s]

ID: 130, Session: 2, Receipt: 4
ID: 130, Session: 2, Receipt: 5


Permutations: 0it [00:00, ?it/s]

Receipts:   0%|          | 0/7 [00:00<?, ?it/s]

ID: 130, Session: 3, Receipt: 1
ID: 130, Session: 3, Receipt: 2


Permutations: 0it [00:00, ?it/s]

ID: 130, Session: 3, Receipt: 3


Permutations: 0it [00:00, ?it/s]

ID: 130, Session: 3, Receipt: 4
ID: 130, Session: 3, Receipt: 5
ID: 130, Session: 3, Receipt: 6
ID: 130, Session: 3, Receipt: 7


Receipts:   0%|          | 0/4 [00:00<?, ?it/s]

ID: 130, Session: 4, Receipt: 1
ID: 130, Session: 4, Receipt: 2
ID: 130, Session: 4, Receipt: 3


Permutations: 0it [00:00, ?it/s]

ID: 130, Session: 4, Receipt: 4


Receipts:   0%|          | 0/4 [00:00<?, ?it/s]

ID: 130, Session: 5, Receipt: 1
ID: 130, Session: 5, Receipt: 2


Permutations: 0it [00:00, ?it/s]

ID: 130, Session: 5, Receipt: 3
ID: 130, Session: 5, Receipt: 4


Sessions:   0%|          | 0/2 [00:00<?, ?it/s]

Receipts:   0%|          | 0/2 [00:00<?, ?it/s]

ID: 153, Session: 1, Receipt: 1
ID: 153, Session: 1, Receipt: 2


Receipts:   0%|          | 0/2 [00:00<?, ?it/s]

ID: 153, Session: 5, Receipt: 1
ID: 153, Session: 5, Receipt: 2


Sessions:   0%|          | 0/2 [00:00<?, ?it/s]

Receipts:   0%|          | 0/4 [00:00<?, ?it/s]

ID: 135, Session: 1, Receipt: 1
ID: 135, Session: 1, Receipt: 2


Permutations: 0it [00:00, ?it/s]

ID: 135, Session: 1, Receipt: 3


Permutations: 0it [00:00, ?it/s]

ID: 135, Session: 1, Receipt: 4


Receipts:   0%|          | 0/8 [00:00<?, ?it/s]

ID: 135, Session: 2, Receipt: 1


Permutations: 0it [00:00, ?it/s]

ID: 135, Session: 2, Receipt: 2
ID: 135, Session: 2, Receipt: 3
ID: 135, Session: 2, Receipt: 4


Permutations: 0it [00:00, ?it/s]

ID: 135, Session: 2, Receipt: 5
ID: 135, Session: 2, Receipt: 6


Permutations: 0it [00:00, ?it/s]

ID: 135, Session: 2, Receipt: 7
ID: 135, Session: 2, Receipt: 8


Sessions:   0%|          | 0/2 [00:00<?, ?it/s]

Receipts:   0%|          | 0/2 [00:00<?, ?it/s]

ID: 137, Session: 1, Receipt: 1


Permutations: 0it [00:00, ?it/s]

ID: 137, Session: 1, Receipt: 2


Receipts:   0%|          | 0/2 [00:00<?, ?it/s]

ID: 137, Session: 4, Receipt: 1


Permutations: 0it [00:00, ?it/s]

ID: 137, Session: 4, Receipt: 2


Sessions: 0it [00:00, ?it/s]

Sessions:   0%|          | 0/5 [00:00<?, ?it/s]

Receipts:   0%|          | 0/3 [00:00<?, ?it/s]

ID: 114, Session: 1, Receipt: 1
ID: 114, Session: 1, Receipt: 2
ID: 114, Session: 1, Receipt: 3


Receipts:   0%|          | 0/2 [00:00<?, ?it/s]

ID: 114, Session: 2, Receipt: 1
ID: 114, Session: 2, Receipt: 2


Receipts:   0%|          | 0/2 [00:00<?, ?it/s]

ID: 114, Session: 3, Receipt: 1
ID: 114, Session: 3, Receipt: 2


Receipts:   0%|          | 0/1 [00:00<?, ?it/s]

ID: 114, Session: 5, Receipt: 1


Receipts:   0%|          | 0/3 [00:00<?, ?it/s]

ID: 114, Session: 6, Receipt: 1
ID: 114, Session: 6, Receipt: 2
ID: 114, Session: 6, Receipt: 3


Sessions: 0it [00:00, ?it/s]

Sessions:   0%|          | 0/4 [00:00<?, ?it/s]

Receipts:   0%|          | 0/3 [00:00<?, ?it/s]

ID: 127, Session: 1, Receipt: 1


Permutations: 0it [00:00, ?it/s]

ID: 127, Session: 1, Receipt: 2


Permutations: 0it [00:00, ?it/s]

ID: 127, Session: 1, Receipt: 3


Permutations: 0it [00:00, ?it/s]

Receipts:   0%|          | 0/3 [00:00<?, ?it/s]

ID: 127, Session: 2, Receipt: 1


Permutations: 0it [00:00, ?it/s]

ID: 127, Session: 2, Receipt: 2


Permutations: 0it [00:00, ?it/s]

ID: 127, Session: 2, Receipt: 3


Permutations: 0it [00:00, ?it/s]

Receipts:   0%|          | 0/4 [00:00<?, ?it/s]

ID: 127, Session: 5, Receipt: 1


Permutations: 0it [00:00, ?it/s]

ID: 127, Session: 5, Receipt: 2
ID: 127, Session: 5, Receipt: 3
ID: 127, Session: 5, Receipt: 4


Receipts:   0%|          | 0/2 [00:00<?, ?it/s]

ID: 127, Session: 6, Receipt: 1
ID: 127, Session: 6, Receipt: 2


In [39]:
df_merged

Unnamed: 0,0,1,WordVec,Distance
0,chicken broth,chicken broth,chicken broth,0.0
1,italian bread,italian bread,bread italian,0.0
2,apple fritter,apple fritter,fritter apple,0.0
3,eggs,eggs,eggs,0.0
4,ham,ham,ham,0.0
5,peaches,peaches,peaches,0.0
6,grapes,grapes,grapes,0.0
7,carrots,carrots,carrots,0.0
8,bananas,bananas,bananas,0.0
9,cabbage,cabbage,cabbage,0.0


In [31]:
# ID: 135, Session: 2, Receipt: 6
# 8! [0, 1, 2, 10, 14, 15, 16, 17]
pd.concat([dfs[0].loc[(dfs[0].ID == 135) & (dfs[0].Session == 2) & (dfs[0].Receipt == 6), 'Item'].reset_index(drop=True),
           dfs[1].loc[(dfs[1].ID == 135) & (dfs[1].Session == 2) & (dfs[1].Receipt == 6), 'Item'].reset_index(drop=True),
           dfs[2].loc[(dfs[2].ID == 135) & (dfs[2].Session == 2) & (dfs[2].Receipt == 6), 'Item'].reset_index(drop=True)],
          axis=1, ignore_index=True)

Unnamed: 0,0,1,2
0,laundry detergent,fabric softener,laundry detergent softener
1,potato chips,wavy crisps,potato chips
2,potato chips,wavy crisps,potato chips
3,caramel,quinoa brown bread,caramels
4,basmiti rice,basmiti rice,basmati rice
5,quinoa,white grain bread,quiona
6,whole grain rice,tomato soup,whole grain rice
7,tomato soup,tomato soup,tomato soup
8,tomato soup,tomato soup,tomato soup
9,tomato soup,tomato soup,tomato soup


In [27]:
dfs[1].loc[(dfs[1].ID == 135) & (dfs[1].Session == 2) & (dfs[1].Receipt == 6) & (dfs[1].Item == 'quino brown bread'), 'Item'] = 'quinoa brown bread'
dfs[1].loc[(dfs[1].ID == 135) & (dfs[1].Session == 2) & (dfs[1].Receipt == 6) & (dfs[1].Item == 'sweetner'), 'Item'] = 'sweetener'