In [1]:
import pandas as pd

import itertools
from tqdm.auto import tqdm

import gensim.downloader as api

In [11]:
pd.set_option('display.max_rows', 500)

In [26]:
# https://github.com/RaRe-Technologies/gensim-data#models
# 50, 100, 200, 300
word_vectors = api.load("glove-wiki-gigaword-100")

In [2]:
DATA_PATH = '../Data/'
FILES = ['clean_max.csv', 'clean_mar.csv', 'clean_sam.csv']
#COLS = [0, 1, 14, 5]  # Index, ID, Basket, Item
COLS = [0, 1, 2, 5]  # Index, ID, Session, Item
DTYPES = {'ID': 'uint8', 'Session': 'uint8', 'Item': str}

dfs = [pd.read_csv(DATA_PATH + file, index_col=0, usecols=COLS, dtype=DTYPES) for file in FILES]

ids_shared = set.intersection(*[set(df.ID.unique()) for df in dfs])
dfs = [df[df.ID.isin(ids_shared)].fillna('').reset_index(drop=True) for df in dfs]

In [3]:
def equalize_length(df1, df2):
    df1_length, df2_length = len(df1), len(df2)
    
    if df1_length > df2_length:    
        df2 = df2.reindex(list(range(df1_length)))
        df2 = df2.fillna('')
    elif df2_length > df1_length:
        df1 = df1.reindex(list(range(df2_length)))
        df1 = df1.fillna('')

    assert len(df1) == len(df2)
    return df1, df2

In [62]:
# TODO: unpack big functions in order to reduce permutations
def compare(df_row):
    result = []
    df_split = df_row.str.split()
    df_split = df_split.fillna('') # hack: somehow a NaN value can 
  
    # return shared words
    shared_words = set.intersection(*map(set, df_split))
    unshared_words = set.symmetric_difference(*map(set, df_split))
    if shared_words:
        result.extend(shared_words)
        # if both phrases are exhausted return
        if not unshared_words:
            df_row['WordVec'] = ' '.join(result)
            df_row['Distance'] = 0
            return df_row
        elif set(df_split[0]) == shared_words or set(df_split[1]) == shared_words:
            df_row['WordVec'] = ' '.join(result)
            df_row['Distance'] = 0.31415  # small identifiable number (not magic)
            return df_row
    
    # return matching substrings
    shared_subwords = []
    for word1 in unshared_words:
        for word2 in unshared_words:
            if word1 in word2 and word1 != word2:
                result.append(word1)
                shared_subwords.extend([word1, word2])
    unshared_words = unshared_words.difference(set(shared_subwords))
    
    # discard words outside vocabulary
    #df_in_voc = [[word for word in phrase if word in word_vectors.vocab] for phrase in df_split]
    #words = [word for text in df_in_voc for word in text if word not in result]
    words = [word for word in unshared_words if word in word_vectors.vocab]
    if not any(words):
        if result:
            df_row['WordVec'] = ' '.join(result)
            df_row['Distance'] = 0.1234  # small identifiable number (not magic)
            return df_row            
        else:
            # print(f'{df_row} is fully OOV') TODO: this is catching seemingly empty strings
            df_row['WordVec'] = ' '.join([*df_row])
            df_row['Distance'] = 0.271828  # small identifiable number (not magic)
            return df_row
    
    # use word vectors to average unshared words
    #words = [word for text in df_in_voc for word in text if word not in shared_words]
    #words = [word for text in df_in_voc for word in text if word not in result]
    most_similar_key, _ = word_vectors.most_similar(positive=[*words])[0]  # take top result
    #most_similar_key, _ = word_vectors.most_similar(positive=[*df_in_voc[0], *df_in_voc[1]])[0]  # take top result
    result.append(most_similar_key)
    df_row['WordVec'] = ' '.join(result)
    df_row['Distance'] = word_vectors.wmdistance(df_split[0], df_split[1])
    return df_row

In [54]:
# TODO: unpack big functions in order to reduce permutations
def align(df1, df2):
    df1, df2 = equalize_length(df1, df2)
    
    # remove identical matches
    result_pairs = []
    df1_dropped, df2_dropped = df1.index, df2.index
    for df1_idx, df1_word in df1.iteritems():
        matches = df2[df2_dropped].str.fullmatch(df1_word)
        if any(matches):
            match_index = matches.idxmax() # return index of first match
            result_pairs.append((df1_idx, match_index))
            df1_dropped = df1_dropped.drop(df1_idx)
            df2_dropped = df2_dropped.drop(match_index)
    
    # remove substring matches
    df1_split = df1[df1_dropped].str.split()
    for df1_idx, df1_words in df1_split.iteritems():
        for word in df1_words:
            matches = df2[df2_dropped].str.contains(word, regex=False)
            if any(matches):
                match_index = matches.idxmax() # return index of first match
                result_pairs.append((df1_idx, match_index))
                df1_dropped = df1_dropped.drop(df1_idx)
                df2_dropped = df2_dropped.drop(match_index)
                break
    
    # remove substring matches in the other direction
    df2_split = df2[df2_dropped].str.split()
    for df2_idx, df2_words in df2_split.iteritems():
        for word in df2_words:
            matches = df1[df1_dropped].str.contains(word, regex=False)
            if any(matches):
                match_index = matches.idxmax() # return index of first match
                result_pairs.append((match_index, df2_idx))
                df1_dropped = df1_dropped.drop(match_index)
                df2_dropped = df2_dropped.drop(df2_idx)
                break
    
    # remove additional unmatched empty items
    df2_dropped = df2_dropped.drop(df2[df2_dropped][df2[df2_dropped] == ''].index)    

    ## short circut for debugging permutation counts
    #print(len(df2_dropped), end=' ')
    #return
    
    # all permutations of remaining indices
    perms = itertools.permutations(df2_dropped)
    print(f'Permutations: {len(df2_dropped)}!', end=' ')

    # generate word vectors and similarity
    if len(df2_dropped) > 1:
        total_distance = []
        df1_reindexed = df1[df1_dropped].reset_index(drop=True)
        for p in tqdm(perms, desc="Permutations", leave=False):
            p = pd.Index(p)
            total_distance.append(
                sum(pd.concat(
                    [df1_reindexed, df2[p].reset_index(drop=True)], axis=1).apply(compare, axis=1).Distance))        
        # find max permutation
        perms_reset = itertools.permutations(df2_dropped) # reset generator
        result_index = pd.Index(next(itertools.islice(perms_reset, total_distance.index(min(total_distance)), None)))
    else:
        perms_reset = itertools.permutations(df2_dropped) # reset generator
        result_index = pd.Index(next(itertools.islice(perms_reset, 0, None)))
    
    # return concatendated dataframe with word vectors
    if result_pairs:
        top_index_left, top_index_right = map(pd.Index, zip(*result_pairs))
    else:
        top_index_left, top_index_right = pd.Index([]), pd.Index([])
    bot_index_left, bot_index_right = df1_dropped, result_index

    df_combined = pd.concat([pd.concat([df1[top_index_left].reset_index(drop=True), 
                                        df2[top_index_right].reset_index(drop=True)], axis=1, ignore_index=True), 
                             pd.concat([df1[bot_index_left].reset_index(drop=True), 
                                        df2[bot_index_right].reset_index(drop=True)], axis=1, ignore_index=True)], 
                            ignore_index=True)
    
    return df_combined

In [63]:
### Test hand alignment against algorithm
# in dfs[0]<->dfs[1] comparison we encounter 12! / 6 / 60 / 60 / 24 = 924 = 3 years run time
def align_by_algo():
    df_final = pd.DataFrame()
    IDs = [130, 153, 135, 137, 141, 114, 121, 127]
    for pid in tqdm(IDs, desc="IDs"):
        print(f'ID: {pid}')
        for session in tqdm(dfs[0].loc[dfs[0].ID == pid, 'Session'].unique(), desc="Sessions"):
            print(f'Session: {session}')
            display(align(dfs[0].loc[(dfs[0].ID == pid) & (dfs[0].Session == session), 'Item'].reset_index(drop=True),
                          dfs[2].loc[(dfs[2].ID == pid) & (dfs[2].Session == session), 'Item'].reset_index(drop=True)).apply(compare, axis=1))

align_by_algo()

IDs:   0%|          | 0/8 [00:00<?, ?it/s]

ID: 130


Sessions:   0%|          | 0/4 [00:00<?, ?it/s]

Session: 2
Permutations: 1! 

Unnamed: 0,0,1,WordVec,Distance
0,grilled cheese,grilled cheese,grilled cheese,0.0
1,strawberry preserves,strawberry preserves,strawberry preserves,0.0
2,chicken broth,chicken broth,broth chicken,0.0
3,barley soup,barley soup,soup barley,0.0
4,maggi seasoning,maggi seasoning,maggi seasoning,0.0
5,ice cream,ice cream,cream ice,0.0
6,italian bread,italian bread,italian bread,0.0
7,apple fritter,apple fritter,apple fritter,0.0
8,eggs,eggs,eggs,0.0
9,ham,ham,ham,0.0


Session: 3
Permutations: 1! 

Unnamed: 0,0,1,WordVec,Distance
0,apple fritter,apple fritter,apple fritter,0.0
1,sourdough bread,sourdough bread,sourdough bread,0.0
2,cole slaw,cole slaw,slaw cole,0.0
3,macaroni salad,macaroni salad,salad macaroni,0.0
4,tomatoes,tomatoes,tomatoes,0.0
5,peaches,peaches,peaches,0.0
6,lettuce,lettuce,lettuce,0.0
7,celery,celery,celery,0.0
8,cucumbers,cucumbers,cucumbers,0.0
9,plums,plums,plums,0.0


Session: 4
Permutations: 0! 

Unnamed: 0,0,1,WordVec,Distance
0,sourdough bread,sourdough bread,sourdough bread,0.0
1,cajun seasoning,cajun seasoning,cajun seasoning,0.0
2,baas,baas,baas,0.0
3,iceberg lettuce,iceberg lettuce,lettuce iceberg,0.0
4,ginger,ginger,ginger,0.0
5,tomatoes,tomatoes,tomatoes,0.0
6,radishes,radishes,radishes,0.0
7,plums,plums,plums,0.0
8,apricots,apricots,apricots,0.0
9,cucumbers,cucumbers,cucumbers,0.0


Session: 5
Permutations: 1! 

Unnamed: 0,0,1,WordVec,Distance
0,radishes,radishes,radishes,0.0
1,oranges,oranges,oranges,0.0
2,iceberg lettuce,iceberg lettuce,lettuce iceberg,0.0
3,plums,plums,plums,0.0
4,baas,baas,baas,0.0
5,carrots,carrots,carrots,0.0
6,peaches,peaches,peaches,0.0
7,blueberries,blueberries,blueberries,0.0
8,,,,0.271828
9,green beans,green beans,beans green,0.0


ID: 153


Sessions:   0%|          | 0/5 [00:00<?, ?it/s]

Session: 1
Permutations: 0! 

Unnamed: 0,0,1,WordVec,Distance
0,brussel sprouts,brussel sprouts,brussel sprouts,0.0
1,green beans,green beans,beans green,0.0
2,margarine,margarine,margarine,0.0
3,nacho cheese tortilla chips,nacho cheese tortilla chips,chips cheese tortilla nacho,0.0
4,tissues,tissues,tissues,0.0
5,orange sparkling water,orange sparkling water,orange sparkling water,0.0
6,lemon sparkling water,lemon sparkling water,sparkling lemon water,0.0
7,diced tomatoes,diced tomatoes,diced tomatoes,0.0
8,sharp cheddar cheese,sharp cheddar cheese,cheddar cheese sharp,0.0
9,caramel ice cream,caramel ice cream,caramel cream ice,0.0


Session: 2
Permutations: 0! 

Unnamed: 0,0,1,WordVec,Distance
0,brussel sprouts,brussel sprouts,brussel sprouts,0.0
1,green beans,green beans,beans green,0.0
2,margarine,margarine,margarine,0.0
3,nacho cheese tortilla chips,nacho cheese tortilla chips,chips cheese tortilla nacho,0.0
4,tissues,tissues,tissues,0.0
5,orange sparkling water,orange sparkling water,orange sparkling water,0.0
6,lemon sparkling water,lemon sparkling water,sparkling lemon water,0.0
7,diced tomatoes,diced tomatoes,diced tomatoes,0.0
8,sharp cheddar cheese,sharp cheddar cheese,cheddar cheese sharp,0.0
9,caramel ice cream,caramel ice cream,caramel cream ice,0.0


Session: 3
Permutations: 0! 

Unnamed: 0,0,1,WordVec,Distance
0,chicken broth,chicken broth,broth chicken,0.0
1,soap,soap,soap,0.0
2,garbanzo beans,garbanzo beans,garbanzo beans,0.0
3,albacore tuna,albacore tuna,albacore tuna,0.0
4,peanut butter,peanut butter,peanut butter,0.0
5,eggs,eggs,eggs,0.0
6,oatmeal flakes cereal,oatmeal flakes cereal,oatmeal flakes cereal,0.0
7,sharp cheddar cheese,sharp cheddar cheese,cheddar cheese sharp,0.0
8,black olvies,black olives,black figs,3.763252
9,coffee beans,beans coffee,coffee beans,0.0


Session: 4
Permutations: 2! 

Permutations: 0it [00:00, ?it/s]

Unnamed: 0,0,1,WordVec,Distance
0,ranch dressing,ranch dressing,ranch dressing,0.0
1,sharp cheddar cheese,sharp cheddar cheese,cheddar cheese sharp,0.0
2,shiraz wine,shiraz wine,wine shiraz,0.0
3,charo wine,charo wine,charo wine,0.0
4,gin,gin,gin,0.0
5,honeycrisp apples,honeycrisp apples,honeycrisp apples,0.0
6,yellow onion,yellow onion,onion yellow,0.0
7,kanzi apples,kanzi apples,apples kanzi,0.0
8,blueberries,blueberries,blueberries,0.0
9,baas,baas,baas,0.0


Session: 5
Permutations: 0! 

Unnamed: 0,0,1,WordVec,Distance
0,arugula,arugula,arugula,0.0
1,dish soap,dish soap,dish soap,0.0
2,tomatoes,tomatoes,tomatoes,0.0
3,lime sparking water,lime sparking water,sparking lime water,0.0
4,grapefruit sparkling water,grapefruit sparkling water,sparkling grapefruit water,0.0
5,sparkling water,sparkling water,sparkling water,0.0
6,peanut butter cereal,peanut butter cereal,peanut butter cereal,0.0
7,tomatoes,tomatoes,tomatoes,0.0
8,honeycrisp apples,honeycrisp apples,honeycrisp apples,0.0
9,jazz apples,jazz apples,jazz apples,0.0


ID: 135


Sessions:   0%|          | 0/5 [00:00<?, ?it/s]

Session: 1
Permutations: 0! 

Unnamed: 0,0,1,WordVec,Distance
0,almond milk,almond milk,almond milk,0.0
1,half and half,half and half,and half,0.0
2,orange juice,orange juice,juice orange,0.0
3,bacon,bacon,bacon,0.0
4,pickles,pickles,pickles,0.0
5,olives,olives,olives,0.0
6,coconut milk,coconut milk,coconut milk,0.0
7,crackers,crackers,crackers,0.0
8,lettuce,lettuce,lettuce,0.0
9,baas,baas,baas,0.0


Session: 2
Permutations: 4! 

Permutations: 0it [00:00, ?it/s]

Unnamed: 0,0,1,WordVec,Distance
0,orange juice,orange juice,juice orange,0.0
1,hummus,hummus,hummus,0.0
2,ricotta cheese,ricotta cheese,ricotta cheese,0.0
3,crackers,crackers,crackers,0.0
4,cheese,cheese,cheese,0.0
5,tomatoes,tomatoes,tomatoes,0.0
6,red potatoes,red potatoes,potatoes red,0.0
7,potatoes,potatoes,potatoes,0.0
8,onions,onions,onions,0.0
9,cherub tomatoes,cherub tomatoes,cherub tomatoes,0.0


Session: 4
Permutations: 0! 

Unnamed: 0,0,1,WordVec,Distance
0,toilet bowl cleaner,toilet bowl cleaner,toilet bowl cleaner,0.0
1,cottage cheese,cottage cheese,cheese cottage,0.0
2,eggs,eggs,eggs,0.0
3,bacon,bacon,bacon,0.0
4,sausage,sausage,sausage,0.0
5,tomato sauce,tomato sauce,sauce tomato,0.0
6,baas,baas,baas,0.0
7,iceberg lettuce,iceberg lettuce,lettuce iceberg,0.0
8,watermelon,watermelon,watermelon,0.0
9,green onions,green onions,onions green,0.0


Session: 5
Permutations: 4! 

Permutations: 0it [00:00, ?it/s]

Unnamed: 0,0,1,WordVec,Distance
0,half and half,half and half,and half,0.0
1,milk,milk,milk,0.0
2,sour cream,sour cream,sour cream,0.0
3,vanilla yogurt,vanilla yogurt,vanilla yogurt,0.0
4,blue cheese,blue cheese,blue cheese,0.0
5,feta cheese,feta cheese,cheese feta,0.0
6,guacamole,guacamole,guacamole,0.0
7,sausage,sausage,sausage,0.0
8,blackberries,blackberries,blackberries,0.0
9,tomatoes,tomatoes,tomatoes,0.0


Session: 6
Permutations: 0! 

Unnamed: 0,0,1,WordVec,Distance
0,hair product,hair product,product hair,0.0
1,red grapes,red grapes,red grapes,0.0
2,white grapes,white grapes,white grapes,0.0
3,manchego cheese,manchego cheese,manchego cheese,0.0
4,orange juice,orange juice,juice orange,0.0
5,blackberries,blackberries,blackberries,0.0
6,eggs,eggs,eggs,0.0
7,havarti cheese,havarti cheese,havarti cheese,0.0
8,egg noodles,egg noodles,noodles egg,0.0
9,croutons,croutons,croutons,0.0


ID: 137


Sessions:   0%|          | 0/6 [00:00<?, ?it/s]

Session: 1
Permutations: 1! 

Unnamed: 0,0,1,WordVec,Distance
0,,,,0.271828
1,,,,0.271828
2,slippers,slippers,slippers,0.0
3,tissues,tissues,tissues,0.0
4,tape,tape,tape,0.0
5,dill pickles,dill pickles,dill pickles,0.0
6,carrots,carrots,carrots,0.0
7,apple chips,apple chips,apple chips,0.0
8,peanuts,peanuts,peanuts,0.0
9,italian salad dressing,italian salad dressing,salad italian dressing,0.0


Session: 2
Permutations: 0! 

Unnamed: 0,0,1,WordVec,Distance
0,milk,milk,milk,0.0
1,strawberries,strawberries,strawberries,0.0
2,hydrogen peroxide,hydrogen peroxide,hydrogen peroxide,0.0
3,paper towels,paper towels,towels paper,0.0
4,english muffins,english muffins,muffins english,0.0
5,brussel sprouts,brussel sprouts,brussel sprouts,0.0
6,corn chips,corn chips,chips corn,0.0
7,eggs,eggs,eggs,0.0
8,thighs chicken,chicken thighs,thighs chicken,0.0
9,yoplait yogurt,pack yogurt,yogurt packs,3.339074


Session: 3
Permutations: 0! 

Unnamed: 0,0,1,WordVec,Distance
0,potato chips,potato chips,potato chips,0.0
1,italian salad dressing,italian salad dressing,salad italian dressing,0.0
2,salad dressing,salad dressing,salad dressing,0.0
3,dill pickles,dill pickles,dill pickles,0.0
4,dill kosher pickles,dill kosher pickles,kosher dill pickles,0.0
5,cranberry mango juice,cranberry mango juice,juice cranberry mango,0.0
6,pinto beans,pinto beans,beans pinto,0.0
7,kidney beans,kidney beans,kidney beans,0.0
8,red beans,red beans,beans red,0.0
9,granola bar,granola bar,bar granola,0.0


Session: 4
Permutations: 0! 

Unnamed: 0,0,1,WordVec,Distance
0,club soda,club soda,soda club,0.0
1,potatoes,potatoes,potatoes,0.0
2,wheat bread,wheat bread,wheat bread,0.0
3,tonic water,tonic water,tonic water,0.0
4,eggs,eggs,eggs,0.0
5,corn,corn,corn,0.0
6,water,water,water,0.0
7,,,,0.271828
8,green bell peppers,green peppers,green peppers,0.31415
9,chicken alfredo meal,chicken alfredo with chicken broccoli,alfredo chicken vegetables,3.163416


Session: 5
Permutations: 3! 

Permutations: 0it [00:00, ?it/s]

Unnamed: 0,0,1,WordVec,Distance
0,,,,0.271828
1,peanuts,peanuts,peanuts,0.0
2,,,,0.271828
3,,,,0.271828
4,,,,0.271828
5,,,,0.271828
6,canned tuna,canned tuna,canned tuna,0.0
7,,,,0.271828
8,gala apples,gala apples,apples gala,0.0
9,milk,milk,milk,0.0


Session: 6
Permutations: 1! 

Unnamed: 0,0,1,WordVec,Distance
0,crackers,crackers,crackers,0.0
1,tomato product,tomato product,tomato product,0.0
2,prepackaged meat,prepackaged meat,meat prepackaged,0.0
3,fresh meat,fresh meat,meat fresh,0.0
4,milk,milk,milk,0.0
5,dairy,dairy,dairy,0.0
6,paper product,paper product,paper product,0.0
7,clothes,children's clothes,clothes,0.31415
8,salted snacks,snacks,snacks,0.31415
9,fish,canned fish meat,fish,0.31415


ID: 141


Sessions:   0%|          | 0/5 [00:00<?, ?it/s]

Session: 1
Permutations: 0! 

Unnamed: 0,0,1,WordVec,Distance
0,fabric softener,fabric softener,fabric softener,0.0
1,red wine,red wine,red wine,0.0
2,garden salad,garden salad,salad garden,0.0
3,crowns broccoli,crowns broccoli,broccoli crowns,0.0
4,portabella mushrooms,portabella mushrooms,mushrooms portabella,0.0
5,monterey pepper jack cheese,monterey pepper jack cheese,jack cheese pepper monterey,0.0
6,white cheddar cheese,white cheddar cheese,cheddar cheese white,0.0
7,brie cheese,brie cheese,cheese brie,0.0
8,cream cheese,cream cheese,cheese cream,0.0
9,eggs,eggs,eggs,0.0


Session: 2
Permutations: 1! 

Unnamed: 0,0,1,WordVec,Distance
0,,,,0.271828
1,chipotle aioli,chipotle aioli,aioli chipotle,0.0
2,,,,0.271828
3,gouda cheese,gouda cheese,gouda cheese,0.0
4,dog food,dog food,dog food,0.0
5,plums,plums,plums,0.0
6,olive spread,olive spread,spread olive,0.0
7,cream cheese,cream cheese,cheese cream,0.0
8,pepper,pepper,pepper,0.0
9,onions,chopped onions,onions,0.31415


Session: 3
Permutations: 1! 

Unnamed: 0,0,1,WordVec,Distance
0,toilet paper,toilet paper,toilet paper,0.0
1,pita melts,pita melts,melts pita,0.0
2,red wine,red wine,red wine,0.0
3,garden salad,garden salad,salad garden,0.0
4,cheddar pepper jack cheese,cheddar pepper jack cheese,jack cheese cheddar pepper,0.0
5,pork rinds,pork rinds,rinds pork,0.0
6,monterey pepper jack cheese,monterey pepper jack cheese,jack cheese pepper monterey,0.0
7,quart sandwich bags,quart sandwich bags,sandwich quart bags,0.0
8,gallon sandwich bags,gallon sandwich bags,sandwich gallon bags,0.0
9,cream cheese,cream cheese,cheese cream,0.0


Session: 4
Permutations: 0! 

Unnamed: 0,0,1,WordVec,Distance
0,cheese puffs,cheese puffs,cheese puffs,0.0
1,toilet paper,toilet paper,toilet paper,0.0
2,peas,peas,peas,0.0
3,eggs,eggs,eggs,0.0
4,cream cheese,cream cheese,cheese cream,0.0
5,egg thins,egg thins,thins egg,0.0
6,monterey pepper jack cheese,monterey pepper jack cheese,jack cheese pepper monterey,0.0
7,crackers cauliflower,cauliflower crackers,cauliflower crackers,0.0
8,prosecco wine,prosecco,prosecco,0.31415
9,colby jack cheese,sliced colby jack cheese,jack cheese colby,0.31415


Session: 5
Permutations: 5! 

Permutations: 0it [00:00, ?it/s]

Unnamed: 0,0,1,WordVec,Distance
0,,,,0.271828
1,,,,0.271828
2,,,,0.271828
3,quesadilla,quesadilla,quesadilla,0.0
4,pineapple,pineapple,pineapple,0.0
5,,,,0.271828
6,borax,borax,borax,0.0
7,chipotle aioli,chipotle aioli,aioli chipotle,0.0
8,,,,0.271828
9,,,,0.271828


ID: 114


Sessions:   0%|          | 0/6 [00:00<?, ?it/s]

Session: 1
Permutations: 0! 

Unnamed: 0,0,1,WordVec,Distance
0,chicken enchilada,chicken enchilada,enchilada chicken,0.0
1,dog food,dog food,dog food,0.0
2,tea bags,tea bags,tea bags,0.0
3,colby jack cheese,colby jack cheese,jack cheese colby,0.0
4,asian salad,asian salad,salad asian,0.0
5,turkey bacon,turkey bacon,turkey bacon,0.0
6,baas,baas,baas,0.0
7,newspaper,newspaper,newspaper,0.0
8,sandwich bags,sandwich bags,sandwich bags,0.0
9,cobb salad,cobb salad,salad cobb,0.0


Session: 2
Permutations: 0! 

Unnamed: 0,0,1,WordVec,Distance
0,thai salad,thai salad,salad thai,0.0
1,country spread,country spread,spread country,0.0
2,1% milk,1% milk,milk 1%,0.0
3,lemon sparkling water,lemon sparkling water,sparkling lemon water,0.0
4,ketchup,ketchup,ketchup,0.0
5,corn,corn,corn,0.0
6,baas,baas,baas,0.0
7,tortilla chips,tortilla chips,chips tortilla,0.0
8,syrup,syrup,syrup,0.0
9,pretzels,pretzels,pretzels,0.0


Session: 3
Permutations: 1! 

Unnamed: 0,0,1,WordVec,Distance
0,chicken enchilada,chicken enchilada,enchilada chicken,0.0
1,syrup,syrup,syrup,0.0
2,turkey bologna,turkey bologna,turkey bologna,0.0
3,oat squares cereal,oat squares cereal,oat cereal squares,0.0
4,chicken caesar salad,chicken caesar salad,salad caesar chicken,0.0
5,spinach salad,spinach salad,spinach salad,0.0
6,baas,baas,baas,0.0
7,newspaper,newspaper,newspaper,0.0
8,fat free milk,fat free milk,free milk fat,0.0
9,potato salad,potato salad,potato salad,0.0


Session: 4
Permutations: 1! 

Unnamed: 0,0,1,WordVec,Distance
0,chicken enchilada,chicken enchilada,enchilada chicken,0.0
1,syrup,syrup,syrup,0.0
2,turkey bologna,turkey bologna,turkey bologna,0.0
3,oat squares cereal,oat squares cereal,oat cereal squares,0.0
4,chicken caesar salad,chicken caesar salad,salad caesar chicken,0.0
5,spinach salad,spinach salad,spinach salad,0.0
6,baas,baas,baas,0.0
7,newspaper,newspaper,newspaper,0.0
8,harvest cheddar sunchips,harvest cheddar sunchips,harvest cheddar sunchips,0.0
9,ice cream,ice cream,cream ice,0.0


Session: 5
Permutations: 0! 

Unnamed: 0,0,1,WordVec,Distance
0,lime sparkling water,lime sparkling water,lime sparkling water,0.0
1,tea bags,tea bags,tea bags,0.0
2,fat free milk,fat free milk,free milk fat,0.0
3,baked beans,baked beans,beans baked,0.0
4,cheddar cheese,cheddar cheese,cheddar cheese,0.0
5,cheese puffs,cheese puffs,cheese puffs,0.0
6,tortilla chips,tortilla chips,chips tortilla,0.0
7,newspaper,newspaper,newspaper,0.0
8,garden salad,garden salad,salad garden,0.0
9,baas,baas,baas,0.0


Session: 6
Permutations: 1! 

Unnamed: 0,0,1,WordVec,Distance
0,key lime pie,key lime pie,lime pie key,0.0
1,salad,salad,salad,0.0
2,blt salad,blt salad,salad blt,0.0
3,vanilla ice cream sandwiches,vanilla ice cream sandwiches,vanilla sandwiches cream ice,0.0
4,1% milk,1% milk,milk 1%,0.0
5,mustard,mustard,mustard,0.0
6,plum jam,plum jam,plum jam,0.0
7,whole wheat spaghetti,whole wheat spaghetti,whole wheat spaghetti,0.0
8,pepperoni lasagna,pepperoni lasagna,lasagna pepperoni,0.0
9,tortilla chips,tortilla chips,chips tortilla,0.0


ID: 121


Sessions:   0%|          | 0/6 [00:00<?, ?it/s]

Session: 1
Permutations: 0! 

Unnamed: 0,0,1,WordVec,Distance
0,blt salad,blt salad,salad blt,0.0
1,bacon,bacon,bacon,0.0
2,sugar cookie,sugar cookie,cookie sugar,0.0
3,coconut cake,coconut cake,coconut cake,0.0
4,bread,bread,bread,0.0
5,garlic bread,garlic bread,garlic bread,0.0
6,raisin bran cereal,raisin bran cereal,bran raisin cereal,0.0
7,shrimp,shrimp,shrimp,0.0
8,ground beef,ground beef,ground beef,0.0
9,guacamole,guacamole,guacamole,0.0


Session: 2
Permutations: 2! 

Permutations: 0it [00:00, ?it/s]

Unnamed: 0,0,1,WordVec,Distance
0,chocolate chip cookies,chocolate chip cookies,cookies chocolate chip,0.0
1,biscuits and gravy,biscuits and gravy,and biscuits gravy,0.0
2,tarp,tarp,tarp,0.0
3,index cards,index cards,cards index,0.0
4,,,,0.271828
5,eyeglasses chain,eyeglasses chain,chain eyeglasses,0.0
6,almond butter,almond butter,almond butter,0.0
7,cole slaw,cole slaw,slaw cole,0.0
8,lemons,lemons,lemons,0.0
9,butter pecan gelato,butter pecan gelato,butter pecan gelato,0.0


Session: 3
Permutations: 0! 

Unnamed: 0,0,1,WordVec,Distance
0,santa fe salad kit,santa fe salad kit,salad kit fe santa,0.0
1,ice cream,ice cream,cream ice,0.0
2,watermelon,watermelon,watermelon,0.0
3,almond milk,almond milk,almond milk,0.0
4,orange juice,orange juice,juice orange,0.0
5,onions,onions,onions,0.0
6,wonton strips,wonton strips,strips wonton,0.0
7,rice,rice,rice,0.0
8,green bell peppers,green bell peppers,green peppers bell,0.0
9,ice cream,ice cream,cream ice,0.0


Session: 4
Permutations: 3! 

Permutations: 0it [00:00, ?it/s]

Unnamed: 0,0,1,WordVec,Distance
0,,,,0.271828
1,crispy onions,crispy onions,crispy onions,0.0
2,wonton strips,wonton strips,strips wonton,0.0
3,watermelon,watermelon,watermelon,0.0
4,salad bowl,salad bowl,salad bowl,0.0
5,eggs,eggs,eggs,0.0
6,apple pie,apple pie,apple pie,0.0
7,cherry pie,cherry pie,cherry pie,0.0
8,bacon,bacon,bacon,0.0
9,butter pecan gelato,butter pecan gelato,butter pecan gelato,0.0


Session: 5
Permutations: 3! 

Permutations: 0it [00:00, ?it/s]

Unnamed: 0,0,1,WordVec,Distance
0,ice cream,ice cream,cream ice,0.0
1,gelato,gelato,gelato,0.0
2,sushi,sushi,sushi,0.0
3,apple pie,apple pie,apple pie,0.0
4,cashews,cashews,cashews,0.0
5,potato salad,potato salad,potato salad,0.0
6,gelato,gelato,gelato,0.0
7,garlic bread,garlic bread,garlic bread,0.0
8,eggs,eggs,eggs,0.0
9,tortillas,tortillas,tortillas,0.0


Session: 6
Permutations: 0! 

Unnamed: 0,0,1,WordVec,Distance
0,cauliflower rice,cauliflower rice,cauliflower rice,0.0
1,ice cream,ice cream,cream ice,0.0
2,marbled cheese,marbled cheese,cheese marbled,0.0
3,almond milk,almond milk,almond milk,0.0
4,yogurt,yogurt,yogurt,0.0
5,heavy cream,heavy cream,cream heavy,0.0
6,roses,roses,roses,0.0
7,bacon,bacon,bacon,0.0
8,eggs,eggs,eggs,0.0
9,cashews,cashews,cashews,0.0


ID: 127


Sessions:   0%|          | 0/5 [00:00<?, ?it/s]

Session: 1
Permutations: 1! 

Unnamed: 0,0,1,WordVec,Distance
0,strawberries,strawberries,strawberries,0.0
1,blueberries,blueberries,blueberries,0.0
2,hummus,hummus,hummus,0.0
3,guacamole,guacamole,guacamole,0.0
4,,,,0.271828
5,salmon shrimp cat food,salmon shrimp cat food,salmon cat food shrimp,0.0
6,chicken cat food,chicken cat food,cat food chicken,0.0
7,caramel rice cakes,caramel rice cakes,caramel cakes rice,0.0
8,water,water,water,0.0
9,pita bread,pita bread,pita bread,0.0


Session: 2
Permutations: 0! 

Unnamed: 0,0,1,WordVec,Distance
0,potato salad,potato salad,potato salad,0.0
1,popsicles,popsicles,popsicles,0.0
2,2% milk,2% milk,2% milk,0.0
3,water,water,water,0.0
4,chicken strips,chicken strips,strips chicken,0.0
5,stuffed grape leaves,stuffed grape leaves,grape stuffed leaves,0.0
6,paper towels,paper towels,towels paper,0.0
7,,,,0.271828
8,strawberry sparkling water,strawberry sparkling water,sparkling strawberry water,0.0
9,water,water,water,0.0


Session: 4
Permutations: 2! 

Permutations: 0it [00:00, ?it/s]

Unnamed: 0,0,1,WordVec,Distance
0,chocolate hummus,chocolate hummus,hummus chocolate,0.0
1,dairy free half and half,dairy free half and half,free and dairy half,0.0
2,2% milk,2% milk,2% milk,0.0
3,eggs,eggs,eggs,0.0
4,ice cream,ice cream,cream ice,0.0
5,tea bags,tea bags,tea bags,0.0
6,graham crackers,graham crackers,graham crackers,0.0
7,ginger thins cookies,ginger thins cookies,ginger cookies thins,0.0
8,raspberry drink mix,raspberry drink mix,drink raspberry mix,0.0
9,empire apples,empire apples,empire apples,0.0


Session: 5
Permutations: 1! 

Unnamed: 0,0,1,WordVec,Distance
0,ground beef,ground beef,ground beef,0.0
1,sausage,sausage,sausage,0.0
2,popcorn,popcorn,popcorn,0.0
3,mangos,mangos,mangos,0.0
4,baas,baas,baas,0.0
5,taboule salad,taboule salad,salad taboule,0.0
6,black bean salad,black bean salad,salad black bean,0.0
7,stuffed grape leaves,stuffed grape leaves,grape stuffed leaves,0.0
8,string cheese,string cheese,string cheese,0.0
9,goat cheese,goat cheese,goat cheese,0.0


Session: 6
Permutations: 1! 

Unnamed: 0,0,1,WordVec,Distance
0,2% milk,2% milk,2% milk,0.0
1,paper bowls,paper bowls,bowls paper,0.0
2,chocolate chip cookies,chocolate chip cookies,cookies chocolate chip,0.0
3,batteries,batteries,batteries,0.0
4,baas,baas,baas,0.0
5,potato chips,potato chips,potato chips,0.0
6,butter,butter,butter,0.0
7,orange juice,orange juice,juice orange,0.0
8,chocolate ice cream,chocolate ice cream,chocolate cream ice,0.0
9,baas,baas,baas,0.0


In [7]:
### Align Data Sets by hand
def align_by_hand():
    # ID 137 and 114 have low variation
    df3 = dfs[2].loc[dfs[2].ID == 137, 'Item'].copy().reset_index(drop=True)

    # align by inspecting for proof of concept
    df3_aligned = df3.drop([102]).reset_index(drop=True)
    
    df_hand_aligned = pd.concat([dfs[0].loc[dfs[0].ID == 137, 'Item'].reset_index(drop=True), df3_aligned], axis=1)
    df_hand_aligned = df_hand_aligned.apply(compare, axis=1)
    display(df_hand_aligned)
    
#align_by_hand()

In [8]:
dfs[0].loc[dfs[0].ID == 127, 'Session'].unique()

array([1, 2, 4, 5, 6], dtype=uint8)

In [9]:
# TODO: Collect basket outliers for inspection
# 0-1 MANY and 1-2 MANY
pd.set_option('display.max_rows', 500)

display(pd.concat([dfs[0].loc[(dfs[0].ID == 127) & (dfs[0].Session == 4), 'Item'].reset_index(drop=True),
                   dfs[2].loc[(dfs[2].ID == 127) & (dfs[2].Session == 4), 'Item'].reset_index(drop=True)],
                      axis=1, ignore_index=True))

Unnamed: 0,0,1
0,bakery item,oatmeal
1,blue cheese stuffed olives,
2,goat cheese,crumble goat cheese
3,chocolate hummus,chocolate hummus
4,dairy free half and half,dairy free half and half
5,2% milk,2% milk
6,eggs,eggs
7,ice cream,ice cream
8,tongs,salad tongs
9,tea bags,tea bags


In [21]:
item1 = 'veggie'.split()
item2 = 'vegetables'.split()
print(word_vectors.n_similarity(item1, item2))
print(word_vectors.most_similar(positive=[*item1, *item2])[0:5])
print(word_vectors.wmdistance(item1, item2))

0.29503438
[('burgers', 0.8257980346679688), ('chicken', 0.8151949644088745), ('soup', 0.7996612191200256), ('hamburgers', 0.7982652187347412), ('salads', 0.7971976399421692)]
6.401086807250976
