In [1]:
import pandas as pd

import itertools
from tqdm.auto import tqdm

import gensim.downloader as api
word_vectors = api.load("glove-wiki-gigaword-50")

In [2]:
DATA_PATH = '../Data/'
FILES = ['clean_max.csv', 'clean_mar.csv', 'clean_sam.csv']
#COLS = [0, 1, 14, 5]  # Index, ID, Basket, Item
COLS = [0, 1, 2, 5]  # Index, ID, Session, Item
DTYPES = {'ID': 'uint8', 'Session': 'uint8', 'Item': str}

dfs = [pd.read_csv(DATA_PATH + file, index_col=0, usecols=COLS, dtype=DTYPES) for file in FILES]

ids_shared = set.intersection(*[set(df.ID.unique()) for df in dfs])
dfs = [df[df.ID.isin(ids_shared)].fillna('').reset_index(drop=True) for df in dfs]

In [3]:
def equalize_length(df1, df2):
    df1_length, df2_length = len(df1), len(df2)
    
    if df1_length > df2_length:    
        df2 = df2.reindex(list(range(df1_length)))
        df2 = df2.fillna('')
    elif df2_length > df1_length:
        df1 = df1.reindex(list(range(df2_length)))
        df1 = df1.fillna('')

    assert len(df1) == len(df2)
    return df1, df2

In [4]:
# TODO: unpack big functions in order to reduce permutations
def compare(df_row):
    result = []
    df_split = df_row.str.split()
    df_split = df_split.fillna('') # hack: somehow a NaN value can 
  
    # return shared words
    shared_words = set.intersection(*map(set, df_split))
    unshared_words = set.symmetric_difference(*map(set, df_split))
    if shared_words:
        result.extend(shared_words)
        # if both phrases are exhausted return
        if not unshared_words:
            df_row['WordVec'] = ' '.join(result)
            df_row['Distance'] = 0
            return df_row
    
    # discard words outside vocabulary
    df_in_voc = [[word for word in phrase if word in word_vectors.vocab] for phrase in df_split]
    if not any(df_in_voc):
        df_row['WordVec'] = ' '.join([*df_row])
        df_row['Distance'] = 999  # large number smaller than inf
        return df_row
    
    # use word vectors to average remaining words
    #words = [word for text in df_in_voc for word in text]
    most_similar_key, _ = word_vectors.most_similar(positive=[*df_in_voc[0], *df_in_voc[1]])[0]  # take top result
    result.append(most_similar_key)
    df_row['WordVec'] = ' '.join(result)
    df_row['Distance'] = word_vectors.wmdistance(df_split[0], df_split[1])
    return df_row

In [5]:
# TODO: unpack big functions in order to reduce permutations
def align(df1, df2):
    df1, df2 = equalize_length(df1, df2)
    
    # remove identical matches
    result_pairs = []
    df1_dropped, df2_dropped = df1.index, df2.index
    for df1_idx, df1_word in df1.iteritems():
        matches = df2[df2_dropped].str.fullmatch(df1_word)
        if any(matches):
            match_index = matches.idxmax() # return index of first match
            result_pairs.append((df1_idx, match_index))
            df1_dropped = df1_dropped.drop(df1_idx)
            df2_dropped = df2_dropped.drop(match_index)
    
    # remove substring matches
    df1_split = df1[df1_dropped].str.split()
    for df1_idx, df1_words in df1_split.iteritems():
        for word in df1_words:
            matches = df2[df2_dropped].str.contains(word, regex=False)
            if any(matches):
                match_index = matches.idxmax() # return index of first match
                result_pairs.append((df1_idx, match_index))
                df1_dropped = df1_dropped.drop(df1_idx)
                df2_dropped = df2_dropped.drop(match_index)
                break
    
    # remove substring matches in the other direction
    df2_split = df2[df2_dropped].str.split()
    for df2_idx, df2_words in df2_split.iteritems():
        for word in df2_words:
            matches = df1[df1_dropped].str.contains(word, regex=False)
            if any(matches):
                match_index = matches.idxmax() # return index of first match
                result_pairs.append((match_index, df2_idx))
                df1_dropped = df1_dropped.drop(match_index)
                df2_dropped = df2_dropped.drop(df2_idx)
                break
    
    # remove additional unmatched empty items
    df2_dropped = df2_dropped.drop(df2[df2_dropped][df2[df2_dropped] == ''].index)    

    ## short circut for debugging permutation counts
    #print(len(df2_dropped), end=' ')
    #return
    
    # all permutations of remaining indices
    perms = list(itertools.permutations(df2_dropped))
    #print(len(perms), end=' ')
    
    # TODO: this might be needed
    # df1['WordVec'], df2['WordVec'] = None, None
    # df1['Distance'], df2['Distance'] = 99, 99
    # generate word vectors and similarity
    if len(perms) > 1:
        total_distance = []
        df1_reindexed = df1[df1_dropped].reset_index(drop=True)
        for p in tqdm(perms, desc="Permutations", leave=False):
            p = pd.Index(p)
            total_distance.append(
                sum(pd.concat(
                    [df1_reindexed, df2[p].reset_index(drop=True)], axis=1).apply(compare, axis=1).Distance))        
        # find max permutation
        result_index = pd.Index(perms[total_distance.index(min(total_distance))])
    else:
        result_index = pd.Index(perms[0])
    
    # return concatendated dataframe with word vectors
    if result_pairs:
        top_index_left, top_index_right = map(pd.Index, zip(*result_pairs))
    else:
        top_index_left, top_index_right = pd.Index([]), pd.Index([])
    bot_index_left, bot_index_right = df1_dropped, result_index

    df_combined = pd.concat([pd.concat([df1[top_index_left].reset_index(drop=True), 
                                        df2[top_index_right].reset_index(drop=True)], axis=1, ignore_index=True), 
                             pd.concat([df1[bot_index_left].reset_index(drop=True), 
                                        df2[bot_index_right].reset_index(drop=True)], axis=1, ignore_index=True)], 
                            ignore_index=True)
    
    return df_combined

In [6]:
### Test hand alignment against algorithm
def align_by_algo():
    df_final = pd.DataFrame()
    for pid in tqdm([130, 153, 135, 137, 141, 114, 121, 127], desc="IDs"):
        print(f'ID: {pid}')
        for session in tqdm(dfs[0].loc[dfs[0].ID == pid, 'Session'].unique(), desc="Sessions"):
            print(f'Session: {session}')
            display(align(dfs[0].loc[(dfs[0].ID == pid) & (dfs[0].Session == session), 'Item'].reset_index(drop=True),
                          dfs[2].loc[(dfs[2].ID == pid) & (dfs[2].Session == session), 'Item'].reset_index(drop=True)))
        #df_final = pd.concat([df_final, 
        #                      align(dfs[0].loc[(dfs[0].ID == 137) & (dfs[0].Basket == basket), 'Item'].reset_index(drop=True),
        #                            dfs[2].loc[(dfs[2].ID == 137) & (dfs[2].Basket == basket), 'Item'].reset_index(drop=True))], ignore_index=True)
    #display(df_final.apply(compare, axis=1))

align_by_algo()
    
def align_count_free_rows():
    for pid in tqdm(ids_shared, desc="IDs"):
        print(f'PID={pid}:', end=' ')
        for session in range(1, 7):
            align(dfs[0].loc[(dfs[0].ID == pid) & (dfs[0].Session == session), 'Item'].reset_index(drop=True),
                  dfs[2].loc[(dfs[2].ID == pid) & (dfs[2].Session == session), 'Item'].reset_index(drop=True))
        print()
        
#align_count_free_rows()

IDs:   0%|          | 0/8 [00:00<?, ?it/s]

ID: 130


Sessions:   0%|          | 0/4 [00:00<?, ?it/s]

Session: 2


Unnamed: 0,0,1
0,grilled cheese,grilled cheese
1,strawberry preserves,strawberry preserves
2,chicken broth,chicken broth
3,barley soup,barley soup
4,maggi seasoning,maggi seasoning
...,...,...
56,canola spray,canola oil spray
57,black beans,baked beans
58,angel hair,angel hair pasta
59,whipped cream,whipped topping


Session: 3


Unnamed: 0,0,1
0,apple fritter,apple fritter
1,sourdough bread,sourdough bread
2,cole slaw,cole slaw
3,macaroni salad,macaroni salad
4,tomatoes,tomatoes
5,peaches,peaches
6,lettuce,lettuce
7,celery,celery
8,cucumbers,cucumbers
9,plums,plums


Session: 4


Unnamed: 0,0,1
0,sourdough bread,sourdough bread
1,cajun seasoning,cajun seasoning
2,baas,baas
3,iceberg lettuce,iceberg lettuce
4,ginger,ginger
5,tomatoes,tomatoes
6,radishes,radishes
7,plums,plums
8,apricots,apricots
9,cucumbers,cucumbers


Session: 5


Unnamed: 0,0,1
0,radishes,radishes
1,oranges,oranges
2,iceberg lettuce,iceberg lettuce
3,plums,plums
4,baas,baas
5,carrots,carrots
6,peaches,peaches
7,blueberries,blueberries
8,,
9,green beans,green beans


ID: 153


Sessions:   0%|          | 0/5 [00:00<?, ?it/s]

Session: 1


Unnamed: 0,0,1
0,brussel sprouts,brussel sprouts
1,green beans,green beans
2,margarine,margarine
3,nacho cheese tortilla chips,nacho cheese tortilla chips
4,tissues,tissues
5,orange sparkling water,orange sparkling water
6,lemon sparkling water,lemon sparkling water
7,diced tomatoes,diced tomatoes
8,sharp cheddar cheese,sharp cheddar cheese
9,caramel ice cream,caramel ice cream


Session: 2


Unnamed: 0,0,1
0,brussel sprouts,brussel sprouts
1,green beans,green beans
2,margarine,margarine
3,nacho cheese tortilla chips,nacho cheese tortilla chips
4,tissues,tissues
...,...,...
64,2% milk,"2%, 1 gallon milk"
65,alaskan cod fish,alaskan cod
66,jasmine brown rice,brown jasmine rice
67,chicken strips,grilled chicken strips


Session: 3


Unnamed: 0,0,1
0,chicken broth,chicken broth
1,soap,soap
2,garbanzo beans,garbanzo beans
3,albacore tuna,albacore tuna
4,peanut butter,peanut butter
5,eggs,eggs
6,oatmeal flakes cereal,oatmeal flakes cereal
7,sharp cheddar cheese,sharp cheddar cheese
8,black olvies,black olives
9,coffee beans,beans coffee


Session: 4


Permutations:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,0,1
0,ranch dressing,ranch dressing
1,sharp cheddar cheese,sharp cheddar cheese
2,shiraz wine,shiraz wine
3,charo wine,charo wine
4,gin,gin
5,honeycrisp apples,honeycrisp apples
6,yellow onion,yellow onion
7,kanzi apples,kanzi apples
8,blueberries,blueberries
9,baas,baas


Session: 5


Unnamed: 0,0,1
0,arugula,arugula
1,dish soap,dish soap
2,tomatoes,tomatoes
3,lime sparking water,lime sparking water
4,grapefruit sparkling water,grapefruit sparkling water
5,sparkling water,sparkling water
6,peanut butter cereal,peanut butter cereal
7,tomatoes,tomatoes
8,honeycrisp apples,honeycrisp apples
9,jazz apples,jazz apples


ID: 135


Sessions:   0%|          | 0/5 [00:00<?, ?it/s]

Session: 1


Unnamed: 0,0,1
0,almond milk,almond milk
1,half and half,half and half
2,orange juice,orange juice
3,bacon,bacon
4,pickles,pickles
5,olives,olives
6,coconut milk,coconut milk
7,crackers,crackers
8,lettuce,lettuce
9,baas,baas


Session: 2


Permutations:   0%|          | 0/24 [00:00<?, ?it/s]

Unnamed: 0,0,1
0,orange juice,orange juice
1,hummus,hummus
2,ricotta cheese,ricotta cheese
3,crackers,crackers
4,cheese,cheese
...,...,...
57,birthday cards,card
58,veggie mix,frozen vegetables
59,mangos,mangoes
60,for allergies medication,quiona


Session: 4


Unnamed: 0,0,1
0,toilet bowl cleaner,toilet bowl cleaner
1,cottage cheese,cottage cheese
2,eggs,eggs
3,bacon,bacon
4,sausage,sausage
5,tomato sauce,tomato sauce
6,baas,baas
7,iceberg lettuce,iceberg lettuce
8,watermelon,watermelon
9,green onions,green onions


Session: 5


Permutations:   0%|          | 0/24 [00:00<?, ?it/s]

Unnamed: 0,0,1
0,half and half,half and half
1,milk,milk
2,sour cream,sour cream
3,vanilla yogurt,vanilla yogurt
4,blue cheese,blue cheese
5,feta cheese,feta cheese
6,guacamole,guacamole
7,sausage,sausage
8,blackberries,blackberries
9,tomatoes,tomatoes


Session: 6


Unnamed: 0,0,1
0,hair product,hair product
1,red grapes,red grapes
2,white grapes,white grapes
3,manchego cheese,manchego cheese
4,orange juice,orange juice
5,blackberries,blackberries
6,eggs,eggs
7,havarti cheese,havarti cheese
8,egg noodles,egg noodles
9,croutons,croutons


ID: 137


Sessions:   0%|          | 0/6 [00:00<?, ?it/s]

Session: 1


Unnamed: 0,0,1
0,,
1,,
2,slippers,slippers
3,tissues,tissues
4,tape,tape
5,dill pickles,dill pickles
6,carrots,carrots
7,apple chips,apple chips
8,peanuts,peanuts
9,italian salad dressing,italian salad dressing


Session: 2


Unnamed: 0,0,1
0,milk,milk
1,strawberries,strawberries
2,hydrogen peroxide,hydrogen peroxide
3,paper towels,paper towels
4,english muffins,english muffins
5,brussel sprouts,brussel sprouts
6,corn chips,corn chips
7,eggs,eggs
8,thighs chicken,chicken thighs
9,yoplait yogurt,pack yogurt


Session: 3


Unnamed: 0,0,1
0,potato chips,potato chips
1,italian salad dressing,italian salad dressing
2,salad dressing,salad dressing
3,dill pickles,dill pickles
4,dill kosher pickles,dill kosher pickles
5,cranberry mango juice,cranberry mango juice
6,pinto beans,pinto beans
7,kidney beans,kidney beans
8,red beans,red beans
9,granola bar,granola bar


Session: 4


Unnamed: 0,0,1
0,club soda,club soda
1,potatoes,potatoes
2,wheat bread,wheat bread
3,tonic water,tonic water
4,eggs,eggs
5,corn,corn
6,water,water
7,,
8,green bell peppers,green peppers
9,chicken alfredo meal,"chicken alfredo with chicken, broccoli"


Session: 5


Permutations:   0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0,0,1
0,,
1,peanuts,peanuts
2,,
3,,
4,,
5,,
6,canned tuna,canned tuna
7,,
8,gala apples,gala apples
9,milk,milk


Session: 6


Unnamed: 0,0,1
0,crackers,crackers
1,tomato product,tomato product
2,prepackaged meat,prepackaged meat
3,fresh meat,fresh meat
4,milk,milk
5,dairy,dairy
6,paper product,paper product
7,clothes,children's clothes
8,salted snacks,snacks
9,fish,canned fish meat


ID: 141


Sessions:   0%|          | 0/5 [00:00<?, ?it/s]

Session: 1


Unnamed: 0,0,1
0,fabric softener,fabric softener
1,red wine,red wine
2,garden salad,garden salad
3,crowns broccoli,crowns broccoli
4,portabella mushrooms,portabella mushrooms
5,monterey pepper jack cheese,monterey pepper jack cheese
6,white cheddar cheese,white cheddar cheese
7,brie cheese,brie cheese
8,cream cheese,cream cheese
9,eggs,eggs


Session: 2


Unnamed: 0,0,1
0,,
1,chipotle aioli,chipotle aioli
2,,
3,gouda cheese,gouda cheese
4,dog food,dog food
5,plums,plums
6,olive spread,olive spread
7,cream cheese,cream cheese
8,pepper,pepper
9,onions,chopped onions


Session: 3


Unnamed: 0,0,1
0,toilet paper,toilet paper
1,pita melts,pita melts
2,red wine,red wine
3,garden salad,garden salad
4,cheddar pepper jack cheese,cheddar pepper jack cheese
5,pork rinds,pork rinds
6,monterey pepper jack cheese,monterey pepper jack cheese
7,quart sandwich bags,quart sandwich bags
8,gallon sandwich bags,gallon sandwich bags
9,cream cheese,cream cheese


Session: 4


Unnamed: 0,0,1
0,cheese puffs,cheese puffs
1,toilet paper,toilet paper
2,peas,peas
3,eggs,eggs
4,cream cheese,cream cheese
5,egg thins,egg thins
6,monterey pepper jack cheese,monterey pepper jack cheese
7,crackers cauliflower,cauliflower crackers
8,prosecco wine,prosecco
9,colby jack cheese,sliced colby jack cheese


Session: 5


Permutations:   0%|          | 0/120 [00:00<?, ?it/s]

Unnamed: 0,0,1
0,,
1,,
2,,
3,quesadilla,quesadilla
4,pineapple,pineapple
5,,
6,borax,borax
7,chipotle aioli,chipotle aioli
8,,
9,,


ID: 114


Sessions:   0%|          | 0/6 [00:00<?, ?it/s]

Session: 1


Unnamed: 0,0,1
0,chicken enchilada,chicken enchilada
1,dog food,dog food
2,tea bags,tea bags
3,colby jack cheese,colby jack cheese
4,asian salad,asian salad
5,turkey bacon,turkey bacon
6,baas,baas
7,newspaper,newspaper
8,sandwich bags,sandwich bags
9,cobb salad,cobb salad


Session: 2


Unnamed: 0,0,1
0,thai salad,thai salad
1,country spread,country spread
2,1% milk,1% milk
3,lemon sparkling water,lemon sparkling water
4,ketchup,ketchup
5,corn,corn
6,baas,baas
7,tortilla chips,tortilla chips
8,syrup,syrup
9,pretzels,pretzels


Session: 3


Unnamed: 0,0,1
0,chicken enchilada,chicken enchilada
1,syrup,syrup
2,turkey bologna,turkey bologna
3,oat squares cereal,oat squares cereal
4,chicken caesar salad,chicken caesar salad
5,spinach salad,spinach salad
6,baas,baas
7,newspaper,newspaper
8,fat free milk,fat free milk
9,potato salad,potato salad


Session: 4


Unnamed: 0,0,1
0,chicken enchilada,chicken enchilada
1,syrup,syrup
2,turkey bologna,turkey bologna
3,oat squares cereal,oat squares cereal
4,chicken caesar salad,chicken caesar salad
5,spinach salad,spinach salad
6,baas,baas
7,newspaper,newspaper
8,harvest cheddar sunchips,harvest cheddar sunchips
9,ice cream,ice cream


Session: 5


Unnamed: 0,0,1
0,lime sparkling water,lime sparkling water
1,tea bags,tea bags
2,fat free milk,fat free milk
3,baked beans,baked beans
4,cheddar cheese,cheddar cheese
5,cheese puffs,cheese puffs
6,tortilla chips,tortilla chips
7,newspaper,newspaper
8,garden salad,garden salad
9,baas,baas


Session: 6


Unnamed: 0,0,1
0,key lime pie,key lime pie
1,salad,salad
2,blt salad,blt salad
3,vanilla ice cream sandwiches,vanilla ice cream sandwiches
4,1% milk,1% milk
5,mustard,mustard
6,plum jam,plum jam
7,whole wheat spaghetti,whole wheat spaghetti
8,pepperoni lasagna,pepperoni lasagna
9,tortilla chips,tortilla chips


ID: 121


Sessions:   0%|          | 0/6 [00:00<?, ?it/s]

Session: 1


Unnamed: 0,0,1
0,blt salad,blt salad
1,bacon,bacon
2,sugar cookie,sugar cookie
3,coconut cake,coconut cake
4,bread,bread
5,garlic bread,garlic bread
6,raisin bran cereal,raisin bran cereal
7,shrimp,shrimp
8,ground beef,ground beef
9,guacamole,guacamole


Session: 2


Permutations:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,0,1
0,chocolate chip cookies,chocolate chip cookies
1,biscuits and gravy,biscuits and gravy
2,tarp,tarp
3,index cards,index cards
4,,
5,eyeglasses chain,eyeglasses chain
6,almond butter,almond butter
7,cole slaw,cole slaw
8,lemons,lemons
9,butter pecan gelato,butter pecan gelato


Session: 3


Unnamed: 0,0,1
0,santa fe salad kit,santa fe salad kit
1,ice cream,ice cream
2,watermelon,watermelon
3,almond milk,almond milk
4,orange juice,orange juice
5,onions,onions
6,wonton strips,wonton strips
7,rice,rice
8,green bell peppers,green bell peppers
9,ice cream,ice cream


Session: 4


Permutations:   0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0,0,1
0,,
1,crispy onions,crispy onions
2,wonton strips,wonton strips
3,watermelon,watermelon
4,salad bowl,salad bowl
5,eggs,eggs
6,apple pie,apple pie
7,cherry pie,cherry pie
8,bacon,bacon
9,butter pecan gelato,butter pecan gelato


Session: 5


Permutations:   0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0,0,1
0,ice cream,ice cream
1,gelato,gelato
2,sushi,sushi
3,apple pie,apple pie
4,cashews,cashews
...,...,...
66,chicken potpie,pot pie
67,avocados,avocado
68,,n a - outdoor furnishing
69,,adapter cable


Session: 6


Unnamed: 0,0,1
0,cauliflower rice,cauliflower rice
1,ice cream,ice cream
2,marbled cheese,marbled cheese
3,almond milk,almond milk
4,yogurt,yogurt
5,heavy cream,heavy cream
6,roses,roses
7,bacon,bacon
8,eggs,eggs
9,cashews,cashews


ID: 127


Sessions:   0%|          | 0/5 [00:00<?, ?it/s]

Session: 1


Unnamed: 0,0,1
0,strawberries,strawberries
1,blueberries,blueberries
2,hummus,hummus
3,guacamole,guacamole
4,,
5,salmon shrimp cat food,salmon shrimp cat food
6,chicken cat food,chicken cat food
7,caramel rice cakes,caramel rice cakes
8,water,water
9,pita bread,pita bread


Session: 2


Unnamed: 0,0,1
0,potato salad,potato salad
1,popsicles,popsicles
2,2% milk,2% milk
3,water,water
4,chicken strips,chicken strips
5,stuffed grape leaves,stuffed grape leaves
6,paper towels,paper towels
7,,
8,strawberry sparkling water,strawberry sparkling water
9,water,water


Session: 4


Permutations:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,0,1
0,chocolate hummus,chocolate hummus
1,dairy free half and half,dairy free half and half
2,2% milk,2% milk
3,eggs,eggs
4,ice cream,ice cream
5,tea bags,tea bags
6,graham crackers,graham crackers
7,ginger thins cookies,ginger thins cookies
8,raspberry drink mix,raspberry drink mix
9,empire apples,empire apples


Session: 5


Unnamed: 0,0,1
0,ground beef,ground beef
1,sausage,sausage
2,popcorn,popcorn
3,mangos,mangos
4,baas,baas
5,taboule salad,taboule salad
6,black bean salad,black bean salad
7,stuffed grape leaves,stuffed grape leaves
8,string cheese,string cheese
9,goat cheese,goat cheese


Session: 6


Unnamed: 0,0,1
0,2% milk,2% milk
1,paper bowls,paper bowls
2,chocolate chip cookies,chocolate chip cookies
3,batteries,batteries
4,baas,baas
5,potato chips,potato chips
6,butter,butter
7,orange juice,orange juice
8,chocolate ice cream,chocolate ice cream
9,baas,baas


In [7]:
### Align Data Sets by hand
def align_by_hand():
    # ID 137 and 114 have low variation
    df3 = dfs[2].loc[dfs[2].ID == 137, 'Item'].copy().reset_index(drop=True)

    # align by inspecting for proof of concept
    df3_aligned = df3.drop([102]).reset_index(drop=True)
    
    df_hand_aligned = pd.concat([dfs[0].loc[dfs[0].ID == 137, 'Item'].reset_index(drop=True), df3_aligned], axis=1)
    df_hand_aligned = df_hand_aligned.apply(compare, axis=1)
    display(df_hand_aligned)
    
#align_by_hand()

In [8]:
dfs[0].loc[dfs[0].ID == 127, 'Session'].unique()

array([1, 2, 4, 5, 6], dtype=uint8)

In [9]:
# TODO: Collect basket outliers for inspection
# 0-1 MANY and 1-2 MANY
pd.set_option('display.max_rows', 500)

display(pd.concat([dfs[0].loc[(dfs[0].ID == 127) & (dfs[0].Session == 4), 'Item'].reset_index(drop=True),
                   dfs[2].loc[(dfs[2].ID == 127) & (dfs[2].Session == 4), 'Item'].reset_index(drop=True)],
                      axis=1, ignore_index=True))

Unnamed: 0,0,1
0,bakery item,oatmeal
1,blue cheese stuffed olives,
2,goat cheese,crumble goat cheese
3,chocolate hummus,chocolate hummus
4,dairy free half and half,dairy free half and half
5,2% milk,2% milk
6,eggs,eggs
7,ice cream,ice cream
8,tongs,salad tongs
9,tea bags,tea bags


In [10]:
item1 = 'cleaning spray'.split()
item2 = 'glass wipes'.split()
print(word_vectors.n_similarity(item1, item2))
print(word_vectors.most_similar(positive=[*item1, *item2])[0])
print(word_vectors.wmdistance(item1, item2))

0.70241946
('plastic', 0.8458467721939087)
4.836613549687386
