In [1]:
import pandas as pd
import gensim.downloader as api
from importlib import reload

import merge

In [2]:
pd.set_option('display.max_rows', 1000)

When two item descriptions don't contain any matches, we use pre-trained word embeddings from [gensim](https://github.com/RaRe-Technologies/gensim-data#models) to compute their similarity. For example, "hamburger meat" compared with "beef patties."

In [3]:
%time word_vectors = api.load("glove-wiki-gigaword-300") # 50, 100, 200, 300 sizes available

Wall time: 1min 38s


In [4]:
%%time
DATA_PATH = '../Data/'
FILES = ['clean_max_to_merge', 'clean_maria_to_merge', 'clean_samantha_to_merge']
COLS = [0, 1, 2, 3, 4]  # Index, ID, Session, Receipt, Item
DTYPES = {'ID': 'uint8', 'Session': 'uint8', 'Receipt': 'uint8', 'Item': 'string'}

dfs_full = [pd.read_csv(DATA_PATH + file + '.csv', index_col=0, usecols=COLS, dtype=DTYPES) for file in FILES]

Wall time: 121 ms


In [5]:
ids_shared = set.intersection(*[set(df.ID.unique()) for df in dfs_full])
print('Full list: {121, 114, 137, 153, 141, 127, 130, 135, 148, 158}')
print(f'IDs for merging: {ids_shared}')
dfs = [df[df.ID.isin(ids_shared)].reset_index(drop=True) for df in dfs_full]

Full list: {121, 114, 137, 153, 141, 127, 130, 135, 148, 158}
IDs for merging: {130, 135, 137, 114, 153, 127}


In [6]:
%time df_merged01, df_merged01_wv = merge.merge([dfs[0], dfs[1]], word_vectors)

ID:   0%|          | 0/6 [00:00<?, ?it/s]

Session:   0%|          | 0/5 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Session:   0%|          | 0/4 [00:00<?, ?it/s]

Session:   0%|          | 0/4 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Wall time: 35min 36s


In [7]:
print(f"The merge used word_vectors heavily on {df_merged01_wv.shape[0]} rows")

The merge used word_vectors heavily on 142 rows


Correct by hand

In [8]:
df_merged01_wv

Unnamed: 0,ID,Session,Receipt,Item1,Item2,Item,Distance
25,114,1,3,coca cola soda,diet cola,cola pepsi,4.81381
26,114,1,3,coca cola soda,diet cola,cola pepsi,4.81381
27,114,1,3,coca cola soda,diet cola,cola pepsi,4.81381
28,114,1,3,coca cola soda,diet cola,cola pepsi,4.81381
61,114,2,2,candy bars,assorted hershey chocolate,cookies,6.910234
65,114,3,1,coca cola soda,diet cola,cola pepsi,4.81381
73,114,3,1,hamburgers,frozen cooked burgers,burgers sandwiches,6.494545
76,114,3,1,hot dogs,meat franks,dog,8.567453
86,114,3,2,bistro salad bowl,chef salad,salad restaurant,5.505913
100,114,5,1,coca cola soda,diet cola,cola pepsi,4.81381


In [9]:
df_merged01.iloc[61, 3] = "chocolate bars"
df_merged01.iloc[73, 3] = "hamburgers"
df_merged01.iloc[76, 3] = "hot dogs"
df_merged01.iloc[106, 3] = "pie"
df_merged01.iloc[125, 3] = ""
df_merged01.iloc[136, 3] = "salad"
df_merged01.iloc[156, 3] = "mango popsicles"
df_merged01.iloc[162, 3] = ""
df_merged01.iloc[163, 3] = ""
df_merged01.iloc[191, 3] = ""
df_merged01.iloc[205, 3] = ""
df_merged01.iloc[216, 3] = "maple cereal"
df_merged01.iloc[205, 3] = ""
df_merged01.iloc[221, 3] = "walnuts"
df_merged01.iloc[223, 3] = "tilapia"
df_merged01.iloc[230, 3] = "havarti cheese"
df_merged01.iloc[258, 3] = "swiss cheese"
df_merged01.iloc[259, 3] = "havarti cheese"
df_merged01.iloc[279, 3] = "caesar salad"
df_merged01.iloc[294, 3] = "fruit"
df_merged01.iloc[306, 3] = "cake"
df_merged01.iloc[331, 3] = "blueberries"
df_merged01.iloc[332, 3] = "blueberries"
df_merged01.iloc[345, 3] = "salad"
df_merged01.iloc[400, 3] = "hot dogs"
df_merged01.iloc[401, 3] = ""
df_merged01.iloc[402, 3] = ""
df_merged01.iloc[416, 3] = "mint ice cream"
df_merged01.iloc[424, 3] = ""
df_merged01.iloc[432, 3] = "yogurt"
df_merged01.iloc[434, 3] = "yogurts"
df_merged01.iloc[447, 3] = ""
df_merged01.iloc[491, 3] = "fruit"
df_merged01.iloc[492, 3] = "fruit"
df_merged01.iloc[520, 3] = "milk"
df_merged01.iloc[521, 3] = "corvina"
df_merged01.iloc[526, 3] = "fruit"
df_merged01.iloc[553, 3] = "milk"
df_merged01.iloc[555, 3] = "beans"
df_merged01.iloc[559, 3] = "strawberries"
df_merged01.iloc[560, 3] = ""
df_merged01.iloc[594, 3] = "strawberries"
df_merged01.iloc[609, 3] = ""
df_merged01.iloc[610, 3] = ""
df_merged01.iloc[611, 3] = "candy"
df_merged01.iloc[627, 3] = "milk"
df_merged01.iloc[629, 3] = "strawberries"
df_merged01.iloc[637, 3] = "blackberry jam"
df_merged01.iloc[642, 3] = "chocolate candy"
df_merged01.iloc[665, 3] = "strawberries"
df_merged01.iloc[677, 3] = "milk"
df_merged01.iloc[685, 3] = ""
df_merged01.iloc[703, 3] = "salmon"
df_merged01.iloc[704, 3] = "strawberries"
df_merged01.iloc[705, 3] = "strawberries"
df_merged01.iloc[715, 3] = "milk"
df_merged01.iloc[730, 3] = "corvina"
df_merged01.iloc[741, 3] = "milk"
df_merged01.iloc[748, 3] = ""
df_merged01.iloc[768, 3] = "blueberries"
df_merged01.iloc[769, 3] = "blueberries"
df_merged01.iloc[772, 3] = ""
df_merged01.iloc[812, 3] = ""
df_merged01.iloc[815, 3] = "beef"
df_merged01.iloc[823, 3] = "romaine lettuce"
df_merged01.iloc[825, 3] = "garlic toast"
df_merged01.iloc[846, 3] = "vegetables"
df_merged01.iloc[854, 3] = "mangos"
df_merged01.iloc[889, 3] = ""
df_merged01.iloc[890, 3] = "chips"
df_merged01.iloc[891, 3] = "chips"
df_merged01.iloc[892, 3] = ""

73 items corrected by hand. An empty string (19) represents no obvious connection between items.

Second merge

In [10]:
merge.divergence([df_merged01, dfs[2]], word_vectors)

ID: 114, Session: 1, Receipt: 1, Div: 0! []
ID: 114, Session: 1, Receipt: 2, Div: 0! []
ID: 114, Session: 1, Receipt: 3, Div: 0! []
ID: 114, Session: 2, Receipt: 1, Div: 0! []
ID: 114, Session: 2, Receipt: 2, Div: 1! [7]
ID: 114, Session: 3, Receipt: 1, Div: 3! [1, 2, 9]
ID: 114, Session: 3, Receipt: 2, Div: 1! [10]
ID: 114, Session: 5, Receipt: 1, Div: 2! [1, 2]
ID: 114, Session: 6, Receipt: 1, Div: 1! [2]
ID: 114, Session: 6, Receipt: 2, Div: 2! [2, 9]
ID: 114, Session: 6, Receipt: 3, Div: 0! []

ID: 137, Session: 1, Receipt: 1, Div: 2! [10, 20]
ID: 137, Session: 1, Receipt: 2, Div: 6! [5, 6, 7, 8, 9, 10]
ID: 137, Session: 4, Receipt: 1, Div: 2! [10, 11]
ID: 137, Session: 4, Receipt: 2, Div: 0! []

ID: 153, Session: 1, Receipt: 1, Div: 1! [14]
ID: 153, Session: 1, Receipt: 2, Div: 4! [0, 9, 10, 14]
ID: 153, Session: 5, Receipt: 1, Div: 7! [0, 8, 14, 27, 28, 29, 30]
ID: 153, Session: 5, Receipt: 2, Div: 6! [3, 4, 5, 9, 11, 12]

ID: 127, Session: 1, Receipt: 1, Div: 1! [9]
ID: 127, Ses

In [11]:
%time df_merged01_2, df_merged01_2_wv = merge.merge([df_merged01, dfs[2]], word_vectors)

ID:   0%|          | 0/6 [00:00<?, ?it/s]

Session:   0%|          | 0/5 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Session:   0%|          | 0/4 [00:00<?, ?it/s]

Session:   0%|          | 0/4 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Wall time: 7min 28s


In [12]:
print(f"The merge used word_vectors heavily on {df_merged01_2_wv.shape[0]} rows")

The merge used word_vectors heavily on 94 rows


Correct by hand

In [13]:
df_merged01_2_wv

Unnamed: 0,ID,Session,Receipt,Item1,Item2,Item,Distance
25,114,1,3,cola pepsi,cola diet soda,cola coke,5.05694
26,114,1,3,cola pepsi,cola diet soda,cola coke,5.05694
27,114,1,3,cola pepsi,cola diet soda,cola coke,5.05694
28,114,1,3,cola pepsi,cola diet soda,cola coke,5.05694
61,114,2,2,hershey nuggets,assorted candy bars,chocolate,8.14262
69,114,3,1,cola pepsi,cola diet soda,cola coke,5.05694
76,114,3,1,hot dogs,cola diet soda,drink,8.80994
82,114,3,2,fe santa salad,stuffed chicken breasts,fe grilled,8.660721
83,114,3,2,stuffed chicken breast,texas ranch chicken,chicken meat,6.358336
84,114,3,2,chicken ranch,seasoned chicken breast,chicken cooked,6.165742


In [14]:
df_merged01_2.iloc[57, 3] = "candy bars"
df_merged01_2.iloc[76, 3] = ""
df_merged01_2.iloc[86, 3] = ""
df_merged01_2.iloc[87, 3] = "bread"
df_merged01_2.iloc[135, 3] = "barbecue sauce"
df_merged01_2.iloc[147, 3] = "salad dressing"
df_merged01_2.iloc[153, 3] = "salad dressing"
df_merged01_2.iloc[308, 3] = "cake"
df_merged01_2.iloc[346, 3] = "salad"
df_merged01_2.iloc[379, 3] = "dog treats"
df_merged01_2.iloc[401, 3] = ""
df_merged01_2.iloc[410, 3] = ""
df_merged01_2.iloc[471, 3] = ""
df_merged01_2.iloc[487, 3] = ""
df_merged01_2.iloc[488, 3] = ""
df_merged01_2.iloc[499, 3] = "soda"
df_merged01_2.iloc[516, 3] = "cheese"
df_merged01_2.iloc[564, 3] = "whipped cream"
df_merged01_2.iloc[595, 3] = "hot dogs"
df_merged01_2.iloc[640, 3] = "blackberry jam"
df_merged01_2.iloc[641, 3] = "raspberry jam"
df_merged01_2.iloc[673, 3] = "milk"
df_merged01_2.iloc[681, 3] = "whipped cream"
df_merged01_2.iloc[686, 3] = "tomato sauce"
df_merged01_2.iloc[747, 3] = ""
df_merged01_2.iloc[799, 3] = "sauce"
df_merged01_2.iloc[826, 3] = "romaine lettuce"
df_merged01_2.iloc[829, 3] = "soda"
df_merged01_2.iloc[855, 3] = "mangos"
df_merged01_2.iloc[888, 3] = "grain"
df_merged01_2.iloc[901, 3] = "cranberry sauce"

32 items corrected by hand. An empty string (8) represents no obvious connection between items.

In [15]:
df_merged01_2.to_csv(f'{DATA_PATH}merged01_2.csv')