In [1]:
import pandas as pd
import gensim.downloader as api
from importlib import reload

import merge

In [2]:
pd.set_option('display.max_rows', 1000)

When two item descriptions don't contain any matches, we use pre-trained word embeddings from [gensim](https://github.com/RaRe-Technologies/gensim-data#models) to compute their similarity. For example, "hamburger meat" compared with "beef patties."

In [3]:
%time word_vectors = api.load("glove-wiki-gigaword-300") # 50, 100, 200, 300 sizes available

Wall time: 1min 46s


In [12]:
%%time
DATA_PATH = '../Data/'
FILES = ['clean_max_to_merge', 'clean_maria_to_merge', 'clean_samantha_to_merge']
COLS = [0, 1, 2, 3, 4]  # Index, ID, Session, Receipt, Item
DTYPES = {'ID': 'uint8', 'Session': 'uint8', 'Receipt': 'uint8', 'Item': 'string'}

dfs = [pd.read_csv(DATA_PATH + file + '.csv', index_col=0, usecols=COLS, dtype=DTYPES) for file in FILES]

ids_shared = set.intersection(*[set(df.ID.unique()) for df in dfs])
print(f'IDs for merging: {ids_shared}')
print('Full list: {121, 114, 137, 153, 141, 127, 130, 135, 148, 158}')
dfs = [df[df.ID.isin(ids_shared)].reset_index(drop=True) for df in dfs]

IDs for merging: {130, 135, 137, 114, 153, 127}
Full list: {121, 114, 137, 153, 141, 127, 130, 135, 148, 158}
Wall time: 32 ms


In [13]:
for df in dfs:
    df.info()
    print()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 885 entries, 0 to 884
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ID       885 non-null    uint8 
 1   Session  885 non-null    uint8 
 2   Receipt  885 non-null    uint8 
 3   Item     885 non-null    string
dtypes: string(1), uint8(3)
memory usage: 9.6 KB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 861 entries, 0 to 860
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ID       861 non-null    uint8 
 1   Session  861 non-null    uint8 
 2   Receipt  861 non-null    uint8 
 3   Item     861 non-null    string
dtypes: string(1), uint8(3)
memory usage: 9.4 KB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 888 entries, 0 to 887
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ID       888 non-null    uint8 
 1   Session  888 non-null    uin

In [18]:
merge.divergence([dfs[0], dfs[1]], word_vectors)
merge.divergence([dfs[0], dfs[2]], word_vectors)
merge.divergence([dfs[1], dfs[2]], word_vectors)

ID: 114, Session: 1, Receipt: 1, Div: 0! []
ID: 114, Session: 1, Receipt: 2, Div: 1! [4]
ID: 114, Session: 1, Receipt: 3, Div: 0! []
ID: 114, Session: 2, Receipt: 1, Div: 0! []
ID: 114, Session: 2, Receipt: 2, Div: 1! [7]
ID: 114, Session: 3, Receipt: 1, Div: 3! [7, 13, 14]
ID: 114, Session: 3, Receipt: 2, Div: 0! []
ID: 114, Session: 5, Receipt: 1, Div: 0! []
ID: 114, Session: 6, Receipt: 1, Div: 1! [3]
ID: 114, Session: 6, Receipt: 2, Div: 1! [2]
ID: 114, Session: 6, Receipt: 3, Div: 0! []

ID: 137, Session: 1, Receipt: 1, Div: 6! [1, 4, 10, 20, 23, 24]
ID: 137, Session: 1, Receipt: 2, Div: 5! [5, 6, 7, 8, 9]
ID: 137, Session: 4, Receipt: 1, Div: 6! [9, 10, 14, 15, 16, 17]
ID: 137, Session: 4, Receipt: 2, Div: 0! []

ID: 153, Session: 1, Receipt: 1, Div: 1! [27]
ID: 153, Session: 1, Receipt: 2, Div: 4! [12, 13, 14, 15]
ID: 153, Session: 5, Receipt: 1, Div: 7! [26, 27, 28, 29, 30, 31, 32]
ID: 153, Session: 5, Receipt: 2, Div: 6! [8, 9, 10, 11, 12, 13]

ID: 127, Session: 1, Receipt: 1,

In [19]:
merge.merge([dfs[0], dfs[1]], word_vectors)

ID:   0%|          | 0/6 [00:00<?, ?it/s]

Session:   0%|          | 0/5 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Session:   0%|          | 0/4 [00:00<?, ?it/s]

Session:   0%|          | 0/4 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,0,1,2,3,4,5,6,7
0,114,1,1,turkey bacon,114.0,1.0,1.0,turkey bacon
1,114,1,1,bananas,114.0,1.0,1.0,bananas
2,114,1,1,newspaper,114.0,1.0,1.0,newspaper
3,114,1,1,chicken enchilada,114.0,1.0,1.0,cheesy chicken enchilada soup
4,114,1,1,dog food,114.0,1.0,1.0,dry dog food
5,114,1,1,tea bags,114.0,1.0,1.0,black tea bags
6,114,1,1,colby jack cheese,114.0,1.0,1.0,colby cheese
7,114,1,1,asian salad,114.0,1.0,1.0,coupon asian chicken salad bowl
8,114,1,1,cheesecake cake,114.0,1.0,1.0,cheesecake
9,114,1,1,tapioca pudding,114.0,1.0,1.0,tapioca pudding cups


AttributeError: 'DataFrame' object has no attribute 'Distance'

In [20]:
merge.merge([dfs[0], dfs[2]], word_vectors)

ID:   0%|          | 0/6 [00:00<?, ?it/s]

Session:   0%|          | 0/5 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Session:   0%|          | 0/4 [00:00<?, ?it/s]

Session:   0%|          | 0/4 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,0,1,2,3,4,5,6,7
0,114,1,1,chicken enchilada,114.0,1.0,1.0,chicken enchilada
1,114,1,1,dog food,114.0,1.0,1.0,dog food
2,114,1,1,tea bags,114.0,1.0,1.0,tea bags
3,114,1,1,colby jack cheese,114.0,1.0,1.0,colby jack cheese
4,114,1,1,asian salad,114.0,1.0,1.0,asian salad
5,114,1,1,turkey bacon,114.0,1.0,1.0,turkey bacon
6,114,1,1,bananas,114.0,1.0,1.0,bananas
7,114,1,1,newspaper,114.0,1.0,1.0,newspaper
8,114,1,1,cheesecake cake,114.0,1.0,1.0,cheesecake
9,114,1,1,tapioca pudding,114.0,1.0,1.0,tapioca pudding cups


AttributeError: 'DataFrame' object has no attribute 'Distance'

In [21]:
merge.merge([dfs[1], dfs[2]], word_vectors)

ID:   0%|          | 0/6 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Session:   0%|          | 0/4 [00:00<?, ?it/s]

Session:   0%|          | 0/4 [00:00<?, ?it/s]

Session:   0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,0,1,2,3,4,5,6,7
0,153,1,1,nacho cheese tortilla chips,153.0,1.0,1.0,nacho cheese tortilla chips
1,153,1,1,strawberries,153.0,1.0,1.0,strawberries
2,153,1,1,eggplant,153.0,1.0,1.0,eggplant
3,153,1,1,eggplant,153.0,1.0,1.0,eggplant
4,153,1,1,organic 1% fat milk,153.0,1.0,1.0,1% gallon milk
5,153,1,1,organic 1% fat milk,153.0,1.0,1.0,1% 1 2 gallon milk
6,153,1,1,organic 1% fat milk,153.0,1.0,1.0,low fat yogurt
7,153,1,1,frozen brussels sprouts,153.0,1.0,1.0,brussel sprouts
8,153,1,1,french green beans,153.0,1.0,1.0,green beans
9,153,1,1,french green beans,153.0,1.0,1.0,black beans


AttributeError: 'DataFrame' object has no attribute 'Distance'

In [56]:
reload(merge)

<module 'merge' from 'C:\\Users\\Joseph\\Documents\\Projects\\ALab\\CookiesAndCognition\\Scripts\\merge.py'>

In [57]:
df_merged01, df_merged01_wv = merge.merge([dfs[0], dfs[1]], word_vectors)
df_merged01.head()

ID:   0%|          | 0/6 [00:00<?, ?it/s]

Session:   0%|          | 0/5 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Session:   0%|          | 0/4 [00:00<?, ?it/s]

Session:   0%|          | 0/4 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,ID,Session,Receipt,Item
0,114,1,1,bacon turkey
1,114,1,1,bananas
2,114,1,1,newspaper
3,114,1,1,enchilada chicken
4,114,1,1,dog food


In [58]:
df_merged02, df_merged02_wv = merge.merge([dfs[0], dfs[2]], word_vectors)
df_merged02.head()

ID:   0%|          | 0/6 [00:00<?, ?it/s]

Session:   0%|          | 0/5 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Session:   0%|          | 0/4 [00:00<?, ?it/s]

Session:   0%|          | 0/4 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,ID,Session,Receipt,Item
0,114,1,1,enchilada chicken
1,114,1,1,dog food
2,114,1,1,tea bags
3,114,1,1,cheese colby jack
4,114,1,1,asian salad


In [59]:
df_merged12, df_merged12_wv = merge.merge([dfs[1], dfs[2]], word_vectors)
df_merged12.head()

ID:   0%|          | 0/6 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Session:   0%|          | 0/4 [00:00<?, ?it/s]

Session:   0%|          | 0/4 [00:00<?, ?it/s]

Session:   0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,ID,Session,Receipt,Item
0,153,1,1,cheese nacho chips tortilla
1,153,1,1,strawberries
2,153,1,1,eggplant
3,153,1,1,eggplant
4,153,1,1,milk 1% dairy


In [61]:
df_merged01_2, df_merged01_2_wv = merge.merge([df_merged01, dfs[2]], word_vectors)
df_merged01_2.head()

ID:   0%|          | 0/6 [00:00<?, ?it/s]

Session:   0%|          | 0/5 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Session:   0%|          | 0/4 [00:00<?, ?it/s]

Session:   0%|          | 0/4 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,ID,Session,Receipt,Item
0,114,1,1,bananas
1,114,1,1,newspaper
2,114,1,1,dog food
3,114,1,1,tea bags
4,114,1,1,asian salad


In [62]:
df_merged01.info()
df_merged02.info()
df_merged12.info()
df_merged01_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912 entries, 0 to 911
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ID       912 non-null    uint8 
 1   Session  912 non-null    uint8 
 2   Receipt  912 non-null    uint8 
 3   Item     912 non-null    object
dtypes: object(1), uint8(3)
memory usage: 9.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 889 entries, 0 to 888
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ID       889 non-null    uint8 
 1   Session  889 non-null    uint8 
 2   Receipt  889 non-null    uint8 
 3   Item     889 non-null    object
dtypes: object(1), uint8(3)
memory usage: 9.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913 entries, 0 to 912
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ID       913 non-null    uint8 
 1   Session  913 non-null    uin

In [63]:
display(df_merged01_wv)
display(df_merged02_wv)
display(df_merged12_wv)
display(df_merged01_2_wv)

Unnamed: 0,ID,Session,Receipt,Item1,Item2,Item,Distance
13,114,1,2,chocolate almond milk,hershey's almond chocolate nuggest,chocolate almond cream,2.459587
14,114,1,2,fettuccine,fettucini,linguine,inf
25,114,1,3,coca cola soda,diet cola,cola pepsi,4.81381
26,114,1,3,coca cola soda,diet cola,cola pepsi,4.81381
27,114,1,3,coca cola soda,diet cola,cola pepsi,4.81381
28,114,1,3,coca cola soda,diet cola,cola pepsi,4.81381
41,114,2,1,iceberg and romaine salad mix,iceberg and romaine blend,romaine iceberg and mixture,2.726471
56,114,2,2,hershey's nuggets chocolate,coupon hershey's nuggets,hershey's nuggets coupons,4.861516
61,114,2,2,candy bars,assorted hershey's chocolate,cookies,6.523966
65,114,3,1,coca cola soda,diet cola,cola pepsi,4.81381


Unnamed: 0,ID,Session,Receipt,Item1,Item2,Item,Distance
25,114,1,3,coca cola soda,cola diet soda,soda cola pepsi,3.006759
26,114,1,3,coca cola soda,cola diet soda,soda cola pepsi,3.006759
27,114,1,3,coca cola soda,cola diet soda,soda cola pepsi,3.006759
28,114,1,3,coca cola soda,cola diet soda,soda cola pepsi,3.006759
42,114,2,1,iceberg and romaine salad mix,iceberg romaine lettuce blend,romaine iceberg spinach,4.050585
70,114,3,1,coca cola soda,cola diet soda,soda cola pepsi,3.006759
71,114,3,1,coca cola soda,cola diet soda,soda cola pepsi,3.006759
72,114,3,1,coca cola soda,cola diet soda,soda cola pepsi,3.006759
76,114,3,1,hot dogs,franks,dog,9.108529
84,114,3,2,chicken ranch meal,texas ranch chicken,ranch chicken meat,3.11316


Unnamed: 0,ID,Session,Receipt,Item1,Item2,Item,Distance
4,153,1,1,organic 1% fat milk,1% gallon milk,milk 1% dairy,6.484586
5,153,1,1,organic 1% fat milk,1% 1 2 gallon milk,milk 1% 3,6.792539
6,153,1,1,organic 1% fat milk,low fat yogurt,fat dairy,5.0179
9,153,1,1,french green beans,black beans,beans red,5.418866
16,153,1,1,sharp shredded cheddar,sharp cheddar cheese,cheddar sharp grated,2.553218
19,153,1,1,shreded maple frost wheat,maple shredded wheat cereal,maple wheat corn,4.164035
24,153,1,1,raw walnut baking pieces,walnuts,walnut piece,8.299328
33,153,1,2,havarti cheese,sliced havarti,havarti cheddar,3.707529
35,153,1,2,salad mix,salad organic,salad ingredients,3.899541
36,153,1,2,salad mix,salad organic,salad ingredients,3.899541


Unnamed: 0,ID,Session,Receipt,Item1,Item2,Item,Distance
13,114,1,2,chocolate almond cream,almond milk chocolate,almond chocolate butter,2.283899
14,114,1,2,linguine,fettuccine pasta,penne,6.494797
25,114,1,3,cola pepsi,cola diet soda,cola coke,5.05694
26,114,1,3,cola pepsi,cola diet soda,cola coke,5.05694
27,114,1,3,cola pepsi,cola diet soda,cola coke,5.05694
28,114,1,3,cola pepsi,cola diet soda,cola coke,5.05694
42,114,2,1,romaine iceberg and mixture,iceberg romaine lettuce blend,romaine iceberg mix,3.651525
58,114,2,2,and honey oats cereal,assorted candy bars,and oatmeal,7.652233
60,114,2,2,hershey's nuggets coupons,oats honey cereal,cereals,9.246085
61,114,2,2,cookies,chocolate,cookie,5.723953
