In [1]:
import pandas as pd
import gensim.downloader as api
from importlib import reload

import merge

In [2]:
pd.set_option('display.max_rows', 1000)

When two item descriptions don't contain any matches, we use pre-trained word embeddings from [gensim](https://github.com/RaRe-Technologies/gensim-data#models) to compute their similarity. For example, "hamburger meat" compared with "beef patties."

In [3]:
%time word_vectors = api.load("glove-wiki-gigaword-300") # 50, 100, 200, 300 sizes available

Wall time: 2min


In [4]:
%%time
DATA_PATH = '../Data/'
FILES = ['clean_max_to_merge', 'clean_maria_to_merge', 'clean_samantha_to_merge']
COLS = [0, 1, 2, 3, 4]  # Index, ID, Session, Receipt, Item
DTYPES = {'ID': 'uint8', 'Session': 'uint8', 'Receipt': 'uint8', 'Item': 'string'}

dfs = [pd.read_csv(DATA_PATH + file + '.csv', index_col=0, usecols=COLS, dtype=DTYPES) for file in FILES]

ids_shared = set.intersection(*[set(df.ID.unique()) for df in dfs])
print(f'IDs for merging: {ids_shared}')
print('Full list: {121, 114, 137, 153, 141, 127, 130, 135, 148, 158}')
dfs = [df[df.ID.isin(ids_shared)].reset_index(drop=True) for df in dfs]

IDs for merging: {130, 135, 137, 114, 153, 127}
Full list: {121, 114, 137, 153, 141, 127, 130, 135, 148, 158}
Wall time: 99.5 ms


In [5]:
for df in dfs:
    df.info()
    print()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 895 entries, 0 to 894
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ID       895 non-null    uint8 
 1   Session  895 non-null    uint8 
 2   Receipt  895 non-null    uint8 
 3   Item     895 non-null    string
dtypes: string(1), uint8(3)
memory usage: 9.7 KB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 864 entries, 0 to 863
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ID       864 non-null    uint8 
 1   Session  864 non-null    uint8 
 2   Receipt  864 non-null    uint8 
 3   Item     864 non-null    string
dtypes: string(1), uint8(3)
memory usage: 9.4 KB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 898 entries, 0 to 897
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ID       898 non-null    uint8 
 1   Session  898 non-null    uin

In [6]:
reload(merge)

<module 'merge' from 'C:\\Users\\Joseph\\Documents\\Projects\\ALab\\CookiesAndCognition\\Scripts\\merge.py'>

In [7]:
# def align_by_receipt(dfs, wv, count_only=False)
df_merged = merge.align_by_receipt(dfs, word_vectors)

IDs:   0%|          | 0/8 [00:00<?, ?it/s]

Sessions:   0%|          | 0/4 [00:00<?, ?it/s]

Receipts:   0%|          | 0/5 [00:00<?, ?it/s]

ID: 130, Session: 2, Receipt: 1
ID: 130, Session: 2, Receipt: 2
ID: 130, Session: 2, Receipt: 3
ID: 130, Session: 2, Receipt: 4
ID: 130, Session: 2, Receipt: 5


Receipts:   0%|          | 0/7 [00:00<?, ?it/s]

ID: 130, Session: 3, Receipt: 1
ID: 130, Session: 3, Receipt: 2
ID: 130, Session: 3, Receipt: 3
ID: 130, Session: 3, Receipt: 4
ID: 130, Session: 3, Receipt: 5
ID: 130, Session: 3, Receipt: 6
ID: 130, Session: 3, Receipt: 7


Receipts:   0%|          | 0/4 [00:00<?, ?it/s]

ID: 130, Session: 4, Receipt: 1
ID: 130, Session: 4, Receipt: 2
ID: 130, Session: 4, Receipt: 3
ID: 130, Session: 4, Receipt: 4


Receipts:   0%|          | 0/4 [00:00<?, ?it/s]

ID: 130, Session: 5, Receipt: 1
ID: 130, Session: 5, Receipt: 2
ID: 130, Session: 5, Receipt: 3
ID: 130, Session: 5, Receipt: 4


Sessions:   0%|          | 0/2 [00:00<?, ?it/s]

Receipts:   0%|          | 0/2 [00:00<?, ?it/s]

ID: 153, Session: 1, Receipt: 1
ID: 153, Session: 1, Receipt: 2


Receipts:   0%|          | 0/2 [00:00<?, ?it/s]

ID: 153, Session: 5, Receipt: 1
ID: 153, Session: 5, Receipt: 2


Sessions:   0%|          | 0/2 [00:00<?, ?it/s]

Receipts:   0%|          | 0/4 [00:00<?, ?it/s]

ID: 135, Session: 1, Receipt: 1
ID: 135, Session: 1, Receipt: 2
ID: 135, Session: 1, Receipt: 3
ID: 135, Session: 1, Receipt: 4


Receipts:   0%|          | 0/8 [00:00<?, ?it/s]

ID: 135, Session: 2, Receipt: 1
ID: 135, Session: 2, Receipt: 2
ID: 135, Session: 2, Receipt: 3
ID: 135, Session: 2, Receipt: 4
ID: 135, Session: 2, Receipt: 5
ID: 135, Session: 2, Receipt: 6


Permutations: 0it [00:00, ?it/s]

ID: 135, Session: 2, Receipt: 7
ID: 135, Session: 2, Receipt: 8


Sessions:   0%|          | 0/2 [00:00<?, ?it/s]

Receipts:   0%|          | 0/2 [00:00<?, ?it/s]

ID: 137, Session: 1, Receipt: 1
ID: 137, Session: 1, Receipt: 2


Receipts:   0%|          | 0/2 [00:00<?, ?it/s]

ID: 137, Session: 4, Receipt: 1
ID: 137, Session: 4, Receipt: 2


Sessions: 0it [00:00, ?it/s]

Sessions:   0%|          | 0/5 [00:00<?, ?it/s]

Receipts:   0%|          | 0/3 [00:00<?, ?it/s]

ID: 114, Session: 1, Receipt: 1
ID: 114, Session: 1, Receipt: 2
ID: 114, Session: 1, Receipt: 3


Receipts:   0%|          | 0/2 [00:00<?, ?it/s]

ID: 114, Session: 2, Receipt: 1
ID: 114, Session: 2, Receipt: 2


Receipts:   0%|          | 0/2 [00:00<?, ?it/s]

ID: 114, Session: 3, Receipt: 1
ID: 114, Session: 3, Receipt: 2


Receipts:   0%|          | 0/1 [00:00<?, ?it/s]

ID: 114, Session: 5, Receipt: 1


Receipts:   0%|          | 0/3 [00:00<?, ?it/s]

ID: 114, Session: 6, Receipt: 1
ID: 114, Session: 6, Receipt: 2
ID: 114, Session: 6, Receipt: 3


Sessions: 0it [00:00, ?it/s]

Sessions:   0%|          | 0/4 [00:00<?, ?it/s]

Receipts:   0%|          | 0/3 [00:00<?, ?it/s]

ID: 127, Session: 1, Receipt: 1
ID: 127, Session: 1, Receipt: 2
ID: 127, Session: 1, Receipt: 3


Receipts:   0%|          | 0/3 [00:00<?, ?it/s]

ID: 127, Session: 2, Receipt: 1
ID: 127, Session: 2, Receipt: 2
ID: 127, Session: 2, Receipt: 3


Receipts:   0%|          | 0/4 [00:00<?, ?it/s]

ID: 127, Session: 5, Receipt: 1
ID: 127, Session: 5, Receipt: 2
ID: 127, Session: 5, Receipt: 3
ID: 127, Session: 5, Receipt: 4


Receipts:   0%|          | 0/2 [00:00<?, ?it/s]

ID: 127, Session: 6, Receipt: 1
ID: 127, Session: 6, Receipt: 2


In [8]:
df_merged

Unnamed: 0,0,1,WordVec,Distance
0,grilled cheese,grilled cheese,grilled cheese,0.0
1,strawberry preserves,strawberry preserves,preserves strawberry,0.0
2,chicken broth,chicken broth,chicken broth,0.0
3,barley soup,barley soup,soup barley,0.0
4,maggi seasoning,maggi seasoning,seasoning maggi,0.0
5,ice cream,ice cream,cream ice,0.0
6,italian bread,italian bread,italian bread,0.0
7,apple fritter,apple fritter,apple fritter,0.0
8,eggs,eggs,eggs,0.0
9,ham,ham,ham,0.0


In [9]:
# ID: 135, Session: 2, Receipt: 6
# 8! [0, 1, 2, 10, 14, 15, 16, 17]
pd.concat([dfs[0].loc[(dfs[0].ID == 135) & (dfs[0].Session == 2) & (dfs[0].Receipt == 6), 'Item'].reset_index(drop=True),
           dfs[1].loc[(dfs[1].ID == 135) & (dfs[1].Session == 2) & (dfs[1].Receipt == 6), 'Item'].reset_index(drop=True),
           dfs[2].loc[(dfs[2].ID == 135) & (dfs[2].Session == 2) & (dfs[2].Receipt == 6), 'Item'].reset_index(drop=True)],
          axis=1, ignore_index=True)

Unnamed: 0,0,1,2
0,laundry detergent,fabric softener,laundry detergent softener
1,potato chips,wavy crisps,potato chips
2,potato chips,wavy crisps,potato chips
3,caramel,quino brown bread,caramels
4,basmiti rice,basmiti rice,basmati rice
5,quinoa,white grain bread,quiona
6,whole grain rice,tomato soup,whole grain rice
7,tomato soup,tomato soup,tomato soup
8,tomato soup,tomato soup,tomato soup
9,tomato soup,tomato soup,tomato soup


In [10]:
dfs[1].loc[(dfs[1].ID == 135) & (dfs[1].Session == 2) & (dfs[1].Receipt == 6) & (dfs[1].Item == 'quino brown bread'), 'Item'] = 'quinoa brown bread'
dfs[1].loc[(dfs[1].ID == 135) & (dfs[1].Session == 2) & (dfs[1].Receipt == 6) & (dfs[1].Item == 'sweetner'), 'Item'] = 'sweetener'