In [1]:
import pandas as pd
import gensim.downloader as api
from importlib import reload

import merge

In [2]:
pd.set_option('display.max_rows', 1000)

When two item descriptions don't contain any matches, we use pre-trained word embeddings from [gensim](https://github.com/RaRe-Technologies/gensim-data#models) to compute their similarity. For example, "hamburger meat" compared with "beef patties."

In [3]:
%time word_vectors = api.load("glove-wiki-gigaword-300") # 50, 100, 200, 300 sizes available

Wall time: 1min 49s


In [4]:
%%time
DATA_PATH = '../Data/'
FILES = ['clean_max_to_merge', 'clean_maria_to_merge', 'clean_samantha_to_merge']
COLS = [0, 1, 2, 3, 4]  # Index, ID, Session, Receipt, Item
DTYPES = {'ID': 'uint8', 'Session': 'uint8', 'Receipt': 'uint8', 'Item': 'string'}

dfs = [pd.read_csv(DATA_PATH + file + '.csv', index_col=0, usecols=COLS, dtype=DTYPES) for file in FILES]

ids_shared = set.intersection(*[set(df.ID.unique()) for df in dfs])
print(f'IDs for merging: {ids_shared}')
print('Full list: {121, 114, 137, 153, 141, 127, 130, 135, 148, 158}')
dfs = [df[df.ID.isin(ids_shared)].reset_index(drop=True) for df in dfs]

IDs for merging: {130, 135, 137, 114, 153, 127}
Full list: {121, 114, 137, 153, 141, 127, 130, 135, 148, 158}
Wall time: 125 ms


In [5]:
for df in dfs:
    df.info()
    print()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 885 entries, 0 to 884
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ID       885 non-null    uint8 
 1   Session  885 non-null    uint8 
 2   Receipt  885 non-null    uint8 
 3   Item     885 non-null    string
dtypes: string(1), uint8(3)
memory usage: 9.6 KB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 861 entries, 0 to 860
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ID       861 non-null    uint8 
 1   Session  861 non-null    uint8 
 2   Receipt  861 non-null    uint8 
 3   Item     861 non-null    string
dtypes: string(1), uint8(3)
memory usage: 9.4 KB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 888 entries, 0 to 887
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ID       888 non-null    uint8 
 1   Session  888 non-null    uin

In [6]:
reload(merge)

<module 'merge' from 'C:\\Users\\Joseph\\Documents\\Projects\\ALab\\CookiesAndCognition\\Scripts\\merge.py'>

In [7]:
df_merged01, df_merged01_wv = merge.merge([dfs[0], dfs[1]], word_vectors)
df_merged01.head()

ID:   0%|          | 0/6 [00:00<?, ?it/s]

Session:   0%|          | 0/5 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Session:   0%|          | 0/4 [00:00<?, ?it/s]

Session:   0%|          | 0/4 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,ID,Session,Receipt,Item
0,114,1,1,turkey bacon
1,114,1,1,bananas
2,114,1,1,newspaper
3,114,1,1,chicken enchilada
4,114,1,1,dog food


In [8]:
df_merged02, df_merged02_wv = merge.merge([dfs[0], dfs[2]], word_vectors)
df_merged02.head()

ID:   0%|          | 0/6 [00:00<?, ?it/s]

Session:   0%|          | 0/5 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Session:   0%|          | 0/4 [00:00<?, ?it/s]

Session:   0%|          | 0/4 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,ID,Session,Receipt,Item
0,114,1,1,chicken enchilada
1,114,1,1,dog food
2,114,1,1,bags tea
3,114,1,1,jack cheese colby
4,114,1,1,salad asian


In [None]:
df_merged12, df_merged12_wv = merge.merge([dfs[1], dfs[2]], word_vectors)
df_merged12.head()

ID:   0%|          | 0/6 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

Session:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
df_merged01_2, df_merged01_2_wv = merge.merge([df_merged01, dfs[2]], word_vectors)
df_merged01_2.head()

In [None]:
df_merged01.info()
df_merged02.info()
df_merged12.info()
df_merged01_2.info()

In [None]:
display(df_merged02_wv)

In [None]:
# barbecue sauce	bbq sauce	sauce spicy
item1 = "barbecue".split()
item2 = "bbq".split()
print(word_vectors.n_similarity(item1, item2))
print(word_vectors.most_similar(positive=[*item1, *item2], topn=5))
print(word_vectors.wmdistance(item1, item2))