In [1]:
import re
import datetime

import pandas as pd
import gensim.downloader as api

import merge

In [2]:
from importlib import reload

In [3]:
pd.set_option('display.max_rows', 200)

In [4]:
%time word_vectors = api.load("glove-wiki-gigaword-300") # 50, 100, 200, 300 sizes available

Wall time: 1min 46s


In [5]:
DATA_PATH = '../Data/'
FILES = ['clean_max', 'clean_maria', 'clean_samantha']
COLS = [0, 1, 2, 3, 5]  # Index, ID, Session, Receipt, Item
DTYPES = {'ID': 'uint8', 'Session': 'uint8', 'Receipt': 'uint8', 'Item': str}

dfs = [pd.read_csv(DATA_PATH + file + '.csv', index_col=0, usecols=COLS, dtype=DTYPES) for file in FILES]

Restrict data set to shared participants

In [6]:
ids_shared = set.intersection(*[set(df.ID.unique()) for df in dfs])
dfs = [df[df.ID.isin(ids_shared)].reset_index(drop=True) for df in dfs]

Item descriptions are optionally formated as "item (modifier)", where modifier usually denotes an adjective like flavor, such as "ice cream (chocolate)". The reformat_modifier function removes this formatting by moving 'modifier' to beginning of text and droping the parentheses. The Item strings are additionally cleaned by removing punctuation and stripping white space.

In [7]:
paren = re.compile(r'\(.+\)')

def reformat_modifier(text):
    m = paren.search(text)
    if m:
        text = ' '.join([m.group(0)[1:-1], text])
        text = paren.sub('', text)
    return text

In [8]:
for df in dfs:
    df.Item = (df.Item
               .apply(reformat_modifier)
               .str.replace(r'[/(),"&]', ' ', regex=True)
               .str.replace(r'?', '', regex=False)
               .str.replace(r"'s", '', regex=False)
               .str.replace(r"coupon", '', regex=False)
               .str.strip())

The merging algorithm employs an O(n!) brute force search, where n is the number of unmatched item descriptions. To be computationally feasible n must be below 8. Sessions are examined for large variations between data sets.

In [9]:
pd.concat([df.groupby(by=['ID', 'Session']).Item.count() for df in dfs], axis=1, ignore_index=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2
ID,Session,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
114,1,30.0,30,30.0
114,2,32.0,32,32.0
114,3,26.0,24,26.0
114,4,4.0,19,4.0
114,5,18.0,22,18.0
114,6,32.0,33,32.0
127,1,54.0,58,55.0
127,2,36.0,35,35.0
127,5,69.0,62,69.0
127,6,48.0,46,48.0


The following tuples are dropped by inspection.

In [10]:
for df in dfs:
    df.drop(df[(df.ID == 114) & (df.Session == 4)].index, inplace=True)
    df.drop(df[(df.ID == 130) & (df.Session == 1)].index, inplace=True)
    df.drop(df[(df.ID == 135) & (df.Session == 6)].index, inplace=True)
    df.drop(df[(df.ID == 153) & (df.Session == 2)].index, inplace=True)
    df.drop(df[(df.ID == 153) & (df.Session == 6)].index, inplace=True)

The merge algorithm operates on receipts and requires each data set to recognize the same number of receipts per session per ID. The number of receipts are examined for variations between the data sets.

In [11]:
pd.concat([df.groupby(by=['ID', 'Session']).Receipt.unique() for df in dfs], axis=1, ignore_index=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2
ID,Session,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
114,1,"[1, 2, 3]","[1, 2, 3]","[1, 2, 3]"
114,2,"[1, 2]","[1, 2]","[1, 2]"
114,3,"[1, 2]","[1, 2]","[1, 2]"
114,5,[1],"[1, 2]",[1]
114,6,"[1, 2, 3]","[1, 2, 3]","[1, 2, 3]"
127,1,"[1, 2, 3]","[1, 2, 3]","[1, 2, 3]"
127,2,"[1, 2, 3]","[1, 2]","[1, 2, 3]"
127,5,"[1, 2, 3, 4]","[1, 2, 3]","[1, 2, 3, 4]"
127,6,"[1, 2]","[1, 2]","[1, 2]"
130,2,"[1, 2, 3, 4, 5]","[1, 5, 2, 3, 4]","[1, 2, 3, 4, 5]"


Discrepancies are corrected by inspection. In some cases a receipt stub remained as an artifact of previous data cleaning. In other cases two receipts needed to be merged due to the same.

In [12]:
# ID: 114, Session: 5, Receipts: [1]	[1, 2]	[1]
pd.concat([df.loc[(df.ID == 114) & (df.Session == 5), ['Receipt', 'Item']].reset_index(drop=True) for df in dfs],
          axis=1, ignore_index=True)

Unnamed: 0,0,1,2,3,4,5
0,1.0,lime sparkling water,1,unsweetned lime sparkling water,1.0,lime sparkling water
1,1.0,arnold palmer drink mix,1,lemonade iced tea drink,1.0,lemonade iced tea mix
2,1.0,arnold palmer drink mix,1,lemonade iced tea drink,1.0,lemonade iced tea mix
3,1.0,tea bags,1,black tea bags,1.0,tea bags
4,1.0,fat free milk,1,free fat milk,1.0,fat free milk
5,1.0,coca cola soda,1,diet cola,1.0,cola diet soda
6,1.0,queso,1,dips queso,1.0,queso dip
7,1.0,baked beans,1,baked beans,1.0,baked beans
8,1.0,cheddar cheese,1,cheddar cheese,1.0,cheddar cheese
9,1.0,cheese puffs,1,s cheese puffs,1.0,cheese puffs


In [13]:
dfs[1].drop(dfs[1][(dfs[1].ID == 114) & (dfs[1].Session == 5) & (dfs[1].Receipt == 2)].index, inplace=True)

In [14]:
# ID: 127, Session: 2, Receipts: [1, 2, 3]	[1, 2]	[1, 2, 3]
pd.concat([df.loc[(df.ID == 127) & (df.Session == 2), ['Receipt', 'Item']].reset_index(drop=True) for df in dfs],
          axis=1, ignore_index=True)

Unnamed: 0,0,1,2,3,4,5
0,1,potato salad,1.0,coke diet,1.0,potato salad
1,1,coca cola soda,1.0,coke diet,1.0,cola diet soda
2,1,coca cola soda,1.0,ginger ale,1.0,cola diet soda
3,1,ginger ale soda,1.0,frozen greek yogurt bars,1.0,diet ginger ale
4,1,peanut butter yogurt bars,1.0,frozen greek yogurt bars,1.0,peanut butter green yogurt frozen bars
5,1,peanut butter chocolate yogurt bars,1.0,frozen greek yogurt bars,1.0,peanut butter chocolate green yogurt frozen bars
6,1,brownie yogurt bars,1.0,frozen greek yogurt bars,1.0,brownie greek yogurt frozen bars
7,1,yogurt bars,1.0,pops,1.0,greek yogurt frozen bars
8,1,popsicles,1.0,riced cauliflower,1.0,popsicles
9,1,cauliflower + broccoli rice,1.0,lemon riced,1.0,riced cauliflower broccoli


In [15]:
dfs[1].loc[(dfs[1].ID == 127) & (dfs[1].Session == 2), ['Receipt', 'Item']]

Unnamed: 0,Receipt,Item
805,1,coke diet
806,1,coke diet
807,1,ginger ale
808,1,frozen greek yogurt bars
809,1,frozen greek yogurt bars
810,1,frozen greek yogurt bars
811,1,frozen greek yogurt bars
812,1,pops
813,1,riced cauliflower
814,1,lemon riced


In [16]:
dfs[1].loc[831:839, 'Receipt'] = 3

In [17]:
# ID: 127, Session: 5, Receipts: [1, 2, 3, 4]	[1, 2, 3]	[1, 2, 3, 4]
pd.concat([df.loc[(df.ID == 127) & (df.Session == 5), ['Receipt', 'Item']].reset_index(drop=True) for df in dfs],
          axis=1, ignore_index=True)

Unnamed: 0,0,1,2,3,4,5
0,1,ground beef,1.0,ground beef,1,ground beef
1,1,sausage,1.0,red hot dog,1,sausage
2,1,egg potato salad,1.0,egg and potato salad,1,egg potato salad
3,1,popcorn,1.0,popcorn,1,popcorn
4,1,mangos,1.0,mango,1,mangos
5,1,bananas,1.0,bananas,1,bananas
6,1,taboule salad,1.0,fresh tabboule,1,taboule salad
7,1,black bean salad,1.0,bean salad,1,black bean salad
8,1,stuffed grape leaves,1.0,grapes leaves,1,stuffed grape leaves
9,1,string cheese,1.0,goat cheese,1,string cheese


In [18]:
dfs[1].loc[(dfs[1].ID == 127) & (dfs[1].Session == 5), ['Receipt', 'Item']]

Unnamed: 0,Receipt,Item
840,1,ground beef
841,1,red hot dog
842,1,egg and potato salad
843,1,popcorn
844,1,mango
845,1,bananas
846,1,fresh tabboule
847,1,bean salad
848,1,grapes leaves
849,1,goat cheese


In [19]:
dfs[1].loc[880:901, 'Receipt'] = 4

In [20]:
# ID: 135, Session: 2, Receipts: [1, 2, 3, 4, 5, 6, 7, 8]	[1, 2, 3, 4, 5, 6, 7, 8, 9]	[1, 2, 3, 4, 5, 6, 7, 8]
pd.concat([df.loc[(df.ID == 135) & (df.Session == 2), ['Receipt', 'Item']].reset_index(drop=True) for df in dfs],
          axis=1, ignore_index=True)

Unnamed: 0,0,1,2,3,4,5
0,1.0,orange juice,1.0,orange juice,1,orange juice
1,1.0,hummus,1.0,hummus,1,hummus
2,1.0,hummus,1.0,hummus,1,hummus
3,1.0,ricotta cheese,1.0,ricotta cheese,1,ricotta cheese
4,1.0,crackers,1.0,crackers,1,crackers
5,1.0,cheese,1.0,cheese,1,block cheese
6,1.0,veggie mix,1.0,frozen vegetables,1,frozen vegetables
7,1.0,tomatoes,1.0,tomatoes,1,tomatoes
8,1.0,organic greens salad mix,1.0,lettuce spinach 50 50 blend,1,greens blend
9,2.0,red potatoes,2.0,potatoes,2,red potatoes


In [21]:
dfs[1].drop(dfs[1][(dfs[1].ID == 135) & (dfs[1].Session == 2) & (dfs[1].Receipt == 9)].index, inplace=True)

Data sets are passed partially through the merge algorithm to test their computability. Note a divergence higher than 7 is considered infeasible. One receipt is identified and typos are fixed by inspection.

In [22]:
reload(merge)

<module 'merge' from 'C:\\Users\\Joseph\\Documents\\Projects\\ALab\\CookiesAndCognition\\Scripts\\merge.py'>

In [23]:
merge.divergence([dfs[0], dfs[1]], word_vectors)

ID: 114, Session: 1, Receipt: 1, Div: 0! []
ID: 114, Session: 1, Receipt: 2, Div: 1! [4]
ID: 114, Session: 1, Receipt: 3, Div: 0! []
ID: 114, Session: 2, Receipt: 1, Div: 0! []
ID: 114, Session: 2, Receipt: 2, Div: 1! [7]
ID: 114, Session: 3, Receipt: 1, Div: 3! [7, 13, 14]
ID: 114, Session: 3, Receipt: 2, Div: 0! []
ID: 114, Session: 5, Receipt: 1, Div: 0! []
ID: 114, Session: 6, Receipt: 1, Div: 1! [3]
ID: 114, Session: 6, Receipt: 2, Div: 1! [2]
ID: 114, Session: 6, Receipt: 3, Div: 0! []

ID: 137, Session: 1, Receipt: 1, Div: 6! [1, 4, 10, 20, 23, 24]
ID: 137, Session: 1, Receipt: 2, Div: 5! [5, 6, 7, 8, 9]
ID: 137, Session: 4, Receipt: 1, Div: 6! [9, 10, 14, 15, 16, 17]
ID: 137, Session: 4, Receipt: 2, Div: 0! []

ID: 153, Session: 1, Receipt: 1, Div: 1! [27]
ID: 153, Session: 1, Receipt: 2, Div: 4! [12, 13, 14, 15]
ID: 153, Session: 5, Receipt: 1, Div: 7! [26, 27, 28, 29, 30, 31, 32]
ID: 153, Session: 5, Receipt: 2, Div: 6! [8, 9, 10, 11, 12, 13]

ID: 127, Session: 1, Receipt: 1,

In [24]:
# ID: 135, Session: 2, Receipt: 3, Div: 8! [1, 3, 4, 5, 6, 7, 8, 9]
pd.concat([df.loc[(df.ID == 135) & (df.Session == 2) & (df.Receipt == 3), 'Item'].reset_index(drop=True) for df in dfs],
          axis=1, ignore_index=True)

Unnamed: 0,0,1,2
0,cheese,sargento cheese,cheese
1,feta cheese,cream,feta cheese
2,mashed potatoes,mashed potatoes,mashed potato
3,toothpaste,,toothpaste
4,toothpaste,,toothpaste
5,soup,,canned soup
6,soup,,canned soup
7,soup,,canned soup
8,romaine lettuce,,romaine hearts
9,almond milk,,almond milk


In [25]:
for df in dfs:
    df.drop(df[(df.ID == 135) & (df.Session == 2) & (df.Receipt == 3)].index, inplace=True)

In [26]:
# ID: 135, Session: 2, Receipt: 8! [0, 1, 2, 10, 14, 15, 16, 17]
pd.concat([df.loc[(df.ID == 135) & (df.Session == 2) & (df.Receipt == 6), 'Item'].reset_index(drop=True) for df in dfs],
          axis=1, ignore_index=True)

Unnamed: 0,0,1,2
0,laundry detergent,fabric softener,laundry detergent softener
1,potato chips,wavy crisps,potato chips
2,potato chips,wavy crisps,potato chips
3,caramel,quino brown bread,caramels
4,basmiti rice,basmiti rice,basmati rice
5,quinoa,white grain bread,quiona
6,whole grain rice,tomato soup,whole grain rice
7,tomato soup,tomato soup,tomato soup
8,tomato soup,tomato soup,tomato soup
9,tomato soup,tomato soup,tomato soup


In [27]:
dfs[1].loc[(dfs[1].ID == 135) & (dfs[1].Session == 2) & (dfs[1].Receipt == 6) & (dfs[1].Item == 'quino brown bread'), 'Item'] = 'quinoa brown bread'
dfs[1].loc[(dfs[1].ID == 135) & (dfs[1].Session == 2) & (dfs[1].Receipt == 6) & (dfs[1].Item == 'sweetner'), 'Item'] = 'sweetener'

In [28]:
merge.divergence([dfs[0], dfs[2]], word_vectors)

ID: 114, Session: 1, Receipt: 1, Div: 0! []
ID: 114, Session: 1, Receipt: 2, Div: 0! []
ID: 114, Session: 1, Receipt: 3, Div: 0! []
ID: 114, Session: 2, Receipt: 1, Div: 0! []
ID: 114, Session: 2, Receipt: 2, Div: 0! []
ID: 114, Session: 3, Receipt: 1, Div: 1! [9]
ID: 114, Session: 3, Receipt: 2, Div: 0! []
ID: 114, Session: 5, Receipt: 1, Div: 0! []
ID: 114, Session: 6, Receipt: 1, Div: 1! [2]
ID: 114, Session: 6, Receipt: 2, Div: 0! []
ID: 114, Session: 6, Receipt: 3, Div: 0! []

ID: 137, Session: 1, Receipt: 1, Div: 0! []
ID: 137, Session: 1, Receipt: 2, Div: 1! [10]
ID: 137, Session: 4, Receipt: 1, Div: 0! []
ID: 137, Session: 4, Receipt: 2, Div: 0! []

ID: 153, Session: 1, Receipt: 1, Div: 0! []
ID: 153, Session: 1, Receipt: 2, Div: 0! []
ID: 153, Session: 5, Receipt: 1, Div: 0! []
ID: 153, Session: 5, Receipt: 2, Div: 0! []

ID: 127, Session: 1, Receipt: 1, Div: 1! [21]
ID: 127, Session: 1, Receipt: 2, Div: 0! []
ID: 127, Session: 1, Receipt: 3, Div: 0! []
ID: 127, Session: 2, Re

In [29]:
merge.divergence([dfs[1], dfs[2]], word_vectors)

ID: 153, Session: 1, Receipt: 1, Div: 1! [27]
ID: 153, Session: 1, Receipt: 2, Div: 4! [12, 13, 14, 15]
ID: 153, Session: 5, Receipt: 1, Div: 7! [26, 27, 28, 29, 30, 31, 32]
ID: 153, Session: 5, Receipt: 2, Div: 6! [8, 9, 10, 11, 12, 13]

ID: 135, Session: 1, Receipt: 1, Div: 1! [10]
ID: 135, Session: 1, Receipt: 2, Div: 2! [1, 10]
ID: 135, Session: 1, Receipt: 3, Div: 4! [4, 11, 12, 13]
ID: 135, Session: 1, Receipt: 4, Div: 4! [2, 3, 6, 7]
ID: 135, Session: 2, Receipt: 1, Div: 0! []
ID: 135, Session: 2, Receipt: 2, Div: 3! [1, 7, 8]
ID: 135, Session: 2, Receipt: 4, Div: 6! [8, 9, 10, 11, 12, 13]
ID: 135, Session: 2, Receipt: 5, Div: 0! []
ID: 135, Session: 2, Receipt: 6, Div: 8! [1, 2, 3, 14, 15, 16, 17, 20]
ID: 135, Session: 2, Receipt: 7, Div: 2! [7, 8]
ID: 135, Session: 2, Receipt: 8, Div: 0! []

ID: 137, Session: 1, Receipt: 1, Div: 6! [1, 4, 10, 20, 23, 24]
ID: 137, Session: 1, Receipt: 2, Div: 6! [5, 6, 7, 8, 9, 10]
ID: 137, Session: 4, Receipt: 1, Div: 6! [9, 10, 14, 15, 16, 17

In [30]:
# ID: 135, Session: 2, Receipt: 6, Div: 8! [1, 2, 3, 5, 15, 16, 17, 18]
pd.concat([df.loc[(df.ID == 135) & (df.Session == 2) & (df.Receipt == 6), 'Item'].reset_index(drop=True) for df in dfs],
          axis=1, ignore_index=True)

Unnamed: 0,0,1,2
0,laundry detergent,fabric softener,laundry detergent softener
1,potato chips,wavy crisps,potato chips
2,potato chips,wavy crisps,potato chips
3,caramel,quinoa brown bread,caramels
4,basmiti rice,basmiti rice,basmati rice
5,quinoa,white grain bread,quiona
6,whole grain rice,tomato soup,whole grain rice
7,tomato soup,tomato soup,tomato soup
8,tomato soup,tomato soup,tomato soup
9,tomato soup,tomato soup,tomato soup


In [31]:
dfs[2].loc[(dfs[2].ID == 135) & (dfs[2].Session == 2) & (dfs[2].Receipt == 6) & (dfs[2].Item == 'quiona'), 'Item'] = 'quinoa'

### Results

In [32]:
for df in dfs:
    df.info()
    print()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 885 entries, 0 to 932
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ID       885 non-null    uint8 
 1   Session  885 non-null    uint8 
 2   Receipt  885 non-null    uint8 
 3   Item     885 non-null    object
dtypes: object(1), uint8(3)
memory usage: 16.4+ KB

<class 'pandas.core.frame.DataFrame'>
Int64Index: 861 entries, 0 to 1107
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ID       861 non-null    uint8 
 1   Session  861 non-null    uint8 
 2   Receipt  861 non-null    uint8 
 3   Item     861 non-null    object
dtypes: object(1), uint8(3)
memory usage: 16.0+ KB

<class 'pandas.core.frame.DataFrame'>
Int64Index: 888 entries, 0 to 1007
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ID       888 non-null    uint8 
 1   Session  888 non-null 

In [33]:
for i, df in enumerate(dfs):
    df = df.reset_index(drop=True)
    df.to_csv(f'{DATA_PATH}{FILES[i]}_to_merge.csv')