In [1]:
import numpy as np
import pandas as pd
import datetime
import re

import gensim.downloader as api
word_vectors = api.load("glove-wiki-gigaword-200")

pd.options.display.max_rows = 999

In [2]:
DATA_PATH = '../Data/'
df_max = pd.read_csv(DATA_PATH + 'clean_max.csv', index_col=0, parse_dates=[3])
df_mar = pd.read_csv(DATA_PATH + 'clean_mar.csv', index_col=0, parse_dates=[3])
df_sam = pd.read_csv(DATA_PATH + 'clean_sam.csv', index_col=0, parse_dates=[3])

df_all = [df_max, df_mar, df_sam]
df_sizes = [df.shape[0] for df in df_all]

In [3]:
### Assign Data Types 
string_columns = ['Item', 'ItemMore', 'Category', 'Comment']
int_columns = ['ID', 'Session', 'Receipt', 'Quantity']
for df in df_all:
    df.loc[:, int_columns] = df[int_columns].astype(int)
    df.loc[:, 'Date'] = pd.to_datetime(df.Date, errors='coerce').dt.date
    df.loc[:, string_columns] = df[string_columns].astype(str)

In [4]:
### Shared Participants
ids_shared = set(df_max.ID.unique()) & set(df_mar.ID.unique()) & set(df_sam.ID.unique())

df_shared = []
for df in df_all:
    df_shared.append(df[df.ID.isin(ids_shared)].copy())

In [5]:
### Align Data Sets by hand
# ID 137 and 114 have low variation
df1 = df_shared[0].loc[df_shared[0].ID == 137, ['Item']].copy().reset_index(drop=True)
df2 = df_shared[1].loc[df_shared[1].ID == 137, ['Item']].copy().reset_index(drop=True)
df3 = df_shared[2].loc[df_shared[2].ID == 137, ['Item']].copy().reset_index(drop=True)

# align by inspecting for proof of concept
df3 = df3.drop([102]).reset_index(drop=True)
df_hand_aligned = pd.concat([df1, df3], axis=1)
df_hand_aligned.columns = ['Item1', 'Item3']

In [6]:
### Pre-process
df_hand_aligned['Item1'] = df_hand_aligned['Item1'].str.replace(r'[()?]', '', regex=True)
df_hand_aligned['Item1'] = df_hand_aligned['Item1'].str.replace(r'/', ' ', regex=False)
df_hand_aligned['Item1'] = df_hand_aligned['Item1'].str.replace(r'unknown', '', regex=False)
df_hand_aligned['Item1'] = df_hand_aligned['Item1'].str.replace('nan', '', regex=False)

df_hand_aligned['Item3'] = df_hand_aligned['Item3'].str.replace(r'[()?]', '', regex=True)
df_hand_aligned['Item3'] = df_hand_aligned['Item3'].str.replace(r'/', ' ', regex=False)
df_hand_aligned['Item3'] = df_hand_aligned['Item3'].str.replace(r'unknown', '', regex=False)
df_hand_aligned['Item3'] = df_hand_aligned['Item3'].str.replace('nan', '', regex=False)

In [7]:
### Compare
def compare(df):
    result = []
    item1_split = df['Item1'].split()
    item2_split = df['Item3'].split()    
    
    # shared words
    item12_intersection = [word for word in item1_split if word in item2_split]
    if item12_intersection:
        for word in item12_intersection:
            result.append(word)
        item1_split = [word for word in item1_split if word not in item12_intersection]
        item2_split = [word for word in item2_split if word not in item12_intersection]
    
    # discard word outside vocabulary
    item1 = [word for word in item1_split if word in word_vectors.vocab]
    item2 = [word for word in item2_split if word in word_vectors.vocab]
    
    # if either item is exhausted return
    if not item1 or not item2:
        return ' '.join(result)
    
    # use word vectors to average remaining words
    wv_result = word_vectors.most_similar(positive=[*item1, *item2])
    most_similar_key, _ = wv_result[0]  # look at the first match
    result.append(most_similar_key)
    return ' '.join(result)
    
df_hand_aligned['WordVec'] = df_hand_aligned.apply(compare, axis=1)
df_hand_aligned

Unnamed: 0,Item1,Item3,WordVec
0,,,
1,cleaning wipes,glass wipes,wipes kitchen
2,cleaning spray,spray bottle,spray bottles
3,,,
4,book,children's book,book
5,slippers,slippers,slippers
6,tissues,tissues,tissues
7,tape,tape,tape
8,video creator,video maker kit,video manufacturer
9,pickles dill,pickles dill,pickles dill
