In [1]:
import numpy as np
import pandas as pd
import datetime
import re

import gensim.downloader as api
word_vectors = api.load("glove-wiki-gigaword-200")

pd.options.display.max_rows = 999

In [2]:
DATA_PATH = '../Data/'
df_max = pd.read_csv(DATA_PATH + 'clean_max.csv', index_col=0, parse_dates=[3])
df_mar = pd.read_csv(DATA_PATH + 'clean_mar.csv', index_col=0, parse_dates=[3])
df_sam = pd.read_csv(DATA_PATH + 'clean_sam.csv', index_col=0, parse_dates=[3])

df_all = [df_max, df_mar, df_sam]
df_sizes = [df.shape[0] for df in df_all]

In [3]:
### Assign Data Types 
int_columns = ['ID', 'Session', 'Receipt', 'Quantity']
string_columns = ['Item', 'ItemMore', 'Category', 'Comment']
for df in df_all:
    df.loc[:, int_columns] = df[int_columns].astype(int)
    df.loc[:, string_columns] = df[string_columns].astype(str)
    df.loc[:, 'Date'] = pd.to_datetime(df.Date, errors='coerce').dt.date

In [4]:
### Shared Participants
ids_shared = set(df_max.ID.unique()) & set(df_mar.ID.unique()) & set(df_sam.ID.unique())

df_shared = []
for df in df_all:
    df_shared.append(df[df.ID.isin(ids_shared)].copy())

In [51]:
### Compare
def compare(df):
    result = []
    df_split = df.str.split()
    
    # return shared words
    shared_words = list(set.intersection(*map(set, df_split)))
    if shared_words:
        for word in shared_words:
            result.append(word)
        df_unique = [[word for word in phrase if word not in shared_words] for phrase in df_split]
    else:
        df_unique = df_split
    
    # discard word outside vocabulary
    df_in_voc = [[word for word in phrase if word in word_vectors.vocab] for phrase in df_unique]
    
    # if any text is exhausted return
    if not all(df_in_voc):
        return ' '.join(result)
    
    # use word vectors to average remaining words
    words = [word for text in df_in_voc for word in text]
    wv_result = word_vectors.most_similar(positive=[*words])
    most_similar_key, similarity = wv_result[0]  # look at the first match
    result.append(most_similar_key)
    return ' '.join(result)

In [53]:
### Align Data Sets by hand
# ID 137 and 114 have low variation
df1 = df_shared[0].loc[df_shared[0].ID == 137, ['Item']].copy().reset_index(drop=True)
df3 = df_shared[2].loc[df_shared[2].ID == 137, ['Item']].copy().reset_index(drop=True)

# align by inspecting for proof of concept
df3_aligned = df3.drop([102]).reset_index(drop=True)
df_hand_aligned = pd.concat([df1, df3_aligned], axis=1)
df_hand_aligned.columns = ['Item1', 'Item3']

### Pre-process
df_hand_aligned['Item1'] = df_hand_aligned['Item1'].str.replace(r'[()?]', '', regex=True)
df_hand_aligned['Item1'] = df_hand_aligned['Item1'].str.replace(r'/', ' ', regex=False)
df_hand_aligned['Item1'] = df_hand_aligned['Item1'].str.replace(r'unknown', '', regex=False)
df_hand_aligned['Item1'] = df_hand_aligned['Item1'].str.replace(r'nan', '', regex=False)

df_hand_aligned['Item3'] = df_hand_aligned['Item3'].str.replace(r'[()?]', '', regex=True)
df_hand_aligned['Item3'] = df_hand_aligned['Item3'].str.replace(r'/', ' ', regex=False)
df_hand_aligned['Item3'] = df_hand_aligned['Item3'].str.replace(r'unknown', '', regex=False)
df_hand_aligned['Item3'] = df_hand_aligned['Item3'].str.replace(r'nan', '', regex=False)

df_hand_aligned['WordVec'] = df_hand_aligned.apply(compare, axis=1)
display(df_hand_aligned)

Unnamed: 0,Item1,Item3,WordVec
0,,,
1,cleaning wipes,glass wipes,wipes kitchen
2,cleaning spray,spray bottle,spray bottles
3,,,
4,book,children's book,book
5,slippers,slippers,slippers
6,tissues,tissues,tissues
7,tape,tape,tape
8,video creator,video maker kit,video manufacturer
9,pickles dill,pickles dill,dill pickles


In [52]:
### Pre-process
df = pd.concat([df1, df2, df3], axis=1)

df.columns = ['Item1', 'Item2', 'Item3']
df['Item1'] = df['Item1'].str.replace(r'[()?]', '', regex=True)
df['Item1'] = df['Item1'].str.replace(r'/', ' ', regex=False)
df['Item1'] = df['Item1'].str.replace(r'unknown', '', regex=False)
df['Item1'] = df['Item1'].str.replace(r'nan', '', regex=False)

df['Item2'] = df['Item2'].str.replace(r'[()?]', '', regex=True)
df['Item2'] = df['Item2'].str.replace(r'/', ' ', regex=False)
df['Item2'] = df['Item2'].str.replace(r'unknown', '', regex=False)
df['Item2'] = df['Item2'].str.replace(r'nan', '', regex=False)

df['Item3'] = df['Item3'].str.replace(r'[()?]', '', regex=True)
df['Item3'] = df['Item3'].str.replace(r'/', ' ', regex=False)
df['Item3'] = df['Item3'].str.replace(r'unknown', '', regex=False)
df['Item3'] = df['Item3'].str.replace(r'nan', '', regex=False)

df = df.fillna('')

df['WordVec'] = df.apply(compare, axis=1)
display(df)

Unnamed: 0,Item1,Item2,Item3,WordVec
0,,chips tortilla,,
1,cleaning wipes,disinfectant wipes,glass wipes,wipes washing
2,cleaning spray,licensed playland ballpit,spray bottle,sprayed
3,,coloring book,,
4,book,snuggletoes,children's book,
5,slippers,facial tissues,slippers,skin
6,tissues,tape,tissues,tissue
7,tape,studio creator kit,tape,video
8,video creator,kosher dills,video maker kit,dvd
9,pickles dill,penne,pickles dill,pickle
