In [None]:
import numpy as np
import pandas as pd
import datetime
import re

import gensim.downloader as api
word_vectors = api.load("glove-wiki-gigaword-200")

pd.options.display.max_rows = 999

In [None]:
DATA_PATH = '../Data/'
df_max = pd.read_csv(DATA_PATH + 'clean_max.csv', index_col=0, parse_dates=[3])
df_mar = pd.read_csv(DATA_PATH + 'clean_mar.csv', index_col=0, parse_dates=[3])
df_sam = pd.read_csv(DATA_PATH + 'clean_sam.csv', index_col=0, parse_dates=[3])

df_all = [df_max, df_mar, df_sam]
df_sizes = [df.shape[0] for df in df_all]

In [None]:
### Assign Data Types 
int_columns = ['ID', 'Session', 'Receipt', 'Quantity']
string_columns = ['Item', 'ItemMore', 'Category', 'Comment']
for df in df_all:
    df.loc[:, int_columns] = df[int_columns].astype(int)
    df.loc[:, string_columns] = df[string_columns].astype(str)
    df.loc[:, 'Date'] = pd.to_datetime(df.Date, errors='coerce').dt.date

In [None]:
### Shared Participants
ids_shared = set(df_max.ID.unique()) & set(df_mar.ID.unique()) & set(df_sam.ID.unique())

df_shared = []
for df in df_all:
    df_shared.append(df[df.ID.isin(ids_shared)].copy())

In [None]:
### Compare
def compare(df):
    result = []
    df_split = df.str.split()
    
    # return shared words
    shared_words = list(set.intersection(*map(set, df_split)))
    if shared_words:
        for word in shared_words:
            result.append(word)
        df_unique = [[word for word in phrase if word not in shared_words] for phrase in df_split]
    else:
        df_unique = df_split
    
    # discard word outside vocabulary
    df_in_voc = [[word for word in phrase if word in word_vectors.vocab] for phrase in df_unique]
    
    # if any text is exhausted return
    if not all(df_in_voc):
        df['WordVec'] = ' '.join(result)
        df['Similarity'] = 1.0
        return df
    
    # use word vectors to average remaining words
    words = [word for text in df_in_voc for word in text]
    wv_result = word_vectors.most_similar(positive=[*words])
    most_similar_key, similarity = wv_result[0]
    result.append(most_similar_key)
    df['WordVec'] = ' '.join(result)
    df['Similarity'] = round(similarity, 3)
    return df

In [None]:
### Align Data Sets by hand
# ID 137 and 114 have low variation
df1 = df_shared[0].loc[df_shared[0].ID == 137, ['Item']].copy().reset_index(drop=True)
df3 = df_shared[2].loc[df_shared[2].ID == 137, ['Item']].copy().reset_index(drop=True)

# align by inspecting for proof of concept
df3_aligned = df3.drop([102]).reset_index(drop=True)
df_hand_aligned = pd.concat([df1, df3_aligned], axis=1)
df_hand_aligned.columns = ['Item1', 'Item3']

### Pre-process
df_hand_aligned['Item1'] = df_hand_aligned['Item1'].str.replace(r'[()?]', '', regex=True)
df_hand_aligned['Item1'] = df_hand_aligned['Item1'].str.replace(r'/', ' ', regex=False)
df_hand_aligned['Item1'] = df_hand_aligned['Item1'].str.replace(r'unknown', '', regex=False)
df_hand_aligned['Item1'] = df_hand_aligned['Item1'].str.replace(r'nan', '', regex=False)

df_hand_aligned['Item3'] = df_hand_aligned['Item3'].str.replace(r'[()?]', '', regex=True)
df_hand_aligned['Item3'] = df_hand_aligned['Item3'].str.replace(r'/', ' ', regex=False)
df_hand_aligned['Item3'] = df_hand_aligned['Item3'].str.replace(r'unknown', '', regex=False)
df_hand_aligned['Item3'] = df_hand_aligned['Item3'].str.replace(r'nan', '', regex=False)

df_hand_aligned = df_hand_aligned.apply(compare, axis=1)
display(df_hand_aligned)

In [None]:
### Pre-process
df = pd.concat([df1, df2, df3], axis=1)

df.columns = ['Item1', 'Item2', 'Item3']
df['Item1'] = df['Item1'].str.replace(r'[()?]', '', regex=True)
df['Item1'] = df['Item1'].str.replace(r'/', ' ', regex=False)
df['Item1'] = df['Item1'].str.replace(r'unknown', '', regex=False)
df['Item1'] = df['Item1'].str.replace(r'nan', '', regex=False)

df['Item2'] = df['Item2'].str.replace(r'[()?]', '', regex=True)
df['Item2'] = df['Item2'].str.replace(r'/', ' ', regex=False)
df['Item2'] = df['Item2'].str.replace(r'unknown', '', regex=False)
df['Item2'] = df['Item2'].str.replace(r'nan', '', regex=False)

df['Item3'] = df['Item3'].str.replace(r'[()?]', '', regex=True)
df['Item3'] = df['Item3'].str.replace(r'/', ' ', regex=False)
df['Item3'] = df['Item3'].str.replace(r'unknown', '', regex=False)
df['Item3'] = df['Item3'].str.replace(r'nan', '', regex=False)

df = df.fillna('')

df = df.apply(compare, axis=1)
display(df)