In [1]:
import numpy as np
import pandas as pd
import datetime
import re

import gensim.downloader as api
word_vectors = api.load("glove-wiki-gigaword-200")

import itertools

pd.options.display.max_rows = 999

In [2]:
DATA_PATH = '../Data/'
df_max = pd.read_csv(DATA_PATH + 'clean_max.csv', index_col=0, parse_dates=[3])
df_mar = pd.read_csv(DATA_PATH + 'clean_mar.csv', index_col=0, parse_dates=[3])
df_sam = pd.read_csv(DATA_PATH + 'clean_sam.csv', index_col=0, parse_dates=[3])

df_all = [df_max, df_mar, df_sam]
df_sizes = [df.shape[0] for df in df_all]

### Assign Data Types 
int_columns = ['ID', 'Session', 'Receipt', 'Quantity', 'Basket']
string_columns = ['Item', 'ItemMore', 'Category', 'Comment']
for df in df_all:
    df.loc[:, int_columns] = df[int_columns].astype(int)
    df.loc[:, string_columns] = df[string_columns].astype(str)
    df.loc[:, 'Date'] = pd.to_datetime(df.Date, errors='coerce').dt.date
    
### Shared Participants
ids_shared = set(df_max.ID.unique()) & set(df_mar.ID.unique()) & set(df_sam.ID.unique())

df_shared = []
for df in df_all:
    df_shared.append(df[df.ID.isin(ids_shared)].copy())

In [20]:
def preprocess(col):
    result_col = col.str.replace(r'[()?]', '', regex=True)
    result_col = result_col.str.replace(r'/', ' ', regex=False)
    result_col = result_col.str.replace(r'unknown', '', regex=False)
    result_col = result_col.str.replace(r'nan', '', regex=False)
    return result_col

In [3]:
def compare(df):
    result = []
    df_split = df.str.split()
    
    # return shared words
    shared_words = list(set.intersection(*map(set, df_split)))
    if shared_words:
        for word in shared_words:
            result.append(word)
        df_unique = [[word for word in phrase if word not in shared_words] for phrase in df_split]
    else:
        df_unique = df_split
    
    # discard word outside vocabulary
    df_in_voc = [[word for word in phrase if word in word_vectors.vocab] for phrase in df_unique]
    
    # if any text is exhausted return
    if not all(df_in_voc):
        df['WordVec'] = ' '.join(result)
        df['Similarity'] = 1.0
        return df
    
    # use word vectors to average remaining words
    words = [word for text in df_in_voc for word in text]
    wv_result = word_vectors.most_similar(positive=[*words])
    most_similar_key, similarity = wv_result[0]
    result.append(most_similar_key)
    df['WordVec'] = ' '.join(result)
    df['Similarity'] = round(similarity, 3)
    return df

In [4]:
def align(df1, df2):
    # equalize lengths
    df1_length, df2_length = len(df1), len(df2)
    if df1_length > df2_length:
        length_diff = df1_length - df2_length
        df2 = df2.reindex(df2.index.union(list(range(df2_length, df2_length + length_diff))))
    elif df2_length > df1_length:
        length_diff = df2_length - df1_length
        df1 = df1.reindex(df1.index.union(list(range(df1_length, df1_length + length_diff))))
    df1 = df1.fillna('')
    df2 = df2.fillna('')
    assert len(df1) == len(df2)
    
    # remove identical matches
    result_pairs = []
    df1_dropped = df1.copy()
    df2_dropped = df2.copy()
    for df1_idx, df1_word in enumerate(df1):
        matches = df2_dropped.str.fullmatch(df1_word)
        if any(matches):
            first_match_index = matches.idxmax()
            result_pairs.append((df1_idx, first_match_index))
            df1_dropped.drop(index=df1_idx, inplace=True)
            df2_dropped.drop(index=first_match_index, inplace=True)

    # all permutations of remaining indices
    perms = list(itertools.permutations(df2_dropped.index))
    
    # generate word vectors and similarity
    total_similarity = []
    for p in perms:
        df = pd.concat([df1_dropped.reset_index(drop=True), df2_dropped.reindex(p).reset_index(drop=True)], axis=1)
        df = df.apply(compare, axis=1)
        # sum similarity 
        total_similarity.append(sum(df.Similarity))
    
    # find max permutation
    result_index = perms[total_similarity.index(max(total_similarity))]
    print(result_index)
    
    # return concatendated dataframe with word vectors
    top_index_left, top_index_right = zip(*result_pairs)
    df_top = pd.concat([df1.reindex(top_index_left).reset_index(drop=True), 
                        df2.reindex(top_index_right).reset_index(drop=True)], axis=1, ignore_index=True)
    
    bot_index_left = df1_dropped.index
    bot_index_right = result_index
    df_bot = pd.concat([df1.reindex(bot_index_left).reset_index(drop=True), 
                        df2.reindex(bot_index_right).reset_index(drop=True)], axis=1, ignore_index=True)
    
    df_combined = pd.concat([df_top, df_bot])
    return df_combined

In [33]:
### Align Data Sets by hand
# ID 137 and 114 have low variation
df1 = df_shared[0].loc[df_shared[0].ID == 137, 'Item'].copy().reset_index(drop=True)
df1 = preprocess(df1)
df2 = df_shared[1].loc[df_shared[1].ID == 137, 'Item'].copy().reset_index(drop=True)
df2 = preprocess(df2)
df3 = df_shared[2].loc[df_shared[2].ID == 137, 'Item'].copy().reset_index(drop=True)
df3 = preprocess(df3)

# align by inspecting for proof of concept
df3_aligned = df3.drop([102]).reset_index(drop=True)
df_hand_aligned = pd.concat([df1, df3_aligned], axis=1)
df_hand_aligned.columns = ['Item1', 'Item3']

df_hand_aligned = df_hand_aligned.apply(compare, axis=1)
display(df_hand_aligned)

Unnamed: 0,Item1,Item3,WordVec,Similarity
0,,,,1.0
1,cleaning wipes,glass wipes,wipes kitchen,0.621
2,cleaning spray,spray bottle,spray bottles,0.666
3,,,,1.0
4,book,children's book,book,1.0
5,slippers,slippers,slippers,1.0
6,tissues,tissues,tissues,1.0
7,tape,tape,tape,1.0
8,video creator,video maker kit,video manufacturer,0.66
9,pickles dill,pickles dill,pickles dill,1.0


In [35]:
### Test hand alignment against algorithm

#print(df_shared[0].loc[df_shared[0].ID == 137, 'Basket'].unique())
#print(df_shared[2].loc[df_shared[2].ID == 137, 'Basket'].unique())
df1 = df_shared[0].loc[(df_shared[0].ID == 137) & (df_shared[0].Basket == 1), 'Item'].copy().reset_index(drop=True)
df1 = preprocess(df1)
df3 = df_shared[2].loc[(df_shared[2].ID == 137) & (df_shared[2].Basket == 1), 'Item'].copy().reset_index(drop=True)
df3 = preprocess(df3)

display(align(df1, df3))

(2, 1, 4, 8, 10, 13)


Unnamed: 0,0,1
0,,
1,,
2,slippers,slippers
3,tissues,tissues
4,tape,tape
5,pickles dill,pickles dill
6,carrots,carrots
7,coffee beans,coffee beans
8,peanuts,peanuts
0,cleaning wipes,spray bottle
