In [1]:
import numpy as np
import pandas as pd
import datetime
import gc

import itertools
from tqdm.auto import tqdm

import gensim.downloader as api
word_vectors = api.load("glove-wiki-gigaword-50")

pd.options.display.max_rows = 999

In [2]:
DATA_PATH = '../Data/'
SHEETS = ['clean_max.csv', 'clean_mar.csv', 'clean_sam.csv']
dfs = []

for sheet in SHEETS:
    dfs.append(pd.read_csv(DATA_PATH + sheet, index_col=0, parse_dates=[3]))
    
ids_shared = set.intersection(*[set(df.ID.unique()) for df in dfs])
for df in dfs:
    df = df[df.ID.isin(ids_shared)]

# Assign Data Types 
int_columns = ['ID', 'Session', 'Receipt', 'Quantity', 'Basket']
string_columns = ['Item', 'ItemMore', 'Category', 'Comment']
for df in dfs:
    df.loc[:, int_columns] = df[int_columns].astype('int16')
    df.loc[:, string_columns] = df[string_columns].fillna('').astype(str)
    df.loc[:, 'Date'] = pd.to_datetime(df.Date, errors='coerce').dt.date

In [3]:
def equalize_length(df1, df2):
    df1_length, df2_length = len(df1), len(df2)

    if df1_length > df2_length:
        length_diff = df1_length - df2_length
        df2 = df2.reindex(df2.index.union(list(range(df2_length, df2_length + length_diff))))
    elif df2_length > df1_length:
        length_diff = df2_length - df1_length
        df1 = df1.reindex(df1.index.union(list(range(df1_length, df1_length + length_diff))))

    assert len(df1) == len(df2)
    return df1, df2

In [4]:
def compare(df_row):
    result = []
    df_split = df_row.str.split()
    
    # return shared words
    shared_words = set.intersection(*map(set, df_split))
    if shared_words:
        for word in shared_words:
            result.append(word)
        df_unique = [[word for word in phrase if word not in shared_words] for phrase in df_split]
    else:
        df_unique = df_split
    
    # discard word outside vocabulary
    df_in_voc = [[word for word in phrase if word in word_vectors.vocab] for phrase in df_unique]
    
    # if any text is exhausted return
    if not all(df_in_voc):
        df_row['WordVec'] = ' '.join(result)
        df_row['Similarity'] = 1.0
        return df_row
    
    # use word vectors to average remaining words
    words = [word for text in df_in_voc for word in text]
    wv_result = word_vectors.most_similar(positive=[*words])
    most_similar_key, similarity = wv_result[0]
    result.append(most_similar_key)
    df_row['WordVec'] = ' '.join(result)
    df_row['Similarity'] = round(similarity, 3)
    return df_row

In [5]:
def align(df1, df2):
    df1, df2 = equalize_length(df1, df2)
    
    # remove identical matches
    result_pairs = []
    df1_dropped, df2_dropped = df1.index, df2.index
    
    for df1_idx, df1_word in enumerate(df1):
        matches = df2.reindex(df2_dropped).str.contains(df1_word, regex=False)
        if any(matches):
            first_match_index = matches.idxmax()
            result_pairs.append((df1_idx, first_match_index))
            df1_dropped = df1_dropped.drop(df1_idx)
            df2_dropped = df2_dropped.drop(first_match_index)

    # all permutations of remaining indices
    perms = list(itertools.permutations(df2_dropped))
    print(len(perms), end=' ')
    
    # generate word vectors and similarity
    total_similarity = []
    if len(perms) > 1:
        for p in tqdm(perms, desc="Permutations", leave=False):
            total_similarity.append(
                sum(pd.concat(
                    [df1.reindex(df1_dropped).reset_index(drop=True), 
                     df2.reindex(df2_dropped).reindex(p).reset_index(drop=True)], axis=1).apply(compare, axis=1).Similarity))
            gc.collect()    
        # find max permutation
        result_index = perms[total_similarity.index(max(total_similarity))]
    else:
        result_index = perms[0]
    
    # return concatendated dataframe with word vectors
    top_index_left, top_index_right = zip(*result_pairs)
    bot_index_left, bot_index_right = df1_dropped, result_index

    df_combined = pd.concat([pd.concat([df1.reindex(top_index_left).reset_index(drop=True), 
                                        df2.reindex(top_index_right).reset_index(drop=True)], axis=1, ignore_index=True), 
                             pd.concat([df1.reindex(bot_index_left).reset_index(drop=True), 
                                        df2.reindex(bot_index_right).reset_index(drop=True)], axis=1, ignore_index=True)], ignore_index=True)
    return df_combined

In [6]:
### Align Data Sets by hand
def align_by_hand():
    # ID 137 and 114 have low variation
    df3 = dfs[2].loc[dfs[2].ID == 137, 'Item'].copy().reset_index(drop=True)

    # align by inspecting for proof of concept
    df3_aligned = df3.drop([102]).reset_index(drop=True)
    df_hand_aligned = pd.concat([df1, df3_aligned], axis=1)
    df_hand_aligned.columns = ['Item1', 'Item3']

    df_hand_aligned = df_hand_aligned.apply(compare, axis=1)
    display(df_hand_aligned)
    
#align_by_hand()

In [7]:
### Test hand alignment against algorithm

#12 total baskets
df_final = pd.DataFrame()
for basket in tqdm(range(1,6), desc="Basket"):
    df_final = pd.concat([df_final, align(dfs[0].loc[(dfs[0].ID == 137) & (dfs[0].Basket == basket), 'Item'].reset_index(drop=True),
                                          dfs[2].loc[(dfs[2].ID == 137) & (dfs[2].Basket == basket), 'Item'].reset_index(drop=True))], ignore_index=True)

display(df_final)

Basket:   0%|          | 0/5 [00:00<?, ?it/s]

120 

Permutations:   0%|          | 0/120 [00:00<?, ?it/s]

6 

Permutations:   0%|          | 0/6 [00:00<?, ?it/s]

1 1 24 

Permutations:   0%|          | 0/24 [00:00<?, ?it/s]

Unnamed: 0,0,1
0,,
1,,glass wipes
2,book,children's book
3,slippers,slippers
4,tissues,tissues
5,tape,tape
6,dill pickles,dill pickles
7,penne,penne pasta
8,carrots,carrots
9,peanuts,peanuts


In [8]:
#display(pd.concat([dfs[0].loc[(dfs[0].ID == 137) & (dfs[0].Basket == 7), 'Item'].reset_index(drop=True),
#                   dfs[2].loc[(df[2].ID == 137) & (dfs[2].Basket == 7), 'Item'].reset_index(drop=True)],
#                   axis=1, ignore_index=True))