In [None]:
import numpy as np
import pandas as pd
import datetime

import matplotlib.pyplot as plt
from matplotlib_venn import venn3

%matplotlib inline

In [None]:
data_path = '../Data/Max, Samantha, Maria data.xlsx'
xls = pd.ExcelFile(data_path)
max_df = pd.read_excel(xls, sheet_name = 'Max', parse_dates = [3])
mar_df = pd.read_excel(xls, sheet_name = 'Maria', parse_dates = [3])
sam_df = pd.read_excel(xls, sheet_name = 'Samantha', parse_dates = [3])
all_df = [max_df, mar_df, sam_df]

In [None]:
# set logic for venn diagram
max_ids = set(max_df.ID.unique())
mar_ids = set(mar_df.ID.unique())
sam_ids = set(sam_df.ID.unique())

max_mar_ids = max_ids & mar_ids - sam_ids
max_sam_ids = max_ids & sam_ids - mar_ids
mar_sam_ids = mar_ids & sam_ids - max_ids
max_mar_sam_ids = max_ids & mar_ids & sam_ids

print('Max-Mar Cross IDs: ', max_mar_ids)
print('Max-Sam Cross IDs: ', max_sam_ids)
print('Mar-Sam Cross IDs: ', mar_sam_ids)
print('Max-Mar-Sam Cross IDs: ', max_mar_sam_ids)

plt.figure(figsize=(11,11))
v = venn3([max_ids, mar_ids, sam_ids], ('Max', 'Maria', 'Samantha'))

v.get_label_by_id('100').set_text('\n'.join(str(s) for s in (max_ids - mar_ids - sam_ids)))
v.get_label_by_id('110').set_text('\n'.join(str(s) for s in (max_ids & mar_ids - sam_ids)))
v.get_label_by_id('010').set_text('\n'.join(str(s) for s in (mar_ids - max_ids - sam_ids)))
v.get_label_by_id('101').set_text('\n'.join(str(s) for s in (max_ids - mar_ids & sam_ids)))
v.get_label_by_id('111').set_text('\n'.join(str(s) for s in (max_ids & mar_ids & sam_ids)))
v.get_label_by_id('011').set_text('\n'.join(str(s) for s in (mar_ids & sam_ids - max_ids)))
v.get_label_by_id('001').set_text('\n'.join(str(s) for s in (sam_ids - max_ids - mar_ids)))

# plt.savefig('../Output/cross-ref_venn_diagram.png')

In [None]:
# Build (ID, BasketNum, ItemNum) multi-index
# where BasketNum is a flatenning of (TP-Date, RecNum)

# when receipt number is null, assume all items came from a single basket that day
for df in all_df:
    df['RecNum'].fillna(value = 1, inplace = True)
    df['RecNum'] = df['RecNum'].astype('int32')

# generate BasketNum column from unique (TP-Date, RecNum) pairs
for df in all_df:
    df['BasketNum'] = np.nan
    for id_num in df['ID'].unique():
        basket_counter = 0
        # print()
        for tp_num in df.loc[df['ID'] == id_num, 'TP-Date'].unique():
            for rec_num in df.loc[(df['ID'] == id_num) & (df['TP-Date'] == tp_num), 'RecNum'].unique():
                basket_counter += 1
                # print(f'({id_num}, {tp_num}, {rec_num}): {basket_counter}')
                df.loc[(df['ID'] == id_num) & (df['TP-Date'] == tp_num) & (df['RecNum'] == rec_num), 'BasketNum'] = basket_counter
                # print(df.loc[(df['ID'] == id_num) & (df['TP-Date'] == tp_num), ['ID', 'BasketNum']].head(1))
    df['BasketNum'] = df['BasketNum'].astype('Int64')

# generate ItemNum per basket
for df in all_df:
    df['ItemNum'] = np.nan
    for id_num in df['ID'].unique():
        for basket_num in df.loc[df['ID'] == id_num, 'BasketNum'].unique():
            df.loc[(df['ID'] == id_num) & (df['BasketNum'] == basket_num), 'ItemNum'] = \
                    range(1, df.loc[(df['ID'] == id_num) & (df['BasketNum'] == basket_num), 'ItemNum'].size + 1)
            # print(df.loc[(df['ID'] == id_num) & (df['BasketNum'] == basket_num), ['ID', 'BasketNum', 'ItemNum']])
            # print(range(1, df.loc[(df['ID'] == id_num) & (df['BasketNum'] == basket_num), 'ItemNum'].size + 1))
    df['ItemNum'] = df['ItemNum'].astype('Int64')

for df in all_df:
    df.drop(columns = ['TP-Date', 'RecNum'], inplace = True)

In [None]:
# build multi-index
for df in all_df:
    df.set_index(['ID', 'BasketNum', 'ItemNum'], inplace = True)

In [None]:
# sort multi-index in prep for comparisons
for df in all_df:
    df = df.sort_index()
    print(df.index.is_monotonic_increasing)

In [None]:
# comparing shared multi-index
mar_sam_cross = mar_df.loc[mar_sam_ids].index.symmetric_difference(sam_df.loc[mar_sam_ids].index).tolist()
# A sd B sd C - A & B & C
max_mar_sam_cross = set(max_df.loc[max_mar_sam_ids].index.symmetric_difference(mar_df.loc[max_mar_sam_ids].index).symmetric_difference(sam_df.loc[max_mar_sam_ids].index).tolist()) \
      - set(max_df.loc[max_mar_sam_ids].index.intersection(mar_df.loc[max_mar_sam_ids].index).intersection(sam_df.loc[max_mar_sam_ids].index).tolist())
print('Mar-Sam Index Incongruities\n', mar_sam_cross)
# print('Max-Mar-Sam Index Incongruities\n', max_mar_sam_cross)

In [None]:
# tidy up and begin verification

print(sam_df.loc[(148, 3), ['Item (modifier)', 'NumP']].join(mar_df.loc[(148, 3), ['Item (modifier)', 'NumP']], \
                                                                     how = 'outer', lsuffix = '_s', rsuffix = '_m'))
print(sum(sam_df.loc[(148, 3), 'NumP']))
print('Sam total items: ', sum(sam_df.loc[(148, 3), 'NumP']), 'Mar total items: ', sum(mar_df.loc[(148, 3), 'NumP']), sep = '')
# print(mar_df.loc[(148, 3), 'Item (modifier)'].value_counts(), sam_df.loc[(148, 3), 'Item (modifier)'].value_counts())