In [1]:
import numpy as np
import pandas as pd
import datetime

import matplotlib.pyplot as plt
from matplotlib_venn import venn3

%matplotlib inline

pd.options.display.max_rows = 999

In [2]:
data_path = '../Data/'
file_name = 'Max, Samantha, Maria data.xlsx'
xls = pd.ExcelFile(data_path + file_name)
df_max = pd.read_excel(xls, sheet_name = 'Max', parse_dates = [3])
df_mar = pd.read_excel(xls, sheet_name = 'Maria', parse_dates = [3])
df_sam = pd.read_excel(xls, sheet_name = 'Samantha', parse_dates = [3])

df_all = [df_max, df_mar, df_sam]
df_sizes = [df.shape[0] for df in df_all]

  warn(msg)


In [3]:
### Standize Number of Columns
df_mar.drop(columns = 'coupon', inplace = True)
df_sam.drop(columns = 'Coupon (#)', inplace= True)
# Max data set lacks coupon column

assert df_max.columns.size == df_mar.columns.size == df_sam.columns.size

In [4]:
### Standarize Column Names
column_names = ['ID', 'Session', 'ReceiptNum', 'ReceiptDate', 'Item', 'Item2',
                 'Uncertain', 'Unknown', 'Quantity', 'Hit', 'Miss', 'Category', 'Comment']
for df in df_all:
    df.columns = column_names

assert df_max.columns.equals(df_mar.columns) and df_mar.columns.equals(df_sam.columns)

In [5]:
### Fill NaN values
for df in df_all:
    # when receipt number is null, assume all items came from a single receipt for that session
    df.loc[:, ['ReceiptNum', 'Quantity']] = df[['ReceiptNum', 'Quantity']].fillna(value = 1)
    df.loc[:, ['Item2', 'Comment']] = df[['Item2', 'Comment']].fillna(value = '')
    
    assert df[['ReceiptNum', 'Quantity', 'Item2', 'Comment']].notna().all(axis = None)

In [6]:
### Assign data types 

# Typos which produced errors during subsequent data type conversion
df_sam.loc[df_sam.ReceiptNum == datetime.datetime(1900, 1, 1, 0, 0), 'ReceiptNum'] = 1
df_sam.loc[df_sam['Quantity'] == '??', 'Quantity'] = 1

string_columns = ['Item', 'Item2', 'Category', 'Comment']
for df in df_all:
    df.loc[:, ['ID', 'Session', 'ReceiptNum', 'Quantity']] = df[['ID', 'Session', 'ReceiptNum', 'Quantity']].astype(pd.Int16Dtype())
    df.loc[:, 'ReceiptDate'] = pd.to_datetime(df['ReceiptDate'], errors = 'coerce')
    df.loc[:, string_columns] = df[string_columns].astype(str)
    
    # clean strings
    for col in string_columns:
        df.loc[:, col] = df[col].str.lower()

In [7]:
### TODO: Validate ID
valid_ids = [129, 136, 144, 147, 151, 156, 160, 112, 117, 120,
             128, 134, 143, 146, 150, 154, 159, 110, 115, 119,
             131, 139, 145, 149, 152, 157, 162, 113, 118, 126,
             121, 114, 137, 153, 141, 127, 130, 135, 148, 158]

assert all([df['ID'].isin(valid_ids).all() for df in df_all])

### Participant Assignment Verification
ids_assigned_max = {129, 136, 144, 147, 151, 156, 160, 112, 117, 120, 121, 114, 137, 153, 141, 127, 130, 135, 148, 158}
ids_assigned_mar = {128, 134, 143, 146, 150, 154, 159, 110, 115, 119, 121, 114, 137, 153, 141, 127, 130, 135, 148, 158}
ids_assigned_sam = {131, 139, 145, 149, 152, 157, 162, 113, 118, 126, 121, 114, 137, 153, 141, 127, 130, 135, 148, 158}

print("Missing participant IDs")
# All should be empty
print("Max:", set(ids_assigned_max.difference(df_max['ID'].unique())))
print("Maria:", set(ids_assigned_mar.difference(df_mar['ID'].unique())))
print("Samantha:", set(ids_assigned_sam.difference(df_sam['ID'].unique())))

# Shared participant verification
max_ids = set(df_max.ID.unique())
mar_ids = set(df_mar.ID.unique())
sam_ids = set(df_sam.ID.unique())

# Venn diagram
plt.figure(figsize=(11,11))
v = venn3([max_ids, mar_ids, sam_ids], ('Max', 'Maria', 'Samantha'))

v.get_label_by_id('100').set_text('\n'.join(str(s) for s in (max_ids - mar_ids - sam_ids)))
v.get_label_by_id('110').set_text('\n'.join(str(s) for s in (max_ids & mar_ids - sam_ids)))
v.get_label_by_id('010').set_text('\n'.join(str(s) for s in (mar_ids - max_ids - sam_ids)))
v.get_label_by_id('101').set_text('\n'.join(str(s) for s in (max_ids - mar_ids & sam_ids)))
v.get_label_by_id('111').set_text('\n'.join(str(s) for s in (max_ids & mar_ids & sam_ids)))
v.get_label_by_id('011').set_text('\n'.join(str(s) for s in (mar_ids & sam_ids - max_ids)))
v.get_label_by_id('001').set_text('\n'.join(str(s) for s in (sam_ids - max_ids - mar_ids)))

plt.savefig('../Output/id_verification_venn_diagram.png')

# Pair-wise shared participants
max_mar_ids = max_ids & mar_ids - sam_ids
max_sam_ids = max_ids & sam_ids - mar_ids
mar_sam_ids = mar_ids & sam_ids - max_ids
shared_ids = max_ids & mar_ids & sam_ids
df_shared = []
for df in df_all:
    df_shared.append(df[df['ID'].isin(shared_ids)].copy())

# Shared participants: {121, 114, 137, 153, 141, 127, 130, 135, 148, 158}
print('Max-Mar shared IDs: ', max_mar_ids)
print('Max-Sam shared IDs: ', max_sam_ids)
print('Mar-Sam shared IDs: ', mar_sam_ids)
print('Max-Mar-Sam shared IDs: ', shared_ids)

(129, 1) <IntegerArray>
[1]
Length: 1, dtype: Int16
(129, 2) <IntegerArray>
[1]
Length: 1, dtype: Int16
(129, 3) <IntegerArray>
[1, 2, 3, 4]
Length: 4, dtype: Int16
(129, 4) <IntegerArray>
[1, 2]
Length: 2, dtype: Int16
(129, 5) <IntegerArray>
[1]
Length: 1, dtype: Int16
(136, 1) <IntegerArray>
[1, 2, 4, 5, 6, 7]
Length: 6, dtype: Int16
(136, 2) <IntegerArray>
[1, 2, 3, 4, 5, 6, 7]
Length: 7, dtype: Int16
(136, 4) <IntegerArray>
[1, 2, 3, 4, 5, 6, 7, 8]
Length: 8, dtype: Int16
(136, 5) <IntegerArray>
[1, 2, 3, 4, 5, 6, 7, 8, 9]
Length: 9, dtype: Int16
(121, 1) <IntegerArray>
[1]
Length: 1, dtype: Int16
(121, 2) <IntegerArray>
[1]
Length: 1, dtype: Int16
(121, 3) <IntegerArray>
[1]
Length: 1, dtype: Int16
(121, 4) <IntegerArray>
[1]
Length: 1, dtype: Int16
(121, 5) <IntegerArray>
[1]
Length: 1, dtype: Int16
(121, 6) <IntegerArray>
[1]
Length: 1, dtype: Int16
(114, 1) <IntegerArray>
[1, 2, 3]
Length: 3, dtype: Int16
(114, 2) <IntegerArray>
[1, 2]
Length: 2, dtype: Int16
(114, 3) <Integer

In [None]:
### TODO: Validate Session 
valid_sessions = [1, 2, 3, 4, 5, 6]

assert all([df['Session'].isin(valid_sessions).all() for df in df_all])

In [None]:
### TODO: Validate ReceiptNum

#TODO I think these receipt numbers should be contiguous
# IntegerArray???
for pid in df_max['ID'].unique():
    for session in df_max.loc[df_max['ID'] == pid, 'Session'].unique():
        print(f'({pid}, {session})', df_max.loc[(df_max['ID'] == pid) & (df_max['Session'] == session), 'ReceiptNum'].unique())