In [None]:
import numpy as np
import pandas as pd
import datetime

import matplotlib.pyplot as plt
from matplotlib_venn import venn3

%matplotlib inline

pd.options.display.max_rows = 999

In [None]:
DATA_PATH = '../Data/'
FILE_NAME = 'Max, Samantha, Maria data.xlsx'

xls = pd.ExcelFile(DATA_PATH + FILE_NAME)
df_max = pd.read_excel(xls, sheet_name = 'Max', parse_dates = [3])
df_mar = pd.read_excel(xls, sheet_name = 'Maria', parse_dates = [3])
df_sam = pd.read_excel(xls, sheet_name = 'Samantha', parse_dates = [3])

df_all = [df_max, df_mar, df_sam]
df_sizes = [df.shape[0] for df in df_all]

In [None]:
### Standize Number of Columns

# Max's data set lacks coupon column
df_mar.drop(columns = 'coupon', inplace = True)
df_sam.drop(columns = 'Coupon (#)', inplace= True)

assert df_max.columns.size == df_mar.columns.size == df_sam.columns.size

In [None]:
### Standarize Column Names

column_names = ['ID', 'Session', 'Scan', 'Date', 'Item', 'ItemMore', 'Uncertain', 'Unknown', 'Quantity', 'Hit', 'Miss', 'Category', 'Comment']
for df in df_all:
    df.columns = column_names

assert df_max.columns.equals(df_mar.columns) and df_mar.columns.equals(df_sam.columns)

In [None]:
### Fill NaN values

for df in df_all:
    scan_na_count = df.Scan.isna().sum()
    scan_na_percent = round(scan_na_count / df.Scan.shape[0] * 100)
    df.loc[:, 'Scan'] = df.Scan.fillna(value = -1)
    print(f'Scan {scan_na_percent}% null')
    df.loc[:, 'Quantity'] = df.Quantity.fillna(value = 1)
    df.loc[:, ['ItemMore', 'Comment']] = df[['ItemMore', 'Comment']].fillna(value = '')
    
    assert df[['Scan', 'Quantity', 'ItemMore', 'Comment']].notna().all(axis = None)

In [None]:
### Assign data types 

# Typos which produced errors during subsequent data type conversion
df_sam.loc[df_sam.Scan == datetime.datetime(1900, 1, 1, 0, 0), 'Scan'] = 1
df_sam.loc[df_sam.Quantity == '??', 'Quantity'] = 1

string_columns = ['Item', 'ItemMore', 'Category', 'Comment']
for df in df_all:
    df.loc[:, ['ID', 'Session', 'Scan', 'Quantity']] = df[['ID', 'Session', 'Scan', 'Quantity']].astype(int)
    df.loc[:, 'Date'] = pd.to_datetime(df.Date, errors = 'coerce')
    df.loc[:, 'Date'] = df.loc[:, 'Date'].dt.date
    df.loc[:, string_columns] = df[string_columns].astype(str)
    
    # clean strings
    for col in string_columns:
        df.loc[:, col] = df[col].str.lower()

In [None]:
### Validate ID
valid_ids = [129, 136, 144, 147, 151, 156, 160, 112, 117, 120,
             128, 134, 143, 146, 150, 154, 159, 110, 115, 119,
             131, 139, 145, 149, 152, 157, 162, 113, 118, 126,
             121, 114, 137, 153, 141, 127, 130, 135, 148, 158]

assert all([df.ID.isin(valid_ids).all() for df in df_all])

ids_assigned_max = {129, 136, 144, 147, 151, 156, 160, 112, 117, 120} | {121, 114, 137, 153, 141, 127, 130, 135, 148, 158}
ids_assigned_mar = {128, 134, 143, 146, 150, 154, 159, 110, 115, 119} | {121, 114, 137, 153, 141, 127, 130, 135, 148, 158}
ids_assigned_sam = {131, 139, 145, 149, 152, 157, 162, 113, 118, 126} | {121, 114, 137, 153, 141, 127, 130, 135, 148, 158}

ids_max = set(df_max.ID.unique())
ids_mar = set(df_mar.ID.unique())
ids_sam = set(df_sam.ID.unique())

print("Missing participant IDs")
print("Max:", ids_assigned_max - ids_max)
print("Maria:", ids_assigned_mar - ids_mar)
print("Samantha:", ids_assigned_sam - ids_sam)
# TODO: Replace with assert empty

In [None]:
### Validate Session 
valid_sessions = [1, 2, 3, 4, 5, 6]

assert all([df.Session.isin(valid_sessions).all() for df in df_all])

In [None]:
### Validate ReceiptNum

# Typos identified by discontinous scan numbers
df_mar.loc[(df_mar.ID == 137) & (df_mar.Scan == 11), 'Scan'] = 1
df_mar.loc[(df_mar.ID == 130) & (df_mar.Session == 2) & (df_mar.Date == datetime.date(2020, 8, 3)), 'Scan'] = 1

print('Discontinuous receipt numbers')
# (136, 1, 3) is empty
# (119, 6, 1) is empty
# (145, 3, 1&2) DNE
for df in df_all:
    for pid in df.ID.unique():
        for session in df.loc[df.ID == pid, 'Session'].unique():
            tmp = list(df.loc[(df.ID == pid) & (df.Session == session), 'Scan'].unique())
            if (tmp != list(range(1, len(tmp) + 1))) and (-1 not in tmp):
                print(f'({pid}, {session}):', tmp)
    print()

In [None]:
### Validate Date

# Typos found by examining min and max dates
df_mar.loc[df_mar.Date == datetime.date(2002, 9, 10), 'Date'] = datetime.date(2020, 9, 10)
df_mar.loc[df_mar.Date == datetime.date(2002, 9, 21), 'Date'] = datetime.date(2020, 9, 21)
df_mar.loc[df_mar.Date == datetime.date(2020, 4, 6), 'Date'] = datetime.date(2020, 6, 4)
df_mar.loc[df_mar.Date == datetime.date(2020, 1, 7), 'Date'] = datetime.date(2020, 7, 1)

for df in df_all:
    print(df.Date.dropna().min(), df.Date.dropna().max(), '\n')
    
assert [datetime.date(2020, 5, 1) < df.Date.dropna().min() for df in df_all]
assert [df.Date.dropna().max() < datetime.date(2020, 12, 31) for df in df_all]

In [None]:
### Validate Category

In [None]:
df_max.to_csv(DATA_PATH + 'clean_max.csv')
df_mar.to_csv(DATA_PATH + 'clean_mar.csv')
df_sam.to_csv(DATA_PATH + 'clean_sam.csv')