In [1]:
import pandas as pd
import datetime
import re

In [2]:
DATA_PATH = '../Data/'
FILE_NAME = 'Max, Samantha, Maria data.xlsx'

xls = pd.ExcelFile(DATA_PATH + FILE_NAME)
df_max = pd.read_excel(xls, sheet_name='Max', parse_dates=[3])
df_mar = pd.read_excel(xls, sheet_name='Maria', parse_dates=[3])
df_sam = pd.read_excel(xls, sheet_name='Samantha', parse_dates=[3])

df_all = [df_max, df_mar, df_sam]

  warn(msg)


In [3]:
### Standize the number of columns

# Max's data set lacks coupon column -- drop
df_mar.drop(columns='coupon', inplace=True)
df_sam.drop(columns='Coupon (#)', inplace=True)

assert (df_max.columns.size == df_mar.columns.size == df_sam.columns.size)

In [4]:
### Standarize the column names

# TODO: Receipt vs Scan
column_names = ['ID', 'Session', 'Receipt', 'Date', 
                'Item', 'ItemMore', 'Uncertain', 'Unknown', 
                'Quantity', 'Hit', 'Miss', 'Category', 'Comment']

str_columns = ['Item', 'ItemMore', 'Category', 'Comment']
int_columns = ['ID', 'Session', 'Receipt', 'Quantity']

for df in df_all:
    df.columns = column_names

assert (df_max.columns.equals(df_mar.columns) and df_mar.columns.equals(df_sam.columns))

In [5]:
### Fill NaN values

for df in df_all:
    df.loc[:, 'Receipt'] = df.Receipt.fillna(value=0)
    df.loc[:, 'Quantity'] = df.Quantity.fillna(value=1)
    df.loc[:, str_columns] = df[str_columns].fillna(value='')
    
    assert df[[*int_columns, *str_columns]].notna().all(axis=None)

In [6]:
paren = re.compile(r'\(.+\)')

def reformat_modifier(text):
    m = paren.search(text)
    if m:
        text = ' '.join([m.group(0)[1:-1], text])
        text = paren.sub('', text)
    return text

In [7]:
### Assign Data Types 

# Typos which produced errors during data type conversion
typo = datetime.datetime(1900, 1, 1, 0, 0)
df_sam.loc[df_sam.Receipt == typo, 'Receipt'] = 1
df_sam.loc[df_sam.Quantity == '??', 'Quantity'] = 1

for df in df_all:
    df.loc[:, int_columns] = df[int_columns].astype('int16')
    df.loc[:, str_columns] = df[str_columns].astype(str)
    df.loc[:, 'Date'] = pd.to_datetime(df.Date, errors='coerce').dt.date
    
    # clean strings
    for col in str_columns:
        df.loc[:, col] = df[col].str.lower()    
        df.loc[:, col] = df[col].apply(reformat_modifier)
        df.loc[:, col] = df[col].str.replace(r'[/()]', ' ', regex=False)
        df.loc[:, col] = df[col].str.replace(r'unknown', '', regex=False)
        df.loc[:, col] = df[col].str.replace(r'nan', '', regex=False)
        df.loc[:, col] = df[col].str.replace(r'?', '', regex=False)

In [8]:
### Validate ID
ids_assigned_all = {121, 114, 137, 153, 141, 127, 130, 135, 148, 158}
ids_assigned_max = ids_assigned_all | {129, 136, 144, 147, 151, 
                                       156, 160, 112, 117, 120}
ids_assigned_mar = ids_assigned_all | {128, 134, 143, 146, 150, 
                                       154, 159, 110, 115, 119}
ids_assigned_sam = ids_assigned_all | {131, 139, 145, 149, 152, 
                                       157, 162, 113, 118, 126}

valid_ids = (ids_assigned_max | ids_assigned_mar | ids_assigned_sam)

assert all([df.ID.isin(valid_ids).all() for df in df_all])

In [9]:
### Validate Session 
valid_sessions = [1, 2, 3, 4, 5, 6]

assert all([df.Session.isin(valid_sessions).all() 
            for df in df_all])

In [10]:
### Validate Receipt

# Typos identified by discontinous receipt numbers
typo = datetime.date(2020, 8, 3)
df_mar.loc[(df_mar.ID == 137) & (df_mar.Receipt == 11), 'Receipt'] = 1
df_mar.loc[(df_mar.ID == 130) & (df_mar.Session == 2) & (df_mar.Date == typo), 'Receipt'] = 1

print('Discontinuous receipt numbers')
# (136, 1, 3) is an empty receipt on box
# (119, 6, 1) is an empty receipt on box
# (145, 3, 1&2) do not exist
# (153, 6, 1) is labled 153-6, receipt 0
for df in df_all:
    for pid in df.ID.unique():
        for session in df.loc[df.ID == pid, 'Session'].unique():
            tmp = list(
                df.loc[(df.ID == pid) & 
                       (df.Session == session), 'Receipt'].unique())
            if 0 in tmp:
                tmp.remove(0)
            if tmp != list(range(1, len(tmp) + 1)):
                print(f'({pid}, {session}):', tmp)
    print()

Discontinuous receipt numbers
(136, 1): [1, 2, 4, 5, 6, 7]

(119, 6): [2, 3, 4, 5]

(145, 3): [3, 4]
(153, 6): [2]



In [11]:
### Validate Date

# Typos found by examining date range
df_mar.loc[df_mar.Date == datetime.date(2002, 9, 10), 'Date'] = datetime.date(2020, 9, 10)
df_mar.loc[df_mar.Date == datetime.date(2002, 9, 21), 'Date'] = datetime.date(2020, 9, 21)
df_mar.loc[df_mar.Date == datetime.date(2020, 4, 6), 'Date'] = datetime.date(2020, 6, 4)
df_mar.loc[df_mar.Date == datetime.date(2020, 1, 7), 'Date'] = datetime.date(2020, 7, 1)

for df in df_all:
    print(df.Date.dropna().min(), df.Date.dropna().max(), '\n')
    
assert [datetime.date(2020, 5, 1) < df.Date.dropna().min() for df in df_all]
assert [df.Date.dropna().max() < datetime.date(2020, 12, 31) for df in df_all]

2020-06-04 2020-10-17 

2020-05-06 2020-12-08 

2020-06-04 2020-11-08 



In [12]:
### TODO: Validate Category

In [13]:
# Drop duplicate receipts
for df in df_all:
    duplicate_mask = df.Comment.str.contains(r'duplicate|repeat')
    duplicate_drop_count = sum(duplicate_mask)
    df = df.drop(df[duplicate_mask].index)
    print(f'{duplicate_drop_count} duplicate rows dropped, {duplicate_drop_count / df.shape[0]:.0%}')

187 duplicate rows dropped, 9%
0 duplicate rows dropped, 0%
25 duplicate rows dropped, 1%


In [14]:
### Create Baskets
for df in df_all:
    df['Basket'] = None    
    for pid in df.ID.unique():
        basket_counter = 0
        for session in df.loc[df.ID == pid, 'Session'].unique():    
            for receipt in df.loc[(df.ID == pid) & (df.Session == session), 'Receipt'].unique():
                for date in df.loc[(df.ID == pid) & (df.Session == session) & (df.Receipt == receipt), 'Date'].unique():
                    basket_counter += 1
                    df.loc[(df.ID == pid) & (df.Session == session) & (df.Receipt == receipt) & 
                           (df.Date == date), 'Basket'] = basket_counter
                    
                df.loc[(df.ID == pid) & (df.Session == session) & (df.Receipt == receipt) & 
                       df.Date.isna(), 'Basket'] = basket_counter + 1
                
    df['Basket'] = df['Basket'].astype(int)

In [15]:
df_max.to_csv(DATA_PATH + 'clean_max.csv')
df_mar.to_csv(DATA_PATH + 'clean_mar.csv')
df_sam.to_csv(DATA_PATH + 'clean_sam.csv')