In [1]:
import pandas as pd
import datetime

In [2]:
data_path = '../Data/'
file_name = 'Max, Samantha, Maria data.xlsx'
xls = pd.ExcelFile(data_path + file_name)
df_max = pd.read_excel(xls, sheet_name = 'Max', parse_dates = [3])
df_mar = pd.read_excel(xls, sheet_name = 'Maria', parse_dates = [3])
df_sam = pd.read_excel(xls, sheet_name = 'Samantha', parse_dates = [3])

df_all = [df_max, df_mar, df_sam]
df_sizes = [df.shape[0] for df in df_all]

  warn(msg)


In [3]:
# Standize number of columns

# Max data set lacks coupon column; dropping
df_mar.drop(columns = 'coupon', inplace = True)
df_sam.drop(columns = 'Coupon (#)', inplace= True)

assert df_max.columns.size == df_mar.columns.size == df_sam.columns.size

In [4]:
# Standarize column names
column_names = ['ID', 'Session', 'ReceiptNum', 'ReceiptDate', 'Item', 'Item2',
                 'Uncertain', 'Unknown', 'Quantity', 'Hit', 'Miss', 'Category', 'Comment']
for df in df_all:
    df.columns = column_names

assert df_max.columns.equals(df_mar.columns) and df_mar.columns.equals(df_sam.columns)

In [6]:
# Fill NaN values
for df in df_all:
    # when receipt number is null, assume all items came from a single basket that session
    df.loc[:, ['ReceiptNum', 'Quantity']] = df[['ReceiptNum', 'Quantity']].fillna(value = 1)
    df.loc[:, ['Item2', 'Comment']] = df[['Item2', 'Comment']].fillna(value = '')
    
    assert df[['ReceiptNum', 'Quantity', 'Item2', 'Comment']].notna().all(axis = None)

In [7]:
# Assign data types 

# Typos which produced errors during subsequent data type conversion
df_sam.loc[df_sam.ReceiptNum == datetime.datetime(1900, 1, 1, 0, 0), 'ReceiptNum'] = 1
df_sam.loc[df_sam['Quantity'] == '??', 'Quantity'] = 1

string_columns = ['Item', 'Item2', 'Category', 'Comment']
for df in df_all:
    df.loc[:, ['ID', 'Session', 'ReceiptNum', 'Quantity']] = df[['ID', 'Session', 'ReceiptNum', 'Quantity']].astype(pd.Int16Dtype())
    df.loc[:, 'ReceiptDate'] = pd.to_datetime(df['ReceiptDate'], errors = 'coerce')
    df.loc[:, string_columns] = df[string_columns].astype(str)
    
    # clean strings
    for col in string_columns:
        df.loc[:, col] = df[col].str.lower()

In [8]:
# Drop unusable rows
essential_drop_count, unknown_drop_count, duplicate_drop_count, total_drop_count = [], [], [], []
for df in df_all:
    # missing essential values
    essential_drop_count.append(df[['ID', 'Session', 'ReceiptNum', 'Item']].isna().sum().sum())
    df.dropna(subset = ['ID', 'Session', 'ReceiptNum', 'Item'], inplace = True)
    
    # unreliable
    unknown_drop_count.append(df[df['Unknown'] == 'x'].isna().sum().sum())
    df.drop(df[df['Unknown'] == 'x'].index, inplace = True)
    
    # duplicate receipts
    duplicate_drop_count.append(df['Comment'].str.contains(r'duplicate|repeat').sum())
    df.drop(df[df['Comment'].str.contains(r'duplicate|repeat')].index, inplace = True)
    
    assert df[['ID', 'Session', 'ReceiptNum', 'Item']].notna().all(axis = None)
    assert df_max['Unknown'].isna().all()
    assert not df['Comment'].str.contains(r'duplicate|repeat').any()

print('Number of columns dropped due to missing essential columns:', essential_drop_count)
print('Number of columns dropped due to item being marked as "unknown":', unknown_drop_count)
print('Number of duplicate columns dropped:', duplicate_drop_count)

total_drop_count = [sum(c) for c in zip(essential_drop_count, unknown_drop_count, duplicate_drop_count)]
print('Total number of dropped rows:', total_drop_count)

drop_percentage = [round(c[0] / c[1] * 100) for c in zip(total_drop_count, df_sizes)]
print('Drop percentage:', drop_percentage)

Number of columns dropped due to missing essential columns: [0, 0, 0]
Number of columns dropped due to item being marked as "unknown": [267, 322, 265]
Number of duplicate columns dropped: [185, 0, 25]
Total number of dropped rows: [452, 322, 290]
Drop percentage: [21, 7, 9]


In [9]:
# Export clean data sets
df_max.to_csv(data_path + 'clean_max.csv')
df_mar.to_csv(data_path + 'clean_mar.csv')
df_sam.to_csv(data_path + 'clean_sam.csv')