In [1]:
import pandas as pd
import datetime
import re

In [2]:
DATA_PATH = '../Data/'
FILE_NAME = 'Max, Samantha, Maria data.xlsx'

xls = pd.ExcelFile(DATA_PATH + FILE_NAME)
df_max = pd.read_excel(xls, sheet_name='Max', parse_dates=[3])
df_mar = pd.read_excel(xls, sheet_name='Maria', parse_dates=[3])
df_sam = pd.read_excel(xls, sheet_name='Samantha', parse_dates=[3])

df_all = [df_max, df_mar, df_sam]

  warn(msg)


In [3]:
### Standize the number of columns

# Max's data set lacks coupon column -- drop
df_mar.drop(columns='coupon', inplace=True)
df_sam.drop(columns='Coupon (#)', inplace=True)

assert (df_max.columns.size == df_mar.columns.size == df_sam.columns.size)

In [4]:
### Standarize the column names

# TODO: Receipt vs Scan
column_names = ['ID', 'Session', 'Receipt', 'Date', 
                'Item', 'ItemMore', 'Uncertain', 'Unknown', 
                'Quantity', 'Hit', 'Miss', 'Category', 'Comment']

str_columns = ['Item', 'ItemMore', 'Category', 'Comment']
int_columns = ['ID', 'Session', 'Receipt', 'Quantity']

for df in df_all:
    df.columns = column_names

assert (df_max.columns.equals(df_mar.columns) and df_mar.columns.equals(df_sam.columns))

In [5]:
### Fill NaN values

for df in df_all:
    df.loc[:, 'Receipt'] = df.Receipt.fillna(value=0)
    df.loc[:, 'Quantity'] = df.Quantity.fillna(value=1)
    df.loc[:, str_columns] = df[str_columns].fillna(value='')
    
    #assert df[[*int_columns, *str_columns]].notna().all(axis=None)

In [6]:
paren = re.compile(r'\(.+\)')

def reformat_modifier(text):
    m = paren.search(text)
    if m:
        text = ' '.join([m.group(0)[1:-1], text])
        text = paren.sub('', text)
    return text

In [7]:
### Assign Data Types 

# Typos which produced errors during data type conversion
typo = datetime.datetime(1900, 1, 1, 0, 0)
df_sam.loc[df_sam.Receipt == typo, 'Receipt'] = 1
df_sam.loc[df_sam.Quantity == '??', 'Quantity'] = 1

for df in df_all:
    df.loc[:, int_columns] = df[int_columns].astype('uint8')
    df.loc[:, str_columns] = df[str_columns].astype(str)
    df.loc[:, 'Date'] = pd.to_datetime(df.Date, errors='coerce').dt.date
    
    # clean strings
    for col in str_columns:
        df.loc[:, col] = df[col].str.lower()    
        df.loc[:, col] = df[col].apply(reformat_modifier)
        df.loc[:, col] = df[col].str.replace(r'[/()]', ' ', regex=True)
        df.loc[:, col] = df[col].str.replace(r'unknown', '', regex=False)
        df.loc[:, col] = df[col].str.replace(r'nan', '', regex=False)
        df.loc[:, col] = df[col].str.replace(r'?', '', regex=False)
        df.loc[:, col] = df[col].str.strip()

In [8]:
### Validate ID
ids_assigned_all = {121, 114, 137, 153, 141, 127, 130, 135, 148, 158}
ids_assigned_max = ids_assigned_all | {129, 136, 144, 147, 151, 
                                       156, 160, 112, 117, 120}
ids_assigned_mar = ids_assigned_all | {128, 134, 143, 146, 150, 
                                       154, 159, 110, 115, 119}
ids_assigned_sam = ids_assigned_all | {131, 139, 145, 149, 152, 
                                       157, 162, 113, 118, 126}

valid_ids = (ids_assigned_max | ids_assigned_mar | ids_assigned_sam)

assert all([df.ID.isin(valid_ids).all() for df in df_all])

In [9]:
### Validate Session 
valid_sessions = [1, 2, 3, 4, 5, 6]

assert all([df.Session.isin(valid_sessions).all() 
            for df in df_all])

In [10]:
### Validate Receipt

# Typos identified by discontinous receipt numbers
typo = datetime.date(2020, 8, 3)
df_mar.loc[(df_mar.ID == 137) & (df_mar.Receipt == 11), 'Receipt'] = 1
df_mar.loc[(df_mar.ID == 130) & (df_mar.Session == 2) & (df_mar.Date == typo), 'Receipt'] = 1

print('Discontinuous receipt numbers')
# (136, 1, 3) is an empty receipt on box
# (119, 6, 1) is an empty receipt on box
# (145, 3, 1&2) do not exist
# (153, 6, 1) is labled 153-6 (receipt number missing)
# 3 different responses
# Maria infers receipt 1
# Sam infers receipt _
# Max drops it entirely
for df in df_all:
    for pid in df.ID.unique():
        for session in df.loc[df.ID == pid, 'Session'].unique():
            tmp = list(
                df.loc[(df.ID == pid) & 
                       (df.Session == session), 'Receipt'].unique())
            if 0 in tmp:
                tmp.remove(0)
            if tmp != list(range(1, len(tmp) + 1)):
                print(f'({pid}, {session}):', tmp)
    print()

Discontinuous receipt numbers
(136, 1): [1, 2, 4, 5, 6, 7]

(119, 6): [2, 3, 4, 5]

(145, 3): [3, 4]
(153, 6): [2]



In [11]:
### Validate Date

# Typos found by examining date range
df_mar.loc[df_mar.Date == datetime.date(2002, 9, 10), 'Date'] = datetime.date(2020, 9, 10)
df_mar.loc[df_mar.Date == datetime.date(2002, 9, 21), 'Date'] = datetime.date(2020, 9, 21)
df_mar.loc[df_mar.Date == datetime.date(2020, 4, 6), 'Date'] = datetime.date(2020, 6, 4)
df_mar.loc[df_mar.Date == datetime.date(2020, 1, 7), 'Date'] = datetime.date(2020, 7, 1)

for df in df_all:
    print(df.Date.dropna().min(), df.Date.dropna().max(), '\n')
    
assert [datetime.date(2020, 5, 1) < df.Date.dropna().min() for df in df_all]
assert [df.Date.dropna().max() < datetime.date(2020, 12, 31) for df in df_all]

2020-06-04 2020-10-17 

2020-05-06 2020-12-08 

2020-06-04 2020-11-08 



In [12]:
### TODO: Validate Category

In [18]:
### Compare session sizes of shared IDs
for pid in ids_assigned_all:
    for session in valid_sessions:
        print(f'ID: {pid}, Session: {session}, Sizes:', end=' ')
        for df in df_all: 
            print(df.loc[(df.ID == pid) & (df.Session == session)].shape[0], end=' ')
        print()
    print(f'ID: {pid}, Total:', end=' ')
    for df in df_all:
        print(df.loc[df.ID == pid].shape[0], end=' ')
    print()
    print()

ID: 130, Session: 1, Sizes: 0 0 0 
ID: 130, Session: 2, Sizes: 61 61 61 
ID: 130, Session: 3, Sizes: 60 61 60 
ID: 130, Session: 4, Sizes: 57 57 57 
ID: 130, Session: 5, Sizes: 51 53 51 
ID: 130, Session: 6, Sizes: 0 0 0 
ID: 130, Total: 229 232 229 

ID: 153, Session: 1, Sizes: 38 34 38 
ID: 153, Session: 2, Sizes: 69 58 69 
ID: 153, Session: 3, Sizes: 17 17 17 
ID: 153, Session: 4, Sizes: 30 29 30 
ID: 153, Session: 5, Sizes: 31 26 31 
ID: 153, Session: 6, Sizes: 0 0 0 
ID: 153, Total: 185 164 185 

ID: 135, Session: 1, Sizes: 46 47 46 
ID: 135, Session: 2, Sizes: 62 60 62 
ID: 135, Session: 3, Sizes: 0 0 0 
ID: 135, Session: 4, Sizes: 17 20 17 
ID: 135, Session: 5, Sizes: 58 56 58 
ID: 135, Session: 6, Sizes: 46 46 30 
ID: 135, Total: 229 229 213 

ID: 137, Session: 1, Sizes: 32 31 32 
ID: 137, Session: 2, Sizes: 13 13 13 
ID: 137, Session: 3, Sizes: 31 32 31 
ID: 137, Session: 4, Sizes: 14 15 14 
ID: 137, Session: 5, Sizes: 35 33 36 
ID: 137, Session: 6, Sizes: 15 15 15 
ID: 137, T

In [14]:
# Inconsistent session coding
# ID: 130, Session: 1, Sizes: 0 65 0 -> drop
# ID: 153, Session: 6, Sizes: 0 22 28 -> drop
# ID: 158, Session: 1, Sizes: 0 11 13 -> drop
df_mar.drop(df_mar[(df_mar.ID == 130) & (df_mar.Session == 1)].index, inplace=True)

df_mar.drop(df_mar[(df_mar.ID == 153) & (df_mar.Session == 6)].index, inplace=True)
df_sam.drop(df_sam[(df_sam.ID == 153) & (df_sam.Session == 6)].index, inplace=True)

df_mar.drop(df_mar[(df_mar.ID == 158) & (df_mar.Session == 1)].index, inplace=True)
df_sam.drop(df_sam[(df_sam.ID == 158) & (df_sam.Session == 1)].index, inplace=True)

In [15]:
display(pd.concat([df_max.loc[(df_max.ID == 127) & (df_max.Session == 2), 'Item'].reset_index(drop=True),
                   df_mar.loc[(df_mar.ID == 127) & (df_mar.Session == 2), 'Item'].reset_index(drop=True),
                   df_sam.loc[(df_sam.ID == 127) & (df_sam.Session == 2), 'Item'].reset_index(drop=True)],
                 axis=1))

Unnamed: 0,Item,Item.1,Item.2
0,potato salad,coke diet,potato salad
1,coca cola soda,ginger ale,"cola, diet soda"
2,ginger ale soda,frozen greek yogurt bars,diet ginger ale
3,peanut butter yogurt bars,pops,peanut butter green yogurt frozen bars
4,peanut butter chocolate yogurt bars,riced cauliflower,peanut butter chocolate green yogurt frozen bars
5,brownie yogurt bars,lemon riced,brownie greek yogurt frozen bars
6,yogurt bars,2% milk,greek yogurt frozen bars
7,popsicles,soda polar drink,popsicles
8,cauliflower + broccoli rice,sprite,riced cauliflower broccoli
9,lemon garlic cauliflower rice,spring water,lemon garlic riced cauliflower


In [16]:
for df in df_all:
    print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2159 entries, 0 to 2158
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         2159 non-null   uint8  
 1   Session    2159 non-null   uint8  
 2   Receipt    2159 non-null   uint8  
 3   Date       1718 non-null   object 
 4   Item       2159 non-null   object 
 5   ItemMore   2159 non-null   object 
 6   Uncertain  37 non-null     object 
 7   Unknown    87 non-null     object 
 8   Quantity   2159 non-null   uint8  
 9   Hit        0 non-null      float64
 10  Miss       0 non-null      float64
 11  Category   2159 non-null   object 
 12  Comment    2159 non-null   object 
dtypes: float64(2), object(7), uint8(4)
memory usage: 160.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4226 entries, 0 to 4323
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         4226 non-null   uint8

In [17]:
df_max.to_csv(DATA_PATH + 'clean_max.csv')
df_mar.to_csv(DATA_PATH + 'clean_mar.csv')
df_sam.to_csv(DATA_PATH + 'clean_sam.csv')