In [1]:
import pandas as pd
import datetime

In [2]:
DATA_PATH = '../Data/'
FILE_NAME = 'Max, Samantha, Maria data.xlsx'

xls = pd.ExcelFile(DATA_PATH + FILE_NAME)
df_max = pd.read_excel(xls, sheet_name = 'Max', parse_dates = [3])
df_mar = pd.read_excel(xls, sheet_name = 'Maria', parse_dates = [3])
df_sam = pd.read_excel(xls, sheet_name = 'Samantha', parse_dates = [3])

df_all = [df_max, df_mar, df_sam]

  warn(msg)


In [3]:
### Standize Number of Columns

# Max's data set lacks coupon column
df_mar.drop(columns = 'coupon', inplace = True)
df_sam.drop(columns = 'Coupon (#)', inplace= True)

assert df_max.columns.size == df_mar.columns.size == df_sam.columns.size

In [4]:
### Standarize Column Names

column_names = ['ID', 'Session', 'Scan', 'Date', 'Item', 'ItemMore', 'Uncertain', 'Unknown', 'Quantity', 'Hit', 'Miss', 'Category', 'Comment']
for df in df_all:
    df.columns = column_names

assert df_max.columns.equals(df_mar.columns) and df_mar.columns.equals(df_sam.columns)

In [5]:
### Fill NaN values

for df in df_all:
    scan_na_count = df.Scan.isna().sum()
    scan_na_percent = round(scan_na_count / df.Scan.shape[0] * 100)
    df.loc[:, 'Scan'] = df.Scan.fillna(value = -1)
    print(f'Scan {scan_na_percent}% null')
    df.loc[:, 'Quantity'] = df.Quantity.fillna(value = 1)
    df.loc[:, ['ItemMore', 'Comment']] = df[['ItemMore', 'Comment']].fillna(value = '')
    
    assert df[['Scan', 'Quantity', 'ItemMore', 'Comment']].notna().all(axis = None)

Scan 28% null
Scan 50% null
Scan 42% null


In [6]:
### Assign Data Types 

# Typos which produced errors during subsequent data type conversion
df_sam.loc[df_sam.Scan == datetime.datetime(1900, 1, 1, 0, 0), 'Scan'] = 1
df_sam.loc[df_sam.Quantity == '??', 'Quantity'] = 1

string_columns = ['Item', 'ItemMore', 'Category', 'Comment']
for df in df_all:
    df.loc[:, ['ID', 'Session', 'Scan', 'Quantity']] = df[['ID', 'Session', 'Scan', 'Quantity']].astype(int)
    df.loc[:, 'Date'] = pd.to_datetime(df.Date, errors = 'coerce')
    df.loc[:, 'Date'] = df.loc[:, 'Date'].dt.date
    df.loc[:, string_columns] = df[string_columns].astype(str)
    
    # clean strings
    for col in string_columns:
        df.loc[:, col] = df[col].str.lower()

In [7]:
### Validate ID
valid_ids = [129, 136, 144, 147, 151, 156, 160, 112, 117, 120,
             128, 134, 143, 146, 150, 154, 159, 110, 115, 119,
             131, 139, 145, 149, 152, 157, 162, 113, 118, 126,
             121, 114, 137, 153, 141, 127, 130, 135, 148, 158]

assert all([df.ID.isin(valid_ids).all() for df in df_all])

ids_assigned_max = {129, 136, 144, 147, 151, 156, 160, 112, 117, 120} | {121, 114, 137, 153, 141, 127, 130, 135, 148, 158}
ids_assigned_mar = {128, 134, 143, 146, 150, 154, 159, 110, 115, 119} | {121, 114, 137, 153, 141, 127, 130, 135, 148, 158}
ids_assigned_sam = {131, 139, 145, 149, 152, 157, 162, 113, 118, 126} | {121, 114, 137, 153, 141, 127, 130, 135, 148, 158}

print("Missing participant IDs")
print("Max:", ids_assigned_max - set(df_max.ID.unique()))
print("Maria:", ids_assigned_mar - set(df_mar.ID.unique()))
print("Samantha:", ids_assigned_sam - set(df_sam.ID.unique()))
# TODO: Replace with assert empty

Missing participant IDs
Max: {160, 112, 147, 148, 117, 151, 120, 156, 158}
Maria: set()
Samantha: set()


In [8]:
### Validate Session 
valid_sessions = [1, 2, 3, 4, 5, 6]

assert all([df.Session.isin(valid_sessions).all() for df in df_all])

In [9]:
### Validate Scan

# Typos identified by discontinous scan numbers
df_mar.loc[(df_mar.ID == 137) & (df_mar.Scan == 11), 'Scan'] = 1
df_mar.loc[(df_mar.ID == 130) & (df_mar.Session == 2) & (df_mar.Date == datetime.date(2020, 8, 3)), 'Scan'] = 1

print('Discontinuous scan numbers')
# (136, 1, 3) is empty
# (119, 6, 1) is empty
# (145, 3, 1&2) DNE
for df in df_all:
    for pid in df.ID.unique():
        for session in df.loc[df.ID == pid, 'Session'].unique():
            tmp = list(df.loc[(df.ID == pid) & (df.Session == session), 'Scan'].unique())
            if (tmp != list(range(1, len(tmp) + 1))) and (-1 not in tmp):
                print(f'({pid}, {session}):', tmp)
    print()

Discontinuous scan numbers
(136, 1): [1, 2, 4, 5, 6, 7]

(119, 6): [2, 3, 4, 5]

(145, 3): [3, 4]



In [10]:
### Validate Date

# Typos found by examining min and max dates
df_mar.loc[df_mar.Date == datetime.date(2002, 9, 10), 'Date'] = datetime.date(2020, 9, 10)
df_mar.loc[df_mar.Date == datetime.date(2002, 9, 21), 'Date'] = datetime.date(2020, 9, 21)
df_mar.loc[df_mar.Date == datetime.date(2020, 4, 6), 'Date'] = datetime.date(2020, 6, 4)
df_mar.loc[df_mar.Date == datetime.date(2020, 1, 7), 'Date'] = datetime.date(2020, 7, 1)

for df in df_all:
    print(df.Date.dropna().min(), df.Date.dropna().max(), '\n')
    
assert [datetime.date(2020, 5, 1) < df.Date.dropna().min() for df in df_all]
assert [df.Date.dropna().max() < datetime.date(2020, 12, 31) for df in df_all]

2020-06-04 2020-10-17 

2020-05-06 2020-12-08 

2020-06-04 2020-11-08 



In [11]:
### Validate Category

In [12]:
df_max.to_csv(DATA_PATH + 'clean_max.csv')
df_mar.to_csv(DATA_PATH + 'clean_mar.csv')
df_sam.to_csv(DATA_PATH + 'clean_sam.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2159 entries, 0 to 2158
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         2159 non-null   int32  
 1   Session    2159 non-null   int32  
 2   Scan       2159 non-null   int32  
 3   Date       1718 non-null   object 
 4   Item       2159 non-null   object 
 5   ItemMore   2159 non-null   object 
 6   Uncertain  37 non-null     object 
 7   Unknown    87 non-null     object 
 8   Quantity   2159 non-null   int32  
 9   Hit        0 non-null      float64
 10  Miss       0 non-null      float64
 11  Category   2159 non-null   object 
 12  Comment    2159 non-null   object 
dtypes: float64(2), int32(4), object(7)
memory usage: 185.7+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4324 entries, 0 to 4323
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         4324 non-null   int32