In [1]:
import re
import datetime

import pandas as pd

In [2]:
%%time
# import Max's data set
DATA_PATH = '../Data/'
FILE_NAME = 'Max, Samantha, Maria data.xlsx'
SHEET = 'Samantha'

df = pd.read_excel(DATA_PATH + FILE_NAME, sheet_name=SHEET)

initial_row_count = df.shape[0]

Wall time: 1.35 s


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3138 entries, 0 to 3137
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   ID               3138 non-null   int64         
 1   TP-Date          3138 non-null   int64         
 2   RecNum           1822 non-null   object        
 3   RecDate          2509 non-null   datetime64[ns]
 4   Item (modifier)  3069 non-null   object        
 5   Secondary name   135 non-null    object        
 6   Coupon (#)       36 non-null     object        
 7   Uncertain        111 non-null    object        
 8   Unknown          82 non-null     object        
 9   NumPurchased     525 non-null    object        
 10  Hit              27 non-null     float64       
 11  Miss             0 non-null      float64       
 12  FoodCategory     3003 non-null   object        
 13  Comments         284 non-null    object        
dtypes: datetime64[ns](1), float64(2), int64(

In [4]:
# standardize column names
df = df.drop(columns='Coupon (#)') # rarely used and missing from Max's data set

column_names = ['ID', 'Session', 'Receipt', 'Date', 
                'Item', 'Item2', 'Uncertain', 'Unknown', 
                'Quantity', 'Hit', 'Miss', 'Category', 'Comment']
df.columns = column_names

In [5]:
# ID column stores participant identification numbers as integers between 110 and 162
df.ID = df.ID.astype('uint8')

# participants assigned to all transcribers plus those unique to this data set
pids_assigned = ({121, 114, 137, 153, 141, 127, 130, 135, 148, 158} | 
                 {131, 139, 145, 149, 152, 157, 162, 113, 118, 126})

assert set(df.ID.unique()) <= pids_assigned

In [6]:
# Session column records the transcription session bookkeeping
df.Session = df.Session.astype('uint8')

# transcription was divided into 6 sessions
valid_sessions = [1, 2, 3, 4, 5, 6]

assert df.Session.isin(valid_sessions).all() 

In [7]:
# Receipt column provides bookkeeping of grocery receipes per session
# df.info() has shown existence of missing values
null_receipt_count = df.Receipt.isna().sum()
print(f'{null_receipt_count} ({null_receipt_count / df.shape[0]:.0%}) rows missing Receipt values')

1316 (42%) rows missing Receipt values


In [8]:
# distinguishing between different receipts is essential to this data set
# missing Receipt number is unusable and dropped
df = df[df.Receipt.notna()]

# data type conversion prompts error leading to discovery of typo
typo = datetime.datetime(1900, 1, 1, 0, 0)
df.loc[df.Receipt == typo, 'Receipt'] = 1
df.Receipt = df.Receipt.astype('uint8')

# receipt numbers should be small integers and continuous
for pid in df.ID.unique():
    for session in df.loc[df.ID == pid, 'Session'].unique():
        receipt_numbers = list(df.loc[(df.ID == pid) & (df.Session == session), 'Receipt'].unique())
        if receipt_numbers != list(range(1, len(receipt_numbers) + 1)):
            print(f'({pid}, {session}):', receipt_numbers)

(145, 3): [3, 4]
(153, 6): [2]


In [9]:
# (145, 3, 1&2) do not exist
# (153, 6, 1) is labled 153-6 (receipt number missing)

In [10]:
# Date column records purchase date on receipt if available
df.Date = pd.to_datetime(df.Date, errors='coerce').dt.date.astype('datetime64')

# approximate date range (5/1/2020 - 12/31/2020)
assert df.Date.dropna().between(datetime.datetime(2020, 5, 1), datetime.datetime(2020, 12, 31)).all()

In [11]:
# Item column contains grocery description as string with "ITEM (MODIFIER)" format
df.Item = df.Item.str.lower().str.strip().astype('string')
df.Item.value_counts(dropna=False).head()

bananas         50
NaN             35
blueberries     24
eggs            22
strawberries    15
Name: Item, dtype: Int64

In [12]:
# Item description is essential data. Unidentifiable items are dropped
NULL_ITEM_DESC = r'unknown|n/a|missing'
null_item_count = df.Item.str.contains(NULL_ITEM_DESC).sum() + df.Item.isna().sum()
display(df[df.Item.str.contains(NULL_ITEM_DESC)])
df = df[df.Item.notna()]
df = df[~df.Item.str.contains(NULL_ITEM_DESC)]
print(f'{null_item_count} ({null_item_count / df.shape[0]:.0%}) additional rows dropped due to null item')

Unnamed: 0,ID,Session,Receipt,Date,Item,Item2,Uncertain,Unknown,Quantity,Hit,Miss,Category,Comment
2241,127,2,3,2020-07-28,n/a - might just be soda can deposit?,,,x,,,,,"unknown: ""dep sftdk 12pk"""


36 (2%) additional rows dropped due to null item


In [13]:
# Item2 provides additional description of the grocery, but is too sparse to be useful
df = df.drop(columns='Item2')

In [14]:
# Uncertain denotes low confidence in transcription
display(df[df.Uncertain.notna()])

Unnamed: 0,ID,Session,Receipt,Date,Item,Uncertain,Unknown,Quantity,Hit,Miss,Category,Comment
110,139,1,2,NaT,salad dressing,x,,2.0,,,Condiment,"uncertain: ""newmans own"""
394,145,1,2,2020-07-23,cherry crumble?,x,,,,,Dessert,"unknown: ""nat bak om crumb chy"""
395,145,1,2,2020-07-23,apple crumble?,x,,,,,Dessert,"unknown: ""nat bak om crumb apl"""
397,145,1,2,2020-07-23,green onions,x,,,,,Vegetable,"uncertain: ""gh onion/chv veggie"""
399,145,1,2,2020-07-23,nuts (unsalted),x,,,,,Snack,"uncertain: ""plntr unslt dry rstd"""
412,145,1,3,2020-07-17,cherry crumble?,x,,,,,Dessert,"unknown: ""nat bak om crumb chy"""
413,145,1,3,2020-07-17,apple crumble?,x,,,,,Dessert,"unknown: ""nat bak om crumb apl"""
468,145,2,3,2020-08-10,onions,x,,,,,Vegetable,uncertain: can't see text
477,145,3,3,NaT,salad greens blend,x,,,,,Salad,uncertain: can’t see text
488,145,3,3,NaT,unsweetened vanilla plant milk?,x,,,,,Drink,uncertain: can't see text


In [15]:
# transcription quality seems acceptable so we'll keep the items
# but first remove question marks
df.Item = df.Item.str.replace(r'?', '', regex=False)
df = df.drop(columns='Uncertain')

In [16]:
# Unknown denotes very low confidence in transcription
display(df[df.Unknown.notna()])

Unnamed: 0,ID,Session,Receipt,Date,Item,Unknown,Quantity,Hit,Miss,Category,Comment
57,131,4,1,2020-09-15,bakery item,x,,,,,"don't know: ""Bakery - 15.99"""
67,131,5,1,2020-09-15,bakery,x,,,,,"don't know: ""Bakery - 15.99"""
390,145,1,2,2020-07-23,unsweetened vanilla plant milk,x,,,,Drink,"unknown: ""silk unswt vanla org"""
407,145,1,3,2020-07-17,unsweetened vanilla plant milk,x,,,,Drink,"unknown: ""silk unswt vanla org"""
439,145,2,2,2020-08-01,unsweetened vanilla plant milk,x,,,,Drink,"unknown: ""silk unswt vanla org"""
470,145,2,3,2020-08-10,unsweetened vanilla plant milk,x,,,,Drink,"unknown: ""silk unswt vanla org"""
490,145,3,3,NaT,wheat bran,x,,,,Grain,unknown: can't see text
554,145,4,2,2020-09-10,unsweetened vanilla plant milk,x,,,,Drink,"unknown: ""silk unswt vanilla org"""
936,113,2,1,2020-06-19,spinach,x,,,,Vegetable,
2714,135,2,9,NaT,,x,,,,,"all of 2-9: too blurry, can't read"


In [17]:
# item descriptions are too vague to be useful
unknown_count = df.Unknown.notna().sum()
print(f'{unknown_count} ({unknown_count / df.shape[0]:.0%}) additional rows dropped due to null item')
df = df[df.Unknown.isna()]
df = df.drop(columns='Unknown')

10 (1%) additional rows dropped due to null item


In [18]:
# Quantity represents multiple purchases of the same item
df.Quantity.value_counts(dropna=False)

NaN    1483
2       183
3        62
4        23
5        11
6         5
15        2
14        2
8         1
7         1
12        1
??        1
11        1
Name: Quantity, dtype: int64

In [19]:
# fix typo
df.loc[df.Quantity == '??', 'Quantity'] = 1

# examine large quantities
display(df[df.Quantity.isin([7, 8, 11, 12, 14, 15])])

Unnamed: 0,ID,Session,Receipt,Date,Item,Quantity,Hit,Miss,Category,Comment
605,149,1,2,2020-07-30,plantain chips,8,,,Snack,"uncertain: ""grc grn plant ch"""
654,149,3,1,2020-09-08,banana chips,7,,,Snack,"uncertain: ""grc grn bnana ch"""
666,149,3,2,2020-08-30,plantain chips,15,,,Snack,"uncertain: ""grc grn plant ch"""
704,149,4,3,2020-09-21,plantain chips,14,,,Snack,"uncertain: ""grc grn plant ch"""
712,149,5,1,2020-09-28,plantain chips,14,,,Snak,"uncertain: ""grc grn plant ch"""
729,149,6,1,2020-10-08,plantain chips,12,,,Snack,"uncertain: ""grc grn plant ch"""
1133,118,1,3,NaT,cat food (wet),15,,,petfood,
1205,126,1,1,2020-06-16,macaroni and cheese with broccoli,11,,,Dish,


In [20]:
# duplicate rows according to quantity purchased
print(f'{df.Quantity.fillna(1).sum() - df.shape[0]} rows added from expanding Quantity data')
df = df.loc[df.index.repeat(df.Quantity.fillna(1))]
df = df.drop(columns='Quantity')

533 rows added from expanding Quantity data


In [21]:
# Hit and Miss columns contain little or no data
df = df.drop(columns=['Hit', 'Miss'])

In [22]:
# Category columns groups groceries by type
df.Category = df.Category.astype('string')
df.Category.value_counts(dropna=False).head()

Fruit    298
Dairy    245
Drink    218
Snack    159
Dish     149
Name: Category, dtype: Int64

In [23]:
# Comment column contains miscellaneous notes from transcriber
df.Comment.value_counts(dropna=False)

NaN                                                                                          2104
uncertain: "grc grn plant ch"                                                                  66
new category needed                                                                            17
walmart: "gv 12 jumbo"                                                                         16
repeat of 3-1                                                                                  15
uncertain: "grc grn bnana ch"                                                                  10
repeat of 4-2                                                                                   8
repeat of 4-1                                                                                   5
cut off?                                                                                        4
uncertain: can't see text                                                                       4
cut off             

In [24]:
# "uncertain" items seem useable
display(df[df.Comment.str.contains(r'uncertain', na=False)])

Unnamed: 0,ID,Session,Receipt,Date,Item,Category,Comment
110,139,1,2,NaT,salad dressing,Condiment,"uncertain: ""newmans own"""
110,139,1,2,NaT,salad dressing,Condiment,"uncertain: ""newmans own"""
114,139,1,2,NaT,croutons,Grain,uncertain on category
397,145,1,2,2020-07-23,green onions,Vegetable,"uncertain: ""gh onion/chv veggie"""
399,145,1,2,2020-07-23,nuts (unsalted),Snack,"uncertain: ""plntr unslt dry rstd"""
...,...,...,...,...,...,...,...
2182,127,1,1,2020-07-01,cucumber,Vegetables,"uncertain - price chopper: ""cuke brpl"""
2226,127,2,1,2020-07-28,soda,Drink,"uncertain - price chopper: ""polar 1 ltr dt chr"""
2336,127,5,4,2020-08-13,cheese,Dairy,"uncertain: ""2 pk half moon cky"""
2697,135,2,6,NaT,advil,Supplement,"walmart - uncertain: ""total adv tp"""


In [25]:
# drop duplicate receipts
duplicate_mask = df.Comment.str.contains(r'duplicate|repeat', case=False, na=False)
duplicate_drop_count = sum(duplicate_mask)
df = df[~duplicate_mask]
df.Comment = df.Comment.astype('string')
print(f'{duplicate_drop_count} ({duplicate_drop_count / df.shape[0]:.0%}) duplicate rows dropped')

28 (1%) duplicate rows dropped


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2281 entries, 0 to 2923
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   ID        2281 non-null   uint8         
 1   Session   2281 non-null   uint8         
 2   Receipt   2281 non-null   uint8         
 3   Date      1819 non-null   datetime64[ns]
 4   Item      2281 non-null   string        
 5   Category  2250 non-null   string        
 6   Comment   177 non-null    string        
dtypes: datetime64[ns](1), string(3), uint8(3)
memory usage: 95.8 KB


In [27]:
total_drop = null_receipt_count + null_item_count + unknown_count + duplicate_drop_count
print(f'Total row reduction: {total_drop} ({total_drop / initial_row_count:.0%})')

Total row reduction: 1390 (44%)


In [28]:
df = df.reset_index(drop=True)
df.to_csv(f'{DATA_PATH}clean_{SHEET.lower()}.csv')