In [1]:
import re
import datetime

import pandas as pd

In [2]:
%%time
# import Max's data set
DATA_PATH = '../Data/'
FILE_NAME = 'Max, Samantha, Maria data.xlsx'
SHEET = 'Max'

df = pd.read_excel(DATA_PATH + FILE_NAME, sheet_name=SHEET)

initial_row_count = df.shape[0]

Wall time: 1.08 s


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2159 entries, 0 to 2158
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   ID               2159 non-null   int64         
 1   TP-Date          2159 non-null   int64         
 2   RecNum           1555 non-null   float64       
 3   RecDate          1718 non-null   datetime64[ns]
 4   Item (Modifier)  2159 non-null   object        
 5   Secondary Name   38 non-null     object        
 6   Uncertain        37 non-null     object        
 7   Unknown          87 non-null     object        
 8   NumPurchased     274 non-null    float64       
 9   Hit              0 non-null      float64       
 10  Miss             0 non-null      float64       
 11  FoodCategory     2059 non-null   object        
 12  Comments         494 non-null    object        
dtypes: datetime64[ns](1), float64(4), int64(2), object(6)
memory usage: 219.4+ KB


In [4]:
# standardize column names
column_names = ['ID', 'Session', 'Receipt', 'Date', 
                'Item', 'Item2', 'Uncertain', 'Unknown', 
                'Quantity', 'Hit', 'Miss', 'Category', 'Comment']
df.columns = column_names

In [5]:
# ID column stores participant identification numbers as integers between 110 and 162
df.ID = df.ID.astype('uint8')

# participants assigned to all transcribers plus those unique to this data set
pids_assigned = ({121, 114, 137, 153, 141, 127, 130, 135, 148, 158} | 
                 {129, 136, 144, 147, 151, 156, 160, 112, 117, 120})

assert set(df.ID.unique()) <= pids_assigned

In [6]:
# Session column records the transcription session bookkeeping
df.Session = df.Session.astype('uint8')

# transcription was divided into 6 sessions
valid_sessions = [1, 2, 3, 4, 5, 6]

assert df.Session.isin(valid_sessions).all() 

In [7]:
# Receipt column provides bookkeeping of grocery receipes per session

# df.info() has shown existence of missing values
null_receipt_count = df.Receipt.isna().sum()
print(f'{null_receipt_count} ({null_receipt_count / df.shape[0]:.0%}) rows missing Receipt values')

604 (28%) rows missing Receipt values


In [8]:
# distinguishing between different receipts is essential to this data set
# missing Receipt number is unusable and dropped
df = df[df.Receipt.notna()]

# receipt numbers should be small integers and continuous
df.Receipt = df.Receipt.astype('uint8')
for pid in df.ID.unique():
    for session in df.loc[df.ID == pid, 'Session'].unique():
        receipt_numbers = list(df.loc[(df.ID == pid) & (df.Session == session), 'Receipt'].unique())
        if receipt_numbers != list(range(1, len(receipt_numbers) + 1)):
            print(f'({pid}, {session}):', receipt_numbers)

(136, 1): [1, 2, 4, 5, 6, 7]


In [9]:
# Date column records purchase date on receipt if available
df.Date = pd.to_datetime(df.Date, errors='coerce').dt.date.astype('datetime64')

# approximate date range (5/1/2020 - 12/31/2020)
assert df.Date.dropna().between(datetime.datetime(2020, 5, 1), datetime.datetime(2020, 12, 31)).all()

In [10]:
# Item column contains grocery description as string with "ITEM (MODIFIER)" format
df.Item = df.Item.str.lower().str.strip().astype('string')
df.Item.value_counts(dropna=False).head()

bananas        43
unknown        30
blueberries    18
eggs           16
milk (2%)      14
Name: Item, dtype: Int64

In [11]:
# Item description is essential data. Unidentifiable items are dropped
NULL_ITEM_DESC = r'unknown|n/a|missing'
null_item_count = df.Item.str.contains(NULL_ITEM_DESC).sum() + df.Item.isna().sum()
display(df[df.Item.str.contains(NULL_ITEM_DESC)])
df = df[df.Item.notna()]
df = df[~df.Item.str.contains(NULL_ITEM_DESC)]
print(f'{null_item_count} ({null_item_count / df.shape[0]:.0%}) additional rows dropped due to null item')

Unnamed: 0,ID,Session,Receipt,Date,Item,Item2,Uncertain,Unknown,Quantity,Hit,Miss,Category,Comment
17,129,2,1,2020-06-30,unknown,,,x,2.0,,,,"unknown: ""100 Cal Trop/Mixed"""
60,129,3,2,2020-07-28,unknown,,,x,,,,,"unknown: ""GV SF PW SYR"""
196,129,5,1,2020-08-18,unknown,,,x,,,,,"unknown: ""GV 100 WWWP;"" Great Value _____?"
259,136,2,1,2020-08-14,unknown,,,x,,,,,"unknown: ""NGVC 2020 Anniv;"" ""0084932700262"""
269,136,2,1,2020-08-14,unknown,,,x,,,,,"unknown: ""DC $3.45 Natura;"" ""0084932700217"""
270,136,2,1,2020-08-14,unknown,,,x,,,,,"unknown: ""DC $3.45 Natura;"" ""0084932700217"""
271,136,2,1,2020-08-14,unknown,,,x,,,,,"unknown: ""DC $3.49 Lightl;"" ""0004345410080"""
272,136,2,1,2020-08-14,unknown,,,x,,,,,"unknown: ""DC Free Natural;"" ""0084932700134"""
369,136,4,8,2020-09-10,unknown,,,x,2.0,,,,"unknown: ""STO CARTS RN"" Simple Truth Organic _..."
894,137,1,1,2020-07-29,unknown,,,x,,,,,"unknown: ""OTBCTC24OZ"""


31 (2%) additional rows dropped due to null item


In [12]:
# Item2 provides additional description of the grocery, but is too sparse to be useful
df = df.drop(columns='Item2')

In [13]:
# Uncertain denotes low confidence in transcription
display(df[df.Uncertain.notna()])

Unnamed: 0,ID,Session,Receipt,Date,Item,Uncertain,Unknown,Quantity,Hit,Miss,Category,Comment
26,129,2,1,2020-07-02,sandwich rolls,x,,,,,Grain,"uncertain: ""KK SNDWCH RL 15"""
70,129,3,2,2020-07-28,butter (herb),x,,,,,,"uncertain: ""BUTR HERB SL"""
151,129,4,2,2020-08-04,frozen meal,x,,,,,Dish,"uncertain: ""LFC BOWL 11z;"" Life Cuisine"
152,129,4,2,2020-08-04,frozen meal,x,,,,,Dish,"uncertain: ""LFC BOWL 10.875z;"" Life Cuisine"
176,129,5,1,2020-08-17,sandwich rolls,x,,,,,Grain,"uncertain: ""KK SNDWCH RL 15"""
343,136,4,4,2020-09-04,baby food,x,,,,,Dish,"uncertain: ""CMFRTS BABY"" could be diapers, wip..."
357,136,4,6,2020-09-06,bacon,x,,,,,Meat,"uncertain: ""PC HMPL BACON"" assumed Hempler's B..."
362,136,4,7,2020-09-10,red sugar,x,,,,,Seasoning,"uncertain: ""PC STO RED SUG"""
376,136,4,8,2020-09-10,red sugar,x,,,,,Seasoning,"uncertain: ""PC STO RED SUG"" & Duplicate of 4-7"
435,136,5,5,NaT,beef,x,,,,,Meat,"uncertain: ""HTGF BEEF"""


In [14]:
# transcription quality seems acceptable so we'll keep the items
df = df.drop(columns='Uncertain')

In [15]:
# Unknown denotes very low confidence in transcription
display(df[df.Unknown.notna()])

Unnamed: 0,ID,Session,Receipt,Date,Item,Unknown,Quantity,Hit,Miss,Category,Comment
6,129,1,1,2020-06-19,indian meal,x,2.0,,,,"unknown: ""MEAL INDIAN"""
25,129,2,1,2020-07-02,bake shop item,x,2.0,,,Grain,"unknown: ""REDUCE BAKE SHOP"""
29,129,2,1,2020-07-02,deli item,x,,,,,"unknown: ""LOL AM WHT END"""
39,129,2,1,2020-07-02,produce item,x,2.0,,,,"unknown: ""REDUCED PRODUCE"""
44,129,3,1,2020-07-31,dairy item,x,,,,Dairy,"unknown: ""GB WHT PCH 5.3Z"""
48,129,3,1,2020-07-31,deli item,x,,,,,"unknown: ""DELI MI"""
52,129,3,1,2020-07-31,produce item,x,,,,,"unknown: ""REDUCED PRODUCE"""
62,129,3,2,2020-07-28,buttermilk product,x,,,,,"unknown: ""BUTTERMIL"""
65,129,3,2,2020-07-28,lemon snack product,x,,,,Snack,"unknown: ""LEMON SNACK"""
118,129,4,1,2020-08-11,bake shop item,x,,,,,"unknown: ""REDUCE BAKE SHOP"""


In [16]:
# item descriptions are too vague to be useful
unknown_count = df.Unknown.notna().sum()
print(f'{unknown_count} ({unknown_count / df.shape[0]:.0%}) additional rows dropped due to null item')
df = df[df.Unknown.isna()]
df = df.drop(columns='Unknown')

26 (2%) additional rows dropped due to null item


In [17]:
# Quantity represents multiple purchases of the same item
df.Quantity.value_counts(dropna=False)

NaN    1323
2.0     136
3.0      24
4.0       7
5.0       5
6.0       3
Name: Quantity, dtype: int64

In [18]:
# duplicate rows according to quantity purchased
print(f'{df.Quantity.fillna(1).sum() - df.shape[0]} rows added from expanding Quantity data')
df = df.loc[df.index.repeat(df.Quantity.fillna(1))]
df = df.drop(columns='Quantity')

240.0 rows added from expanding Quantity data


In [19]:
# Hit and Miss columns contain little or no data
df = df.drop(columns=['Hit', 'Miss'])

In [20]:
# Category columns groups groceries by type
df.Category = df.Category.astype('string')
df.Category.value_counts(dropna=False).head()

Fruit        241
Vegetable    202
Drink        159
Meat         133
Dairy        128
Name: Category, dtype: Int64

In [21]:
# Comment column contains miscellaneous notes from transcriber
df.Comment.value_counts(dropna=False)

NaN                                                                1352
Duplicate of 1-1                                                     49
category suggestion: "Petproduct"                                    44
category suggestion: "Cleaningproduct"                               22
category suggestion: "Toiletries"                                    19
Duplicate of 6-1                                                     17
Duplicate of 1-2                                                     16
Duplicate of 1-5                                                     15
Duplicate of 3-1                                                     15
category suggestion: "Householdgoods" ??                             14
Duplicate of 1-3                                                     13
category suggestion: "Cookingproduct"                                13
Duplicate of 5-5                                                     12
category suggestion: "Kitchenappliance"                         

In [22]:
# "uncertain" items seem useable
df[df.Comment.str.contains(r'uncertain', na=False)]

Unnamed: 0,ID,Session,Receipt,Date,Item,Category,Comment
26,129,2,1,2020-07-02,sandwich rolls,Grain,"uncertain: ""KK SNDWCH RL 15"""
70,129,3,2,2020-07-28,butter (herb),,"uncertain: ""BUTR HERB SL"""
151,129,4,2,2020-08-04,frozen meal,Dish,"uncertain: ""LFC BOWL 11z;"" Life Cuisine"
152,129,4,2,2020-08-04,frozen meal,Dish,"uncertain: ""LFC BOWL 10.875z;"" Life Cuisine"
176,129,5,1,2020-08-17,sandwich rolls,Grain,"uncertain: ""KK SNDWCH RL 15"""
343,136,4,4,2020-09-04,baby food,Dish,"uncertain: ""CMFRTS BABY"" could be diapers, wip..."
357,136,4,6,2020-09-06,bacon,Meat,"uncertain: ""PC HMPL BACON"" assumed Hempler's B..."
362,136,4,7,2020-09-10,red sugar,Seasoning,"uncertain: ""PC STO RED SUG"""
376,136,4,8,2020-09-10,red sugar,Seasoning,"uncertain: ""PC STO RED SUG"" & Duplicate of 4-7"
435,136,5,5,NaT,beef,Meat,"uncertain: ""HTGF BEEF"""


In [23]:
# drop duplicate receipts
duplicate_mask = df.Comment.str.contains(r'duplicate|repeat', case=False, na=False)
duplicate_drop_count = sum(duplicate_mask)
df = df[~duplicate_mask]
df.Comment = df.Comment.astype('string')
print(f'{duplicate_drop_count} ({duplicate_drop_count / df.shape[0]:.0%}) duplicate rows dropped')

201 (13%) duplicate rows dropped


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1537 entries, 0 to 2137
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   ID        1537 non-null   uint8         
 1   Session   1537 non-null   uint8         
 2   Receipt   1537 non-null   uint8         
 3   Date      1221 non-null   datetime64[ns]
 4   Item      1537 non-null   string        
 5   Category  1532 non-null   string        
 6   Comment   185 non-null    string        
dtypes: datetime64[ns](1), string(3), uint8(3)
memory usage: 64.5 KB


In [25]:
# net results
total_drop = null_receipt_count + null_item_count + unknown_count + duplicate_drop_count
print(f'Total row reduction: {total_drop} ({total_drop / initial_row_count:.0%})')

Total row reduction: 862 (40%)


In [26]:
df = df.reset_index(drop=True)
df.to_csv(f'{DATA_PATH}clean_{SHEET.lower()}.csv')