In [1]:
import pandas as pd
import re

In [2]:
data = pd.read_excel('Greco receiptData.xlsx')

In [3]:
# drops listed columns
data.drop(['Secondary name','Coupon (#)','Uncertain','Unknown','Hit','Miss','Comments','Unnamed: 14','Unnamed: 15'],axis=1,inplace=True)

In [4]:
# sets null quantity purchased to 1
data.loc[pd.isna(data['NumPurchased']), 'NumPurchased'] = 1

In [5]:
# drops rows with null items 
data.drop(data[data['Item (modifier)'].isna()].index, inplace=True)

In [6]:
# drops few rows with null food category (non-food item)
data.drop(data[data['FoodCategory'].isna()].index, inplace=True)

In [7]:
# list of non-food categories
nonfood_list = ['Cleaningsupplies','Clothing/accessories','Electronics',\
                'Flower','Kitchentool','Officesupplies','Paperproduct',\
                'Paperproducts','Petfood','petfood','Plasticproduct',\
                'plasticproduct','Supplement','Toiletries',\
                'toiletries','Utensil','utensil']

In [8]:
# drops non-food items
data.drop(data[data['FoodCategory'].isin(nonfood_list)].index, inplace=True)

In [9]:
# when receipt number is null, assume all items came from a single basket that day
data.loc[pd.isna(data['RecNum']), 'RecNum'] = 1

In [10]:
# drops participants with less than 50 data points
low_part = data['ID'].value_counts() < 50
for p in low_part[low_part == True].index:
    data.drop(data[data['ID'] == p].index, inplace=True)

In [11]:
# crude first pass interpolation of missing dates
# if 20% or more data is missing abort
# otherwise attribute all missing dates to the middle of the largest date gap
for id in data['ID'].unique():
    # 20% NaN check
    nan_count = sum(data[data['ID'] == id]['RecDate'].isna())
    item_count = len(data[data['ID'] == id]['RecDate'])
    nan_percent = nan_count / item_count
    if nan_percent <= .2:
        # interpolate to middle of largest gap in dates
        shopping_days = data[data['ID'] == id]['RecDate'].unique()
        shopping_days.sort()
        gaps = []
        for d in range(len(shopping_days)-1):
            gaps.append(shopping_days[d+1] - shopping_days[d])
        ind = gaps.index(max(gaps))
        interp_date = (shopping_days[ind] + max(gaps)/2).astype('datetime64[D]')
        data.loc[(data['ID'] == 113) & pd.isnull(data['RecDate']), 'RecDate'] = interp_date
    else:
        continue

In [12]:
# remove (modifier) data from 'Item (modifier)' column
paren = re.compile(r' \(.*\)')
#[paren.findall(x) for x in data['Item (modifier)']]
data['Item'] = [re.sub(paren,'', str(x)) for x in data['Item (modifier)']]

In [13]:
data.to_csv('Greco cleaned.csv')