In [1]:
import pandas as pd

In [2]:
# this ensures tables are wide and you can scroll through all columns
pd.set_option('display.max_columns', 500)

In [3]:
# read sample data from local copy (covers Exeter two stores for Setp, Oct, Nov)
names = ['Store Number','Period of day code','Period of day','Business Date','POS device','POS_HOLD_TM_SC_QT','Total cost','Unit cost','Quantity','Food cost','Paper cost','Hour','Transaction time','Transaction Date','POS:Till/Kiosk/Drive Thru ID','Unique Kisok/Till/Drive Thru code','Sale Number','Menu item ID','Till/Kiosk/Drive Thru Key','Till/Kiosk/Drive Thru','Transaction type code','Sale or refund','Eat in 1 or Take out 0','Eat In or Take out','Payment Method Type ID','Payment Type']
df = pd.read_csv('BasketDataExeter2SepOctNov.csv', sep='|', names=names, engine='c', index_col=False)

In [5]:
# calculate ratio of cash versus cashless transactions
int_cash = df[df['Payment Type']=='Cash'].count()[0]
int_non_cash = len(df) - int_cash

print ('cash = {:d}, non-cash = {:d}, {:.1%} of people pay by cash'.format(int_cash, int_non_cash, int_cash/(int_non_cash+int_cash)))

cash = 496907, non-cash = 1049254, 32.1% of people pay by cash


In [6]:
# drop columns that are either empty or are duplicates of human-readable columns
df.drop(['Transaction type code','Payment Method Type ID','Eat in 1 or Take out 0','POS_HOLD_TM_SC_QT'], axis=1, inplace=True)

In [7]:
# list remaining columns
df.dtypes

Store Number                           int64
Period of day code                     int64
Period of day                         object
Business Date                         object
POS device                             int64
Total cost                           float64
Unit cost                            float64
Quantity                               int64
Food cost                            float64
Paper cost                           float64
Hour                                   int64
Transaction time                      object
Transaction Date                      object
POS:Till/Kiosk/Drive Thru ID          object
Unique Kisok/Till/Drive Thru code      int64
Sale Number                            int64
Menu item ID                           int64
Till/Kiosk/Drive Thru Key              int64
Till/Kiosk/Drive Thru                 object
Sale or refund                        object
Eat In or Take out                    object
Payment Type                          object
dtype: obj

In [8]:
# investigate where dates don't match
df_temp = df[df[df.columns[3]]!=df[df.columns[13]]]
df_temp[df.columns[3:14]].head()

# The Transaction date appears to be the date when the transaction took place

Unnamed: 0,Business Date,POS device,Total cost,Unit cost,Quantity,Food cost,Paper cost,Hour,Transaction time,Transaction Date,POS:Till/Kiosk/Drive Thru ID
0,2017-10-14,2,0.0,0.0,1,0.0,0.0,16,1900-01-01 16:45:00,2017-10-14,POS0002:595657242
1,2017-10-14,2,3.1,3.1,1,0.7784,0.0385,16,1900-01-01 16:45:00,2017-10-14,POS0002:595657242
2,2017-10-14,2,0.0,0.0,1,0.0,0.0,16,1900-01-01 16:45:00,2017-10-14,POS0002:595657242
3,2017-10-14,2,0.68,0.68,1,0.0,0.0,16,1900-01-01 16:45:00,2017-10-14,POS0002:595657242
4,2017-10-14,2,0.0,0.0,1,0.0,0.0,16,1900-01-01 16:45:00,2017-10-14,POS0002:595657242


In [9]:
# cast dates and times to correct format
df['Business Date'] = pd.to_datetime(df['Business Date'])
df['Transaction Date'] = pd.to_datetime(df['Transaction Date'])
df['Transaction time'] = pd.to_datetime(df['Transaction time'])

df[['Business Date','Transaction Date','Transaction time']].head()

Unnamed: 0,Business Date,Transaction Date,Transaction time
0,2017-10-14,2017-10-14,1900-01-01 16:45:00
1,2017-10-14,2017-10-14,1900-01-01 16:45:00
2,2017-10-14,2017-10-14,1900-01-01 16:45:00
3,2017-10-14,2017-10-14,1900-01-01 16:45:00
4,2017-10-14,2017-10-14,1900-01-01 16:45:00


In [10]:
# list different payment methods
df['Payment Type'].unique()

array(['Cash', 'Cashless', 'UNKNOWN CODE', 'Customer Service Voucher',
       'Luncheon Voucher'], dtype=object)

In [43]:
# now just include cashless payments
df_filtered = df[df['Payment Type']=='Cashless']
# remove refunds
df_filtered = df_filtered[df_filtered['Sale or refund']=='Sale']
# specify date
df_filtered = df_filtered[df_filtered['Transaction Date']=='2017-09-05']
# POS device
df_filtered = df_filtered[df_filtered['POS device']==21]
# and time
#df_filtered = df_filtered[df_filtered['Transaction time']=='1900-01-01 12:00:00']

In [28]:
# used to check POS (tills) presence in card data also
df['POS device'].unique()
# 20, 63, 19, 17 are not in card data

array([ 2, 21, 26,  3, 22,  1, 23, 25, 28, 24, 27, 20, 63, 19, 17])

In [44]:
# sort table by keys
df_sorted = df_filtered.sort_values(by=['Store Number','Transaction Date','POS:Till/Kiosk/Drive Thru ID','Sale Number'], axis=0)
# display sorted table with key info
df_sorted[['Store Number','Transaction Date','POS:Till/Kiosk/Drive Thru ID','Sale Number','Transaction time','Total cost','Unit cost']].head(50)

Unnamed: 0,Store Number,Transaction Date,POS:Till/Kiosk/Drive Thru ID,Sale Number,Transaction time,Total cost,Unit cost
842609,295,2017-09-05,POS0021:902171314,1,1900-01-01 08:15:00,2.99,2.99
842608,295,2017-09-05,POS0021:902171314,2,1900-01-01 08:15:00,2.99,2.99
842611,295,2017-09-05,POS0021:902171314,3,1900-01-01 08:15:00,0.0,0.0
842612,295,2017-09-05,POS0021:902171314,4,1900-01-01 08:15:00,0.0,0.0
842610,295,2017-09-05,POS0021:902171314,5,1900-01-01 08:15:00,0.0,0.0
842613,295,2017-09-05,POS0021:902171314,6,1900-01-01 08:15:00,0.0,0.0
811413,295,2017-09-05,POS0021:902171315,1,1900-01-01 08:30:00,0.83,0.83
860987,295,2017-09-05,POS0021:902171316,1,1900-01-01 09:45:00,1.99,1.99
875496,295,2017-09-05,POS0021:902171317,1,1900-01-01 10:00:00,1.24,1.24
875495,295,2017-09-05,POS0021:902171317,2,1900-01-01 10:00:00,1.16,1.16


In [53]:
# sort table by keys
df_grouped = df_sorted.groupby(['Store Number','Transaction Date','POS:Till/Kiosk/Drive Thru ID']).agg({'Total cost': 'sum', 'Transaction time': 'max'})
# display sorted table with key info
df_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Transaction time,Total cost
Store Number,Transaction Date,POS:Till/Kiosk/Drive Thru ID,Unnamed: 3_level_1,Unnamed: 4_level_1
295,2017-09-05,POS0021:902171314,1900-01-01 08:15:00,5.98
295,2017-09-05,POS0021:902171315,1900-01-01 08:30:00,0.83
295,2017-09-05,POS0021:902171316,1900-01-01 09:45:00,1.99
295,2017-09-05,POS0021:902171317,1900-01-01 10:00:00,2.40
295,2017-09-05,POS0021:902171318,1900-01-01 10:00:00,6.91
295,2017-09-05,POS0021:902171319,1900-01-01 10:00:00,6.23
295,2017-09-05,POS0021:902171320,1900-01-01 10:15:00,7.47
295,2017-09-05,POS0021:902171321,1900-01-01 10:15:00,3.16
295,2017-09-05,POS0021:902171322,1900-01-01 10:45:00,2.31
295,2017-09-05,POS0021:902171323,1900-01-01 10:45:00,3.07
