# McDonald's Joined Basket Analysis - v2 all card tokens
## April 2018
### Dr Jose M Albornoz

This notebook analyses joined basket data using all card tokens to examine i) unmatched transactions; ii) multiple payments between succesive baskets

# 1.- Import necessary modules, define SQLContext

In [1]:
# Import required modules
from pyspark.context import SparkContext
from pyspark.sql.context import SQLContext
from pyspark.sql.functions import *
from pyspark.sql.functions import col
from pyspark.sql.functions import row_number
from pyspark.sql.types import *
from pyspark.sql.functions import unix_timestamp
import math
from pyspark.sql.window import *
import pandas as pd

In [2]:
# Define SQLContext
sqlContext = SQLContext(sc)

# 2.- Generic functions to load data from a text-based file

In [3]:
# a function to load a colon-separated value file
def load_data_colon(filename, schema, columns = None):
    df = sqlContext.read.format('com.databricks.spark.csv').option("delimiter", ";").options(header='true'). \
    load(filename, schema = schema)
    if columns is None:
        # If no columns are given, then select all
        columns = schema.names
    return df.select(columns)

In [4]:
# a function to load a pipe-separated value file
def load_data_pipe(filename, schema, columns = None):
    df = sqlContext.read.format('com.databricks.spark.csv').option("delimiter", "|").options(header='false'). \
    load(filename, schema = schema)
    if columns is None:
        # If no columns are given, then select all
        columns = schema.names
    return df.select(columns)

In [5]:
# a function to load a comma-separated value file
def load_data_comma(filename, schema, columns = None):
    df = sqlContext.read.format('com.databricks.spark.csv').option("delimiter", ",").options(header='true'). \
    load(filename, schema = schema)
    if columns is None:
        # If no columns are given, then select all
        columns = schema.names
    return df.select(columns)

# 3.- Schema for joined basket file

In [6]:
schema_basket = StructType([ 
    StructField('store_number', IntegerType(), True), 
    StructField('timestamp_basket', TimestampType(), True), 
    StructField('pos_code', IntegerType(), True), 
    StructField('pos_id', StringType(), True), 
    StructField('sale_number', IntegerType(), True),
    StructField('total_cost', FloatType(), True),
    StructField('unit_cost', FloatType(), True),
    StructField('quantity', IntegerType(), True), 
    StructField('menu_item_id', IntegerType(), True), 
    StructField('transaction_amount', FloatType(), True), 
    StructField('pan_token', StringType(), True), 
    StructField('timestamp_cards', TimestampType(), True)
])

# 4.- Load joined basket data

In [7]:
df_basket = load_data_comma("joined_basket_reading_all_tokens.csv/part-00000-e68b3079-c1d0-427a-b29e-80eb1114b0a4-c000.csv", \
                           schema_basket)

In [10]:
df_basket = df_basket.select('store_number', 'timestamp_basket', 'pos_code', 'pos_id', 'sale_number', 'pan_token', \
                             'timestamp_cards', 'total_cost').orderBy('store_number', 'pos_code', 'pos_id', \
                                                                      'sale_number')

In [11]:
df_basket.show(20)

+------------+-------------------+--------+-----------------+-----------+--------------------+-------------------+----------+
|store_number|   timestamp_basket|pos_code|           pos_id|sale_number|           pan_token|    timestamp_cards|total_cost|
+------------+-------------------+--------+-----------------+-----------+--------------------+-------------------+----------+
|         102|2017-09-01 00:00:51|       1|POS0001:216437503|          1|CC2919271D518CE2B...|2017-09-01 00:01:00|       0.0|
|         102|2017-09-01 00:00:51|       1|POS0001:216437503|          2|CC2919271D518CE2B...|2017-09-01 00:01:00|      0.91|
|         102|2017-09-01 00:00:51|       1|POS0001:216437503|          3|CC2919271D518CE2B...|2017-09-01 00:01:00|       0.0|
|         102|2017-09-01 00:00:51|       1|POS0001:216437503|          4|CC2919271D518CE2B...|2017-09-01 00:01:00|      3.33|
|         102|2017-09-01 00:01:40|       1|POS0001:216437504|          1|9DCFC4FE00D7565E7...|2017-09-01 00:02:00|    

In [12]:
df_basket.count()

1732243

# 5.- Compute aggregated basket

In [13]:
df_basket_aggregated = df_basket.groupBy("store_number", "timestamp_basket", "pos_code", "pos_id", \
                                         "pan_token", "timestamp_cards").\
                                 agg(sum("total_cost").alias("total_cost")). \
                                                                         orderBy("store_number", \
                                                                                 "pos_code", "pos_id", \
                                                                                 "timestamp_basket")

In [14]:
df_basket_aggregated.show(10)

+------------+-------------------+--------+-----------------+--------------------+-------------------+------------------+
|store_number|   timestamp_basket|pos_code|           pos_id|           pan_token|    timestamp_cards|        total_cost|
+------------+-------------------+--------+-----------------+--------------------+-------------------+------------------+
|         102|2017-09-01 00:00:51|       1|POS0001:216437503|CC2919271D518CE2B...|2017-09-01 00:01:00| 4.239999949932098|
|         102|2017-09-01 00:01:40|       1|POS0001:216437504|9DCFC4FE00D7565E7...|2017-09-01 00:02:00|  5.68999981880188|
|         102|2017-09-01 00:07:28|       1|POS0001:216437511|F8FD3E6E72ECDF9D6...|2017-09-01 00:08:00|2.4800000190734863|
|         102|2017-09-01 00:07:28|       1|POS0001:216437511|CA03E173868FFB0F5...|2017-09-01 00:08:00|2.4800000190734863|
|         102|2017-09-01 00:08:34|       1|POS0001:216437512|                null|               null| 3.559999942779541|
|         102|2017-09-01

In [15]:
df_basket_aggregated.count()

371702

# 6.- Count of unmatched baskets

In [16]:
df_unmatched = df_basket_aggregated.where(col('pan_token').isNull())

In [17]:
df_unmatched.show(10)

+------------+-------------------+--------+-----------------+---------+---------------+------------------+
|store_number|   timestamp_basket|pos_code|           pos_id|pan_token|timestamp_cards|        total_cost|
+------------+-------------------+--------+-----------------+---------+---------------+------------------+
|         102|2017-09-01 00:08:34|       1|POS0001:216437512|     null|           null| 3.559999942779541|
|         102|2017-09-01 00:12:16|       1|POS0001:216437516|     null|           null|3.3199999928474426|
|         102|2017-09-01 00:36:16|       1|POS0001:216437530|     null|           null|3.7200000286102295|
|         102|2017-09-01 00:52:39|       1|POS0001:216437538|     null|           null| 4.750000059604645|
|         102|2017-09-01 01:02:22|       1|POS0001:216437545|     null|           null| 4.070000112056732|
|         102|2017-09-01 01:47:04|       1|POS0001:216437575|     null|           null| 2.990000069141388|
|         102|2017-09-01 02:03:28|   

## 6.1.- Are there any repeated pos_ids in the unmatched baskets?

In [23]:
len([i.pos_id for i in df_unmatched.select('pos_id').distinct().collect()])

51067

In [24]:
df_unmatched.count()

51067

# 7.- Count of multiple payments between transactions

In [19]:
df_basket_multiple_payments = df_basket_aggregated.groupBy('pos_id').count()

In [20]:
df_basket_multiple_payments.show(10)

+-----------------+-----+
|           pos_id|count|
+-----------------+-----+
|POS0001:216439381|    1|
|POS0001:216439620|    1|
|POS0001:216439677|    2|
|POS0001:216439714|    1|
|POS0001:216439730|    1|
|POS0001:216439756|    1|
|POS0001:216442041|    1|
|POS0001:216442906|    2|
|POS0002:563011450|    2|
|POS0002:563011646|    1|
+-----------------+-----+
only showing top 10 rows



In [21]:
df_basket_multiple_payments.count()

178274

In [22]:
df_basket_multiple_payments.where(df_basket_multiple_payments['count'] != 1).count()

45223

# 8.- Convert to Pandas

In [25]:
pdf_basket = df_basket.toPandas()

In [26]:
pdf_basket.head(50)

Unnamed: 0,store_number,timestamp_basket,pos_code,pos_id,sale_number,pan_token,timestamp_cards,total_cost
0,102,2017-09-01 00:00:51,1,POS0001:216437503,1,CC2919271D518CE2B36A278F42A66B95E4A5293502AAC7...,2017-09-01 00:01:00,0.0
1,102,2017-09-01 00:00:51,1,POS0001:216437503,2,CC2919271D518CE2B36A278F42A66B95E4A5293502AAC7...,2017-09-01 00:01:00,0.91
2,102,2017-09-01 00:00:51,1,POS0001:216437503,3,CC2919271D518CE2B36A278F42A66B95E4A5293502AAC7...,2017-09-01 00:01:00,0.0
3,102,2017-09-01 00:00:51,1,POS0001:216437503,4,CC2919271D518CE2B36A278F42A66B95E4A5293502AAC7...,2017-09-01 00:01:00,3.33
4,102,2017-09-01 00:01:40,1,POS0001:216437504,1,9DCFC4FE00D7565E74BDE0513410B33F611CAD8F1F0196...,2017-09-01 00:02:00,-0.8
5,102,2017-09-01 00:01:40,1,POS0001:216437504,2,9DCFC4FE00D7565E74BDE0513410B33F611CAD8F1F0196...,2017-09-01 00:02:00,4.43
6,102,2017-09-01 00:01:40,1,POS0001:216437504,3,9DCFC4FE00D7565E74BDE0513410B33F611CAD8F1F0196...,2017-09-01 00:02:00,0.0
7,102,2017-09-01 00:01:40,1,POS0001:216437504,4,9DCFC4FE00D7565E74BDE0513410B33F611CAD8F1F0196...,2017-09-01 00:02:00,0.82
8,102,2017-09-01 00:01:40,1,POS0001:216437504,5,9DCFC4FE00D7565E74BDE0513410B33F611CAD8F1F0196...,2017-09-01 00:02:00,1.24
9,102,2017-09-01 00:01:40,1,POS0001:216437504,6,9DCFC4FE00D7565E74BDE0513410B33F611CAD8F1F0196...,2017-09-01 00:02:00,0.0


In [27]:
pdf_basket.tail(50)

Unnamed: 0,store_number,timestamp_basket,pos_code,pos_id,sale_number,pan_token,timestamp_cards,total_cost
1732193,1339,2017-09-30 20:52:41,24,POS0024:334180056,4,F449204BD064632100C703086BEB8BCE28844ECB1C7202...,2017-09-30 20:55:00,0.0
1732194,1339,2017-09-30 20:52:41,24,POS0024:334180056,4,1883FF9A5C214E0E491A59F87B908D9C03B11FF36A374C...,2017-09-30 20:59:00,0.0
1732195,1339,2017-09-30 20:52:41,24,POS0024:334180056,5,F449204BD064632100C703086BEB8BCE28844ECB1C7202...,2017-09-30 20:55:00,0.87
1732196,1339,2017-09-30 20:52:41,24,POS0024:334180056,5,1883FF9A5C214E0E491A59F87B908D9C03B11FF36A374C...,2017-09-30 20:59:00,0.87
1732197,1339,2017-09-30 20:52:41,24,POS0024:334180056,6,F449204BD064632100C703086BEB8BCE28844ECB1C7202...,2017-09-30 20:55:00,2.13
1732198,1339,2017-09-30 20:52:41,24,POS0024:334180056,6,1883FF9A5C214E0E491A59F87B908D9C03B11FF36A374C...,2017-09-30 20:59:00,2.13
1732199,1339,2017-09-30 20:52:41,24,POS0024:334180056,7,F449204BD064632100C703086BEB8BCE28844ECB1C7202...,2017-09-30 20:55:00,-0.33
1732200,1339,2017-09-30 20:52:41,24,POS0024:334180056,7,1883FF9A5C214E0E491A59F87B908D9C03B11FF36A374C...,2017-09-30 20:59:00,-0.33
1732201,1339,2017-09-30 20:52:41,24,POS0024:334180056,8,F449204BD064632100C703086BEB8BCE28844ECB1C7202...,2017-09-30 20:55:00,1.7
1732202,1339,2017-09-30 20:52:41,24,POS0024:334180056,8,1883FF9A5C214E0E491A59F87B908D9C03B11FF36A374C...,2017-09-30 20:59:00,1.7


In [28]:
pdf_basket_aggregated = df_basket_aggregated.toPandas()

In [29]:
pdf_basket_aggregated.head(50)

Unnamed: 0,store_number,timestamp_basket,pos_code,pos_id,pan_token,timestamp_cards,total_cost
0,102,2017-09-01 00:00:51,1,POS0001:216437503,CC2919271D518CE2B36A278F42A66B95E4A5293502AAC7...,2017-09-01 00:01:00,4.24
1,102,2017-09-01 00:01:40,1,POS0001:216437504,9DCFC4FE00D7565E74BDE0513410B33F611CAD8F1F0196...,2017-09-01 00:02:00,5.69
2,102,2017-09-01 00:07:28,1,POS0001:216437511,F8FD3E6E72ECDF9D6CA64E0ADF20911842138238C5D388...,2017-09-01 00:08:00,2.48
3,102,2017-09-01 00:07:28,1,POS0001:216437511,CA03E173868FFB0F5B130D6D0B9066FAA326B19E0F5693...,2017-09-01 00:08:00,2.48
4,102,2017-09-01 00:08:34,1,POS0001:216437512,,NaT,3.56
5,102,2017-09-01 00:10:50,1,POS0001:216437513,820B52BFA2328482537E8115C6646D66CF273ABB9CEFFE...,2017-09-01 00:11:00,4.89
6,102,2017-09-01 00:10:50,1,POS0001:216437513,D34051D8DACF1DFB6057C2FD34BAAC66CAA027EB1E1EF0...,2017-09-01 00:11:00,4.89
7,102,2017-09-01 00:11:14,1,POS0001:216437514,1F3A3D6A9B05A6DCD791E6B16ABDC1D7E0B34B79695066...,2017-09-01 00:12:00,3.74
8,102,2017-09-01 00:12:16,1,POS0001:216437516,,NaT,3.32
9,102,2017-09-01 00:14:50,1,POS0001:216437517,2742D137D0DDA803F3D22F9D94DED129D2BFDA4104BC85...,2017-09-01 00:16:00,2.9


In [30]:
pdf_basket_aggregated.tail(50)

Unnamed: 0,store_number,timestamp_basket,pos_code,pos_id,pan_token,timestamp_cards,total_cost
371652,1339,2017-09-30 17:44:56,24,POS0024:334180022,E1AA50D7367349DC9D6AB56B2DD69507C8894C964EDEE1...,2017-09-30 17:47:00,16.8
371653,1339,2017-09-30 18:00:53,24,POS0024:334180023,C0AE1560C5D44C2D257754607F65782534415B284E5E04...,2017-09-30 18:04:00,10.18
371654,1339,2017-09-30 18:07:58,24,POS0024:334180024,91B8A5AD3A7E8EA199A4892C75B42964F30B8E91AC6A81...,2017-09-30 18:10:00,15.52
371655,1339,2017-09-30 18:16:25,24,POS0024:334180025,ACD432B37169C0F4B74515058F0DE2F131F7A71F0E0A2E...,2017-09-30 18:23:00,7.69
371656,1339,2017-09-30 18:16:25,24,POS0024:334180025,FDD70CC8A51CFE302863E111809E8D9D7ED3D7C528510F...,2017-09-30 18:17:00,7.69
371657,1339,2017-09-30 18:23:12,24,POS0024:334180027,,NaT,6.82
371658,1339,2017-09-30 18:25:37,24,POS0024:334180028,3CCB15FAA4994B989D9ECE25DFA4B9E2C0563463E03C6C...,2017-09-30 18:28:00,11.82
371659,1339,2017-09-30 18:25:37,24,POS0024:334180028,3759000D80F655AF4A0EADA88726BE592E6017DED52317...,2017-09-30 18:26:00,11.82
371660,1339,2017-09-30 18:28:02,24,POS0024:334180029,C2BAE4D3A3232922C5D1F514E758678854ECD47C20DFB5...,2017-09-30 18:33:00,3.18
371661,1339,2017-09-30 18:33:25,24,POS0024:334180030,,NaT,5.33
