# McDonald's Joined Basket Analysis - v3 
## April 2018
### Dr Jose M Albornoz

This notebook analyses joined basket data using Ingenico card tokens form March 2018 in the Reading area stores to examine i) unmatched transactions; ii) multiple payments between succesive baskets

# 1.- Import necessary modules, define SQLContext

In [1]:
# Import required modules
from pyspark.context import SparkContext
from pyspark.sql.context import SQLContext
from pyspark.sql.functions import *
from pyspark.sql.functions import col
from pyspark.sql.functions import row_number
from pyspark.sql.types import *
from pyspark.sql.functions import unix_timestamp
import math
from pyspark.sql.window import *
import pandas as pd

In [2]:
# Define SQLContext
sqlContext = SQLContext(sc)

# 2.- Generic functions to load data from a text-based file

In [3]:
# a function to load a colon-separated value file
def load_data_colon(filename, schema, columns = None):
    df = sqlContext.read.format('com.databricks.spark.csv').option("delimiter", ";").options(header='true'). \
    load(filename, schema = schema)
    if columns is None:
        # If no columns are given, then select all
        columns = schema.names
    return df.select(columns)

In [4]:
# a function to load a pipe-separated value file
def load_data_pipe(filename, schema, columns = None):
    df = sqlContext.read.format('com.databricks.spark.csv').option("delimiter", "|").options(header='false'). \
    load(filename, schema = schema)
    if columns is None:
        # If no columns are given, then select all
        columns = schema.names
    return df.select(columns)

In [5]:
# a function to load a comma-separated value file
def load_data_comma(filename, schema, columns = None):
    df = sqlContext.read.format('com.databricks.spark.csv').option("delimiter", ",").options(header='true'). \
    load(filename, schema = schema)
    if columns is None:
        # If no columns are given, then select all
        columns = schema.names
    return df.select(columns)

# 3.- Schema for joined basket file

In [6]:
schema_basket = StructType([ 
    StructField('store_number', IntegerType(), True), 
    StructField('timestamp_basket', TimestampType(), True), 
    StructField('pos_code', IntegerType(), True), 
    StructField('pos_id', StringType(), True), 
    StructField('sale_number', IntegerType(), True), 
    StructField('total_cost', FloatType(), True),\
    StructField('menu_item_id', IntegerType(), True),
    StructField('channel', StringType(), True), 
    StructField('pan_token', StringType(), True),     
    StructField('timestamp_cards', TimestampType(), True),
    StructField('next_timestamp', TimestampType(), True)
])

# 4.- Load joined basket data

In [7]:
df_basket0 = load_data_comma("joined_basket_ReadingMarch2018.csv/part-00000-ccefe0fd-a5df-450d-b884-807f401ceb93-c000.csv", \
                           schema_basket)

In [8]:
df_basket0 = df_basket0.select('store_number', 'pos_code', 'pos_id', 'sale_number', 'pan_token', 'total_cost', \
                               'timestamp_basket', 'timestamp_cards', 'next_timestamp').\
                               orderBy('store_number', 'pos_code', 'timestamp_basket', 'sale_number')

In [9]:
df_basket0.show(20)

+------------+--------+-----------------+-----------+-------------------+----------+-------------------+-------------------+-------------------+
|store_number|pos_code|           pos_id|sale_number|          pan_token|total_cost|   timestamp_basket|    timestamp_cards|     next_timestamp|
+------------+--------+-----------------+-----------+-------------------+----------+-------------------+-------------------+-------------------+
|         102|       1|POS0001:216506961|          1|5573611294275381348|      1.66|2018-03-01 12:13:19|2018-03-01 12:14:19|2018-03-01 12:14:40|
|         102|       1|POS0001:216506961|          2|5573611294275381348|       0.0|2018-03-01 12:13:19|2018-03-01 12:14:19|2018-03-01 12:14:40|
|         102|       1|POS0001:216506962|          1|5573611294275381348|      1.33|2018-03-01 12:14:40|2018-03-01 12:15:10|2018-03-01 13:24:55|
|         102|       1|POS0001:216506962|          2|5573611294275381348|     -0.76|2018-03-01 12:14:40|2018-03-01 12:15:10|2018-0

In [10]:
df_basket0.count()

1979699

# 5.- Compute aggregated basket

In [11]:
df_basket_aggregated = df_basket0.groupBy("store_number", "pos_code", "pos_id", "pan_token", "timestamp_basket", \
                                          "timestamp_cards", "next_timestamp").\
                                 agg(sum("total_cost").alias("total_cost")). \
                                                                         orderBy("store_number", \
                                                                                 "pos_code", \
                                                                                 "timestamp_basket")

In [12]:
df_basket_aggregated.show(10)

+------------+--------+-----------------+-------------------+-------------------+-------------------+-------------------+------------------+
|store_number|pos_code|           pos_id|          pan_token|   timestamp_basket|    timestamp_cards|     next_timestamp|        total_cost|
+------------+--------+-----------------+-------------------+-------------------+-------------------+-------------------+------------------+
|         102|       1|POS0001:216506961|5573611294275381348|2018-03-01 12:13:19|2018-03-01 12:14:19|2018-03-01 12:14:40| 1.659999966621399|
|         102|       1|POS0001:216506962|5573611294275381348|2018-03-01 12:14:40|2018-03-01 12:15:10|2018-03-01 13:24:55| 5.480000197887421|
|         102|       1|POS0001:216506988|3770647320610855465|2018-03-01 13:24:55|2018-03-01 13:25:13|2018-03-01 13:31:58| 4.990000009536743|
|         102|       1|POS0001:216506993|4658598774717247027|2018-03-01 13:31:58|2018-03-01 13:32:12|2018-03-01 13:41:34|0.7400000095367432|
|         102

In [13]:
total_transactions = df_basket_aggregated.count()

In [14]:
total_transactions

390172

# 6.- Count of unmatched baskets

In [15]:
df_unmatched = df_basket_aggregated.where(col('pan_token').isNull())

In [16]:
df_unmatched.show(10)

+------------+--------+-----------------+---------+-------------------+-------------------+-------------------+------------------+
|store_number|pos_code|           pos_id|pan_token|   timestamp_basket|    timestamp_cards|     next_timestamp|        total_cost|
+------------+--------+-----------------+---------+-------------------+-------------------+-------------------+------------------+
|         102|       1| POS0023:35435297|     null|2018-03-02 12:05:53|2018-03-07 19:32:26|2018-03-10 13:14:47|  3.39000004529953|
|         102|       1| POS0023:35435297|     null|2018-03-02 12:05:53|2018-03-03 21:02:37|2018-03-10 13:14:47|  3.39000004529953|
|         102|       1|POS0022:184234456|     null|2018-03-03 11:34:10|2018-03-03 21:02:37|2018-03-06 01:27:02|3.3100000619888306|
|         102|       1|POS0001:216508027|     null|2018-03-03 21:01:16|2018-03-03 21:02:37|2018-03-03 21:08:05| 3.319999933242798|
|         102|       1|POS0024:685699383|     null|2018-03-06 01:16:03|2018-03-13 1

## 6.1.- Proportion of unmatched transactions

In [17]:
unmatched_transactions = df_unmatched.count()

In [18]:
unmatched_transactions

832

In [19]:
unmatched_transactions*100/total_transactions

0.21323928933906072

## 6.2.- Are there any repeated timestamps in the unmatched baskets?

In [20]:
len([i.timestamp_basket for i in df_unmatched.select('timestamp_basket').distinct().collect()])

418

# 7.- Count of multiple payments between transactions

In [21]:
df_basket_payment_counts = df_basket_aggregated.groupBy("store_number", "pos_code", "pos_id", "pan_token", \
                                                        "timestamp_basket", "timestamp_cards", "next_timestamp").\
                                 agg(count('pos_id').alias("payment_count")). \
                                 orderBy("store_number", "pos_id", "timestamp_basket")

In [22]:
df_basket_payment_counts = df_basket_aggregated.groupBy('store_number', 'pos_code', 'pos_id', 'timestamp_basket').count()

In [23]:
df_basket_payment_counts.show(10)

+------------+--------+-------------------+-----+
|store_number|pos_code|   timestamp_basket|count|
+------------+--------+-------------------+-----+
|         102|       1|2018-03-01 12:13:19|    1|
|         102|       1|2018-03-01 12:14:40|    1|
|         102|       1|2018-03-01 13:24:55|    1|
|         102|       1|2018-03-01 13:31:58|    1|
|         102|       1|2018-03-01 13:41:34|    1|
|         102|       1|2018-03-01 13:45:02|    1|
|         102|       1|2018-03-01 13:48:46|    1|
|         102|       1|2018-03-01 13:49:23|    1|
|         102|       1|2018-03-01 13:59:07|    1|
|         102|       1|2018-03-01 13:59:38|    1|
+------------+--------+-------------------+-----+
only showing top 10 rows



In [24]:
df_basket_payment_counts.count()

182006

In [32]:
df_single_payments = df_basket_payment_counts.where(df_basket_payment_counts['count'] == 1)

In [33]:
single_payments = df_single_payments.count()

In [34]:
single_payments

157056

In [25]:
df_multiple_payments = df_basket_payment_counts.where(df_basket_payment_counts['count'] != 1)

In [26]:
df_multiple_payments.show()

+------------+--------+-------------------+-----+
|store_number|pos_code|   timestamp_basket|count|
+------------+--------+-------------------+-----+
|         102|       1|2018-03-02 02:19:31|    2|
|         102|       1|2018-03-02 12:05:53| 1496|
|         102|       1|2018-03-02 13:32:37|    2|
|         102|       1|2018-03-03 10:56:45|    2|
|         102|       1|2018-03-03 11:31:25|    2|
|         102|       1|2018-03-03 11:34:10|  480|
|         102|       1|2018-03-03 11:57:21|    2|
|         102|       1|2018-03-03 12:32:25|    2|
|         102|       1|2018-03-03 13:55:23|    2|
|         102|       1|2018-03-03 21:01:16|    2|
|         102|       1|2018-03-04 12:33:38|    2|
|         102|       1|2018-03-04 13:31:49|    2|
|         102|       1|2018-03-04 15:24:52|    2|
|         102|       1|2018-03-05 14:06:13|    2|
|         102|       1|2018-03-06 01:14:55|    2|
|         102|       1|2018-03-06 01:16:03| 3801|
|         102|       1|2018-03-06 01:26:19|    2|


In [27]:
multiple_payments = df_multiple_payments.count()

In [28]:
multiple_payments

24950

## 7.1.- A closer look at multiple payments

In [38]:
df_basket_aggregated[(col('store_number') == 102) & \
                     (col('pos_code') == 1) & \
                    (col('timestamp_basket') == '2018-03-02 12:05:53')].\
                    select('pos_id', 'pan_token', 'timestamp_basket', 'timestamp_cards', \
                           'next_timestamp').show()

+----------------+-------------------+-------------------+-------------------+-------------------+
|          pos_id|          pan_token|   timestamp_basket|    timestamp_cards|     next_timestamp|
+----------------+-------------------+-------------------+-------------------+-------------------+
|POS0023:35435297|4929147876412187001|2018-03-02 12:05:53|2018-03-02 13:33:03|2018-03-10 13:14:47|
|POS0023:35435297|4762305177088802330|2018-03-02 12:05:53|2018-03-02 23:30:51|2018-03-10 13:14:47|
|POS0023:35435297|4832044082018736749|2018-03-02 12:05:53|2018-03-03 10:09:32|2018-03-10 13:14:47|
|POS0023:35435297|4386621274821512081|2018-03-02 12:05:53|2018-03-03 19:19:39|2018-03-10 13:14:47|
|POS0023:35435297|4462913067970885939|2018-03-02 12:05:53|2018-03-04 15:28:24|2018-03-10 13:14:47|
|POS0023:35435297|5355620656579955723|2018-03-02 12:05:53|2018-03-04 20:28:03|2018-03-10 13:14:47|
|POS0023:35435297|4659439603208941941|2018-03-02 12:05:53|2018-03-06 14:41:09|2018-03-10 13:14:47|
|POS0023:3

In [29]:
df_basket_aggregated[(col('store_number') == 102) & \
                     (col('pos_code') == 1) & \
                    (col('timestamp_basket') == '2018-03-06 01:52:13')].show()

+------------+--------+-----------------+-------------------+-------------------+-------------------+-------------------+-----------------+
|store_number|pos_code|           pos_id|          pan_token|   timestamp_basket|    timestamp_cards|     next_timestamp|       total_cost|
+------------+--------+-----------------+-------------------+-------------------+-------------------+-------------------+-----------------+
|         102|       1|POS0001:216509065|4386623393317280841|2018-03-06 01:52:13|2018-03-06 01:53:31|2018-03-06 01:53:41|7.659999907016754|
|         102|       1|POS0001:216509065|4832045180974836027|2018-03-06 01:52:13|2018-03-06 01:52:33|2018-03-06 01:53:41|7.659999907016754|
+------------+--------+-----------------+-------------------+-------------------+-------------------+-------------------+-----------------+



In [30]:
df_basket0[(col('store_number') == 102) & \
                     (col('pos_code') == 1) & \
                    (col('timestamp_basket') == '2018-03-06 01:52:13')].\
                    sort('store_number', 'pos_code', 'timestamp_basket', 'timestamp_cards', 'sale_number').\
                    select('pos_id', 'sale_number', 'pan_token', 'timestamp_basket', 'timestamp_cards', \
                           'next_timestamp').show()

+-----------------+-----------+-------------------+-------------------+-------------------+-------------------+
|           pos_id|sale_number|          pan_token|   timestamp_basket|    timestamp_cards|     next_timestamp|
+-----------------+-----------+-------------------+-------------------+-------------------+-------------------+
|POS0001:216509065|          1|4832045180974836027|2018-03-06 01:52:13|2018-03-06 01:52:33|2018-03-06 01:53:41|
|POS0001:216509065|          2|4832045180974836027|2018-03-06 01:52:13|2018-03-06 01:52:33|2018-03-06 01:53:41|
|POS0001:216509065|          3|4832045180974836027|2018-03-06 01:52:13|2018-03-06 01:52:33|2018-03-06 01:53:41|
|POS0001:216509065|          4|4832045180974836027|2018-03-06 01:52:13|2018-03-06 01:52:33|2018-03-06 01:53:41|
|POS0001:216509065|          5|4832045180974836027|2018-03-06 01:52:13|2018-03-06 01:52:33|2018-03-06 01:53:41|
|POS0001:216509065|          6|4832045180974836027|2018-03-06 01:52:13|2018-03-06 01:52:33|2018-03-06 01

In [31]:
df_basket0[(col('pos_id') == 'POS0021:172009093')].\
                    sort('store_number', 'pos_code', 'timestamp_basket', 'timestamp_cards', 'sale_number').\
                    select('pos_id', 'sale_number', 'pan_token', 'timestamp_basket', 'timestamp_cards', \
                           'next_timestamp').show()

+-----------------+-----------+-------------------+-------------------+-------------------+-------------------+
|           pos_id|sale_number|          pan_token|   timestamp_basket|    timestamp_cards|     next_timestamp|
+-----------------+-----------+-------------------+-------------------+-------------------+-------------------+
|POS0021:172009093|          1|4659435373866350219|2018-03-29 13:58:26|2018-03-29 13:58:39|2018-03-29 14:00:11|
|POS0021:172009093|          2|4659435373866350219|2018-03-29 13:58:26|2018-03-29 13:58:39|2018-03-29 14:00:11|
|POS0021:172009093|          3|4659435373866350219|2018-03-29 13:58:26|2018-03-29 13:58:39|2018-03-29 14:00:11|
|POS0021:172009093|          1|4658581689937518013|2018-03-29 13:58:26|2018-03-29 13:59:37|2018-03-29 14:00:11|
|POS0021:172009093|          2|4658581689937518013|2018-03-29 13:58:26|2018-03-29 13:59:37|2018-03-29 14:00:11|
|POS0021:172009093|          3|4658581689937518013|2018-03-29 13:58:26|2018-03-29 13:59:37|2018-03-29 14

# 8.- Convert to Pandas

In [None]:
pdf_basket = df_basket.toPandas()

In [None]:
pdf_basket.head(50)

In [None]:
pdf_basket.tail(50)

In [None]:
pdf_basket_aggregated = df_basket_aggregated.toPandas()

In [None]:
pdf_basket_aggregated.head(50)

In [None]:
pdf_basket_aggregated.tail(50)