# McDonald's Joined Basket Analysis - v2 all card tokens
## April 2018
### Dr Jose M Albornoz

This notebook analyses joined basket data using all card tokens to examine i) unmatched transactions; ii) multiple payments between succesive baskets

# 1.- Import necessary modules, define SQLContext

In [1]:
# Import required modules
from pyspark.context import SparkContext
from pyspark.sql.context import SQLContext
from pyspark.sql.functions import *
from pyspark.sql.functions import col
from pyspark.sql.functions import row_number
from pyspark.sql.types import *
from pyspark.sql.functions import unix_timestamp
import math
from pyspark.sql.window import *
import pandas as pd

In [2]:
# Define SQLContext
sqlContext = SQLContext(sc)

# 2.- Generic functions to load data from a text-based file

In [3]:
# a function to load a colon-separated value file
def load_data_colon(filename, schema, columns = None):
    df = sqlContext.read.format('com.databricks.spark.csv').option("delimiter", ";").options(header='true'). \
    load(filename, schema = schema)
    if columns is None:
        # If no columns are given, then select all
        columns = schema.names
    return df.select(columns)

In [4]:
# a function to load a pipe-separated value file
def load_data_pipe(filename, schema, columns = None):
    df = sqlContext.read.format('com.databricks.spark.csv').option("delimiter", "|").options(header='false'). \
    load(filename, schema = schema)
    if columns is None:
        # If no columns are given, then select all
        columns = schema.names
    return df.select(columns)

In [5]:
# a function to load a comma-separated value file
def load_data_comma(filename, schema, columns = None):
    df = sqlContext.read.format('com.databricks.spark.csv').option("delimiter", ",").options(header='true'). \
    load(filename, schema = schema)
    if columns is None:
        # If no columns are given, then select all
        columns = schema.names
    return df.select(columns)

# 3.- Schema for joined basket file

In [6]:
schema_basket = StructType([ 
    StructField('store_number', IntegerType(), True), 
    StructField('timestamp_basket', TimestampType(), True), 
    StructField('pos_code', IntegerType(), True), 
    StructField('pos_id', StringType(), True), 
    StructField('sale_number', IntegerType(), True),
    StructField('total_cost', FloatType(), True),
    StructField('unit_cost', FloatType(), True),
    StructField('quantity', IntegerType(), True), 
    StructField('menu_item_id', IntegerType(), True), 
    StructField('transaction_amount', FloatType(), True), 
    StructField('pan_token', StringType(), True), 
    StructField('timestamp_cards', TimestampType(), True)
])

# 4.- Load joined basket data

In [7]:
df_basket = load_data_comma("joined_basket_reading_all_tokens.csv/part-00000-e68b3079-c1d0-427a-b29e-80eb1114b0a4-c000.csv", \
                           schema_basket)

In [8]:
df_basket = df_basket.select('store_number', 'timestamp_basket', 'pos_code', 'pos_id', 'sale_number', 'pan_token', \
                             'timestamp_cards', 'total_cost').orderBy('store_number', 'pos_code', 'pos_id', \
                                                                      'sale_number')

In [9]:
df_basket.show(20)

+------------+-------------------+--------+-----------------+-----------+--------------------+-------------------+----------+
|store_number|   timestamp_basket|pos_code|           pos_id|sale_number|           pan_token|    timestamp_cards|total_cost|
+------------+-------------------+--------+-----------------+-----------+--------------------+-------------------+----------+
|         102|2017-09-01 00:00:51|       1|POS0001:216437503|          1|CC2919271D518CE2B...|2017-09-01 00:01:00|       0.0|
|         102|2017-09-01 00:00:51|       1|POS0001:216437503|          2|CC2919271D518CE2B...|2017-09-01 00:01:00|      0.91|
|         102|2017-09-01 00:00:51|       1|POS0001:216437503|          3|CC2919271D518CE2B...|2017-09-01 00:01:00|       0.0|
|         102|2017-09-01 00:00:51|       1|POS0001:216437503|          4|CC2919271D518CE2B...|2017-09-01 00:01:00|      3.33|
|         102|2017-09-01 00:01:40|       1|POS0001:216437504|          1|9DCFC4FE00D7565E7...|2017-09-01 00:02:00|    

In [10]:
df_basket.count()

1732243

In [11]:
df_basket.select('store_number', 'timestamp_basket', 'pos_code', 'pos_id', 'sale_number', 'pan_token', \
                             'timestamp_cards').orderBy('store_number', 'pos_code', 'pos_id', \
                                                                      'sale_number').show(10)

+------------+-------------------+--------+-----------------+-----------+--------------------+-------------------+
|store_number|   timestamp_basket|pos_code|           pos_id|sale_number|           pan_token|    timestamp_cards|
+------------+-------------------+--------+-----------------+-----------+--------------------+-------------------+
|         102|2017-09-01 00:00:51|       1|POS0001:216437503|          1|CC2919271D518CE2B...|2017-09-01 00:01:00|
|         102|2017-09-01 00:00:51|       1|POS0001:216437503|          2|CC2919271D518CE2B...|2017-09-01 00:01:00|
|         102|2017-09-01 00:00:51|       1|POS0001:216437503|          3|CC2919271D518CE2B...|2017-09-01 00:01:00|
|         102|2017-09-01 00:00:51|       1|POS0001:216437503|          4|CC2919271D518CE2B...|2017-09-01 00:01:00|
|         102|2017-09-01 00:01:40|       1|POS0001:216437504|          1|9DCFC4FE00D7565E7...|2017-09-01 00:02:00|
|         102|2017-09-01 00:01:40|       1|POS0001:216437504|          2|9DCFC4F

# 5.- Compute aggregated basket

In [12]:
df_basket_aggregated = df_basket.groupBy("store_number", "timestamp_basket", "pos_code", "pos_id", \
                                         "pan_token", "timestamp_cards").\
                                 agg(sum("total_cost").alias("total_cost")). \
                                                                         orderBy("store_number", \
                                                                                 "pos_code", "pos_id", \
                                                                                 "timestamp_basket")

In [13]:
df_basket_aggregated.show(10)

+------------+-------------------+--------+-----------------+--------------------+-------------------+------------------+
|store_number|   timestamp_basket|pos_code|           pos_id|           pan_token|    timestamp_cards|        total_cost|
+------------+-------------------+--------+-----------------+--------------------+-------------------+------------------+
|         102|2017-09-01 00:00:51|       1|POS0001:216437503|CC2919271D518CE2B...|2017-09-01 00:01:00| 4.239999949932098|
|         102|2017-09-01 00:01:40|       1|POS0001:216437504|9DCFC4FE00D7565E7...|2017-09-01 00:02:00|  5.68999981880188|
|         102|2017-09-01 00:07:28|       1|POS0001:216437511|F8FD3E6E72ECDF9D6...|2017-09-01 00:08:00|2.4800000190734863|
|         102|2017-09-01 00:07:28|       1|POS0001:216437511|CA03E173868FFB0F5...|2017-09-01 00:08:00|2.4800000190734863|
|         102|2017-09-01 00:08:34|       1|POS0001:216437512|                null|               null| 3.559999942779541|
|         102|2017-09-01

In [14]:
df_basket_aggregated.count()

371702

# 6.- Count of unmatched baskets

In [15]:
df_unmatched = df_basket_aggregated.where(col('pan_token').isNull())

In [16]:
df_unmatched.show(10)

+------------+-------------------+--------+-----------------+---------+---------------+------------------+
|store_number|   timestamp_basket|pos_code|           pos_id|pan_token|timestamp_cards|        total_cost|
+------------+-------------------+--------+-----------------+---------+---------------+------------------+
|         102|2017-09-01 00:08:34|       1|POS0001:216437512|     null|           null| 3.559999942779541|
|         102|2017-09-01 00:12:16|       1|POS0001:216437516|     null|           null|3.3199999928474426|
|         102|2017-09-01 00:36:16|       1|POS0001:216437530|     null|           null|3.7200000286102295|
|         102|2017-09-01 00:52:39|       1|POS0001:216437538|     null|           null| 4.750000059604645|
|         102|2017-09-01 01:02:22|       1|POS0001:216437545|     null|           null| 4.070000112056732|
|         102|2017-09-01 01:47:04|       1|POS0001:216437575|     null|           null| 2.990000069141388|
|         102|2017-09-01 02:03:28|   

In [20]:
df_basket_aggregated.where(col('pan_token').isNotNull()).select('store_number', 'timestamp_basket', 'pos_id', \
                                                               'timestamp_cards').show(20)

+------------+-------------------+-----------------+-------------------+
|store_number|   timestamp_basket|           pos_id|    timestamp_cards|
+------------+-------------------+-----------------+-------------------+
|         102|2017-09-01 00:00:51|POS0001:216437503|2017-09-01 00:01:00|
|         102|2017-09-01 00:01:40|POS0001:216437504|2017-09-01 00:02:00|
|         102|2017-09-01 00:07:28|POS0001:216437511|2017-09-01 00:08:00|
|         102|2017-09-01 00:07:28|POS0001:216437511|2017-09-01 00:08:00|
|         102|2017-09-01 00:10:50|POS0001:216437513|2017-09-01 00:11:00|
|         102|2017-09-01 00:10:50|POS0001:216437513|2017-09-01 00:11:00|
|         102|2017-09-01 00:11:14|POS0001:216437514|2017-09-01 00:12:00|
|         102|2017-09-01 00:14:50|POS0001:216437517|2017-09-01 00:16:00|
|         102|2017-09-01 00:14:50|POS0001:216437517|2017-09-01 00:16:00|
|         102|2017-09-01 00:16:23|POS0001:216437518|2017-09-01 00:29:00|
|         102|2017-09-01 00:29:23|POS0001:216437526

## 6.1.- Are there any repeated pos_ids in the unmatched baskets?

In [17]:
len([i.pos_id for i in df_unmatched.select('pos_id').distinct().collect()])

51067

In [18]:
df_unmatched.count()

51067

# 7.- Count of multiple payments between transactions

In [None]:
df_basket_multiple_payments = df_basket_aggregated.groupBy('pos_id').count()

In [None]:
df_basket_multiple_payments.show(10)

In [None]:
df_basket_multiple_payments.count()

In [None]:
df_basket_multiple_payments.where(df_basket_multiple_payments['count'] != 1).count()

# 8.- Convert to Pandas

In [None]:
pdf_basket = df_basket.toPandas()

In [None]:
pdf_basket.head(50)

In [None]:
pdf_basket.tail(50)

In [None]:
pdf_basket_aggregated = df_basket_aggregated.toPandas()

In [None]:
pdf_basket_aggregated.head(50)

In [None]:
pdf_basket_aggregated.tail(50)