# McDonald's Basket Data Analysis - v2: Toshiba card tokens
## April 2018
### Dr Jose M Albornoz

This notebook performs basket data and card data merge for September 2017, accounting for all 7 stores in the Reading area. Only Toshiba tokens are being considered.

# 1.- Import necessary modules, define SQLContext

In [1]:
# Import required modules
from pyspark.context import SparkContext
from pyspark.sql.context import SQLContext
from pyspark.sql.functions import *
from pyspark.sql.functions import col
from pyspark.sql.functions import row_number
from pyspark.sql.types import *
from pyspark.sql.functions import unix_timestamp
import math
from pyspark.sql.window import *

In [2]:
# Define SQLContext
sqlContext = SQLContext(sc)

# 2.- Generic functions to load data from a text-based file

In [3]:
# a function to load a colon-separated value file
def load_data_colon(filename, schema, columns = None):
    df = sqlContext.read.format('com.databricks.spark.csv').option("delimiter", ";").options(header='true'). \
    load(filename, schema = schema)
    if columns is None:
        # If no columns are given, then select all
        columns = schema.names
    return df.select(columns)

In [4]:
# a function to load a pipe-separated value file
def load_data_pipe(filename, schema, columns = None):
    df = sqlContext.read.format('com.databricks.spark.csv').option("delimiter", "|").options(header='false'). \
    load(filename, schema = schema)
    if columns is None:
        # If no columns are given, then select all
        columns = schema.names
    return df.select(columns)

# 3.- Schema for card data files

In [5]:
schema_card = StructType([ 
    StructField('store_number', IntegerType(), True), 
    StructField('terminal_number', IntegerType(), True), 
    StructField('transaction_date', StringType(), True), 
    StructField('transaction_time', IntegerType(), True), 
    StructField('transaction_amount', FloatType(), True),
    StructField('card_scheme', StringType(), True),
    StructField('pan_token', StringType(), True),
    StructField('empty_field', IntegerType(), True)    
])

# 4.- Schema for basket data files

In [6]:
schema_basket = StructType([ 
    StructField('store_number', StringType(), True), 
    StructField('time_period', StringType(), True), 
    StructField('time_of_day', StringType(), True), 
    StructField('business_date', StringType(), True), 
    StructField('pos_code', StringType(), True),
    StructField('c6', StringType(), True),
    StructField('total_cost', StringType(), True),
    StructField('unit_cost', StringType(), True),
    StructField('quantity', StringType(), True), 
    StructField('food_cost', StringType(), True), 
    StructField('paper_cost', StringType(), True), 
    StructField('hour', StringType(), True), 
    StructField('transaction_time', StringType(), True), 
    StructField('transaction_date', StringType(), True),
    StructField('pos_id', StringType(), True),
    StructField('unique_till_code', StringType(), True),
    StructField('sale_number', StringType(), True),
    StructField('menu_item_id', StringType(), True), 
    StructField('till_key', StringType(), True), 
    StructField('channel', StringType(), True), 
    StructField('c20', StringType(), True), 
    StructField('sale_or_refund', StringType(), True),
    StructField('eatin_eatout', StringType(), True),
    StructField('eat_in_eatout_str', StringType(), True),
    StructField('payment_type_id', StringType(), True),
    StructField('payment_type', StringType(), True) 
])

# 5.- Basket data preprocessing

## 5.1.- Load basket data

In [7]:
df_basket = load_data_pipe("BasketDataReading7Sept_not15/000", schema_basket)

In [8]:
#df_basket.show(10)

## 5.2.- Cast basket data columns to the correct types

In [9]:
df_basket = df_basket.withColumn("store_number", df_basket["store_number"].cast(IntegerType()))        
df_basket = df_basket.withColumn("time_period", df_basket["time_period"].cast(IntegerType()))
df_basket = df_basket.withColumn("time_of_day", df_basket["time_of_day"].cast(StringType()))
df_basket = df_basket.withColumn("business_date", df_basket["business_date"].cast(DateType()))
df_basket = df_basket.withColumn("pos_code", df_basket["pos_code"].cast(IntegerType()))
df_basket = df_basket.withColumn("c6", df_basket["c6"].cast(StringType()))
df_basket = df_basket.withColumn("total_cost", df_basket["total_cost"].cast(FloatType()))
df_basket = df_basket.withColumn("unit_cost", df_basket["unit_cost"].cast(FloatType()))
df_basket = df_basket.withColumn("quantity", df_basket["quantity"].cast(IntegerType()))
df_basket = df_basket.withColumn("food_cost", df_basket["food_cost"].cast(FloatType()))
df_basket = df_basket.withColumn("paper_cost", df_basket["paper_cost"].cast(FloatType()))
df_basket = df_basket.withColumn("hour", df_basket["hour"].cast(IntegerType()))
df_basket = df_basket.withColumn("transaction_time", df_basket["transaction_time"].cast(StringType()))
df_basket = df_basket.withColumn("transaction_date", df_basket["transaction_date"].cast(StringType()))
df_basket = df_basket.withColumn("pos_id", df_basket["pos_id"].cast(StringType()))
df_basket = df_basket.withColumn("unique_till_code", df_basket["unique_till_code"].cast(IntegerType()))
df_basket = df_basket.withColumn("sale_number", df_basket["sale_number"].cast(IntegerType()))
df_basket = df_basket.withColumn("menu_item_id", df_basket["menu_item_id"].cast(IntegerType()))
df_basket = df_basket.withColumn("till_key", df_basket["till_key"].cast(IntegerType()))
df_basket = df_basket.withColumn("channel", df_basket["channel"].cast(StringType()))
df_basket = df_basket.withColumn("c20", df_basket["c20"].cast(StringType()))
df_basket = df_basket.withColumn("sale_or_refund", df_basket["sale_or_refund"].cast(StringType()))
df_basket = df_basket.withColumn("eatin_eatout", df_basket["eatin_eatout"].cast(IntegerType()))
df_basket = df_basket.withColumn("eat_in_eatout_str", df_basket["eat_in_eatout_str"].cast(StringType()))
df_basket = df_basket.withColumn("payment_type_id", df_basket["payment_type_id"].cast(IntegerType()))
df_basket = df_basket.withColumn("payment_type", df_basket["payment_type"].cast(StringType()))

In [10]:
all_transactions = df_basket.count()

In [11]:
all_transactions

1278812

## 5.3.- Only 'Sales' and not 'Refunds' will be considered

In [12]:
[i.sale_or_refund for i in df_basket.select('sale_or_refund').distinct().collect()]

['Sale']

In [13]:
df_basket = df_basket.filter(df_basket['sale_or_refund'] == 'Sale')

In [14]:
df_basket.count()

1278812

## 5.4.- Convert transaction date and time into a timestamp, filter so that only data for September is considered

In [15]:
col = to_timestamp(df_basket['transaction_time'], 'yyyy-MM-dd HH:mm:ss')

In [16]:
df_basket = df_basket.withColumn('timestamp_basket', col)

In [17]:
df_basket = df_basket.drop('transaction_date', 'transaction_time', 'hour').orderBy('timestamp_basket')

In [18]:
from pyspark.sql.functions import col
df_basket = df_basket.filter(col('timestamp_basket') >= '2017-09-01 00:00:00')

In [19]:
df_basket.count()

1278605

## 5.5.- Select relevant columns from basket data

In [20]:
df_basket_reduced = df_basket.select('store_number', 'timestamp_basket', 'pos_code', 'pos_id', 'sale_number', 'total_cost', \
                                     'unit_cost', 'quantity', 'menu_item_id', 'sale_or_refund', \
                                     'payment_type_id', 'channel')

In [21]:
df_basket_reduced.show(5)

+------------+-------------------+--------+-----------------+-----------+----------+---------+--------+------------+--------------+---------------+-------------+
|store_number|   timestamp_basket|pos_code|           pos_id|sale_number|total_cost|unit_cost|quantity|menu_item_id|sale_or_refund|payment_type_id|      channel|
+------------+-------------------+--------+-----------------+-----------+----------+---------+--------+------------+--------------+---------------+-------------+
|         102|2017-09-01 00:00:51|       1|POS0001:216437503|          2|      0.91|     0.91|       1|        4810|          Sale|              2|FRONT COUNTER|
|         102|2017-09-01 00:00:51|       1|POS0001:216437503|          1|       0.0|      0.0|       2|        4600|          Sale|              2|FRONT COUNTER|
|         102|2017-09-01 00:00:51|       1|POS0001:216437503|          3|       0.0|      0.0|       2|        4650|          Sale|              2|FRONT COUNTER|
|         102|2017-09-01 00:

## 5.7.- Split basket data into 'cash' and 'cashless' transaction, sort by store number, timestamp, till ID and sale number

In [22]:
df_basket_cash = df_basket_reduced.filter(df_basket["payment_type_id"] == 1)

In [23]:
df_basket_cash = df_basket_cash.orderBy("store_number", "timestamp_basket", "pos_id", "sale_number")

In [24]:
df_basket_cash = df_basket_cash.drop('sale_or_refund', 'payment_type_id')

In [25]:
df_basket_cashless = df_basket_reduced.filter(df_basket["payment_type_id"] == 2)

In [26]:
df_basket_cashless = df_basket_cashless.orderBy("store_number", "timestamp_basket", "pos_id", "sale_number")

In [27]:
df_basket_cashless = df_basket_cashless.drop('sale_or_refund', 'payment_type_id')

In [28]:
df_basket_cash.show(20)

+------------+-------------------+--------+-----------------+-----------+----------+---------+--------+------------+-------------+
|store_number|   timestamp_basket|pos_code|           pos_id|sale_number|total_cost|unit_cost|quantity|menu_item_id|      channel|
+------------+-------------------+--------+-----------------+-----------+----------+---------+--------+------------+-------------+
|         102|2017-09-01 00:03:25|       1|POS0001:216437505|          1|      0.83|     0.83|       1|        1010|FRONT COUNTER|
|         102|2017-09-01 00:03:25|       1|POS0001:216437505|          2|      0.83|     0.83|       1|        3405|FRONT COUNTER|
|         102|2017-09-01 00:04:18|       1|POS0001:216437506|          1|      1.16|     1.16|       1|        4216|FRONT COUNTER|
|         102|2017-09-01 00:04:56|       1|POS0001:216437507|          1|      3.43|     3.43|       1|        7233|FRONT COUNTER|
|         102|2017-09-01 00:04:56|       1|POS0001:216437507|          2|      2.74

In [29]:
df_basket_cashless.show(20)

+------------+-------------------+--------+-----------------+-----------+----------+---------+--------+------------+-------------+
|store_number|   timestamp_basket|pos_code|           pos_id|sale_number|total_cost|unit_cost|quantity|menu_item_id|      channel|
+------------+-------------------+--------+-----------------+-----------+----------+---------+--------+------------+-------------+
|         102|2017-09-01 00:00:51|       1|POS0001:216437503|          1|       0.0|      0.0|       2|        4600|FRONT COUNTER|
|         102|2017-09-01 00:00:51|       1|POS0001:216437503|          2|      0.91|     0.91|       1|        4810|FRONT COUNTER|
|         102|2017-09-01 00:00:51|       1|POS0001:216437503|          3|       0.0|      0.0|       2|        4650|FRONT COUNTER|
|         102|2017-09-01 00:00:51|       1|POS0001:216437503|          4|      3.33|     3.33|       1|        1420|FRONT COUNTER|
|         102|2017-09-01 00:01:40|       1|POS0001:216437504|          1|      -0.8

In [30]:
cash_transactions = df_basket_cash.count()

In [31]:
cashless_transactions = df_basket_cashless.count()

In [32]:
cash_transactions

439217

In [33]:
cashless_transactions

835069

## 5.8.- Compute percentage of 'cash' and 'cashless' transaction

In [34]:
cash_transactions*100/all_transactions

34.34570523266907

In [35]:
cashless_transactions*100/all_transactions

65.30037253325743

## 5.9.- Get Reading store numbers

In [36]:
stores_list = [i.store_number for i in df_basket.select('store_number').distinct().collect()]

In [37]:
stores_list

[1339, 787, 980, 960, 1036, 102, 1262]

# 6.- Card data preprocessing

## 6.1.- Load and filter card data by store number, drop irrelevant fields

In [38]:
df_Sep2017T = sqlContext.read.csv("McD_Card_Data/Sep2017_T.csv", header=True, mode="DROPMALFORMED", schema=schema_card)

In [39]:
df_cards = df_Sep2017T.filter(df_Sep2017T['store_number'].isin(stores_list))

In [40]:
df_cards = df_cards.orderBy("store_number", "transaction_date", "transaction_time")

In [41]:
df_cards = df_cards.drop('card_scheme', 'empty_field')

In [42]:
df_cards.show(5)

+------------+---------------+----------------+----------------+------------------+--------------------+
|store_number|terminal_number|transaction_date|transaction_time|transaction_amount|           pan_token|
+------------+---------------+----------------+----------------+------------------+--------------------+
|         102|              1|      2017/09/01|               1|             508.0|CC2919271D518CE2B...|
|         102|              1|      2017/09/01|               2|             778.0|9DCFC4FE00D7565E7...|
|         102|              1|      2017/09/01|               8|             298.0|F8FD3E6E72ECDF9D6...|
|         102|              1|      2017/09/01|               8|             427.0|CA03E173868FFB0F5...|
|         102|              1|      2017/09/01|              11|             448.0|820B52BFA23284825...|
+------------+---------------+----------------+----------------+------------------+--------------------+
only showing top 5 rows



In [43]:
df_cards.count()

181844

## 6.2.- Convert transaction amount to pounds

In [44]:
def get_amount_pounds(amount_pence):
    amount_pounds = amount_pence/100.0
    return amount_pounds

get_amount_pounds_udf = udf(get_amount_pounds, FloatType())

In [45]:
df_cards = df_cards.withColumn("transaction_amount", get_amount_pounds_udf(df_cards['transaction_amount']))

In [46]:
df_cards.show(5)

+------------+---------------+----------------+----------------+------------------+--------------------+
|store_number|terminal_number|transaction_date|transaction_time|transaction_amount|           pan_token|
+------------+---------------+----------------+----------------+------------------+--------------------+
|         102|              1|      2017/09/01|               1|              5.08|CC2919271D518CE2B...|
|         102|              1|      2017/09/01|               2|              7.78|9DCFC4FE00D7565E7...|
|         102|              1|      2017/09/01|               8|              2.98|F8FD3E6E72ECDF9D6...|
|         102|              1|      2017/09/01|               8|              4.27|CA03E173868FFB0F5...|
|         102|              1|      2017/09/01|              11|              4.48|820B52BFA23284825...|
+------------+---------------+----------------+----------------+------------------+--------------------+
only showing top 5 rows



## 6.3.- Convert transaction time to hour and minutes

In [47]:
def convert_time(time_int):
    time_flt = time_int/99.9999999
    time = math.modf(time_flt)
    hour = str(int(time[1]))
    minutes = str(int(time[0]*100))
    
    if len(hour) == 1:
        hour = hour.zfill(2)
    if len(minutes) == 1:
        minutes = minutes.zfill(2)
    
    time_str = hour + ':' + minutes + ':' + '00'
    return time_str

convert_time_udf = udf(convert_time, StringType())

In [48]:
df_cards = df_cards.withColumn("transaction_time", convert_time_udf(df_cards['transaction_time']))

In [49]:
df_cards.show(5)

+------------+---------------+----------------+----------------+------------------+--------------------+
|store_number|terminal_number|transaction_date|transaction_time|transaction_amount|           pan_token|
+------------+---------------+----------------+----------------+------------------+--------------------+
|         102|              1|      2017/09/01|        00:01:00|              5.08|CC2919271D518CE2B...|
|         102|              1|      2017/09/01|        00:02:00|              7.78|9DCFC4FE00D7565E7...|
|         102|              1|      2017/09/01|        00:08:00|              2.98|F8FD3E6E72ECDF9D6...|
|         102|              1|      2017/09/01|        00:08:00|              4.27|CA03E173868FFB0F5...|
|         102|              1|      2017/09/01|        00:11:00|              4.48|820B52BFA23284825...|
+------------+---------------+----------------+----------------+------------------+--------------------+
only showing top 5 rows



## 6.4.- Combine transaction date and time into a single timestamp, drop transaction_date and transaction_time

In [50]:
df_cards = df_cards.withColumn('timestamp_tmp', concat(df_cards["transaction_date"], lit(" "), \
                                                       df_cards["transaction_time"]))

In [51]:
col = to_timestamp(df_cards['timestamp_tmp'], 'yyyy/MM/dd HH:mm:ss')

In [52]:
df_cards = df_cards.withColumn('timestamp_cards', col)

In [53]:
df_cards = df_cards.drop('transaction_date', 'transaction_time', 'timestamp_tmp')

In [54]:
df_cards = df_cards.orderBy('store_number', 'terminal_number', 'timestamp_cards')

In [55]:
df_cards.show(20)

+------------+---------------+------------------+--------------------+-------------------+
|store_number|terminal_number|transaction_amount|           pan_token|    timestamp_cards|
+------------+---------------+------------------+--------------------+-------------------+
|         102|              1|              5.08|CC2919271D518CE2B...|2017-09-01 00:01:00|
|         102|              1|              7.78|9DCFC4FE00D7565E7...|2017-09-01 00:02:00|
|         102|              1|              4.27|CA03E173868FFB0F5...|2017-09-01 00:08:00|
|         102|              1|              2.98|F8FD3E6E72ECDF9D6...|2017-09-01 00:08:00|
|         102|              1|              5.86|D34051D8DACF1DFB6...|2017-09-01 00:11:00|
|         102|              1|              4.48|820B52BFA23284825...|2017-09-01 00:11:00|
|         102|              1|              3.97|1F3A3D6A9B05A6DCD...|2017-09-01 00:12:00|
|         102|              1|              2.98|2742D137D0DDA803F...|2017-09-01 00:16:00|

# 7.- Remove unnecessary data from memory

In [56]:
df_Sep2017T.unpersist()

DataFrame[store_number: int, terminal_number: int, transaction_date: string, transaction_time: int, transaction_amount: float, card_scheme: string, pan_token: string, empty_field: int]

# 8.- Linkage

## 8.1.- Compute aggregated basket with sale total plus VAT

In [57]:
df_basket_cashless_aggregated = df_basket_cashless.groupBy("store_number", "timestamp_basket", "pos_code", \
                                                           "pos_id", "channel").agg(sum("total_cost").\
                                                            alias("total_cost")). \
                                                            orderBy("store_number", "pos_code", "pos_id", \
                                                                                 "timestamp_basket")

In [58]:
df_basket_cashless_aggregated.show(20)

+------------+-------------------+--------+-----------------+-------------+------------------+
|store_number|   timestamp_basket|pos_code|           pos_id|      channel|        total_cost|
+------------+-------------------+--------+-----------------+-------------+------------------+
|         102|2017-09-01 00:00:51|       1|POS0001:216437503|FRONT COUNTER| 4.239999949932098|
|         102|2017-09-01 00:01:40|       1|POS0001:216437504|FRONT COUNTER|  5.68999981880188|
|         102|2017-09-01 00:07:28|       1|POS0001:216437511|FRONT COUNTER|2.4800000190734863|
|         102|2017-09-01 00:08:34|       1|POS0001:216437512|FRONT COUNTER| 3.559999942779541|
|         102|2017-09-01 00:10:50|       1|POS0001:216437513|FRONT COUNTER| 4.889999985694885|
|         102|2017-09-01 00:11:14|       1|POS0001:216437514|FRONT COUNTER|3.7399999499320984|
|         102|2017-09-01 00:12:16|       1|POS0001:216437516|FRONT COUNTER|3.3199999928474426|
|         102|2017-09-01 00:14:50|       1|POS0001

In [59]:
from pyspark.sql.functions import col
df_basket_cashless_aggregated = df_basket_cashless_aggregated.withColumn('sale_total', col('total_cost')*1.2)

In [60]:
df_basket_cashless_aggregated = df_basket_cashless_aggregated.\
                                withColumn("total_cost", round(df_basket_cashless_aggregated["total_cost"], 2))

In [61]:
df_basket_cashless_aggregated = df_basket_cashless_aggregated.\
                                withColumn("sale_total", round(df_basket_cashless_aggregated['sale_total'], 2))

In [62]:
df_basket_cashless_aggregated.show(20)

+------------+-------------------+--------+-----------------+-------------+----------+----------+
|store_number|   timestamp_basket|pos_code|           pos_id|      channel|total_cost|sale_total|
+------------+-------------------+--------+-----------------+-------------+----------+----------+
|         102|2017-09-01 00:00:51|       1|POS0001:216437503|FRONT COUNTER|      4.24|      5.09|
|         102|2017-09-01 00:01:40|       1|POS0001:216437504|FRONT COUNTER|      5.69|      6.83|
|         102|2017-09-01 00:07:28|       1|POS0001:216437511|FRONT COUNTER|      2.48|      2.98|
|         102|2017-09-01 00:08:34|       1|POS0001:216437512|FRONT COUNTER|      3.56|      4.27|
|         102|2017-09-01 00:10:50|       1|POS0001:216437513|FRONT COUNTER|      4.89|      5.87|
|         102|2017-09-01 00:11:14|       1|POS0001:216437514|FRONT COUNTER|      3.74|      4.49|
|         102|2017-09-01 00:12:16|       1|POS0001:216437516|FRONT COUNTER|      3.32|      3.98|
|         102|2017-0

In [63]:
df_basket_cashless_aggregated = df_basket_cashless_aggregated.select("store_number", "timestamp_basket", \
                                                           "pos_code", "pos_id", "channel", "sale_total")

In [64]:
df_basket_cashless_aggregated.show(20)

+------------+-------------------+--------+-----------------+-------------+----------+
|store_number|   timestamp_basket|pos_code|           pos_id|      channel|sale_total|
+------------+-------------------+--------+-----------------+-------------+----------+
|         102|2017-09-01 00:00:51|       1|POS0001:216437503|FRONT COUNTER|      5.09|
|         102|2017-09-01 00:01:40|       1|POS0001:216437504|FRONT COUNTER|      6.83|
|         102|2017-09-01 00:07:28|       1|POS0001:216437511|FRONT COUNTER|      2.98|
|         102|2017-09-01 00:08:34|       1|POS0001:216437512|FRONT COUNTER|      4.27|
|         102|2017-09-01 00:10:50|       1|POS0001:216437513|FRONT COUNTER|      5.87|
|         102|2017-09-01 00:11:14|       1|POS0001:216437514|FRONT COUNTER|      4.49|
|         102|2017-09-01 00:12:16|       1|POS0001:216437516|FRONT COUNTER|      3.98|
|         102|2017-09-01 00:14:50|       1|POS0001:216437517|FRONT COUNTER|      3.48|
|         102|2017-09-01 00:16:23|       1|

In [65]:
df_basket_cashless_aggregated.count()

178274

## 8.2.- Adds a column with row number to aggregated basket data

In [66]:
df_basket_cashless_aggregated = df_basket_cashless_aggregated.withColumn("row_num", row_number().\
                                                                         over(Window.orderBy("store_number", \
                                                                                 "pos_code", "pos_id", \
                                                                                 "timestamp_basket")))

In [67]:
df_basket_cashless_aggregated.show(20)

+------------+-------------------+--------+-----------------+-------------+----------+-------+
|store_number|   timestamp_basket|pos_code|           pos_id|      channel|sale_total|row_num|
+------------+-------------------+--------+-----------------+-------------+----------+-------+
|         102|2017-09-01 00:00:51|       1|POS0001:216437503|FRONT COUNTER|      5.09|      1|
|         102|2017-09-01 00:01:40|       1|POS0001:216437504|FRONT COUNTER|      6.83|      2|
|         102|2017-09-01 00:07:28|       1|POS0001:216437511|FRONT COUNTER|      2.98|      3|
|         102|2017-09-01 00:08:34|       1|POS0001:216437512|FRONT COUNTER|      4.27|      4|
|         102|2017-09-01 00:10:50|       1|POS0001:216437513|FRONT COUNTER|      5.87|      5|
|         102|2017-09-01 00:11:14|       1|POS0001:216437514|FRONT COUNTER|      4.49|      6|
|         102|2017-09-01 00:12:16|       1|POS0001:216437516|FRONT COUNTER|      3.98|      7|
|         102|2017-09-01 00:14:50|       1|POS0001

## 8.3.- Adds a column with next basket timestamp to aggregated basket data

In [68]:
w = Window().partitionBy().orderBy(col("row_num"))
df_basket_cashless_aggregated = df_basket_cashless_aggregated.select("*", lead("timestamp_basket"). \
                                                                     over(w).alias("next_timestamp")). \
                                                                     orderBy("store_number", \
                                                                             "pos_code", "pos_id", \
                                                                             "timestamp_basket")

In [69]:
df_basket_cashless_aggregated.show()

+------------+-------------------+--------+-----------------+-------------+----------+-------+-------------------+
|store_number|   timestamp_basket|pos_code|           pos_id|      channel|sale_total|row_num|     next_timestamp|
+------------+-------------------+--------+-----------------+-------------+----------+-------+-------------------+
|         102|2017-09-01 00:00:51|       1|POS0001:216437503|FRONT COUNTER|      5.09|      1|2017-09-01 00:01:40|
|         102|2017-09-01 00:01:40|       1|POS0001:216437504|FRONT COUNTER|      6.83|      2|2017-09-01 00:07:28|
|         102|2017-09-01 00:07:28|       1|POS0001:216437511|FRONT COUNTER|      2.98|      3|2017-09-01 00:08:34|
|         102|2017-09-01 00:08:34|       1|POS0001:216437512|FRONT COUNTER|      4.27|      4|2017-09-01 00:10:50|
|         102|2017-09-01 00:10:50|       1|POS0001:216437513|FRONT COUNTER|      5.87|      5|2017-09-01 00:11:14|
|         102|2017-09-01 00:11:14|       1|POS0001:216437514|FRONT COUNTER|     

### 8.3.1- Filter out last row of aggregated basket dataframe

In [70]:
df_basket_cashless_aggregated.count()

178274

In [71]:
df_basket_cashless_aggregated = df_basket_cashless_aggregated.filter(col('row_num') != 178274) 

In [72]:
df_basket_cashless_aggregated.count()

178273

## 8.3.- Join aggregated basket with card data 

In [73]:
df_basket_cashless_aggregated.registerTempTable('basket_aggregated')
df_cards.registerTempTable('cards')

In [77]:
df_joined_basket0 = sqlContext.sql("SELECT basket_aggregated.store_number, basket_aggregated.pos_code, \
                             basket_aggregated.pos_id, basket_aggregated.channel, basket_aggregated.sale_total, \
                             cards.transaction_amount, cards.pan_token, basket_aggregated.timestamp_basket, \
                             cards.timestamp_cards, basket_aggregated.next_timestamp \
                             FROM basket_aggregated \
                             JOIN cards ON \
                             (basket_aggregated.store_number = cards.store_number AND \
                             basket_aggregated.pos_code = cards.terminal_number) AND \
                             ((basket_aggregated.timestamp_basket < cards.timestamp_cards AND \
                             basket_aggregated.next_timestamp > cards.timestamp_cards) AND \
                             (basket_aggregated.sale_total < 1.3*cards.transaction_amount OR \
                             basket_aggregated.sale_total > 0.7*cards.transaction_amount)) \
                             ORDER BY basket_aggregated.store_number,  basket_aggregated.pos_code, \
                             basket_aggregated.pos_id, basket_aggregated.timestamp_basket")

In [78]:
df_joined_basket0.show(100)

KeyboardInterrupt: 

In [None]:
df_joined_basket0.count()

In [None]:
df = df_joined_basket0.select('store_number', 'pos_id', 'timestamp_basket', 'timestamp_cards', 'next_timestamp', \
                             'sale_total', 'transaction_amount')

In [None]:
df.show()

In [None]:
df_joined_basket0.registerTempTable('joined_basket0')

## 8.4.- Join total sale - basket data with card data 

In [None]:
df_basket_cashless.registerTempTable('basket')

In [None]:
df_joined_basket1 = sqlContext.sql("SELECT basket.*, joined_basket0.transaction_amount, joined_basket0.pan_token, \
                             joined_basket0.timestamp_cards \
                             FROM basket \
                             LEFT JOIN joined_basket0 ON \
                             basket.store_number = joined_basket0.store_number AND \
                             basket.pos_id = joined_basket0.pos_id \
                             ORDER BY basket.store_number, basket.pos_code, basket.pos_id, \
                             basket.sale_number, basket.timestamp_basket")

In [None]:
df_joined_basket1.show(100)

In [None]:
df_joined_basket1.count()

## 9.4.- Save joined basket data to disk 

In [None]:
df_joined_basket1.repartition(1).write.format('com.databricks.spark.csv').save('joined_basket_reading_v2.csv', header = 'true')