# McDonald's Basket Data Analysis - v1: Toshiba card tokens
## April 2018
### Dr Jose M Albornoz

This notebook explores basket data corresponding to September, October and November 2017 for stores in Exeter, matching it with card data for the same period and the same store. We examine only Toshiba card tokens in this notebook. The basket data timestamps are grouped into 15-minutes periods, unlike the card data timestamps.

# 1.- Import necessary modules, define SQLContext

In [76]:
# Import required modules
from pyspark.context import SparkContext
from pyspark.sql.context import SQLContext
from pyspark.sql.functions import *
from pyspark.sql.functions import col
from pyspark.sql.types import *
from pyspark.sql.functions import unix_timestamp
import math

In [2]:
# Define SQLContext
sqlContext = SQLContext(sc)

# 2.- Generic functions to load data from a text-based file

In [3]:
# a function to load a colon-separated value file
def load_data_colon(filename, schema, columns = None):
    df = sqlContext.read.format('com.databricks.spark.csv').option("delimiter", ";").options(header='true'). \
    load(filename, schema = schema)
    if columns is None:
        # If no columns are given, then select all
        columns = schema.names
    return df.select(columns)

In [4]:
# a function to load a pipe-separated value file
def load_data_pipe(filename, schema, columns = None):
    df = sqlContext.read.format('com.databricks.spark.csv').option("delimiter", "|").options(header='false'). \
    load(filename, schema = schema)
    if columns is None:
        # If no columns are given, then select all
        columns = schema.names
    return df.select(columns)

# 3.- Schema for card data files

In [5]:
schema_card = StructType([ 
    StructField('store_number', IntegerType(), True), 
    StructField('terminal_number', IntegerType(), True), 
    StructField('transaction_date', StringType(), True), 
    StructField('transaction_time', IntegerType(), True), 
    StructField('transaction_amount', FloatType(), True),
    StructField('card_scheme', StringType(), True),
    StructField('pan_token', StringType(), True),
    StructField('empty_field', IntegerType(), True)    
])

# 4.- Schema for basket data files

In [6]:
schema_basket = StructType([ 
    StructField('store_number', StringType(), True), 
    StructField('time_period', StringType(), True), 
    StructField('time_of_day', StringType(), True), 
    StructField('business_date', StringType(), True), 
    StructField('pos_code', StringType(), True),
    StructField('c6', StringType(), True),
    StructField('total_cost', StringType(), True),
    StructField('unit_cost', StringType(), True),
    StructField('quantity', StringType(), True), 
    StructField('food_cost', StringType(), True), 
    StructField('paper_cost', StringType(), True), 
    StructField('hour', StringType(), True), 
    StructField('transaction_time', StringType(), True), 
    StructField('transaction_date', StringType(), True),
    StructField('pos_id', StringType(), True),
    StructField('unique_till_code', StringType(), True),
    StructField('sale_number', StringType(), True),
    StructField('menu_item_id', StringType(), True), 
    StructField('till_key', StringType(), True), 
    StructField('till_location', StringType(), True), 
    StructField('c20', StringType(), True), 
    StructField('sale_or_refund', StringType(), True),
    StructField('eatin_eatout', StringType(), True),
    StructField('eat_in_eatout_str', StringType(), True),
    StructField('payment_type_id', StringType(), True),
    StructField('payment_type', StringType(), True) 
])

# 5.- Basket data preprocessing

## 5.1.- Load basket data

In [7]:
df_basket = load_data_pipe("BasketDataExeter2SeptOctNov/000", schema_basket)

In [8]:
df_basket.show(10)

+------------+-----------+-----------+-------------+--------+----+----------+---------+--------+---------+----------+----+-------------------+----------------+-----------------+----------------+-----------+------------+--------+-------------+---+--------------+------------+-----------------+---------------+------------+
|store_number|time_period|time_of_day|business_date|pos_code|  c6|total_cost|unit_cost|quantity|food_cost|paper_cost|hour|   transaction_time|transaction_date|           pos_id|unique_till_code|sale_number|menu_item_id|till_key|till_location|c20|sale_or_refund|eatin_eatout|eat_in_eatout_str|payment_type_id|payment_type|
+------------+-----------+-----------+-------------+--------+----+----------+---------+--------+---------+----------+----+-------------------+----------------+-----------------+----------------+-----------+------------+--------+-------------+---+--------------+------------+-----------------+---------------+------------+
|         295|          3|  Aftern

In [9]:
df_basket.count()

1546161

## 5.2.- Cast basket data columns to the correct types

In [10]:
df_basket = df_basket.withColumn("store_number", df_basket["store_number"].cast(IntegerType()))        
df_basket = df_basket.withColumn("time_period", df_basket["time_period"].cast(IntegerType()))
df_basket = df_basket.withColumn("time_of_day", df_basket["time_of_day"].cast(StringType()))
df_basket = df_basket.withColumn("business_date", df_basket["business_date"].cast(DateType()))
df_basket = df_basket.withColumn("pos_code", df_basket["pos_code"].cast(IntegerType()))
df_basket = df_basket.withColumn("c6", df_basket["c6"].cast(StringType()))
df_basket = df_basket.withColumn("total_cost", df_basket["total_cost"].cast(FloatType()))
df_basket = df_basket.withColumn("unit_cost", df_basket["unit_cost"].cast(FloatType()))
df_basket = df_basket.withColumn("quantity", df_basket["quantity"].cast(IntegerType()))
df_basket = df_basket.withColumn("food_cost", df_basket["food_cost"].cast(FloatType()))
df_basket = df_basket.withColumn("paper_cost", df_basket["paper_cost"].cast(FloatType()))
df_basket = df_basket.withColumn("hour", df_basket["hour"].cast(IntegerType()))
df_basket = df_basket.withColumn("transaction_time", df_basket["transaction_time"].cast(StringType()))
df_basket = df_basket.withColumn("transaction_date", df_basket["transaction_date"].cast(StringType()))
df_basket = df_basket.withColumn("pos_id", df_basket["pos_id"].cast(StringType()))
df_basket = df_basket.withColumn("unique_till_code", df_basket["unique_till_code"].cast(IntegerType()))
df_basket = df_basket.withColumn("sale_number", df_basket["sale_number"].cast(IntegerType()))
df_basket = df_basket.withColumn("menu_item_id", df_basket["menu_item_id"].cast(IntegerType()))
df_basket = df_basket.withColumn("till_key", df_basket["till_key"].cast(IntegerType()))
df_basket = df_basket.withColumn("till_location", df_basket["till_location"].cast(StringType()))
df_basket = df_basket.withColumn("c20", df_basket["c20"].cast(StringType()))
df_basket = df_basket.withColumn("sale_or_refund", df_basket["sale_or_refund"].cast(StringType()))
df_basket = df_basket.withColumn("eatin_eatout", df_basket["eatin_eatout"].cast(IntegerType()))
df_basket = df_basket.withColumn("eat_in_eatout_str", df_basket["eat_in_eatout_str"].cast(StringType()))
df_basket = df_basket.withColumn("payment_type_id", df_basket["payment_type_id"].cast(IntegerType()))
df_basket = df_basket.withColumn("payment_type", df_basket["payment_type"].cast(StringType()))

In [11]:
all_transactions = df_basket.count()

In [12]:
all_transactions

1546161

## 5.3.- Only 'Sales' and not 'Refunds' will be considered

In [13]:
df_basket = df_basket.filter((df_basket['sale_or_refund'] != 'REFUND') | (df_basket['sale_or_refund'] != 'Refund'))

In [14]:
df_basket.count()

1546161

## 5.4.- Extract transaction time from column 'transaction_time'

In [15]:
def get_time(data_str):
    time_str = data_str.split(' ')[1]
    return time_str

get_time_udf = udf(get_time, StringType())

In [16]:
df_basket = df_basket.withColumn("transaction_time", get_time_udf(df_basket['transaction_time']))

In [17]:
df_basket.show(5)

+------------+-----------+-----------+-------------+--------+----+----------+---------+--------+---------+----------+----+----------------+----------------+-----------------+----------------+-----------+------------+--------+-------------+---+--------------+------------+-----------------+---------------+------------+
|store_number|time_period|time_of_day|business_date|pos_code|  c6|total_cost|unit_cost|quantity|food_cost|paper_cost|hour|transaction_time|transaction_date|           pos_id|unique_till_code|sale_number|menu_item_id|till_key|till_location|c20|sale_or_refund|eatin_eatout|eat_in_eatout_str|payment_type_id|payment_type|
+------------+-----------+-----------+-------------+--------+----+----------+---------+--------+---------+----------+----+----------------+----------------+-----------------+----------------+-----------+------------+--------+-------------+---+--------------+------------+-----------------+---------------+------------+
|         295|          3|  Afternoon|   20

## 5.5.- Combine transaction date and time into a single timestamp, drop transaction_date and transaction_time

In [18]:
df_basket = df_basket.withColumn('timestamp_tmp', concat(col("transaction_date"), lit(" "), col("transaction_time")))

In [19]:
col = to_timestamp(df_basket['timestamp_tmp'], 'yyyy-MM-dd HH:mm:ss')

In [20]:
df_basket = df_basket.withColumn('timestamp_basket', col)

In [21]:
df_basket = df_basket.drop('transaction_date', 'transaction_time', 'timestamp_tmp')

In [22]:
df_basket.show(5)

+------------+-----------+-----------+-------------+--------+----+----------+---------+--------+---------+----------+----+-----------------+----------------+-----------+------------+--------+-------------+---+--------------+------------+-----------------+---------------+------------+-------------------+
|store_number|time_period|time_of_day|business_date|pos_code|  c6|total_cost|unit_cost|quantity|food_cost|paper_cost|hour|           pos_id|unique_till_code|sale_number|menu_item_id|till_key|till_location|c20|sale_or_refund|eatin_eatout|eat_in_eatout_str|payment_type_id|payment_type|   timestamp_basket|
+------------+-----------+-----------+-------------+--------+----+----------+---------+--------+---------+----------+----+-----------------+----------------+-----------+------------+--------+-------------+---+--------------+------------+-----------------+---------------+------------+-------------------+
|         295|          3|  Afternoon|   2017-10-14|       2|null|       0.0|      0.

## 5.6.- Select relevant columns from basket data

In [24]:
df_basket_reduced = df_basket.select('store_number', 'timestamp_basket', 'pos_code', 'pos_id', 'sale_number', 'total_cost', \
                                     'unit_cost', 'quantity', 'menu_item_id', 'sale_or_refund', \
                                     'payment_type_id')

In [25]:
df_basket_reduced.show(5)

+------------+-------------------+--------+-----------------+-----------+----------+---------+--------+------------+--------------+---------------+
|store_number|   timestamp_basket|pos_code|           pos_id|sale_number|total_cost|unit_cost|quantity|menu_item_id|sale_or_refund|payment_type_id|
+------------+-------------------+--------+-----------------+-----------+----------+---------+--------+------------+--------------+---------------+
|         295|2017-10-14 16:45:00|       2|POS0002:595657242|         10|       0.0|      0.0|       1|        4650|          Sale|              1|
|         295|2017-10-14 16:45:00|       2|POS0002:595657242|         12|       3.1|      3.1|       1|        6120|          Sale|              1|
|         295|2017-10-14 16:45:00|       2|POS0002:595657242|          8|       0.0|      0.0|       1|        2619|          Sale|              1|
|         295|2017-10-14 16:45:00|       2|POS0002:595657242|          9|      0.68|     0.68|       1|        2

## 5.7.- Split basket data into 'cash' and 'cashless' transaction, sort by store number, timestamp, till ID and sale number

In [26]:
df_basket_cash = df_basket_reduced.filter(df_basket["payment_type_id"] == 1)

In [27]:
df_basket_cash = df_basket_cash.orderBy("store_number", "timestamp_basket", "pos_id", "sale_number")

In [28]:
df_basket_cash = df_basket_cash.drop('sale_or_refund', 'payment_type_id')

In [29]:
df_basket_cashless = df_basket_reduced.filter(df_basket["payment_type_id"] == 2)

In [30]:
df_basket_cashless = df_basket_cashless.orderBy("store_number", "timestamp_basket", "pos_id", "sale_number")

In [31]:
df_basket_cashless = df_basket_cashless.drop('sale_or_refund', 'payment_type_id')

In [32]:
df_basket_cash.show(20)

+------------+-------------------+--------+-----------------+-----------+----------+---------+--------+------------+
|store_number|   timestamp_basket|pos_code|           pos_id|sale_number|total_cost|unit_cost|quantity|menu_item_id|
+------------+-------------------+--------+-----------------+-----------+----------+---------+--------+------------+
|         295|2017-09-01 05:45:00|       2|POS0002:595643113|          1|      0.66|     0.66|       1|        3750|
|         295|2017-09-01 05:45:00|       2|POS0002:595643113|          2|       0.0|      0.0|       1|        3483|
|         295|2017-09-01 05:45:00|       2|POS0002:595643113|          3|      0.83|     0.83|       1|        5410|
|         295|2017-09-01 06:00:00|       2|POS0002:595643115|          1|       0.0|      0.0|       1|        5501|
|         295|2017-09-01 06:00:00|       2|POS0002:595643115|          2|      1.41|     1.41|       1|        5095|
|         295|2017-09-01 06:00:00|       2|POS0002:595643115|   

In [71]:
df_basket_cashless.show(20)

+------------+-------------------+--------+-----------------+-----------+----------+---------+--------+------------+
|store_number|   timestamp_basket|pos_code|           pos_id|sale_number|total_cost|unit_cost|quantity|menu_item_id|
+------------+-------------------+--------+-----------------+-----------+----------+---------+--------+------------+
|         295|2017-09-01 06:00:00|       2|POS0002:595643114|          1|      0.25|     0.25|       1|        3939|
|         295|2017-09-01 06:00:00|       2|POS0002:595643114|          2|       0.0|      0.0|       1|        5501|
|         295|2017-09-01 06:00:00|       2|POS0002:595643114|          3|      2.57|     2.57|       1|        5127|
|         295|2017-09-01 06:00:00|      24|POS0024:977032388|          1|      2.82|     1.41|       2|        5095|
|         295|2017-09-01 06:15:00|       2|POS0002:595643120|          1|       0.0|      0.0|       1|        3718|
|         295|2017-09-01 06:15:00|       2|POS0002:595643120|   

In [34]:
cash_transactions = df_basket_cash.count()

In [35]:
cashless_transactions = df_basket_cashless.count()

In [36]:
cash_transactions

496907

In [37]:
cashless_transactions

1043511

## 5.8.- Compute percentage of 'cash' and 'cashless' transaction

In [38]:
cash_transactions*100/all_transactions

32.13811498285107

In [39]:
cashless_transactions*100/all_transactions

67.49044892478855

## 5.9.- Get Exeter store numbers

In [40]:
stores_list = [i.store_number for i in df_basket.select('store_number').distinct().collect()]

In [41]:
stores_list

[972, 295]

# 6.- Card data preprocessing

## 6.1.- Load and filter card data by store number

In [42]:
df_Sep2017T = sqlContext.read.csv("McD_Card_Data/Sep2017_T.csv", header=True, mode="DROPMALFORMED", schema=schema_card)

In [43]:
df_Sep2017T = df_Sep2017T.filter(df_Sep2017T['store_number'].isin(stores_list))

In [44]:
df_Oct2017T = sqlContext.read.csv("McD_Card_Data/Oct2017_T.csv", header=True, mode="DROPMALFORMED", schema=schema_card)

In [45]:
df_Oct2017T = df_Oct2017T.filter(df_Oct2017T['store_number'].isin(stores_list))

In [46]:
df_Nov2017T = sqlContext.read.csv("McD_Card_Data/Nov2017_T.csv", header=True, mode="DROPMALFORMED", schema=schema_card)

In [47]:
df_Nov2017T = df_Nov2017T.filter(df_Nov2017T['store_number'].isin(stores_list))

## 6.2.- Concatenate and sort card data, drop irrelevant fields

In [48]:
df_cards = df_Sep2017T.unionAll(df_Oct2017T)

In [49]:
df_cards = df_cards.unionAll(df_Nov2017T)

In [50]:
df_cards = df_cards.orderBy("store_number", "transaction_date", "transaction_time")

In [51]:
df_cards = df_cards.drop('card_scheme', 'empty_field')

In [52]:
df_cards.count()

202023

## 6.3.- Convert transaction amount to pounds

In [53]:
def get_amount_pounds(amount_pence):
    amount_pounds = amount_pence/100.0
    return amount_pounds

get_amount_pounds_udf = udf(get_amount_pounds, FloatType())

In [54]:
df_cards = df_cards.withColumn("transaction_amount", get_amount_pounds_udf(df_cards['transaction_amount']))

In [55]:
df_cards.show(5)

+------------+---------------+----------------+----------------+------------------+--------------------+
|store_number|terminal_number|transaction_date|transaction_time|transaction_amount|           pan_token|
+------------+---------------+----------------+----------------+------------------+--------------------+
|         295|              2|      2017/09/01|             604|              3.39|5C538EB879B4645AF...|
|         295|             24|      2017/09/01|             606|              3.38|ECDFB189903D31473...|
|         295|              2|      2017/09/01|             626|              3.59|B73494120D3F178C3...|
|         295|              2|      2017/09/01|             629|              3.28|96C37352E2BE6EEA2...|
|         295|             26|      2017/09/01|             637|              1.78|7115627C2EE250A21...|
+------------+---------------+----------------+----------------+------------------+--------------------+
only showing top 5 rows



## 6.4.- Convert transaction time to hour and minutes

In [56]:
def convert_time(time_int):
    time_flt = time_int/99.9999999
    time = math.modf(time_flt)
    hour = str(int(time[1]))
    minutes = str(int(time[0]*100))
    
    if len(hour) == 1:
        hour = hour.zfill(2)
    if len(minutes) == 1:
        minutes = minutes.zfill(2)
    
    time_str = hour + ':' + minutes + ':' + '00'
    return time_str

convert_time_udf = udf(convert_time, StringType())

In [57]:
df_cards = df_cards.withColumn("transaction_time", convert_time_udf(df_cards['transaction_time']))

In [58]:
df_cards.show(5)

+------------+---------------+----------------+----------------+------------------+--------------------+
|store_number|terminal_number|transaction_date|transaction_time|transaction_amount|           pan_token|
+------------+---------------+----------------+----------------+------------------+--------------------+
|         295|              2|      2017/09/01|        06:04:00|              3.39|5C538EB879B4645AF...|
|         295|             24|      2017/09/01|        06:06:00|              3.38|ECDFB189903D31473...|
|         295|              2|      2017/09/01|        06:26:00|              3.59|B73494120D3F178C3...|
|         295|              2|      2017/09/01|        06:29:00|              3.28|96C37352E2BE6EEA2...|
|         295|             26|      2017/09/01|        06:37:00|              1.78|7115627C2EE250A21...|
+------------+---------------+----------------+----------------+------------------+--------------------+
only showing top 5 rows



## 6.5.- Combine transaction date and time into a single timestamp, drop transaction_date and transaction_time

In [59]:
df_cards = df_cards.withColumn('timestamp_tmp', concat(df_cards["transaction_date"], lit(" "), \
                                                       df_cards["transaction_time"]))

In [60]:
col = to_timestamp(df_cards['timestamp_tmp'], 'yyyy/MM/dd HH:mm:ss')

In [61]:
df_cards = df_cards.withColumn('timestamp_cards', col)

In [62]:
df_cards = df_cards.drop('transaction_date', 'transaction_time', 'timestamp_tmp')

In [63]:
df_cards.show(20)

+------------+---------------+------------------+--------------------+-------------------+
|store_number|terminal_number|transaction_amount|           pan_token|    timestamp_cards|
+------------+---------------+------------------+--------------------+-------------------+
|         295|              2|              3.39|5C538EB879B4645AF...|2017-09-01 06:04:00|
|         295|             24|              3.38|ECDFB189903D31473...|2017-09-01 06:06:00|
|         295|              2|              3.59|B73494120D3F178C3...|2017-09-01 06:26:00|
|         295|              2|              3.28|96C37352E2BE6EEA2...|2017-09-01 06:29:00|
|         295|             26|              1.78|7115627C2EE250A21...|2017-09-01 06:37:00|
|         295|             24|              3.99|9A0B40CD994F4C878...|2017-09-01 06:38:00|
|         295|             26|              6.18|DA5AC4203EB66A5D4...|2017-09-01 06:39:00|
|         295|             22|             14.55|B4C3C406966A533ED...|2017-09-01 06:43:00|

# 7.- Remove unnecessary data from memory

In [64]:
df_Sep2017T.unpersist()
df_Oct2017T.unpersist()
df_Nov2017T.unpersist()

DataFrame[store_number: int, terminal_number: int, transaction_date: string, transaction_time: int, transaction_amount: float, card_scheme: string, pan_token: string, empty_field: int]

# 8.- Save preprocessed data to disk

In [None]:
df_basket_cash.repartition(1).write.format('com.databricks.spark.csv').save('exeter_basket_cash.csv', header = 'true')

In [None]:
df_basket_cashless.repartition(1).write.format('com.databricks.spark.csv').save('exeter_basket_cashless.csv', header = 'true')

In [None]:
df_cards.repartition(1).write.format('com.databricks.spark.csv').save('exeter_cards.csv', header = 'true')

# 9.- Linkage

## 9.1.- Compute basket total plus VAT

In [96]:
df_basket_cashless_aggregated = df_basket_cashless.groupBy("store_number", "timestamp_basket", "pos_code", \
                                                           "pos_id").agg(sum("total_cost").alias("total_cost")). \
                                                                         orderBy("store_number", \
                                                                                 "timestamp_basket", 
                                                                                 "pos_code", \
                                                                                 "pos_id")

In [97]:
df_basket_cashless_aggregated.show(5)

+------------+-------------------+--------+-----------------+------------------+
|store_number|   timestamp_basket|pos_code|           pos_id|        total_cost|
+------------+-------------------+--------+-----------------+------------------+
|         295|2017-09-01 06:00:00|       2|POS0002:595643114| 2.819999933242798|
|         295|2017-09-01 06:00:00|      24|POS0024:977032388| 2.819999933242798|
|         295|2017-09-01 06:15:00|       2|POS0002:595643120| 2.990000009536743|
|         295|2017-09-01 06:15:00|       2|POS0002:595643121|2.7300000190734863|
|         295|2017-09-01 06:30:00|      22|POS0022:802104822|12.120000004768372|
+------------+-------------------+--------+-----------------+------------------+
only showing top 5 rows



In [98]:
df_basket_cashless_aggregated = df_basket_cashless_aggregated.withColumn('sale_total', col('total_cost')*1.2)

In [99]:
df_basket_cashless_aggregated.show(5)

+------------+-------------------+--------+-----------------+------------------+------------------+
|store_number|   timestamp_basket|pos_code|           pos_id|        total_cost|        sale_total|
+------------+-------------------+--------+-----------------+------------------+------------------+
|         295|2017-09-01 06:00:00|       2|POS0002:595643114| 2.819999933242798|3.3839999198913575|
|         295|2017-09-01 06:00:00|      24|POS0024:977032388| 2.819999933242798|3.3839999198913575|
|         295|2017-09-01 06:15:00|       2|POS0002:595643120| 2.990000009536743|3.5880000114440915|
|         295|2017-09-01 06:15:00|       2|POS0002:595643121|2.7300000190734863|3.2760000228881836|
|         295|2017-09-01 06:30:00|      22|POS0022:802104822|12.120000004768372|14.544000005722046|
+------------+-------------------+--------+-----------------+------------------+------------------+
only showing top 5 rows



In [100]:
df_basket_cashless_aggregated = df_basket_cashless_aggregated.\
                                withColumn("sale_total", round(df_basket_cashless_aggregated["sale_total"], 2))

In [102]:
df_basket_cashless_aggregated.show(5)

+------------+-------------------+--------+-----------------+------------------+----------+
|store_number|   timestamp_basket|pos_code|           pos_id|        total_cost|sale_total|
+------------+-------------------+--------+-----------------+------------------+----------+
|         295|2017-09-01 06:00:00|       2|POS0002:595643114| 2.819999933242798|      3.38|
|         295|2017-09-01 06:00:00|      24|POS0024:977032388| 2.819999933242798|      3.38|
|         295|2017-09-01 06:15:00|       2|POS0002:595643120| 2.990000009536743|      3.59|
|         295|2017-09-01 06:15:00|       2|POS0002:595643121|2.7300000190734863|      3.28|
|         295|2017-09-01 06:30:00|      22|POS0022:802104822|12.120000004768372|     14.54|
+------------+-------------------+--------+-----------------+------------------+----------+
only showing top 5 rows



## 9.2.- Join aggregated basket with card data using sale total

In [103]:
df_basket_cashless.registerTempTable('basket')
df_basket_cashless_aggregated.registerTempTable('basket_aggregated')
df_cards.registerTempTable('cards')

In [108]:
# we consider transaction amounts that are 5% above and below computed sale totals
df_joined_basket0 = sqlContext.sql("SELECT basket_aggregated.*, cards.transaction_amount, cards.pan_token, \
                             cards.timestamp_cards \
                             FROM basket_aggregated \
                             JOIN cards ON \
                             basket_aggregated.store_number = cards.store_number AND \
                             basket_aggregated.pos_code = cards.terminal_number AND \
                             cards.timestamp_cards >= basket_aggregated.timestamp_basket AND \
                             cards.timestamp_cards < basket_aggregated.timestamp_basket + INTERVAL 15 MINUTES AND \
                             cards.transaction_amount < basket_aggregated.sale_total*1.05 AND \
                             cards.transaction_amount > basket_aggregated.sale_total*0.95 \
                             ORDER BY basket_aggregated.store_number, basket_aggregated.timestamp_basket, \
                             basket_aggregated.pos_code, basket_aggregated.pos_id")

In [109]:
df_joined_basket0.show(100)

+------------+-------------------+--------+------------------+------------------+----------+------------------+--------------------+-------------------+
|store_number|   timestamp_basket|pos_code|            pos_id|        total_cost|sale_total|transaction_amount|           pan_token|    timestamp_cards|
+------------+-------------------+--------+------------------+------------------+----------+------------------+--------------------+-------------------+
|         295|2017-09-01 06:00:00|       2| POS0002:595643114| 2.819999933242798|      3.38|              3.39|5C538EB879B4645AF...|2017-09-01 06:04:00|
|         295|2017-09-01 06:00:00|      24| POS0024:977032388| 2.819999933242798|      3.38|              3.38|ECDFB189903D31473...|2017-09-01 06:06:00|
|         295|2017-09-01 06:15:00|       2| POS0002:595643120| 2.990000009536743|      3.59|              3.59|B73494120D3F178C3...|2017-09-01 06:26:00|
|         295|2017-09-01 06:15:00|       2| POS0002:595643121|2.7300000190734863| 

In [110]:
df_joined_basket0.registerTempTable('joined_basket0')

## 9.3.- Join total sale - basket data with card data 

In [112]:
df_joined_basket1 = sqlContext.sql("SELECT basket.*, joined_basket0.transaction_amount, joined_basket0.pan_token, \
                             joined_basket0.timestamp_cards \
                             FROM basket \
                             JOIN joined_basket0 ON \
                             basket.store_number = joined_basket0.store_number AND \
                             basket.pos_code = joined_basket0.pos_code AND \
                             joined_basket0.timestamp_cards >= basket.timestamp_basket AND \
                             joined_basket0.timestamp_cards < basket.timestamp_basket + INTERVAL 15 MINUTES \
                             ORDER BY basket.store_number, basket.timestamp_basket, basket.pos_code, basket.sale_number")

In [113]:
df_joined_basket1.show(100)

+------------+-------------------+--------+-----------------+-----------+----------+---------+--------+------------+------------------+--------------------+-------------------+
|store_number|   timestamp_basket|pos_code|           pos_id|sale_number|total_cost|unit_cost|quantity|menu_item_id|transaction_amount|           pan_token|    timestamp_cards|
+------------+-------------------+--------+-----------------+-----------+----------+---------+--------+------------+------------------+--------------------+-------------------+
|         295|2017-09-01 06:00:00|       2|POS0002:595643114|          1|      0.25|     0.25|       1|        3939|              3.39|5C538EB879B4645AF...|2017-09-01 06:04:00|
|         295|2017-09-01 06:00:00|       2|POS0002:595643114|          2|       0.0|      0.0|       1|        5501|              3.39|5C538EB879B4645AF...|2017-09-01 06:04:00|
|         295|2017-09-01 06:00:00|       2|POS0002:595643114|          3|      2.57|     2.57|       1|        5127

## 9.4.- Save joined basket data to disk 

In [114]:
df_joined_basket1.repartition(1).write.format('com.databricks.spark.csv').save('joined_basket1.csv', header = 'true')

In [115]:
df_joined_basket1.count()

5837615