# McDonald's Card Data Analysis - v2
## March 2018
### Dr Jose M Albornoz

# 1.- Import necessary modules, define SQLContext

In [1]:
# Import required modules
from pyspark.context import SparkContext
from pyspark.sql.context import SQLContext
from pyspark.sql.functions import mean, min, max, to_date, stddev
from pyspark.sql.types import *

In [2]:
# Define SQLContext
sqlContext = SQLContext(sc)

# 2.- Generic function to load data from a csv file

In [3]:
def load_data(filename, schema, columns = None):
    df = sqlContext.read.format('com.databricks.spark.csv').option("delimiter", ";").options(header='false'). \
    load(filename, schema = schema)
    if columns is None:
        # If no columns are given, then select all
        columns = schema.names
    return df.select(columns)

# 3.- Data schema

In [4]:
schema = StructType([ 
    StructField('store_number', IntegerType(), True), 
    StructField('terminal_number', IntegerType(), True), 
    StructField('transaction_date', StringType(), True), 
    StructField('transaction_time', IntegerType(), True), 
    StructField('transaction_amount', IntegerType(), True),
    StructField('card_scheme', StringType(), True),
    StructField('pan_token', StringType(), True),
    StructField('empty_field', IntegerType(), True)    
])

# 4.- Load data

In [5]:
filename = 'McD_Card_Data/CT_201709_p1.csv'
df_p1 = load_data(filename, schema)

In [6]:
filename = 'McD_Card_Data/CT_201709_p2.csv'
df_p2 = load_data(filename, schema)

In [7]:
filename = 'McD_Card_Data/CT_201709_p3.csv'
df_p3 = load_data(filename, schema)

# 5.- Concatenate data

In [8]:
df_data1 = df_p1.unionAll(df_p2)

In [9]:
df_data = df_data1.unionAll(df_p3)

In [10]:
df_data.count()

27083389

# 6.- Remove unnecessary data from memory

In [11]:
df_p1.unpersist()
df_p2.unpersist()
df_p3.unpersist()
df_data1.unpersist()

DataFrame[store_number: int, terminal_number: int, transaction_date: string, transaction_time: int, transaction_amount: int, card_scheme: string, pan_token: string, empty_field: int]

# 7.- Register data as table

In [12]:
df_data.registerTempTable("data")

# 8.- Identify unique cards, register as table

In [13]:
df_unique_cards = sqlContext.sql("SELECT DISTINCT pan_token FROM data")

In [14]:
df_unique_cards.count()

13309332

In [15]:
df_unique_cards.show(5)

+--------------------+
|           pan_token|
+--------------------+
|A0ECABECC29BE59C7...|
|AB812BB8925729B75...|
|F6E97F708E1F38531...|
|84D1DD25D16F6B6C4...|
|C4C38EA5A6A1FEBAD...|
+--------------------+
only showing top 5 rows



In [16]:
df_unique_cards.registerTempTable("unique_cards")

# 8.- Match unique cards with basket & date 

In [None]:
df_matched_data_1 = sqlContext.sql("SELECT DISTINCT pan_token AS unique_cards, store_number, terminal_number, \
                                    transaction_date, transaction_time, transaction_amount, card_scheme \
                                    FROM data ORDER BY unique_cards, transaction_date, transaction_time")

In [20]:
df_matched_data_1 = sqlContext.sql("SELECT * \
                                    FROM data \
                                    GROUP BY pan_token \
                                    ORDER BY transaction_date, transaction_time")

AnalysisException: u"expression 'store_number' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.;"

In [None]:
df_matched_data_1.show(40)

In [None]:
df_matched_data_1.count()

In [None]:
# df_matched_data_1 = sqlContext.sql("SELECT * FROM data \
#                                    WHERE pan_token IN (SELECT pan_token FROM unique_cards) \
#                                    ORDER BY pan_token, transaction_date, transaction_time")