# McDonald's Card Data Analysis - v3
## March 2018
### Dr Jose M Albornoz

# 1.- Import necessary modules, define SQLContext

In [1]:
# Import required modules
from pyspark.context import SparkContext
from pyspark.sql.context import SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import unix_timestamp

In [2]:
# Define SQLContext
sqlContext = SQLContext(sc)

# 2.- Generic function to load data from a csv file

In [3]:
def load_data(filename, schema, columns = None):
    df = sqlContext.read.format('com.databricks.spark.csv').option("delimiter", ";").options(header='false'). \
    load(filename, schema = schema)
    if columns is None:
        # If no columns are given, then select all
        columns = schema.names
    return df.select(columns)

# 3.- Data schema

In [4]:
schema = StructType([ 
    StructField('store_number', IntegerType(), True), 
    StructField('terminal_number', IntegerType(), True), 
    StructField('transaction_date', StringType(), True), 
    StructField('transaction_time', IntegerType(), True), 
    StructField('transaction_amount', IntegerType(), True),
    StructField('card_scheme', StringType(), True),
    StructField('pan_token', StringType(), True),
    StructField('empty_field', IntegerType(), True)    
])

# 4.- Load data

In [5]:
filename = 'McD_Card_Data/CT_201709_p1.csv'
df_p1 = load_data(filename, schema)

In [6]:
filename = 'McD_Card_Data/CT_201709_p2.csv'
df_p2 = load_data(filename, schema)

In [7]:
filename = 'McD_Card_Data/CT_201709_p3.csv'
df_p3 = load_data(filename, schema)

In [8]:
# filename = 'McD_Card_Data/CT_201710_p1.csv'
# df_p4 = load_data(filename, schema)

In [9]:
# filename = 'McD_Card_Data/CT_201710_p2.csv'
# df_p5 = load_data(filename, schema)

In [10]:
# filename = 'McD_Card_Data/CT_201710_p3.csv'
# df_p6 = load_data(filename, schema)

In [11]:
# filename = 'McD_Card_Data/CT_201711_p1.csv'
# df_p7 = load_data(filename, schema)

In [12]:
# filename = 'McD_Card_Data/CT_201711_p2.csv'
# df_p8 = load_data(filename, schema)

In [13]:
# filename = 'McD_Card_Data/CT_201711_p3.csv'
# df_p9 = load_data(filename, schema)

In [14]:
# filename = 'McD_Card_Data/CT_201712_p1.csv'
# df_p10 = load_data(filename, schema)

In [15]:
# filename = 'McD_Card_Data/CT_201712_p2.csv'
# df_p11 = load_data(filename, schema)

In [16]:
# filename = 'McD_Card_Data/CT_201712_p3.csv'
# df_p12 = load_data(filename, schema)

# 5.- Concatenate data

In [17]:
df_data1 = df_p1.unionAll(df_p2)

In [18]:
df_data1 = df_data.unionAll(df_p3)

In [19]:
#df_data = df_data.unionAll(df_p4)

In [20]:
#df_data = df_data.unionAll(df_p5)

In [21]:
#df_data = df_data.unionAll(df_p6)

In [22]:
# df_data = df_data.unionAll(df_p7)

In [23]:
# df_data = df_data.unionAll(df_p8)

In [24]:
# df_data = df_data.unionAll(df_p9)

In [25]:
# df_data = df_data.unionAll(df_p10)

In [26]:
# df_data = df_data.unionAll(df_p11)

In [27]:
# df_data = df_data.unionAll(df_p12)

In [28]:
df_data1.count()

27083389

# 6.- Remove unnecessary data from memory

In [29]:
df_p1.unpersist()
df_p2.unpersist()
df_p3.unpersist()
# df_p4.unpersist()
# df_p5.unpersist()
# df_p6.unpersist()
# df_p7.unpersist()
# df_p8.unpersist()
# df_p9.unpersist()
# df_p10.unpersist()
# df_p11.unpersist()
# df_p12.unpersist()

DataFrame[store_number: int, terminal_number: int, transaction_date: string, transaction_time: int, transaction_amount: int, card_scheme: string, pan_token: string, empty_field: int]

# 7.- Register data as table

In [30]:
df_data1.registerTempTable("data1")

# 8.- Identify unique cards, register as table

In [31]:
df_unique_cards = sqlContext.sql("SELECT DISTINCT pan_token FROM data1")

In [32]:
df_unique_cards.count()

13309332

In [33]:
df_unique_cards.registerTempTable("unique_cards")

# 9.- Order data by transaction date and card token

In [34]:
df_data1 = sqlContext.sql("SELECT * FROM data1 ORDER BY pan_token, transaction_date, transaction_time")

In [35]:
#df_data.show(40)

# 10.- Find frequency of visits per customer

## 10.1.- Number of unique customers that have visited at least twice in any calendar week (H) 

In [36]:
df_weekly1 = df_data1.groupBy("pan_token", weekofyear(from_unixtime(unix_timestamp('transaction_date', 'yyy/MM/dd'))).\
                            alias('week')).agg(count("*").alias('visits')).sort('pan_token')
#df_weekly1.show(20)

In [37]:
df_weekly1.registerTempTable("weekly_visit1")

In [38]:
df_weekly2 = sqlContext.sql("SELECT DISTINCT pan_token, visits \
                             FROM weekly_visit1 \
                             WHERE visits >= 2")

In [39]:
#df_weekly2.show(20)

In [40]:
df_weekly2.registerTempTable("weekly_visit2")

In [41]:
df_weekly3 = sqlContext.sql("SELECT DISTINCT pan_token \
                             FROM weekly_visit2")

In [42]:
#df_weekly3.show(20)

In [43]:
H_count = df_weekly3.count()
H_count

3163482

### 10.1.1.- Register table with high-frequency customers (H) 

In [44]:
df_weekly3.registerTempTable('customersH')

In [46]:
df_weekly1.unpersist()
df_weekly2.unpersist()
df_weekly3.unpersist()

DataFrame[pan_token: string]

### 10.1.2.- Remove high-frequency customers from data

In [47]:
df_data2 = df_data1.join(df_weekly3, ["pan_token"], "leftanti")

In [48]:
df_data2.show()

+--------------------+------------+---------------+----------------+----------------+------------------+-----------+-----------+
|           pan_token|store_number|terminal_number|transaction_date|transaction_time|transaction_amount|card_scheme|empty_field|
+--------------------+------------+---------------+----------------+----------------+------------------+-----------+-----------+
|000101B312159D938...|         774|             24|      2017/09/03|             724|               956|          S|       null|
|000101B312159D938...|         565|             20|      2017/09/20|            1733|               607|          S|       null|
|000101B312159D938...|         565|             20|      2017/09/26|            1308|               628|          S|       null|
|000221A645A0F23D7...|         540|              3|      2017/09/10|            1724|               707|          S|       null|
|00028B75A6B5BBAC0...|        1257|              1|      2017/09/17|            1624|            

In [49]:
df_data2.count()

13640384

In [None]:
df_data2.registerTempTable('data2')

In [None]:
df_data1.unpersist()

## 10.2.- Number of unique customers that visit exactly twice in any calendar month (M)

In [None]:
df_monthly1 = df_data.groupBy("pan_token", month(from_unixtime(unix_timestamp('transaction_date', 'yyy/MM/dd'))).\
                            alias('month')).agg(count("*").alias('visits')).sort('pan_token')
#df_monthly1.show(20)

In [None]:
df_monthly1.registerTempTable("monthly_visit1")

In [None]:
df_monthly2 = sqlContext.sql("SELECT DISTINCT pan_token, visits \
                             FROM monthly_visit1 \
                             WHERE visits = 2")

In [None]:
M_count = df_monthly2.count()
M_count

In [None]:
df_monthly2.registerTempTable('monthly_visit2')

In [None]:
df_monthly3 = sqlContext.sql("SELECT DISTINCT pan_token FROM monthly_visit2")

### 10.2.1.- Register table with medium-frequency customers (H) 

In [None]:
df_monthly3.registerTempTable('customersM')

In [None]:
df_monthly1.unpersist()
df_monthly2.unpersist()

In [None]:
df_data = sqlContext.sql("SELECT * FROM data WHERE pan_token NOT IN (SELECT * FROM customersH)")

## 10.3.- Number of unique customers that visit once in any 6 calendar weeks lapse (L)

In [None]:
interval = 42 * 24 * 60 * 60     # 6 weeks = 42 days  
gdf = df_data.\
withColumn('6week_interval', \
           from_unixtime(floor(unix_timestamp('transaction_date', 'yyy/MM/dd') / interval) * interval\
           + ((unix_timestamp(lit('2017/09/01'),'yyy/MM/dd')/interval)%1)*interval))

df_6weekly1 = gdf.groupBy('pan_token', '6week_interval').agg(count("*").alias('visits')).sort('pan_token')

df_6weekly1.show(20)

In [None]:
df_6weekly1.registerTempTable("6weekly_visit1")

In [None]:
df_6weekly2 = sqlContext.sql("SELECT DISTINCT pan_token, visits \
                             FROM 6weekly_visit1 \
                             WHERE visits = 1")

In [None]:
df_6weekly2.show(40)

In [None]:
L_count = df_6weekly2.count()
L_count

In [None]:
df_6weekly2.registerTempTable("6weekly_visit2")

In [None]:
df_6weekly3 = sqlContext.sql("SELECT DISTINCT pan_token FROM 6weekly_visit2")

In [None]:
df_6weekly3.registerTempTable('customersLI')

In [None]:
gdf.unpersist()
df_6weekly1.unpersist()
df_6weekly2.unpersist()

## 10.4.- Number of unique customers that visit once in a 3 calendar month period (I)

In [None]:
interval = 90 * 24 * 60 * 60     # 3 months = 90 days  
gdf = df_data.\
withColumn('quarter_interval', \
           from_unixtime(floor(unix_timestamp('transaction_date', 'yyy/MM/dd') / interval) * interval\
           + ((unix_timestamp(lit('2017/09/01'),'yyy/MM/dd')/interval)%1)*interval))

df_3month1 = gdf.groupBy('pan_token', 'quarter_interval').agg(count("*").alias('visits')).sort('pan_token')

In [None]:
df_3month1.registerTempTable("3month_visit1")

In [None]:
df_3month2 = sqlContext.sql("SELECT DISTINCT pan_token, visits \
                             FROM 3month_visit1 \
                             WHERE visits = 1")

In [None]:
df_3month2.registerTempTable('3month_visit2')

In [None]:
df_3month3 = sqlContext.sql("SELECT DISTINCT pan_token FROM 3month_visit2")

In [None]:
df_3month3.registerTempTable('customersI')

In [None]:
I_count = df_3month3.count()
I_count

In [None]:
H_count_pct = H_count*100/26994812

In [None]:
M_count_pct = M_count*100/26994812

In [None]:
L_count_pct = L_count*100/26994812

In [None]:
I_count_pct = I_count*100/26994812

In [None]:
numcust = [H_count, M_count, L_count, I_count]

In [None]:
numcust_pct = [H_count_pct, M_count_pct, L_count_pct, I_count_pct]

In [None]:
x = ['Twice a week or more', 'Twice a month', 'Once every 6 weeks', 'Once every 3 months']

In [None]:
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import style
from matplotlib.ticker import MultipleLocator, FormatStrFormatter
style.use('fivethirtyeight')
%matplotlib inline

fig, ax1 = plt.subplots(figsize=(15, 8))

plt.bar(x, numcust, align='center', alpha=0.5)
plt.ylabel('Number of customers')
plt.title('Frequency of visits', fontsize=20)
plt.xticks(fontsize=14)

In [None]:
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import style
from matplotlib.ticker import MultipleLocator, FormatStrFormatter
style.use('fivethirtyeight')
%matplotlib inline

fig, ax1 = plt.subplots(figsize=(15, 8))

plt.bar(x, numcust_pct, align='center', alpha=0.5)
plt.ylabel('Number of customers (percentage of total unique cards)')
plt.title('Frequency of visits', fontsize=20)
plt.xticks(fontsize=14)