# McDonald's Card Data Analysis - v6: Toshiba card tokens
## April 2018
### Dr Jose M Albornoz

This notebook generates plots of customer types as determined by the frequency of their visits; possible overlaps of the different categories have been accounted for. Customer duplication has been accounted for as well by using only card tokens generated by Toshiba. Card tokens are split according to frequency of visits and saved to disk.

# 1.- Import necessary modules, define SQLContext

In [1]:
# Import required modules
from pyspark.context import SparkContext
from pyspark.sql.context import SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import unix_timestamp

In [2]:
sc.stop()
conf = SparkConf()
conf.set("spark.executor.memory", "8g")
conf.set("spark.driver.memory", "8g")
sc = SparkContext(conf=conf)

In [3]:
# Define SQLContext
sqlContext = SQLContext(sc)

# 2.- Generic function to load data from a csv file

In [4]:
def load_data(filename, schema, columns = None):
    df = sqlContext.read.format('com.databricks.spark.csv').option("delimiter", ";").options(header='false'). \
    load(filename, schema = schema)
    if columns is None:
        # If no columns are given, then select all
        columns = schema.names
    return df.select(columns)

# 3.- Data schema

In [5]:
schema = StructType([ 
    StructField('store_number', IntegerType(), True), 
    StructField('terminal_number', IntegerType(), True), 
    StructField('transaction_date', StringType(), True), 
    StructField('transaction_time', IntegerType(), True), 
    StructField('transaction_amount', IntegerType(), True),
    StructField('card_scheme', StringType(), True),
    StructField('pan_token', StringType(), True),
    StructField('empty_field', IntegerType(), True)    
])

# 4.- Load data

In [6]:
df_Sep2017T = sqlContext.read.csv("McD_Card_Data/Sep2017_T.csv", header=True, mode="DROPMALFORMED", schema=schema)

In [7]:
df_Oct2017T  = sqlContext.read.csv("McD_Card_Data/Oct2017_T.csv", header=True, mode="DROPMALFORMED", schema=schema)

In [8]:
df_Nov2017T = sqlContext.read.csv("McD_Card_Data/Nov2017_T.csv", header=True, mode="DROPMALFORMED", schema=schema)

In [9]:
df_Dec2017T = sqlContext.read.csv("McD_Card_Data/Dec2017_T.csv", header=True, mode="DROPMALFORMED", schema=schema)

In [10]:
df_Jan2018T = sqlContext.read.csv("McD_Card_Data/Jan2018_T.csv", header=True, mode="DROPMALFORMED", schema=schema)

In [11]:
df_Feb2018T = sqlContext.read.csv("McD_Card_Data/Feb2018_T.csv", header=True, mode="DROPMALFORMED", schema=schema)

# 5.- Concatenate data

In [12]:
df_data0 = df_Sep2017T.unionAll(df_Oct2017T)

In [13]:
df_data0 = df_data0.unionAll(df_Nov2017T)

In [14]:
df_data0 = df_data0.unionAll(df_Dec2017T)

In [15]:
df_data0 = df_data0.unionAll(df_Jan2018T)

In [16]:
df_data0 = df_data0.unionAll(df_Feb2018T)

In [17]:
df_data0.count()

128061238

# 6.- Remove unnecessary data from memory

In [18]:
df_Sep2017T.unpersist()
df_Oct2017T.unpersist()
df_Nov2017T.unpersist()
df_Dec2017T.unpersist()
df_Jan2018T.unpersist()
df_Feb2018T.unpersist()

DataFrame[store_number: int, terminal_number: int, transaction_date: string, transaction_time: int, transaction_amount: int, card_scheme: string, pan_token: string, empty_field: int]

# 7.- Register data0 as table

In [19]:
df_data0.registerTempTable("data0")

# 8.- Compute count of unique cards

In [20]:
df_unique_cards = sqlContext.sql("SELECT DISTINCT pan_token FROM data0")

In [21]:
unique_cards = df_unique_cards.count()

In [22]:
unique_cards

30846516

# 9.- Order data0 by transaction date and card token

In [23]:
df_data0 = sqlContext.sql("SELECT * FROM data0 ORDER BY pan_token, transaction_date, transaction_time")

In [24]:
#df_data.show(40)

# 10.- Find frequency of visits per customer

## 10.1.- High-frequency customers: number of unique customers that have visited at least twice in any calendar week (H) 

In [25]:
df_weekly1 = df_data0.groupBy("pan_token", weekofyear(from_unixtime(unix_timestamp('transaction_date', 'yyy/MM/dd'))).\
                            alias('week')).agg(count("*").alias('visits')).sort('pan_token')
#df_weekly1.show(20)

In [26]:
df_weekly1.registerTempTable("weekly_visit1")

In [27]:
df_weekly2 = sqlContext.sql("SELECT DISTINCT pan_token \
                             FROM weekly_visit1 \
                             WHERE visits >= 2")

In [28]:
#df_weekly2.show(20)

In [29]:
H_count = df_weekly2.count()
H_count

9412618

### 10.1.1.- Sets aside high-frequency card tokens, save them to disk

In [30]:
from pyspark.sql.functions import col
df_HF = df_weekly2.select('pan_token').distinct().withColumn('Frequency', lit('H'))

In [31]:
#df_HF.show(5)

In [32]:
df_HF.repartition(1).write.format('com.databricks.spark.csv').save('Toshiba_tokens_HF.csv', header = 'true')

### 10.1.2.- Register table with high-frequency customers (H) 

In [33]:
df_weekly2.registerTempTable('customersH')

### 10.1.3.- Remove high-frequency customers from data0 => data1 is generated

In [34]:
df_data1 = df_data0.join(df_weekly2, ["pan_token"], "leftanti")

In [35]:
#df_data1.count()

In [36]:
df_data0.unpersist()
df_weekly1.unpersist()
df_weekly2.unpersist()

DataFrame[pan_token: string]

## 10.2.- Medium_frequency customers: number of unique customers that visit exactly twice in any calendar month (M)

In [37]:
df_monthly1 = df_data1.groupBy("pan_token", month(from_unixtime(unix_timestamp('transaction_date', 'yyy/MM/dd'))).\
                            alias('month')).agg(count("*").alias('visits')).sort('pan_token')
#df_monthly1.show(20)

In [38]:
df_monthly1.registerTempTable("monthly_visit1")

In [39]:
df_monthly2 = sqlContext.sql("SELECT DISTINCT pan_token\
                             FROM monthly_visit1 \
                             WHERE visits = 2")

In [40]:
M_count = df_monthly2.count()
M_count

4583395

### 10.2.1.- Sets aside medium-frequency card tokens, saves them to disk

In [41]:
from pyspark.sql.functions import col
df_MF = df_monthly2.select('pan_token').distinct().withColumn('Frequency', lit('M'))

In [42]:
#df_MF.show(5)

In [43]:
df_MF.repartition(1).write.format('com.databricks.spark.csv').save('Toshiba_tokens_MF.csv', header = 'true')

### 10.2.2.- Register table with medium-frequency customers (M) 

In [44]:
df_monthly2.registerTempTable('customersM')

### 10.2.3.- Remove medium-frequency customers from data1 => data2 is generated

In [45]:
df_data2 = df_data1.join(df_monthly2, ["pan_token"], "leftanti")

In [46]:
#df_data2.count()

In [47]:
df_data1.unpersist()
df_monthly1.unpersist()
df_monthly2.unpersist()

DataFrame[pan_token: string]

## 10.3.- Infrequent customers: number of unique customers that visit once in a 3 calendar month period (I)

In [48]:
interval = 90 * 24 * 60 * 60     # 3 months = 90 days  
gdf = df_data2.\
withColumn('quarter_interval', \
           from_unixtime(floor(unix_timestamp('transaction_date', 'yyy/MM/dd') / interval) * interval\
           + ((unix_timestamp(lit('2017/09/01'),'yyy/MM/dd')/interval)%1)*interval))

df_3month1 = gdf.groupBy('pan_token', 'quarter_interval').agg(count("*").alias('visits')).sort('pan_token')

In [49]:
df_3month1.registerTempTable("3month_visit1")

In [50]:
df_3month2 = sqlContext.sql("SELECT DISTINCT pan_token\
                             FROM 3month_visit1 \
                             WHERE visits = 1")

In [51]:
I_count = df_3month2.count()
I_count

14806895

### 10.3.1.- Sets aside infrequent card tokens, saves them to disk

In [52]:
from pyspark.sql.functions import col
df_IF = df_3month2.select('pan_token').distinct().withColumn('Frequency', lit('I'))

In [53]:
#df_IF.show(5)

In [54]:
df_IF.repartition(1).write.format('com.databricks.spark.csv').save('Toshiba_tokens_IF.csv', header = 'true')

### 10.3.2.- Register table with infrequent customers (I) 

In [55]:
df_3month2.registerTempTable('customersI')

### 10.3.3.- Remove infrequent customers from data2 => data3 is generated

In [56]:
df_data3 = df_data2.join(df_3month2, ["pan_token"], "leftanti")

In [57]:
#df_data3.count()

In [58]:
df_data2.unpersist()
df_3month1.unpersist()
df_3month2.unpersist()

DataFrame[pan_token: string]

## 10.4.- Low-frequency customers: number of unique customers that visit once in any 6 calendar weeks lapse (L)

In [59]:
interval = 42 * 24 * 60 * 60     # 6 weeks = 42 days  
gdf = df_data3.\
withColumn('6week_interval', \
           from_unixtime(floor(unix_timestamp('transaction_date', 'yyy/MM/dd') / interval) * interval\
           + ((unix_timestamp(lit('2017/09/01'),'yyy/MM/dd')/interval)%1)*interval))

df_6weekly1 = gdf.groupBy('pan_token', '6week_interval').agg(count("*").alias('visits')).sort('pan_token')

#df_6weekly1.show(20)

In [60]:
df_6weekly1.registerTempTable("6weekly_visit1")

In [61]:
df_6weekly2 = sqlContext.sql("SELECT DISTINCT pan_token\
                             FROM 6weekly_visit1 \
                             WHERE visits = 1")

In [62]:
#df_6weekly2.show(40)

In [63]:
L_count = df_6weekly2.count()
L_count

1417376

### 10.4.1.- Sets aside low-frequency card tokens, saves them to disk

In [64]:
from pyspark.sql.functions import col
df_LF = df_6weekly2.select('pan_token').distinct().withColumn('Frequency', lit('L'))

In [65]:
#df_LF.show(5)

In [66]:
df_LF.repartition(1).write.format('com.databricks.spark.csv').save('Toshiba_tokens_LF.csv', header = 'true')

In [67]:
df_6weekly2.registerTempTable('customersL')

### 10.4.2.- Remove infrequent customers from data3 => data4 is generated

In [68]:
df_data4 = df_data3.join(df_6weekly2, ["pan_token"], "leftanti")

In [69]:
#df_data4.count()

In [70]:
df_data3.unpersist()
gdf.unpersist()
df_6weekly1.unpersist()
df_6weekly2.unpersist()

DataFrame[pan_token: string]

## 10.5.- Rarely visiting customers: number of unique customers that less than once in a 3 calendar month period (R)

In [71]:
from pyspark.sql.functions import col
df_RF = df_data4.select('pan_token').distinct().withColumn('Frequency', lit('R'))

In [73]:
df_RF.repartition(1).write.format('com.databricks.spark.csv').save('Toshiba_tokens_RF.csv', header = 'true')

AnalysisException: 'path file:/root/notebooks/Codebase/McDonalds/Toshiba_tokens_RF.csv already exists.;'

In [None]:
#df_RF.show(5)

In [None]:
df_RF.count()

In [None]:
total = H_count + M_count + L_count + I_count

In [None]:
total

In [None]:
R_count = unique_cards - total

In [None]:
R_count

In [None]:
total + R_count

# 11 Card tokens and their frequency labels are saved to disk

In [None]:
df_tokens = df_HF.unionAll(df_MF)

In [None]:
df_tokens = df_tokens.unionAll(df_IF)

In [None]:
df_tokens = df_tokens.unionAll(df_RF)

In [None]:
df_tokens = df_tokens.unionAll(df_LF)

In [None]:
df_tokens.count()

In [None]:
df_tokens.repartition(1).write.format('com.databricks.spark.csv').save('Toshiba_tokens.csv', header = 'true')

# 12 Plots

In [None]:
H_count_pct = H_count*100/unique_cards

In [None]:
M_count_pct = M_count*100/unique_cards

In [None]:
L_count_pct = L_count*100/unique_cards

In [None]:
I_count_pct = I_count*100/unique_cards

In [None]:
R_count_pct = R_count*100/unique_cards

In [None]:
numcust = [H_count, M_count, L_count, I_count, R_count]

In [None]:
numcust_pct = [H_count_pct, M_count_pct, L_count_pct, I_count_pct, R_count_pct]

In [None]:
x = ['Twice a week or more', 'Twice a month', 'Once every 6 weeks', 'Once every 3 months', 'Rarely visits']

In [None]:
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import style
from matplotlib.ticker import MultipleLocator, FormatStrFormatter
style.use('fivethirtyeight')
%matplotlib inline

fig, ax1 = plt.subplots(figsize=(15, 8))

plt.bar(x, numcust, align='center', alpha=0.5)
plt.ylabel('Number of customers')
plt.title('Frequency of visits - Ingenico card tokens', fontsize=20)
plt.xticks(fontsize=14)

In [None]:
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import style
from matplotlib.ticker import MultipleLocator, FormatStrFormatter
style.use('fivethirtyeight')
%matplotlib inline

fig, ax1 = plt.subplots(figsize=(15, 8))

plt.bar(x, numcust_pct, align='center', alpha=0.5)
plt.ylabel('Number of customers (percentage of total unique cards)')
plt.title('Frequency of visits - Ingenico card tokens', fontsize=20)
plt.xticks(fontsize=14)

In [None]:
total = H_count + M_count + L_count + I_count

In [None]:
total

In [None]:
unique_cards - total