# McDonald's Card Data Split v2
## March 2018
### Dr Jose M Albornoz

This notebook splits card data into two sets: one containing Toshiba card tokens, and the other containing Ingenico card tokens. A plot of this split is obtained for every store.

# 1.- Import necessary modules, define SQLContext

In [1]:
# Import required modules
from pyspark.context import SparkContext
from pyspark.sql.context import SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
import numpy as np

In [2]:
# Define SQLContext
sqlContext = SQLContext(sc)

# 2.- Data schema

In [3]:
schema = StructType([ 
    StructField('store_number', IntegerType(), True), 
    StructField('terminal_number', IntegerType(), True), 
    StructField('transaction_date', StringType(), True), 
    StructField('transaction_time', IntegerType(), True), 
    StructField('transaction_amount', IntegerType(), True),
    StructField('card_scheme', StringType(), True),
    StructField('pan_token', StringType(), True),
    StructField('empty_field', IntegerType(), True)    
])

# 3.- Load data

In [4]:
df_Sep2017T = sqlContext.read.csv("McD_Card_Data/Sep2017_T.csv")

In [5]:
df_Sep2017I = sqlContext.read.csv("McD_Card_Data/Sep2017_I.csv")

In [6]:
df_Oct2017T = sqlContext.read.csv("McD_Card_Data/Oct2017_T.csv")

In [7]:
df_Oct2017I = sqlContext.read.csv("McD_Card_Data/Oct2017_I.csv")

In [8]:
df_Nov2017T = sqlContext.read.csv("McD_Card_Data/Nov2017_T.csv")

In [9]:
df_Nov2017I = sqlContext.read.csv("McD_Card_Data/Nov2017_I.csv")

In [10]:
df_Dec2017T = sqlContext.read.csv("McD_Card_Data/Dec2017_T.csv")

In [11]:
df_Dec2017I = sqlContext.read.csv("McD_Card_Data/Dec2017_I.csv")

In [12]:
df_Jan2018T = sqlContext.read.csv("McD_Card_Data/Jan2018_T.csv")

In [13]:
df_Jan2018I = sqlContext.read.csv("McD_Card_Data/Jan2018_I.csv")

In [14]:
df_Feb2018T = sqlContext.read.csv("McD_Card_Data/Feb2018_T.csv")

In [15]:
df_Feb2018I = sqlContext.read.csv("McD_Card_Data/Feb2018_I.csv")

# 4.- Concatenate data 

## 4.1.- Toshiba tokens

In [16]:
df_dataT = df_Sep2017T.unionAll(df_Oct2017T)

In [17]:
df_dataT = df_dataT.unionAll(df_Nov2017T)

In [18]:
df_dataT = df_dataT.unionAll(df_Dec2017T)

In [19]:
df_dataT = df_dataT.unionAll(df_Jan2018T)

In [20]:
df_dataT = df_dataT.unionAll(df_Feb2018T)

## 4.2.- Ingenico tokens

In [21]:
df_dataI = df_Sep2017I.unionAll(df_Oct2017I)

In [22]:
df_dataI = df_dataT.unionAll(df_Nov2017I)

In [23]:
df_dataI = df_dataT.unionAll(df_Dec2017I)

In [24]:
df_dataI = df_dataT.unionAll(df_Jan2018I)

In [25]:
df_dataI = df_dataT.unionAll(df_Feb2018I)

# 6.- Plots

Monthly count of Toshiba tokens

In [27]:
Sep2017T = df_Sep2017T.count()

In [28]:
Oct2017T = df_Oct2017T.count()

In [29]:
Nov2017T = df_Nov2017T.count()

In [30]:
Dec2017T = df_Dec2017T.count()

In [32]:
Jan2018T = df_Jan2018T.count()

In [35]:
Feb2018T = df_Feb2018T.count()

In [36]:
countT = [Sep2017T, Oct2017T, Nov2017T, Dec2017T, Jan2018T, Feb2018T]

In [37]:
countT

[27048023, 31339688, 27885607, 21639273, 14268797, 5879856]

Unique Toshiba tokens in September 2017

In [47]:
df_Sep2017T.registerTempTable('SepToshiba')

In [56]:
Toshiba_count_Sep2017 = sqlContext.sql("SELECT DISTINCT _c6 FROM SepToshiba").count()

In [57]:
Toshiba_count_Sep2017

13284372

Monthly count of Ingenico tokens

In [38]:
Sep2017I = df_Sep2017I.count()

In [39]:
Oct2017I = df_Oct2017I.count()

In [40]:
Nov2017I = df_Nov2017I.count()

In [41]:
Dec2017I = df_Dec2017I.count()

In [42]:
Jan2018I = df_Jan2018I.count()

In [43]:
Feb2018I = df_Feb2018I.count()

In [44]:
countI = [Sep2017I, Oct2017I, Nov2017I, Dec2017I, Jan2018I, Feb2018I]

In [45]:
countI

[35360, 205706, 2905783, 9802622, 14525831, 24836486]

Unique Ingenico tokens in September 2017

In [54]:
df_Sep2017I.registerTempTable('SepIngenico')

In [58]:
Ingenico_count_Sep2017 = sqlContext.sql("SELECT DISTINCT _c6 FROM SepIngenico").count()

In [59]:
Ingenico_count_Sep2017

24961

Total unique card tokens Sep2017

In [60]:
total_unique_card_tokens_Sep2017 = Toshiba_count_Sep2017 + Ingenico_count_Sep2017 

In [61]:
Toshiba_count_Sep2017*100/total_unique_card_tokens_Sep2017

99.81245491415686

In [62]:
Ingenico_count_Sep2017*100/total_unique_card_tokens_Sep2017

0.1875450858431448

In [None]:
x = ['Sep2017', 'Oct2017', 'Nov2017', 'Dec2017', 'Jan2018', 'Feb2018']

In [None]:
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import style
from matplotlib.ticker import MultipleLocator, FormatStrFormatter
style.use('fivethirtyeight')
%matplotlib inline

fig, ax = plt.subplots(figsize=(15, 8))

ind = np.arange(len(x))  # the x locations for the groups
width = 0.35       # the width of the bars

rects1 = ax.bar(ind, countT, width, color='r', label='Toshiba')
rects2 = ax.bar(ind + width, countI, width, color='y', label='Ingenico')
ax.set_xticks(ind + width / 2)
ax.set_xticklabels(x)

plt.legend(fontsize=16)
plt.title('Split of card hashes', fontsize=20)
plt.ylabel('Transaction count', fontsize=14)

In [None]:
count = [(ai + bi) for ai,bi in zip(countT, countI)]

In [None]:
countT_pct = [(ai/bi) for ai,bi in zip(countT, count)]

In [None]:
countI_pct = [(ai/bi) for ai,bi in zip(countI, count)]

In [None]:
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import style
from matplotlib.ticker import MultipleLocator, FormatStrFormatter
style.use('fivethirtyeight')
%matplotlib inline

fig, ax = plt.subplots(figsize=(15, 8))

ind = np.arange(len(x))  # the x locations for the groups
width = 0.35       # the width of the bars

rects1 = ax.bar(ind, countT_pct, width, color='r', label='Toshiba')
rects2 = ax.bar(ind + width, countI_pct, width, color='y', label='Ingenico')
ax.set_xticks(ind + width / 2)
ax.set_xticklabels(x)

plt.legend(fontsize=16)
plt.title('Split of card hashes', fontsize=20)
plt.ylabel('Fraction of total transaction count', fontsize=14)