# McDonald's Card Token Analysis - v1
## March 2018
### Dr Jose M Albornoz

In this notebook an initial analysis of the card token data is performed using Pandas

# 1.- Import necessary modules, define SQLContext

In [3]:
# Import required modules
from pyspark.context import SparkContext
from pyspark.sql.context import SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import unix_timestamp

import pandas as pd
import gc

In [4]:
# Define SQLContext
sqlContext = SQLContext(sc)

# 2.- Generic function to load data from a csv file

In [5]:
def load_data(filename, schema, columns = None):
    df = sqlContext.read.format('com.databricks.spark.csv').option("delimiter", ";").options(header='false'). \
    load(filename, schema = schema)
    if columns is None:
        # If no columns are given, then select all
        columns = schema.names
    return df.select(columns)

# 3.- Data schema

In [6]:
schema = StructType([ 
    StructField('store_number', IntegerType(), True), 
    StructField('terminal_number', IntegerType(), True), 
    StructField('transaction_date', StringType(), True), 
    StructField('transaction_time', IntegerType(), True), 
    StructField('transaction_amount', IntegerType(), True),
    StructField('card_scheme', StringType(), True),
    StructField('pan_token', StringType(), True),
    StructField('empty_field', IntegerType(), True)    
])

# 4.- Load data

In [7]:
cols = ['store_number', 'terminal_number', 'transaction_date', 'transaction_time', 'transaction_amount', 'card_scheme',
       'pan_token', 'empty_field']

In [8]:
filename = 'McD_Card_Data/CT_201709_p1.csv'
#df_p1 = load_data(filename, schema)
df_p1 = pd.read_csv(filename, names=cols, sep=";")

In [9]:
filename = 'McD_Card_Data/CT_201709_p2.csv'
# df_p2 = load_data(filename, schema)
df_p2 = pd.read_csv(filename, names=cols, sep=";")

In [10]:
filename = 'McD_Card_Data/CT_201709_p3.csv'
# df_p3 = load_data(filename, schema)
df_p3 = pd.read_csv(filename, names=cols, sep=";")

In [11]:
# filename = 'McD_Card_Data/CT_201710_p1.csv'
# df_p4 = load_data(filename, schema)

In [12]:
# filename = 'McD_Card_Data/CT_201710_p2.csv'
# df_p5 = load_data(filename, schema)

In [13]:
# filename = 'McD_Card_Data/CT_201710_p3.csv'
# df_p6 = load_data(filename, schema)

In [14]:
# filename = 'McD_Card_Data/CT_201711_p1.csv'
# df_p7 = load_data(filename, schema)

In [15]:
# filename = 'McD_Card_Data/CT_201711_p2.csv'
# df_p8 = load_data(filename, schema)

In [16]:
# filename = 'McD_Card_Data/CT_201711_p3.csv'
# df_p9 = load_data(filename, schema)

In [17]:
# filename = 'McD_Card_Data/CT_201712_p1.csv'
# df_p10 = load_data(filename, schema)

In [18]:
# filename = 'McD_Card_Data/CT_201712_p2.csv'
# df_p11 = load_data(filename, schema)

In [19]:
# filename = 'McD_Card_Data/CT_201712_p3.csv'
# df_p12 = load_data(filename, schema)

In [20]:
# filename = 'McD_Card_Data/CT_201801_p1.csv'
# df_p13 = load_data(filename, schema)

In [21]:
# filename = 'McD_Card_Data/CT_201801_p2.csv'
# df_p14 = load_data(filename, schema)

In [22]:
# filename = 'McD_Card_Data/CT_201801_p3.csv'
# df_p15 = load_data(filename, schema)

In [23]:
filename = 'McD_Card_Data/CT_201802_p1.csv'
# df_p16 = load_data(filename, schema)
df_p16 = pd.read_csv(filename, names=cols, sep=";")

In [24]:
filename = 'McD_Card_Data/CT_201802_p2.csv'
# df_p17 = load_data(filename, schema)
df_p17 = pd.read_csv(filename, names=cols, sep=";")

In [25]:
filename = 'McD_Card_Data/CT_201802_p3.csv'
#df_p18 = load_data(filename, schema)
df_p18 = pd.read_csv(filename, names=cols, sep=";")

# 5.- Concatenate data, delete redundant dataframes from memory

In [26]:
df_Sep2017 = pd.concat([df_p1, df_p2, df_p3])

In [27]:
df_Feb2018 = pd.concat([df_p16, df_p17, df_p18])

In [28]:
del [[df_p1, df_p2, df_p3, df_p16, df_p17, df_p18]]
gc.collect()
# df_1 = pd.DataFrame()
# df_2 = pd.DataFrame()
# df_3 = pd.DataFrame()
# df_16 = pd.DataFrame()
# df_17 = pd.DataFrame()
# df_18 = pd.DataFrame()

28

28

# 5.- Examine tokens 

## 5.1.- September 2017

In [29]:
df_Sep2017.shape

(27083389, 8)

(27083389, 8)

In [30]:
df_Sep2017.reset_index(inplace=True)

In [31]:
df_Sep2017.drop('index', axis=1, inplace=True)

In [32]:
df_Sep2017['transaction_date'] = pd.to_datetime(df_Sep2017['transaction_date'])

In [33]:
df_Sep2017.sort_values(by='transaction_date', inplace=True)

In [34]:
df_Sep2017.head()

Unnamed: 0,store_number,terminal_number,transaction_date,transaction_time,transaction_amount,card_scheme,pan_token,empty_field
0,262,3,2017-09-01,0,437,S,D08D751E07A1775D5E4923655B97F6270FE414B7B554E9...,
786462,550,20,2017-09-01,22,1146,S,8EC6A49D074628439EC4E4D9D202F1E87ED9AC96AB536F...,
786463,550,20,2017-09-01,22,1374,S,4AD0A225D6468A1869F9ACBE29FE7D3923426264346174...,
786464,550,20,2017-09-01,25,597,S,6F4A782C4DC871C4418F22933DA475B0E5194BA8A19C43...,
786465,550,20,2017-09-01,26,149,S,B5787424942B8F5B637AD0BC24D5C5771A647BF28BE709...,


Unnamed: 0,store_number,terminal_number,transaction_date,transaction_time,transaction_amount,card_scheme,pan_token,empty_field
0,262,3,2017-09-01,0,437,S,D08D751E07A1775D5E4923655B97F6270FE414B7B554E9...,
786462,550,20,2017-09-01,22,1146,S,8EC6A49D074628439EC4E4D9D202F1E87ED9AC96AB536F...,
786463,550,20,2017-09-01,22,1374,S,4AD0A225D6468A1869F9ACBE29FE7D3923426264346174...,
786464,550,20,2017-09-01,25,597,S,6F4A782C4DC871C4418F22933DA475B0E5194BA8A19C43...,
786465,550,20,2017-09-01,26,149,S,B5787424942B8F5B637AD0BC24D5C5771A647BF28BE709...,


In [35]:
df_Sep2017.tail()

Unnamed: 0,store_number,terminal_number,transaction_date,transaction_time,transaction_amount,card_scheme,pan_token,empty_field
26212476,516,23,2017-09-30,2138,1203,S,6A4DE23D48BB4C032673B1BA12B9F4C856DEE1B7A45136...,
26212475,516,20,2017-09-30,2137,617,S,760A206960D13774E7E9F5F2FEB1B4C88AC73A8EBE05D2...,
26212474,516,20,2017-09-30,2136,838,S,A12A083B406711F991B14696FD1AD07F6A27A92F32D744...,
26212481,516,2,2017-09-30,2141,467,S,51F136ADE4A3533973559E8D5F144A254E71C3E6DE95B5...,
25089372,454,20,2017-09-30,20,1008,S,1A81ABCA28758ADF3B66904E7FBE7E600C0EE0126EC53C...,


Unnamed: 0,store_number,terminal_number,transaction_date,transaction_time,transaction_amount,card_scheme,pan_token,empty_field
26212476,516,23,2017-09-30,2138,1203,S,6A4DE23D48BB4C032673B1BA12B9F4C856DEE1B7A45136...,
26212475,516,20,2017-09-30,2137,617,S,760A206960D13774E7E9F5F2FEB1B4C88AC73A8EBE05D2...,
26212474,516,20,2017-09-30,2136,838,S,A12A083B406711F991B14696FD1AD07F6A27A92F32D744...,
26212481,516,2,2017-09-30,2141,467,S,51F136ADE4A3533973559E8D5F144A254E71C3E6DE95B5...,
25089372,454,20,2017-09-30,20,1008,S,1A81ABCA28758ADF3B66904E7FBE7E600C0EE0126EC53C...,


In [36]:
df_Sep2017.iloc[0,6]

'D08D751E07A1775D5E4923655B97F6270FE414B7B554E9ECE3DDA6D6C66CD07F'

'D08D751E07A1775D5E4923655B97F6270FE414B7B554E9ECE3DDA6D6C66CD07F'

In [37]:
df_Sep2017.iloc[25089372,6]

'7CC6FA44B144EF4674141EA9B906AC1AB2676BFB608796BF5362BA234634C967'

'7CC6FA44B144EF4674141EA9B906AC1AB2676BFB608796BF5362BA234634C967'

## 5.2.- February 2018

In [38]:
df_Feb2018.shape

(30719007, 8)

(30719007, 8)

In [39]:
df_Feb2018.reset_index(inplace=True)

In [40]:
df_Feb2018.drop('index', axis=1, inplace=True)

In [41]:
df_Feb2018['transaction_date'] = pd.to_datetime(df_Feb2018['transaction_date'])

In [42]:
df_Feb2018.sort_values(by='transaction_date', inplace=True)

In [43]:
df_Feb2018.head()

Unnamed: 0,store_number,terminal_number,transaction_date,transaction_time,transaction_amount,card_scheme,pan_token,empty_field
0,335,21,2018-02-01,1704,548,S,C39C0855893EBF95142FBD62DF5A3111F7BEC647758377...,
3380201,996,20,2018-02-01,1938,228,S,4658595665007961033,
3380200,628,24,2018-02-01,1937,668,S,5460161838029131772,
3380199,480,20,2018-02-01,1937,1090,S,4539795660775487123,
3380198,1168,20,2018-02-01,1937,489,S,4658599299214145023,


Unnamed: 0,store_number,terminal_number,transaction_date,transaction_time,transaction_amount,card_scheme,pan_token,empty_field
0,335,21,2018-02-01,1704,548,S,C39C0855893EBF95142FBD62DF5A3111F7BEC647758377...,
3380201,996,20,2018-02-01,1938,228,S,4658595665007961033,
3380200,628,24,2018-02-01,1937,668,S,5460161838029131772,
3380199,480,20,2018-02-01,1937,1090,S,4539795660775487123,
3380198,1168,20,2018-02-01,1937,489,S,4658599299214145023,


In [44]:
df_Feb2018.tail()

Unnamed: 0,store_number,terminal_number,transaction_date,transaction_time,transaction_amount,card_scheme,pan_token,empty_field
27622315,242,20,2018-02-28,1729,1145,S,4659354199601960307,
27622316,744,20,2018-02-28,1729,179,S,4462741171052066667,
27622317,1446,22,2018-02-28,1729,2181,S,4763678288171332718,
27622319,1237,20,2018-02-28,1729,1396,S,4023965690400136143,
27778167,73,22,2018-02-28,1348,628,S,4659441202633452834,


Unnamed: 0,store_number,terminal_number,transaction_date,transaction_time,transaction_amount,card_scheme,pan_token,empty_field
27622315,242,20,2018-02-28,1729,1145,S,4659354199601960307,
27622316,744,20,2018-02-28,1729,179,S,4462741171052066667,
27622317,1446,22,2018-02-28,1729,2181,S,4763678288171332718,
27622319,1237,20,2018-02-28,1729,1396,S,4023965690400136143,
27778167,73,22,2018-02-28,1348,628,S,4659441202633452834,


In [45]:
df_Feb2018.iloc[0,6]

'C39C0855893EBF95142FBD62DF5A3111F7BEC6477583779BA4FD35BF55498461'

'C39C0855893EBF95142FBD62DF5A3111F7BEC6477583779BA4FD35BF55498461'

In [46]:
df_Feb2018.iloc[27778167,6]

'6E225B7F083B153BB37A666F7392C2213B1318236E65F8DDD96D8EA0167C2573'

'6E225B7F083B153BB37A666F7392C2213B1318236E65F8DDD96D8EA0167C2573'

# 6.- Determine split by tokens

## 6.1.- September 2017

In [47]:
maskT = (df_Sep2017['pan_token'].str.len() == 64)

In [48]:
df_Sep2017_T = df_Sep2017.loc[maskT]

In [49]:
df_Sep2017_T.shape

(27048022, 8)

(27048022, 8)

In [None]:
maskI = (df_Sep2017['pan_token'].str.len() == 19)

In [None]:
df_Sep2017_I = df_Sep2017.loc[maskI]

In [None]:
df_Sep2017_I.shape

## 6.2.- February 2018

In [None]:
maskT = (df_Feb2018['pan_token'].str.len() == 64)

In [None]:
df_Feb2018_T = df_Sep2018.loc[maskT]

In [None]:
df_Feb2018_T.shape

In [None]:
maskI = (df_Feb2018['pan_token'].str.len() == 19)

In [None]:
df_Feb2018_I = df_Feb2018.loc[maskI]

In [None]:
df_Feb2018_I.shape