# McDonald's Card Data Analysis - v1
## April 2018
### Dr Jose M Albornoz

Investigation of missing tokens on 15th and 16th September 2016, where we have almost zero transactions. 

# Import necessary modules

In [27]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
style.use('fivethirtyeight')
%matplotlib inline

import numpy as np
import datetime


RANDOM_STATE = 19

# 1 Load raw data

In [12]:
df_p2 = pd.read_csv('McD_Card_Data/CT_201709_p2.csv', sep=';', header=None)

In [14]:
df_p2.tail()

Unnamed: 0,0,1,2,3,4,5,6,7
6781151,626,24,2017/09/20,1822,669,S,4751448807894848202,
6781152,626,22,2017/09/20,1821,1185,S,4751299188656548436,
6781153,626,23,2017/09/20,1820,569,S,4547422071238730007,
6781154,626,24,2017/09/20,1820,1406,S,4658582668264628221,
6781155,626,20,2017/09/20,1819,608,S,4658585971792733117,


In [15]:
df_p2.shape

(6781156, 8)

In [16]:
df_p2.dtypes

0      int64
1      int64
2     object
3      int64
4      int64
5     object
6     object
7    float64
dtype: object

In [49]:
labels = ['store_number', 'terminal_number', 'transaction_date', 'transaction_time', 'transaction_amount', \
 'card_scheme', 'pan_token', 'empty_field']

In [50]:
df_p2.columns = labels

## 1.1 Convert dates, filter for date range of interest

In [19]:
df_p2['transaction_date'] = pd.to_datetime(df_p2['transaction_date'])

In [40]:
mask = (df_p2['transaction_date'] >= datetime.date(2017,9,15)) & \
           (df_p2['transaction_date'] <= datetime.date(2017,9,16))

In [45]:
df1 = df_p2.loc[mask].sort_values(by=['transaction_date'])

In [46]:
df1.shape

(14631, 8)

In [47]:
df1.tail(100)

Unnamed: 0,store_number,terminal_number,transaction_date,transaction_time,transaction_amount,card_scheme,pan_token,empty_field
5541267,1074,20,2017-09-16,1329,298,S,92FFD26D5BAD404A17EF9498CCC480D4D60C2834DF051A...,
5541266,1074,20,2017-09-16,1328,248,S,CD2C41291630DAF81AD4B4CC5BD53C848B76C4060C1E33...,
5541265,1074,20,2017-09-16,1327,489,S,A13E069F8BB2DBCADAC86C8F871EE4E94DB246B983057E...,
5541264,1074,20,2017-09-16,1326,479,S,83AB785D9FAC84227C62DEEC290284490BA000FE167CA2...,
5541263,1074,20,2017-09-16,1326,307,S,C4D13EA89F85A639C239D1100A273CD86A75975C59EEEA...,
5541262,1074,20,2017-09-16,1325,1975,S,6F2D6187261239A9F80FA17A3065802BFF48AE5872709B...,
5541272,1074,20,2017-09-16,1333,1105,S,B38959317DCB96E130D49E1835887CDBC2B2E477BF7BB3...,
5541261,1074,20,2017-09-16,1325,847,S,AD257047FBB78551CB8D5D02F7B3987295F4EBEB784FAE...,
5541259,1074,20,2017-09-16,1324,578,S,B8DC87ACAD18063BD9F25404E447C19D6D40956D5EC817...,
5541258,1074,20,2017-09-16,1324,935,S,82FB5F729E0F18B5011E530647C9FD708B40ABC01FE9CA...,


# 2 - Load Toshiba data for September 2017

In [54]:
df_Sep2017T = pd.read_csv('McD_Card_Data/Sep2017_T.csv/part-00000-04bd0c7a-a5a6-4711-8992-b3ba4cc1de77-c000.csv')

In [55]:
df_Sep2017T.head()

Unnamed: 0,store_number,terminal_number,transaction_date,transaction_time,transaction_amount,card_scheme,pan_token,empty_field
0,262,3,2017/09/01,0,437,S,D08D751E07A1775D5E4923655B97F6270FE414B7B554E9...,
1,262,24,2017/09/01,2,319,S,D943C4129260645C5C886E66FFE781692638F1D772CBB0...,
2,262,23,2017/09/01,3,396,S,0F07B1D413871FD689B63BAEC6D6857DF2D1B8792389C8...,
3,262,22,2017/09/01,3,579,S,528D75D2EE6F7232976DD5CBE8227EB1DAADB3A14CAAF0...,
4,262,21,2017/09/01,3,99,S,E9216F9D775D8540D08435890ED5DA507122124FD3E0B8...,


In [56]:
df_Sep2017T.shape

(27048022, 8)

In [57]:
df_Sep2017T.dtypes

store_number            int64
terminal_number         int64
transaction_date       object
transaction_time        int64
transaction_amount      int64
card_scheme            object
pan_token              object
empty_field           float64
dtype: object

## 2.1 Convert dates, filter for date range of interest

In [58]:
df_Sep2017T['transaction_date'] = pd.to_datetime(df_Sep2017T['transaction_date'])

In [59]:
mask = (df_Sep2017T['transaction_date'] >= datetime.date(2017,9,15)) & \
           (df_Sep2017T['transaction_date'] <= datetime.date(2017,9,16))

In [60]:
df2 = df_Sep2017T.loc[mask].sort_values(by=['transaction_date'])

In [61]:
df2.shape

(11498, 8)

In [62]:
df2.head(100)

Unnamed: 0,store_number,terminal_number,transaction_date,transaction_time,transaction_amount,card_scheme,pan_token,empty_field
12053794,151,31,2017-09-15,0,109,S,B62A0DB381C21AD85673A6F6C282984F8494695B118793...,
12961372,379,26,2017-09-15,111,1337,S,ED982EA786CFD77E21E6A799F027542C661BACFB716C64...,
12961371,379,1,2017-09-15,110,308,S,743A36FB9E5F8EB4C197C7CE8CCA2FAA6C9A9FDC982EC9...,
12961370,379,22,2017-09-15,110,248,S,BF3995ED2AE90734533B02BB87D459F98BAA11DAF17460...,
12961369,379,21,2017-09-15,110,149,S,638EF57D534C52887D06FE4BCA15890ECC76DEBB72586C...,
12961368,379,1,2017-09-15,110,958,S,9F204B54B860432BD19B20D0A95FE31BE210FCAB4FC1DD...,
12961367,379,24,2017-09-15,110,298,S,49AFC566F93DD1A7AAE77CE17C8225B498279140E953DC...,
12961366,379,1,2017-09-15,108,578,S,C467713B8F11E8A78E4176E01A837FA14E8090DE39165A...,
12961365,379,28,2017-09-15,108,409,S,DBBA553DC25008CAE306272BC032AD00D46E6A97D2B72C...,
12961364,379,22,2017-09-15,108,499,S,1944D5F3D5BBED45AA7306A4D1D338F6F579C0603123AB...,


# 3 - Group raw data by transaction date and perform count of transactions

In [63]:
mask = (df_p2['transaction_date'] >= datetime.date(2017,9,10)) & \
           (df_p2['transaction_date'] <= datetime.date(2017,9,20))

In [None]:
df3 = df_p2.loc[mask].sort_values(by=['transaction_date'])

In [None]:
df3 = df_ps.