# McDonald's Card Token Analysis - v1
## March 2018
### Dr Jose M Albornoz

# 1.- Import necessary modules, define SQLContext

In [1]:
# Import required modules
from pyspark.context import SparkContext
from pyspark.sql.context import SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import unix_timestamp

In [2]:
# Define SQLContext
sqlContext = SQLContext(sc)

# 2.- Generic function to load data from a csv file

In [3]:
def load_data(filename, schema, columns = None):
    df = sqlContext.read.format('com.databricks.spark.csv').option("delimiter", ";").options(header='false'). \
    load(filename, schema = schema)
    if columns is None:
        # If no columns are given, then select all
        columns = schema.names
    return df.select(columns)

# 3.- Data schema

In [4]:
schema = StructType([ 
    StructField('store_number', IntegerType(), True), 
    StructField('terminal_number', IntegerType(), True), 
    StructField('transaction_date', StringType(), True), 
    StructField('transaction_time', IntegerType(), True), 
    StructField('transaction_amount', IntegerType(), True),
    StructField('card_scheme', StringType(), True),
    StructField('pan_token', StringType(), True),
    StructField('empty_field', IntegerType(), True)    
])

# 4.- Load data

In [5]:
filename = 'McD_Card_Data/CT_201709_p1.csv'
df_p1 = load_data(filename, schema)

In [6]:
filename = 'McD_Card_Data/CT_201709_p2.csv'
df_p2 = load_data(filename, schema)

In [7]:
filename = 'McD_Card_Data/CT_201709_p3.csv'
df_p3 = load_data(filename, schema)

In [8]:
filename = 'McD_Card_Data/CT_201710_p1.csv'
df_p4 = load_data(filename, schema)

In [9]:
filename = 'McD_Card_Data/CT_201710_p2.csv'
df_p5 = load_data(filename, schema)

In [10]:
filename = 'McD_Card_Data/CT_201710_p3.csv'
df_p6 = load_data(filename, schema)

In [11]:
filename = 'McD_Card_Data/CT_201711_p1.csv'
df_p7 = load_data(filename, schema)

In [12]:
filename = 'McD_Card_Data/CT_201711_p2.csv'
df_p8 = load_data(filename, schema)

In [13]:
filename = 'McD_Card_Data/CT_201711_p3.csv'
df_p9 = load_data(filename, schema)

In [14]:
filename = 'McD_Card_Data/CT_201712_p1.csv'
df_p10 = load_data(filename, schema)

In [15]:
filename = 'McD_Card_Data/CT_201712_p2.csv'
df_p11 = load_data(filename, schema)

In [16]:
filename = 'McD_Card_Data/CT_201712_p3.csv'
df_p12 = load_data(filename, schema)

## 4.1.- Examine tokens for first and last dataframes

In [17]:
df_p1.sort('transaction_date').select('transaction_date', 'pan_token').show(20000)

+----------------+--------------------+
|transaction_date|           pan_token|
+----------------+--------------------+
|      2017/09/01|4AE342DF7304B548A...|
|      2017/09/01|C17781F600FB3DDBB...|
|      2017/09/01|D92A40476B50976E1...|
|      2017/09/01|F9C29200F79856A14...|
|      2017/09/01|2170FE9018ADCAA26...|
|      2017/09/01|FCB11EF9AE83806FE...|
|      2017/09/01|2C26FB60054E97D6F...|
|      2017/09/01|9A40393BAE28F80AE...|
|      2017/09/01|D8556B013BB1D64C0...|
|      2017/09/01|67B2A20C980B1E952...|
|      2017/09/01|8BF4E793B41C9BB91...|
|      2017/09/01|50E9B4B180167D652...|
|      2017/09/01|E10E21C2E256C97B6...|
|      2017/09/01|57A8661BDA188A931...|
|      2017/09/01|D32670A08A7FD28C2...|
|      2017/09/01|94F7ABA4D05B439E6...|
|      2017/09/01|DAAE37D3653A5891A...|
|      2017/09/01|C6F99AB1AB370E33F...|
|      2017/09/01|0C11D5A3318FC8C9F...|
|      2017/09/01|B3E72DDF949B32913...|
|      2017/09/01|5E2D29C84D4F79BEA...|
|      2017/09/01|8911741A4DC972DB5...|


In [None]:
df_p12.sort('transaction_date').select('transaction_date', 'pan_token').show(20000)