# McDonald's Card Data Analysis - v6: Toshiba card tokens - Stitching
## April 2018
### Dr Jose M Albornoz

This notebook stitches together files for Toshiba tokens and their frequencies.

# 1.- Import necessary modules, define SQLContext

In [1]:
# Import required modules
from pyspark.context import SparkContext
from pyspark.sql.context import SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import unix_timestamp

In [2]:
sc.stop()
conf = SparkConf()
conf.set("spark.executor.memory", "8g")
conf.set("spark.driver.memory", "8g")
sc = SparkContext(conf=conf)

In [3]:
# Define SQLContext
sqlContext = SQLContext(sc)

# 2.- Generic function to load data from a csv file

In [4]:
def load_data(filename, schema, columns = None):
    df = sqlContext.read.format('com.databricks.spark.csv').option("delimiter", ";").options(header='false'). \
    load(filename, schema = schema)
    if columns is None:
        # If no columns are given, then select all
        columns = schema.names
    return df.select(columns)

# 3.- Data schema

In [5]:
schema = StructType([ 
    StructField('pan_token', StringType(), True), 
    StructField('Frequency', StringType(), True)   
])

# 4.- Load data

In [6]:
df_HF = sqlContext.read.csv("Toshiba_tokens_HF.csv/part-00000-61f8df92-3108-46c3-b1a2-50001f21b9b5-c000.csv", \
                            header=True, mode="DROPMALFORMED", schema=schema)

In [7]:
df_MF = sqlContext.read.csv("Toshiba_tokens_MF.csv/part-00000-38d3f592-65ff-4994-ba12-7fac904fc245-c000.csv", \
                            header=True, mode="DROPMALFORMED", schema=schema)

In [8]:
df_IF = sqlContext.read.csv("Toshiba_tokens_IF.csv/part-00000-2fb0cf69-f9e4-44f1-b7f5-1bb0baeea649-c000.csv", \
                            header=True, mode="DROPMALFORMED", schema=schema)

In [9]:
df_LF = sqlContext.read.csv("Toshiba_tokens_LF.csv/part-00000-0f592a49-7795-4128-893f-6c4f0e7e57b7-c000.csv", \
                            header=True, mode="DROPMALFORMED", schema=schema)

In [11]:
df_RF = sqlContext.read.csv("Toshiba_tokens_RF.csv/part-00000-75a9a073-c3be-4d5c-87d9-94dd22b718ac-c000.csv", \
                            header=True, mode="DROPMALFORMED", schema=schema)

# 5.- Concatenate data

In [12]:
df_data0 = df_HF.unionAll(df_MF)

In [13]:
df_data0 = df_data0.unionAll(df_IF)

In [14]:
df_data0 = df_data0.unionAll(df_LF)

In [15]:
df_data0 = df_data0.unionAll(df_RF)

In [16]:
df_data0.count()

30846516

# 6.- Remove unnecessary data from memory

In [17]:
df_HF.unpersist()
df_MF.unpersist()
df_IF.unpersist()
df_LF.unpersist()
df_RF.unpersist()

DataFrame[pan_token: string, Frequency: string]

# 7.- Write concatenated dataframe to disk

In [19]:
df_data0.repartition(1).write.format('com.databricks.spark.csv').save('Toshiba_tokens.csv', header = 'true')

AnalysisException: 'path file:/root/notebooks/Codebase/McDonalds/Toshiba_tokens_RF.csv already exists.;'