# March 2018 Ingenico card data stitch
## April 2018
### Dr Jose M Albornoz

This notebook generates reads individual csv files containing card data for March 2018, combining then into a single file.

NOTE: the received files contain data only up to March 30st

# 1.- Import necessary modules, define SQLContext

In [1]:
# Import required modules
from pyspark.context import SparkContext
from pyspark.sql.context import SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import unix_timestamp

In [2]:
# Define SQLContext
sqlContext = SQLContext(sc)

# 2.- Generic function to load data from a csv file

In [3]:
# a function to load a colon-separated value file
def load_data_colon(filename, schema, columns = None):
    df = sqlContext.read.format('com.databricks.spark.csv').option("delimiter", ";").options(header='true'). \
    load(filename, schema = schema)
    if columns is None:
        # If no columns are given, then select all
        columns = schema.names
    return df.select(columns)

# 3.- Data schema

In [4]:
schema = StructType([ 
    StructField('store_number', IntegerType(), True), 
    StructField('terminal_number', IntegerType(), True), 
    StructField('transaction_date', StringType(), True), 
    StructField('transaction_time', IntegerType(), True), 
    StructField('transaction_amount', IntegerType(), True),
    StructField('card_scheme', StringType(), True),
    StructField('card_provider', StringType(), True),
    StructField('pan_token', StringType(), True)
])

# 4.- Load data

In [6]:
for k in range(2,32):
    
    if len(str(k)) == 1:
        day = str(k).zfill(2)
    else:
        day = str(k)
    
    name = "Ingenico/INGENICO_EXTRACT_2018_03_" + day + ".csv"
    print(name)
    
    df_tmp = load_data_colon(name, schema=schema)
    
    if k == 2:
        df_01 = df_tmp
    elif k == 3:
        df_data0 = df_01.unionAll(df_tmp)
    else:
        df_data0 = df_data0.unionAll(df_tmp)        

Ingenico/INGENICO_EXTRACT_2018_03_02.csv
Ingenico/INGENICO_EXTRACT_2018_03_03.csv
Ingenico/INGENICO_EXTRACT_2018_03_04.csv
Ingenico/INGENICO_EXTRACT_2018_03_05.csv
Ingenico/INGENICO_EXTRACT_2018_03_06.csv
Ingenico/INGENICO_EXTRACT_2018_03_07.csv
Ingenico/INGENICO_EXTRACT_2018_03_08.csv
Ingenico/INGENICO_EXTRACT_2018_03_09.csv
Ingenico/INGENICO_EXTRACT_2018_03_10.csv
Ingenico/INGENICO_EXTRACT_2018_03_11.csv
Ingenico/INGENICO_EXTRACT_2018_03_12.csv
Ingenico/INGENICO_EXTRACT_2018_03_13.csv
Ingenico/INGENICO_EXTRACT_2018_03_14.csv
Ingenico/INGENICO_EXTRACT_2018_03_15.csv
Ingenico/INGENICO_EXTRACT_2018_03_16.csv
Ingenico/INGENICO_EXTRACT_2018_03_17.csv
Ingenico/INGENICO_EXTRACT_2018_03_18.csv
Ingenico/INGENICO_EXTRACT_2018_03_19.csv
Ingenico/INGENICO_EXTRACT_2018_03_20.csv
Ingenico/INGENICO_EXTRACT_2018_03_21.csv
Ingenico/INGENICO_EXTRACT_2018_03_22.csv
Ingenico/INGENICO_EXTRACT_2018_03_23.csv
Ingenico/INGENICO_EXTRACT_2018_03_24.csv
Ingenico/INGENICO_EXTRACT_2018_03_25.csv
Ingenico/INGENIC

In [7]:
df_data0 = df_data0.sort('transaction_date', 'transaction_time')

In [8]:
df_data0.show(10)

+------------+---------------+----------------+----------------+------------------+-----------+-------------+-------------------+
|store_number|terminal_number|transaction_date|transaction_time|transaction_amount|card_scheme|card_provider|          pan_token|
+------------+---------------+----------------+----------------+------------------+-----------+-------------+-------------------+
|         599|             23|      2018/03/01|               0|               878|          S|         VISA|4751298424585222103|
|        1011|              4|      2018/03/01|               0|               298|          S|         VISA|4659419926128791813|
|         913|             20|      2018/03/01|               1|              1045|          S|         VISA|4757142953193939201|
|        1521|              2|      2018/03/01|               2|               318|          S|         VISA|4929456025967632008|
|         843|              2|      2018/03/01|               2|               529|       

In [9]:
df_data0.count()

33080275

# 5.- Write data to disk

In [None]:
df_data0.repartition(1).write.format('com.databricks.spark.csv').save('March2018.csv', header = 'true')

In [None]:
|