In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, IntegerType, DecimalType, StringType, DateType, TimestampType
from decimal import Decimal
from datetime import datetime
spark = SparkSession.builder.appName('DataIngestion').getOrCreate()
sc = spark.sparkContext

In [0]:
CommonEvent = Row('trade_dt', 'rec_type', 'symbol', 'exchange', 'event_tm', 'event_seq_nb', 'arrival_tm', 'trade_pr', 'bid_pr', 'bid_size', 'ask_pr', 'ask_size', 'partition', 'raw')

In [0]:
CommonEventSchema = StructType([
                StructField('trade_dt', DateType(), True),
                StructField('rec_type', StringType(), True),
                StructField('symbol', StringType(), True),
                StructField('exchange', StringType(), True),
                StructField('event_tm', TimestampType(), True),
                StructField('event_seq_nb', IntegerType(), True),
                StructField('arrival_tm', TimestampType(), True),
                StructField('trade_pr', DecimalType(), True),
                StructField('bid_pr', DecimalType(), True),
                StructField('bid_size', IntegerType(), True),
                StructField('ask_pr', DecimalType(), True),
                StructField('ask_size', IntegerType(), True),
                StructField('partition', StringType(), True),
                StructField('raw', StringType(), True)
])

In [0]:
rawcsv = sc.textFile("/mnt/source/data/csv/2020-08-05/NYSE/part-00000-5e4ced0a-66e2-442a-b020-347d0df4df8f-c000.txt")

In [0]:
def parse_csv(line):
    items = line.split(',')
    type_pos = 2
    try:
        if items[type_pos] == "T":
            event = CommonEvent(datetime.strptime(items[0], '%Y-%m-%d'), 
                               items[2], 
                               items[3], 
                               items[6], 
                               datetime.strptime(items[4], "%Y-%m-%d %H:%M:%S.%f"), 
                               int(items[5]), 
                               datetime.strptime(items[1], "%Y-%m-%d %H:%M:%S.%f"), 
                               Decimal(items[7]),
                               None, 
                               None,
                               None,
                               None,
                               "T",
                               None)
            return event
        elif items[type_pos] == "Q":
            event = CommonEvent(datetime.strptime(items[0], '%Y-%m-%d'),
                               items[2],
                               items[3],
                               items[6],
                               datetime.strptime(items[4], "%Y-%m-%d %H:%M:%S.%f"), 
                               int(items[5]), 
                               datetime.strptime(items[1], "%Y-%m-%d %H:%M:%S.%f"), 
                               None,
                               Decimal(items[7]),
                               int(items[8]),
                               Decimal(items[9]),
                               int(items[10]),
                               "Q",
                               None)
            return event
    except Exception as e:
        return CommonEvent(None,
                           None,
                           None,
                           None,
                           None,
                           None,
                           None,
                           None,
                           None,
                           None,
                           None,
                           None,
                           "B",
                           str(e))

In [0]:
parsedcsv = rawcsv.map(lambda line: parse_csv(line))

In [0]:
csvdata = spark.createDataFrame(parsedcsv, schema=CommonEventSchema)

In [0]:
rawjson = sc.textFile("/mnt/source/data/json/2020-08-05/NASDAQ/part-00000-c6c48831-3d45-4887-ba5f-82060885fc6c-c000.txt")

In [0]:
import json
def parse_json(line):
    try:
        line_obj = json.loads(line)
        if line_obj['event_type'] == 'T':
            event = CommonEvent(datetime.strptime(line_obj['trade_dt'], '%Y-%m-%d'),
                                line_obj['event_type'],
                                line_obj['symbol'],
                                line_obj['exchange'],
                                datetime.strptime(line_obj['event_tm'], "%Y-%m-%d %H:%M:%S.%f"), 
                                int(line_obj['event_seq_nb']),
                                datetime.strptime(line_obj['file_tm'],"%Y-%m-%d %H:%M:%S.%f"), 
                                Decimal(line_obj['price']),
                                None,
                                None,
                                None,
                                None,
                                'T',
                                None
            )
            return event
        elif line_obj['event_type'] == 'Q':
            event = CommonEvent(datetime.strptime(line_obj['trade_dt'], '%Y-%m-%d'),
                                line_obj['event_type'],
                                line_obj['symbol'],
                                line_obj['exchange'],
                                datetime.strptime(line_obj['event_tm'], "%Y-%m-%d %H:%M:%S.%f"), 
                                int(line_obj['event_seq_nb']),
                                datetime.strptime(line_obj['file_tm'],"%Y-%m-%d %H:%M:%S.%f"), 
                                None,
                                Decimal(line_obj['bid_pr']),
                                int(line_obj['bid_size']),
                                Decimal(line_obj['ask_pr']),
                                int(line_obj['ask_size']),
                                'Q',
                                None
            )
            return event
    except Exception as e:
        return CommonEvent(None,
                           None,
                           None,
                           None,
                           None,
                           None,
                           None,
                           None,
                           None,
                           None,
                           None,
                           None,
                           "B",
                           line)


In [0]:
parsedjson = rawjson.map(lambda line: parse_json(line))

In [0]:
jsondata = spark.createDataFrame(parsedjson, schema=CommonEventSchema)

In [0]:
def get_txt_files(ls_path):
    dir_stack = []
    dir_stack.append(ls_path)
    file_stack = []
    while len(dir_stack) > 0:
        current_folder = dir_stack.pop(0)
        for file in dbutils.fs.ls(current_folder):
            if file.isDir() and file.path != current_folder:
                dir_stack.append(file.path)
            elif file.path.endswith('.txt'):
                file_stack.append(file.path)
    return file_stack

In [0]:
for path in get_txt_files('/mnt/source/data/csv/'):
    rawcsv = sc.textFile(path)
    parsedcsv = rawcsv.map(lambda line: parse_csv(line))
    csvdata = spark.createDataFrame(parsedcsv, schema=CommonEventSchema)
    csvdata.write.partitionBy('partition').mode('append').parquet('/mnt/parsed/')

In [0]:
for path in get_txt_files('/mnt/source/data/json/'):
    rawjson = sc.textFile(path)
    parsedjson = rawjson.map(lambda line: parse_json(line))
    jsondata = spark.createDataFrame(parsedjson, schema=CommonEventSchema)
    jsondata.write.partitionBy('partition').mode('append').parquet('/mnt/parsed/')