In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, IntegerType, DecimalType, StringType, DateType, TimestampType
from decimal import Decimal
from datetime import datetime
spark = SparkSession.builder.appName('DataIngestion').getOrCreate()
sc = spark.sparkContext

In [0]:
dbutils.fs.mount(
  source = "wasbs://source@sbguidedcapstorage.blob.core.windows.net/",
  mount_point = "/mnt/source",
  extra_configs  = {"fs.azure.account.key.sbguidedcapstorage.blob.core.windows.net" : dbutils.secrets.get('key-vault-scope', 'sbguidedcapstorage')}
)

In [0]:
CommonEvent = Row('trade_dt', 'rec_type', 'symbol', 'exchange', 'event_tm', 'event_seq_nb', 'arrival_tm', 'trade_pr', 'bid_pr', 'bid_size', 'ask_pr', 'ask_size', 'partition', 'raw')

In [0]:
CommonEventSchema = StructType([
                StructField('trade_dt', DateType(), True),
                StructField('rec_type', StringType(), True),
                StructField('symbol', StringType(), True),
                StructField('exchange', StringType(), True),
                StructField('event_tm', TimestampType(), True),
                StructField('event_seq_nb', IntegerType(), True),
                StructField('arrival_tm', TimestampType(), True),
                StructField('trade_pr', DecimalType(), True),
                StructField('bid_pr', DecimalType(), True),
                StructField('bid_size', IntegerType(), True),
                StructField('ask_pr', DecimalType(), True),
                StructField('ask_size', IntegerType(), True),
                StructField('partition', StringType(), True),
                StructField('raw', StringType(), True)
])

In [0]:
rawcsv = sc.textFile("/mnt/source/data/csv/2020-08-05/NYSE/part-00000-5e4ced0a-66e2-442a-b020-347d0df4df8f-c000.txt")

In [0]:
rawcsv.collect()

['2020-08-05,2020-08-05 09:30:00.0,Q,SYMA,2020-08-05 09:34:51.505,1,NYSE,75.30254839137037,100,75.35916738004924,100',
 '2020-08-05,2020-08-05 09:30:00.0,Q,SYMA,2020-08-05 09:40:52.586,2,NYSE,77.20874619466693,100,78.90918015646369,100',
 '2020-08-05,2020-08-05 09:30:00.0,Q,SYMA,2020-08-05 09:50:04.681,3,NYSE,77.15973273251218,100,77.3320469411047,100',
 '2020-08-05,2020-08-05 09:30:00.0,Q,SYMA,2020-08-05 09:57:46.343,4,NYSE,79.29977331004093,100,80.08399307353596,100',
 '2020-08-05,2020-08-05 09:30:00.0,Q,SYMA,2020-08-05 10:06:50.886,5,NYSE,77.8634951217078,100,78.30821537434917,100',
 '2020-08-05,2020-08-05 09:30:00.0,Q,SYMA,2020-08-05 10:11:43.839,6,NYSE,78.74799689143143,100,80.32905299746953,100',
 '2020-08-05,2020-08-05 09:30:00.0,Q,SYMA,2020-08-05 10:17:02.044,7,NYSE,77.16837620945849,100,77.97598027909252,100',
 '2020-08-05,2020-08-05 09:30:00.0,Q,SYMA,2020-08-05 10:23:58.216,8,NYSE,78.06551496692607,100,78.80516026628197,100',
 '2020-08-05,2020-08-05 09:30:00.0,Q,SYMA,2020-08-

In [0]:
def parse_csv(line):
    items = line.split(',')
    type_pos = 2
    try:
        if items[type_pos] == "T":
            event = CommonEvent(datetime.strptime(items[0], '%Y-%m-%d'), 
                               items[2], 
                               items[3], 
                               items[6], 
                               datetime.strptime(items[4], "%Y-%m-%d %H:%M:%S.%f"), 
                               int(items[5]), 
                               datetime.strptime(items[1], "%Y-%m-%d %H:%M:%S.%f"), 
                               Decimal(items[7]),
                               None, 
                               int(items[8]),
                               None,
                               None,
                               "T",
                               None)
            return event
        elif items[type_pos] == "Q":
            event = CommonEvent(datetime.strptime(items[0], '%Y-%m-%d'),
                               items[2],
                               items[3],
                               items[6],
                               datetime.strptime(items[4], "%Y-%m-%d %H:%M:%S.%f"), 
                               int(items[5]), 
                               datetime.strptime(items[1], "%Y-%m-%d %H:%M:%S.%f"), 
                               None,
                               Decimal(items[7]),
                               int(items[8]),
                               Decimal(items[9]),
                               int(items[10]),
                               "Q",
                               None)
            return event
    except Exception as e:
        return CommonEvent(None,
                           None,
                           None,
                           None,
                           None,
                           None,
                           None,
                           None,
                           None,
                           None,
                           None,
                           None,
                           "B",
                           str(e))

In [0]:
parsedcsv = rawcsv.map(lambda line: parse_csv(line))

In [0]:
parsedcsv.collect()

[Row(trade_dt=datetime.datetime(2020, 8, 5, 0, 0), rec_type='Q', symbol='SYMA', exchange='NYSE', event_tm=datetime.datetime(2020, 8, 5, 9, 34, 51, 505000), event_seq_nb=1, arrival_tm=datetime.datetime(2020, 8, 5, 9, 30), trade_pr=None, bid_pr=Decimal('75.30254839137037'), bid_size=100, ask_pr=Decimal('75.35916738004924'), ask_size=100, partition='Q', raw=None),
 Row(trade_dt=datetime.datetime(2020, 8, 5, 0, 0), rec_type='Q', symbol='SYMA', exchange='NYSE', event_tm=datetime.datetime(2020, 8, 5, 9, 40, 52, 586000), event_seq_nb=2, arrival_tm=datetime.datetime(2020, 8, 5, 9, 30), trade_pr=None, bid_pr=Decimal('77.20874619466693'), bid_size=100, ask_pr=Decimal('78.90918015646369'), ask_size=100, partition='Q', raw=None),
 Row(trade_dt=datetime.datetime(2020, 8, 5, 0, 0), rec_type='Q', symbol='SYMA', exchange='NYSE', event_tm=datetime.datetime(2020, 8, 5, 9, 50, 4, 681000), event_seq_nb=3, arrival_tm=datetime.datetime(2020, 8, 5, 9, 30), trade_pr=None, bid_pr=Decimal('77.15973273251218'), 

In [0]:
csvdata = spark.createDataFrame(parsedcsv, schema=CommonEventSchema)

In [0]:
csvdata.groupby(csvdata.partition).count().show()

+---------+-----+
|partition|count|
+---------+-----+
|        Q|  270|
|        T|   30|
+---------+-----+



In [0]:
rawjson = sc.textFile("/mnt/source/data/json/2020-08-05/NASDAQ/part-00000-c6c48831-3d45-4887-ba5f-82060885fc6c-c000.txt")

In [0]:
rawjson.collect()

['{"trade_dt":"2020-08-05","file_tm":"2020-08-05 09:30:00.000","event_type":"Q","symbol":"SYMA","event_tm":"2020-08-05 09:36:55.284","event_seq_nb":1,"exchange":"NASDAQ","bid_pr":76.10016521142818,"bid_size":100,"ask_pr":77.9647975908747,"ask_size":100}',
 '{"trade_dt":"2020-08-05","file_tm":"2020-08-05 09:30:00.000","event_type":"Q","symbol":"SYMA","event_tm":"2020-08-05 09:42:32.247","event_seq_nb":2,"exchange":"NASDAQ","bid_pr":75.44372945251948,"bid_size":100,"ask_pr":75.94452858561046,"ask_size":100}',
 '{"trade_dt":"2020-08-05","file_tm":"2020-08-05 09:30:00.000","event_type":"Q","symbol":"SYMA","event_tm":"2020-08-05 09:48:06.767","event_seq_nb":3,"exchange":"NASDAQ","bid_pr":78.84798564828422,"bid_size":100,"ask_pr":80.69114407667608,"ask_size":100}',
 '{"trade_dt":"2020-08-05","file_tm":"2020-08-05 09:30:00.000","event_type":"Q","symbol":"SYMA","event_tm":"2020-08-05 09:53:09.803","event_seq_nb":4,"exchange":"NASDAQ","bid_pr":74.98336890552693,"bid_size":100,"ask_pr":76.162565

In [0]:
import json
def parse_json(line):
    try:
        line_obj = json.loads(line)
        if line_obj['event_type'] == 'T':
            event = CommonEvent(datetime.strptime(line_obj['trade_dt'], '%Y-%m-%d'),
                                line_obj['event_type'],
                                line_obj['symbol'],
                                line_obj['exchange'],
                                datetime.strptime(line_obj['event_tm'], "%Y-%m-%d %H:%M:%S.%f"), 
                                int(line_obj['event_seq_nb']),
                                datetime.strptime(line_obj['file_tm'],"%Y-%m-%d %H:%M:%S.%f"), 
                                Decimal(line_obj['price']),
                                None,
                                int(line_obj['size']),
                                None,
                                None,
                                'T',
                                None
            )
            return event
        elif line_obj['event_type'] == 'Q':
            event = CommonEvent(datetime.strptime(line_obj['trade_dt'], '%Y-%m-%d'),
                                line_obj['event_type'],
                                line_obj['symbol'],
                                line_obj['exchange'],
                                datetime.strptime(line_obj['event_tm'], "%Y-%m-%d %H:%M:%S.%f"), 
                                int(line_obj['event_seq_nb']),
                                datetime.strptime(line_obj['file_tm'],"%Y-%m-%d %H:%M:%S.%f"), 
                                None,
                                Decimal(line_obj['bid_pr']),
                                int(line_obj['bid_size']),
                                Decimal(line_obj['ask_pr']),
                                int(line_obj['ask_size']),
                                'Q',
                                None
            )
            return event
    except Exception as e:
        return CommonEvent(None,
                           None,
                           None,
                           None,
                           None,
                           None,
                           None,
                           None,
                           None,
                           None,
                           None,
                           None,
                           "B",
                           line)


In [0]:
parsedjson = rawjson.map(lambda line: parse_json(line))

In [0]:
parsedjson.collect()

[Row(trade_dt=datetime.datetime(2020, 8, 5, 0, 0), rec_type='Q', symbol='SYMA', exchange='NASDAQ', event_tm=datetime.datetime(2020, 8, 5, 9, 36, 55, 284000), event_seq_nb=1, arrival_tm=datetime.datetime(2020, 8, 5, 9, 30), trade_pr=None, bid_pr=Decimal('76.1001652114281768035652930848300457000732421875'), bid_size=100, ask_pr=Decimal('77.9647975908746957429684698581695556640625'), ask_size=100, partition='Q', raw=None),
 Row(trade_dt=datetime.datetime(2020, 8, 5, 0, 0), rec_type='Q', symbol='SYMA', exchange='NASDAQ', event_tm=datetime.datetime(2020, 8, 5, 9, 42, 32, 247000), event_seq_nb=2, arrival_tm=datetime.datetime(2020, 8, 5, 9, 30), trade_pr=None, bid_pr=Decimal('75.443729452519477263194858096539974212646484375'), bid_size=100, ask_pr=Decimal('75.9445285856104561617030412890017032623291015625'), ask_size=100, partition='Q', raw=None),
 Row(trade_dt=datetime.datetime(2020, 8, 5, 0, 0), rec_type='Q', symbol='SYMA', exchange='NASDAQ', event_tm=datetime.datetime(2020, 8, 5, 9, 48, 6,

In [0]:
jsondata = spark.createDataFrame(parsedjson, schema=CommonEventSchema)

In [0]:
jsondata.groupby(jsondata.partition).count().show()

+---------+-----+
|partition|count|
+---------+-----+
|        Q|  270|
|        T|   30|
+---------+-----+



In [0]:
jsondata.show()

+----------+--------+------+--------+--------------------+------------+-------------------+--------+------+--------+------+--------+---------+----+
|  trade_dt|rec_type|symbol|exchange|            event_tm|event_seq_nb|         arrival_tm|trade_pr|bid_pr|bid_size|ask_pr|ask_size|partition| raw|
+----------+--------+------+--------+--------------------+------------+-------------------+--------+------+--------+------+--------+---------+----+
|2020-08-05|       Q|  SYMA|  NASDAQ|2020-08-05 09:36:...|           1|2020-08-05 09:30:00|    NULL|    76|     100|    78|     100|        Q|NULL|
|2020-08-05|       Q|  SYMA|  NASDAQ|2020-08-05 09:42:...|           2|2020-08-05 09:30:00|    NULL|    75|     100|    76|     100|        Q|NULL|
|2020-08-05|       Q|  SYMA|  NASDAQ|2020-08-05 09:48:...|           3|2020-08-05 09:30:00|    NULL|    79|     100|    81|     100|        Q|NULL|
|2020-08-05|       Q|  SYMA|  NASDAQ|2020-08-05 09:53:...|           4|2020-08-05 09:30:00|    NULL|    75|     

In [0]:
def get_txt_files(ls_path):
    dir_stack = []
    dir_stack.append(ls_path)
    file_stack = []
    while len(dir_stack) > 0:
        current_folder = dir_stack.pop(0)
        for file in dbutils.fs.ls(current_folder):
            if file.isDir() and file.path != current_folder:
                dir_stack.append(file.path)
            elif file.path.endswith('.txt'):
                file_stack.append(file.path)
    return file_stack

In [0]:
dbutils.fs.mount(
  source = "wasbs://parsed@sbguidedcapstorage.blob.core.windows.net/",
  mount_point = "/mnt/parsed",
  extra_configs  = {"fs.azure.account.key.sbguidedcapstorage.blob.core.windows.net" : dbutils.secrets.get('key-vault-scope', 'sbguidedcapstorage')}
)

True

In [0]:
for path in get_txt_files('/mnt/source/data/csv/'):
    rawcsv = sc.textFile(path)
    parsedcsv = rawcsv.map(lambda line: parse_csv(line))
    csvdata = spark.createDataFrame(parsedcsv, schema=CommonEventSchema)
    csvdata.write.partitionBy('partition').mode('overwrite').parquet('/mnt/parsed/')

In [0]:
for path in get_txt_files('/mnt/source/data/json/'):
    rawjson = sc.textFile(path)
    parsedjson = rawjson.map(lambda line: parse_json(line))
    jsondata = spark.createDataFrame(parsedjson, schema=CommonEventSchema)
    jsondata.write.partitionBy('partition').mode('overwrite').parquet('/mnt/parsed/')

In [0]:
dbutils.fs.ls('/mnt/parsed/partition=Q')

[FileInfo(path='dbfs:/mnt/parsed/partition=Q/_SUCCESS', name='_SUCCESS', size=0, modificationTime=1712442343000),
 FileInfo(path='dbfs:/mnt/parsed/partition=Q/_committed_5514179102496910154', name='_committed_5514179102496910154', size=224, modificationTime=1712442339000),
 FileInfo(path='dbfs:/mnt/parsed/partition=Q/_committed_5876140355702659561', name='_committed_5876140355702659561', size=423, modificationTime=1712442341000),
 FileInfo(path='dbfs:/mnt/parsed/partition=Q/_committed_8012874387339221082', name='_committed_8012874387339221082', size=423, modificationTime=1712442343000),
 FileInfo(path='dbfs:/mnt/parsed/partition=Q/_committed_8546968045100134472', name='_committed_8546968045100134472', size=434, modificationTime=1712442340000),
 FileInfo(path='dbfs:/mnt/parsed/partition=Q/_started_5514179102496910154', name='_started_5514179102496910154', size=0, modificationTime=1712442338000),
 FileInfo(path='dbfs:/mnt/parsed/partition=Q/_started_5876140355702659561', name='_started_5