In [0]:
import pyspark
from pyspark.sql import SparkSession, Window
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, IntegerType, DecimalType, StringType, DateType, TimestampType
from pyspark.sql import functions as F
from decimal import Decimal
from datetime import datetime
spark = SparkSession.builder.appName('DataIngestion').getOrCreate()

In [0]:
all_data = spark.read.parquet('/mnt/parsed/')
trade_common = all_data.filter(F.col('partition')=='T')
quote_common = all_data.filter(F.col('partition')=='Q')

In [0]:
all_data.columns

['trade_dt',
 'rec_type',
 'symbol',
 'exchange',
 'event_tm',
 'event_seq_nb',
 'arrival_tm',
 'trade_pr',
 'bid_pr',
 'bid_size',
 'ask_pr',
 'ask_size',
 'raw',
 'partition']

In [0]:
trade = trade_common.select("trade_dt", "symbol", "exchange", "event_tm", "event_seq_nb", "arrival_tm", "trade_pr")

In [0]:
trade_window = Window.partitionBy(["trade_dt", "symbol", "exchange", "event_tm", "event_seq_nb"])
trade_corrected = trade.withColumn("latest_arrival", F.max("arrival_tm").over(trade_window)).filter(F.col("arrival_tm") == F.col("latest_arrival"))

In [0]:
quote_common.show(5)

+----------+--------+------+--------+--------------------+------------+-------------------+--------+------+--------+------+--------+----+---------+
|  trade_dt|rec_type|symbol|exchange|            event_tm|event_seq_nb|         arrival_tm|trade_pr|bid_pr|bid_size|ask_pr|ask_size| raw|partition|
+----------+--------+------+--------+--------------------+------------+-------------------+--------+------+--------+------+--------+----+---------+
|2020-08-05|       Q|  SYMB|    NYSE|2020-08-05 15:38:...|          51|2020-08-05 09:30:00|    NULL|    32|     100|    34|     100|NULL|        Q|
|2020-08-05|       Q|  SYMB|    NYSE|2020-08-05 15:26:...|          49|2020-08-05 09:30:00|    NULL|    34|     100|    35|     100|NULL|        Q|
|2020-08-05|       Q|  SYMB|    NYSE|2020-08-05 15:17:...|          48|2020-08-05 09:30:00|    NULL|    34|     100|    35|     100|NULL|        Q|
|2020-08-05|       Q|  SYMB|    NYSE|2020-08-05 15:11:...|          47|2020-08-05 09:30:00|    NULL|    33|     

In [0]:
quote = quote_common.select('trade_dt', 'symbol', 'exchange', 'event_tm', 'event_seq_nb', 'arrival_tm', 'bid_pr', 'bid_size', 'ask_pr', 'ask_size')

In [0]:
quote_window = Window.partitionBy(['trade_dt', 'symbol', 'exchange', 'event_tm', 'event_seq_nb', 'bid_pr', 'bid_size', 'ask_pr', 'ask_size'])
quote_corrected = quote.withColumn("latest_arrival", F.max("arrival_tm").over(quote_window)).filter(F.col("latest_arrival") == F.col("arrival_tm"))

In [0]:
quote_corrected.write.mode('overwrite').partitionBy('trade_dt').parquet('/mnt/loaded/quote/')
trade_corrected.write.mode('overwrite').partitionBy('trade_dt').parquet('mnt/loaded/trade/')