In [58]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

import os

In [59]:

# Initialize a SparkSession
spark = SparkSession.builder \
    .appName("reader") \
    .master("local[*]") \
    .getOrCreate()

In [60]:
# Define the schema of the CSV file, if known.
# If the CSV file has a header, you can let Spark infer the schema automatically.
schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("symbol", StringType(), True),
    StructField("order_side", StringType(), True),
    StructField("size", DoubleType(), True),
    StructField("price", DoubleType(), True),
    StructField("status", StringType(), True),
    StructField("created_at", IntegerType(), True),
    StructField("total", DoubleType(), True),
    StructField("cum_sum", DoubleType(), True),
])

In [61]:
# Path to your local CSV file
this_script_dir = os.getcwd()
output_dir_path_string = os.path.join(
    os.path.dirname(
        this_script_dir
    ),
    '..',
    'src',
    'main',
    'scala',
    'org',
    'pintu',
    'output',
    'date_partition=2023-11-06'
)

source_dir_path_string = os.path.join(
    os.path.dirname(
        this_script_dir
    ),
    'producer',
    'order_book_mockup.csv'
)

In [62]:
# Read the CSV file into a DataFrame
# df = spark.read.parquet(
#     path=output_dir_path_string,
#     # schema=schema,  # Comment this line if you want Spark to infer the schema automatically
# )

# Read the parquety data
df = spark.read.parquet(output_dir_path_string)

# Read Source data
source_df = spark.read.csv(path=source_dir_path_string, header=True)


In [48]:
source_df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- symbol: string (nullable = true)
 |-- order_side: string (nullable = true)
 |-- size: string (nullable = true)
 |-- price: string (nullable = true)
 |-- status: string (nullable = true)
 |-- created_at: string (nullable = true)



In [56]:
df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- symbol: string (nullable = true)
 |-- order_side: string (nullable = true)
 |-- size: double (nullable = true)
 |-- price: double (nullable = true)
 |-- status: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- total: double (nullable = true)
 |-- cum_sum: double (nullable = true)



In [63]:
# Show the DataFrame content
df.show()

+--------+--------+----------+----+-------+------+-------------------+------------------+------------------+
|order_id|  symbol|order_side|size|  price|status|         created_at|             total|           cum_sum|
+--------+--------+----------+----+-------+------+-------------------+------------------+------------------+
|       4|BTC-USDT|       BUY|0.74|33990.0|  OPEN|2023-09-06 11:11:48|           25152.6|           25152.6|
|       6|BTC-USDT|       BUY|0.33|33965.0|  OPEN|2023-09-06 11:15:08|          11208.45|          36361.05|
|       8|BTC-USDT|       BUY|0.15|33965.0|  OPEN|2023-09-06 11:18:28|           5094.75|           41455.8|
|      10|BTC-USDT|       BUY|0.87|33965.0|  OPEN|2023-09-06 11:21:48|          29549.55|          71005.35|
|      15|BTC-USDT|       BUY| 0.4|33960.0|  OPEN|2023-09-06 11:30:08|           13584.0|          84589.35|
|      22|BTC-USDT|       BUY|0.35|33955.0|  OPEN|2023-09-06 11:41:48|          11884.25|           96473.6|
|      23|BTC-USDT|

In [64]:
df.count()
# Before: 7825
# After Filtered: 5985

5985

In [65]:
source_df.count()
# Before: 7826
# After: 7826

7826

In [66]:
filtered_status_df = df.filter(col('status') == "CLOSED")
filtered_status_df.count()
# Before: 30
# After: 0

0

In [68]:
filtered_order_side_df = df.filter(col('order_side') == "BUY")
filtered_order_side_df.count()
# SELL: 2990
# BUYL: 2995
# Total: 5985

2995

In [33]:
filtered_df = source_df.filter(col('order_id') == 110)
filtered_df.show()

# Wed, 06 Sep 2023 07:08:28 GMT
# Wednesday, September 6, 2023 6:38:28 AM

+--------+--------+----------+----+-------+------+----------+
|order_id|  symbol|order_side|size|  price|status|created_at|
+--------+--------+----------+----+-------+------+----------+
|     110|BTC-USDT|       BUY|0.93|33975.0|  OPEN|1693984108|
|     110|    NULL|      NULL|NULL|   NULL|CLOSED|1693982308|
+--------+--------+----------+----+-------+------+----------+

