Connected to Python 3.11.4

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, unix_timestamp

In [2]:
from pyspark.sql.types import StructType, StructField, StringType, MapType

# Define the schema for the JSON structure
json_schema = StructType([
    StructField("id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("type", StringType(), True),
    StructField("data", StructType([
        StructField("event", StringType(), True),
        StructField("result", StructType([
            StructField("message", StringType(), True)
        ]), True)
    ]), True),
    StructField("extra", StructType([
        StructField("headers", MapType(StringType(), StringType()), True)
    ]), True),
    StructField("session", StructType([
            StructField("id", StringType(), True),
            StructField("start", StringType(), True),
            StructField("end", StringType(), True),
        ]), True)
])


In [3]:
# Create a Spark session
spark = SparkSession.builder.appName("Example").config("spark.sql.legacy.timeParserPolicy", "LEGACY").getOrCreate()

# Sample JSON data
json_data = """
{
    "id": "rfc-recommendation-model-prod-id",
    "name": "rfc-recommendation-model-prod",
    "type": "EVENT",
    "data": {
        "event": "PREDICT_COMPLETED",
        "result": {
        "message": "pricejump endpoint completed"
        }
    },
    "session": {
        "id": "788b0cb26eb0469eb6c37b598948a2e4",
        "start": "2023-11-28 21:24:11.480165",
        "end": "2023-11-28 21:24:20.480165"
    },
    "extra": {
        "headers": {
        "Host": "svc-rfc-recommendation-v2.gvd-services.svc.cluster.local:5000",
        "Project-ID": "project-id-3"
        }
    }
}
"""

# Read JSON data with the specified schema
df = spark.read.json(spark.sparkContext.parallelize([json_data]), schema=json_schema)
selected_columns = ["id", "name", "type", "session.id", "session.start", "session.end", "data.event", "extra.headers.Project-ID"]
final_stream_df = df.select(*selected_columns)
final_stream_df = final_stream_df.withColumn("duration", (unix_timestamp("end") - unix_timestamp("start")).cast("int") * 1000)

final_stream_df.show()


# Access data within the "headers" field under "extra"
# headers_df = df.select("extra.headers")

# Show the result
# headers_df.show(truncate=False)


23/11/30 14:57:54 WARN Utils: Your hostname, codespaces-e43946 resolves to a loopback address: 127.0.0.1; using 172.16.5.4 instead (on interface eth0)
23/11/30 14:57:54 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/30 14:57:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

+--------------------+--------------------+-----+--------------------+--------------------+--------------------+-----------------+------------+--------+
|                  id|                name| type|                  id|               start|                 end|            event|  Project-ID|duration|
+--------------------+--------------------+-----+--------------------+--------------------+--------------------+-----------------+------------+--------+
|rfc-recommendatio...|rfc-recommendatio...|EVENT|788b0cb26eb0469eb...|2023-11-28 21:24:...|2023-11-28 21:24:...|PREDICT_COMPLETED|project-id-3|    9000|
+--------------------+--------------------+-----+--------------------+--------------------+--------------------+-----------------+------------+--------+

