In [5]:
import pyspark
from pyspark import SparkContext

sc = SparkContext()
spark = pyspark.sql.SparkSession(sc, jsparkSession=None)

In [12]:
import os

HOME = "/home/jovyan"


DATA_PATH = os.path.join(HOME, "data")
DATA_PATH

'/home/jovyan/data'

In [26]:
# Read JSON RDD
logDF = spark.read.json(os.path.join(DATA_PATH,"log.json"))

In [27]:
# Print schema
logDF.printSchema()

root
 |-- cart_data: struct (nullable = true)
 |    |-- created_time: string (nullable = true)
 |    |-- departure_district_tikicode: string (nullable = true)
 |    |-- departure_region_tikicode: string (nullable = true)
 |    |-- departure_ward_tikicode: string (nullable = true)
 |    |-- destination_address_type: string (nullable = true)
 |    |-- destination_district_tikicode: string (nullable = true)
 |    |-- destination_region_tikicode: string (nullable = true)
 |    |-- destination_ward_tikicode: string (nullable = true)
 |    |-- estimated_transit_time: long (nullable = true)
 |    |-- fulfillment_type: string (nullable = true)
 |    |-- items: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- inventory_type: string (nullable = true)
 |    |    |    |-- is_stock_available: boolean (nullable = true)
 |    |    |    |-- po_type: string (nullable = true)
 |    |    |    |-- product_type: string (nullable = true)
 |    |    |    |-- se

In [28]:
# Call an action
logDF.show()

+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+-------------+-------------+--------------------+--------------------+-----------+--------------------+
|           cart_data|       created_time|             factors|         fluentd_tag| k8s_container_image|  k8s_container_name|        k8s_pod_name| model_code|model_version|         name|          prediction|          request_id|status_code|           timestamp|
+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+-------------+-------------+--------------------+--------------------+-----------+--------------------+
|{2022-07-05 06:59...|2022-07-05 06:59:59|{4.92793402777777...|smart_pdd.prediction|asia.gcr.io/tikiv...|smart-pdd-prediction|smart-pdd-predict...|light_saber| 1.5.dropship|         null|{[{start_func, , ...|-1-

In [34]:
# Transformation

df = logDF\
        .select("request_id", "model_code", "model_version", "created_time")\
        .where("model_code = 'light_saber'")\
        .select("model_version")\
        .distinct()
df.show()

+-------------+
|model_version|
+-------------+
|          1.4|
| 1.5.dropship|
|  1.4.instock|
+-------------+



In [37]:
# Write to JSON
df.write.json(os.path.join(DATA_PATH,"model_version.json"))