<a href="https://colab.research.google.com/github/joao-dias-25/dataeng-spark/blob/main/spark_streaming/example_3_api_json.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Usecase 2
- Reading data from "rate"
- Aggregating data by window time
- Checking results from query in memory

# Setting up PySpark

In [1]:
%pip install pyspark



In [8]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('Test streaming').config('spark.ui.port', '4050').getOrCreate()
sc = spark.sparkContext

In [9]:
#Get data from API - Vehicles


In [12]:

from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import requests

class ETLFlow:
    def __init__(self, spark: SparkSession) -> None:
        self.spark = spark

    def extract_from_file(self, format: str, path: str, **kwargs) -> DataFrame:
        df = self.spark.read.format(format).load(path)
        return df

    def extract_from_api(self, url: str, schema: StructType = None):
      response = requests.get(url)
      rdd = spark.sparkContext.parallelize(response.json())
      if schema:
        df = spark.read.schema(schema).json(rdd)
      else:
        df = spark.read.json(rdd)
      return df

    def load(self, df: DataFrame, format: str, path: str, **kwargs) -> None:
        df.write.mode("overwrite").format(format).save(path)

class ETLTask(ETLFlow):

    def __init__(self, spark: SparkSession) -> None:
        self.spark = spark

    def ingestion_vehicles(self):
      vehicle_schema = StructType([StructField('bearing', IntegerType(), True),
                                  StructField('block_id', StringType(), True),
                                  StructField('current_status', StringType(), True),
                                  StructField('id', StringType(), True),
                                  StructField('lat', FloatType(), True),
                                  StructField('line_id', StringType(), True),
                                  StructField('lon', FloatType(), True),
                                  StructField('pattern_id', StringType(), True),
                                  StructField('route_id', StringType(), True),
                                  StructField('schedule_relationship', StringType(), True),
                                  StructField('shift_id', StringType(), True),
                                  StructField('speed', FloatType(), True),
                                  StructField('stop_id', StringType(), True),
                                  StructField('timestamp', TimestampType(), True),
                                  StructField('trip_id', StringType(), True)])

      df = self.extract_from_api(url="https://api.carrismetropolitana.pt/vehicles", schema=vehicle_schema)
      #df = df.withColumn('date', date_format('timestamp',"HHmmss"))
      self.load(df=df, format="parquet", path="/content/vehicles")



if __name__ == '__main__':

    # init spark
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.master('local').appName('ETL Program').getOrCreate()

    print("Starting ETL program")
    etl = ETLTask(spark)

    # run tasks
    print("Running Task - Ingestion Vehicles")
    etl.ingestion_vehicles()

    #etl.enrich()

    print("ETL program completed")


Starting ETL program
Running Task - Ingestion Vehicles
ETL program completed


In [25]:
import pyspark.sql.functions as F
from pyspark.sql import DataFrame

from pyspark.sql.streaming import StreamingQuery



def save_json(df, batch_id):
  (df
   .withColumn("batch_id",F.lit(batch_id))
   .withColumn("load_time",F.current_timestamp())
   .write.mode("append")
   .json("content/output/json_file")
  )

def streaming_1(queryName: str) -> StreamingQuery:

  # Start read of file stream (csv) from input folder
  stream1 = spark.readStream.format('parquet').schema(vehicle_schema).option('header', False).load('/content/vehicles/')

  # Check if dataframe is streaming
  print(stream1.isStreaming)

  # Start write as streaming into memory
  query = (stream1.writeStream
  .format('memory')
  .queryName(queryName)
  .trigger(processingTime='5 seconds')
  .outputMode('append')
  .foreachBatch(save_json)
  .start()
  )
  return query

In [26]:
query = streaming_1("qry1")

True


In [27]:
# querying data in memory
spark.sql("select count(1) from qry1").show()

+--------+
|count(1)|
+--------+
|     425|
+--------+



In [24]:
query.stop()

In [28]:
result = spark.read.format("json").load("content/output/json_file/")
result.show(100, False)

+--------+-------+------------------------------+--------------+--------+---------+-------+------------------------+----------+----------+--------+---------------------+------------+---------+-------+------------------------+----------------------------------+
|batch_id|bearing|block_id                      |current_status|id      |lat      |line_id|load_time               |lon       |pattern_id|route_id|schedule_relationship|shift_id    |speed    |stop_id|timestamp               |trip_id                           |
+--------+-------+------------------------------+--------------+--------+---------+-------+------------------------+----------+----------+--------+---------------------+------------+---------+-------+------------------------+----------------------------------+
|0       |265    |20241123-64020071-112170000007|IN_TRANSIT_TO |44|12066|38.525196|4412   |2024-11-23T17:22:36.252Z|-8.8765135|4412_0_1  |4412_0  |SCHEDULED            |112170000007|0.0      |160211 |2024-11-23T17:08: