<a href="https://colab.research.google.com/github/jorgeneves16/dataeng-dataprocessing/blob/main/dataprocessing_challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import *
import pyspark.sql.functions as F
from pyspark.sql.types import *
import requests
from functools import reduce # Import reduce from functools

class ETLFlow:
    def __init__(self, spark: SparkSession) -> None:
        self.spark = spark

    def extract_from_api(self, url: str, schema: StructType = None):
      response = requests.get(url)
      rdd = spark.sparkContext.parallelize(response.json())
      if schema:
        df = spark.read.schema(schema).json(rdd)
      else:
        df = spark.read.json(rdd)
      return df


    def load(self, df: DataFrame, format: str, path: str, partition_column: str = None, **kwargs) -> None:
        if partition_column:
          df.coalesce(1).write.mode("overwrite").partitionBy(partition_column).format(format).save(path)
        else:
          df.coalesce(1).write.mode("overwrite").format(format).save(path)

class ETLTask(ETLFlow):

    def __init__(self, spark: SparkSession) -> None:
        self.spark = spark

    def ingestion_lines(self):
      # schema
      lines_schema = StructType([StructField('color', StringType(), True),
                                 StructField('facilities', ArrayType(StringType(), True), True),
                                 StructField('id', StringType(), True),
                                 StructField('localities',ArrayType(StringType(), True), True),
                                 StructField('long_name', StringType(), True),
                                 StructField('municipalities', ArrayType(StringType(), True), True),
                                 StructField('patterns', ArrayType(StringType(), True), True),
                                 StructField('routes', ArrayType(StringType(), True), True),
                                 StructField('short_name', StringType(), True), StructField('text_color', StringType(), True)])
      # ingestion
      df = self.extract_from_api(url="https://api.carrismetropolitana.pt/lines", schema=lines_schema)

      #ensure only 1 file is created
      df = df.repartition(1)

      # load
      self.load(df=df, format="parquet", path="/content/lake/bronze/lines", mode="overwrite")


    def ingestion_vehicles(self):
      vehicle_schema = StructType([StructField('bearing', IntegerType(), True),
                                  StructField('block_id', StringType(), True),
                                  StructField('current_status', StringType(), True),
                                  StructField('id', StringType(), True),
                                  StructField('lat', FloatType(), True),
                                  StructField('line_id', StringType(), True),
                                  StructField('lon', FloatType(), True),
                                  StructField('pattern_id', StringType(), True),
                                  StructField('route_id', StringType(), True),
                                  StructField('schedule_relationship', StringType(), True),
                                  StructField('shift_id', StringType(), True),
                                  StructField('speed', FloatType(), True),
                                  StructField('stop_id', StringType(), True),
                                  StructField('timestamp', TimestampType(), True),
                                  StructField('trip_id', StringType(), True)])

      df = self.extract_from_api(url="https://api.carrismetropolitana.pt/vehicles", schema=vehicle_schema)

      # create date column
      # date(from_unixtime(1732305594))
      df = df.withColumn("date", expr("date(timestamp)"))

      df = df.repartition(1)

      self.load(df=df, format="parquet", path="/content/lake/bronze/vehicles", mode="overwrite", partition_column="date")


    def ingestion_municipalities(self):
      municipalities_schema = StructType([StructField('id', StringType(), True),
                                          StructField('name', StringType(), True),
                                          StructField('prefix', StringType(), True),
                                          StructField('district_id', StringType(), True),
                                          StructField('district_name', StringType(), True),
                                          StructField('region_id', StringType(), True),
                                          StructField('region_name', StringType(), True)])

      df = self.extract_from_api(url="https://api.carrismetropolitana.pt/municipalities", schema=municipalities_schema)

      df = df.repartition(1)

      self.load(df=df, format="parquet", path="/content/lake/bronze/municipalities", mode="overwrite")


    def transform_vehicles(self):
      df = spark.read.format("parquet").load("/content/lake/bronze/vehicles")
      df = df.withColumnRenamed("lat", "latitude") \
            .withColumnRenamed("lon", "longitude")



      df.show()



    # def cleansing_vehicles(self):
    #   df = self.extract_from_file(format="parquet", path="/content/lake/bronze/vehicles")

    #   # transformations
    #   df = df.withColumn("new_column", lit("test"))
    #   df = df.drop_duplicates()

    #   self.load(df=df, format="parquet", path="/content/lake/silver/vehicles")

    # def enrich(self):
    #     pass


if __name__ == '__main__':

    # init spark
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.master('local').appName('ETL Program').getOrCreate()

    print("Starting ETL program")
    etl = ETLTask(spark)

    # run tasks
    # print("Running Task - Ingestion Vehicles")
    # etl.ingestion_vehicles()

    # print("Running Task - Ingestion Lines")
    # etl.ingestion_lines()

    # print("Running Task - Ingestion Municipalities")
    # etl.ingestion_municipalities()

    print("Running Task - Transform Vehicles")
    etl.transform_vehicles()

    # print("Running Task - Cleansing Vehicles")
    # etl.cleansing_vehicles()

    #etl.enrich()

    # print("ETL program completed")

Starting ETL program
Running Task - Transform Vehicles
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+----------+
|bearing|            block_id|current_status|      id| latitude|line_id|longitude|pattern_id|route_id|schedule_relationship|    shift_id|    speed|stop_id|          timestamp|             trip_id|      date|
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+----------+
|      5|20250628-64020025...| IN_TRANSIT_TO|44|12059|38.523964|   4426|-8.894337|  4426_0_1|  4426_0|            SCHEDULED|112040000007|5.8333335| 160105|2025-06-28 09:11:56|4426_0_1|600|1000...|2025-06-28|
|    234|20250628-64020205...| IN_TRANSIT_TO|44|12642|38.522923|   4471|-8.898623|  4471_0_3|  4471_0|           

In [None]:
# check results
spark.read.parquet("/content/lake/bronze/vehicles").show()
spark.read.parquet("/content/lake/bronze/lines").show()
spark.read.parquet("/content/lake/bronze/municipalities").show()