<a href="https://colab.research.google.com/github/joao-dias-25/dataeng-spark/blob/main/spark/challenges/challenge_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CHALLENGE 3
##  Implement ENRICH process
- Set up path in the "lake"
  - !mkdir -p /content/lake/gold

- Read data from SILVER layer
  - Paths:
    - vehicles - path: /content/lake/silver/vehicles
    - lines - path: /content/lake/silver/lines
    - municipalities - path: /content/lake/silver/municipalities
  - Use StructFields to enforce schema

- Enrichment
  - Enrich vehicles dataset with information from the line and municipalities
    - join vehicles with lines and municipalities
      - select all columns from vehicles + lines.long_name (name: line_name, format:string) + municipalities.name (name: municipality_name, format: array)
      - Note that "municipalities.name" is an array

- Write data as PARQUET into the GOLD layer (/content/lake/gold)
  - Dataset name: vehicles_enriched
  - Partition "vehicles_enriched" by "date" column
  - Paths:
    - vehicles - path: /content/lake/gold/vehicles_enriched
  - Make sure there is only 1 single parquet created
  - Use overwrite as write mode

# Setting up PySpark

In [1]:
%pip install pyspark



In [48]:

from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import requests

class ETLFlow:
    def __init__(self, spark: SparkSession) -> None:
        self.spark = spark

    def extract_from_file(self, format: str, path: str, **kwargs) -> DataFrame:
        df = self.spark.read.format(format).load(path)
        return df

    def extract_from_api(self, url: str, schema: StructType = None):
      response = requests.get(url)
      rdd = spark.sparkContext.parallelize(response.json())
      if schema:
        df = spark.read.schema(schema).json(rdd)
      else:
        df = spark.read.json(rdd)
      return df

    def load(self, df: DataFrame, format: str, path: str, **kwargs) -> None:
        df.write.mode("overwrite").format(format).save(path)

    def load_partition(self, df: DataFrame, partition:str, format: str, path: str, **kwargs) -> None:
        df.write.mode("overwrite").partitionBy(partition).format(format).save(path)

class ETLTask(ETLFlow):

    def __init__(self, spark: SparkSession) -> None:
        self.spark = spark

    def ingestion_vehicles(self):
      vehicle_schema = StructType([StructField('bearing', IntegerType(), True),
                                  StructField('block_id', StringType(), True),
                                  StructField('current_status', StringType(), True),
                                  StructField('id', StringType(), True),
                                  StructField('lat', FloatType(), True),
                                  StructField('line_id', StringType(), True),
                                  StructField('lon', FloatType(), True),
                                  StructField('pattern_id', StringType(), True),
                                  StructField('route_id', StringType(), True),
                                  StructField('schedule_relationship', StringType(), True),
                                  StructField('shift_id', StringType(), True),
                                  StructField('speed', FloatType(), True),
                                  StructField('stop_id', StringType(), True),
                                  StructField('timestamp', TimestampType(), True),
                                  StructField('trip_id', StringType(), True)])

      df = self.extract_from_api(url="https://api.carrismetropolitana.pt/vehicles", schema=vehicle_schema)
      df = df.withColumn('date', date_format('timestamp',"yyyyMMdd"))
      self.load(df=df, format="parquet", path="/content/lake/bronze/vehicles")

    def ingestion_lines(self):

      lines_schema = StructType([
                                 StructField('_corrupt_record', StringType(), True),
                                  StructField('color', StringType(), True),
                                  StructField("facilities",  ArrayType(StringType(), True), True),
                                  StructField('id', StringType(), True),
                                  StructField("localities",  ArrayType(StringType(), True), True),
                                  StructField('long_name', StringType(), True),
                                  StructField("municipalities",  ArrayType(StringType(), True), True),
                                  StructField("patterns",  ArrayType(StringType(), True), True),
                                  StructField("routes",  ArrayType(StringType(), True), True),
                                  StructField('short_name', StringType(), True),
                                  StructField('text_color', StringType(), True)
                                  ])
      df = self.extract_from_api(url="https://api.carrismetropolitana.pt/lines", schema=lines_schema)

      self.load(df=df, format="parquet", path="/content/lake/bronze/lines")

    def ingestion_municipalities(self):
      municipalities_schema = StructType([
                                 StructField('district_id', StringType(), True),
                                  StructField('district_name', StringType(), True),
                                  StructField("id",  StringType(), True),
                                  StructField('name', StringType(), True),
                                  StructField("prefix", StringType(), True),
                                  StructField('region_id', StringType(), True),
                                  StructField("region_name",  StringType(), True),

                                  ])

      df = self.extract_from_api(url="https://api.carrismetropolitana.pt/municipalities", schema=municipalities_schema)

      self.load(df=df, format="parquet", path="/content/lake/bronze/municipalities")

    def cleansing_vehicles(self):
      df = self.extract_from_file(format="parquet", path="/content/lake/bronze/vehicles")

      # transformations
      df = df.withColumnRenamed("lat", "latitude")
      df = df.withColumnRenamed("lon", "longitude")
      df = df.drop_duplicates()
      df = df.filter(df.current_status.isNotNull())

      self.load_partition(df=df, format="parquet", partition='date' ,path="/content/lake/silver/vehicles")

    def cleansing_lines(self):
      df = self.extract_from_file(format="parquet", path="/content/lake/bronze/lines")

      # transformations
      df = df.drop_duplicates()
      df = df.filter(df.long_name.isNotNull())
      self.load(df=df, format="parquet", path="/content/lake/silver/lines")

    def cleansing_municipalities(self):
      df = self.extract_from_file(format="parquet", path="/content/lake/bronze/municipalities")

      # transformations
      df = df.drop_duplicates()
      df = df.filter(df.name.isNotNull() & df.district_name.isNotNull())
      self.load(df=df, format="parquet", path="/content/lake/silver/municipalities")



    def enrich_vehicles(self):
      df_v = self.extract_from_file(format="parquet", path="/content/lake/silver/vehicles")
      df_l = self.extract_from_file(format="parquet", path="/content/lake/silver/lines")
      df_m = self.extract_from_file(format="parquet", path="/content/lake/silver/municipalities")

      # transformations
      # transformations

      df_l = df_l.withColumnRenamed("id", "line_id")
      dict = {row['id']:row['name'] for row in df_m.collect()}

      df = df_v.join(df_l.select('line_id', 'long_name','municipalities'), on='line_id' ,how='left')
      map_col = create_map([lit(x) for i in dict.items() for x in i])
      df = df.withColumn('municipality_names', transform('municipalities', lambda x: map_col[x]))

      self.load(df=df, format="parquet", path="/content/lake/gold/vehicles")


if __name__ == '__main__':

    # init spark
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.master('local').appName('ETL Program').getOrCreate()

    print("Starting ETL program")
    etl = ETLTask(spark)

    # run tasks
    print("Running Task - Ingestion Vehicles")
    etl.ingestion_vehicles()

    print("Running Task - Ingestion lines")
    etl.ingestion_lines()

    print("Running Task - ingestion municipalities ")
    etl.ingestion_municipalities()

    print("Running Task - Cleansing Vehicles")
    etl.cleansing_vehicles()

    print("Running Task - Cleansing lines")
    etl.cleansing_lines()

    print("Running Task - Cleansing municipalities")
    etl.cleansing_municipalities()

    print("Running Task - enrich vehicles")
    etl.enrich_vehicles()

    #etl.enrich()

    print("ETL program completed")

Starting ETL program
Running Task - Ingestion Vehicles
Running Task - Ingestion lines
Running Task - ingestion municipalities 
Running Task - Cleansing Vehicles
Running Task - Cleansing lines
Running Task - Cleansing municipalities
Running Task - enrich vehicles
ETL program completed


In [47]:

df_v = ETLTask(spark).extract_from_file(format="parquet", path="/content/lake/silver/vehicles")
df_l = ETLTask(spark).extract_from_file(format="parquet", path="/content/lake/silver/lines")
df_m = ETLTask(spark).extract_from_file(format="parquet", path="/content/lake/silver/municipalities")

df_l = df_l.withColumnRenamed("id", "line_id")
dict = {row['id']:row['name'] for row in df_m.collect()}

df = df_v.join(df_l.select('line_id', 'long_name','municipalities'), on='line_id' ,how='left')

map_col = create_map([lit(x) for i in dict.items() for x in i])
df = df.withColumn('municipality_name', transform('municipalities', lambda x: map_col[x]))
#df = df.join(df_m.select('id', 'name'), df.stop_id==df_m.id ,'left'
df.printSchema()
df.show()

root
 |-- line_id: string (nullable = true)
 |-- bearing: integer (nullable = true)
 |-- block_id: string (nullable = true)
 |-- current_status: string (nullable = true)
 |-- id: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- pattern_id: string (nullable = true)
 |-- route_id: string (nullable = true)
 |-- schedule_relationship: string (nullable = true)
 |-- shift_id: string (nullable = true)
 |-- speed: float (nullable = true)
 |-- stop_id: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- trip_id: string (nullable = true)
 |-- date: integer (nullable = true)
 |-- long_name: string (nullable = true)
 |-- municipalities: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- municipality_name: array (nullable = true)
 |    |-- element: string (containsNull = true)

+-------+-------+--------------------+--------------+--------------------+---------+---------+----------+--------+--

In [20]:
from pyspark.sql.functions import *
mapping = {0: "negative", 1: "positive", 2: "name", 3: "sequel", 4: "odd"}
map_col = create_map([lit(x) for i in mapping.items() for x in i])
print(map_col)

Column<'map(0, negative, 1, positive, 2, name, 3, sequel, 4, odd)'>


In [36]:
dict = {row['id']:row['name'] for row in df_m.collect()}

dict

{'1107': 'Loures',
 '1504': 'Barreiro',
 '1101': 'Alenquer',
 '1511': 'Sesimbra',
 '0712': 'Vendas Novas',
 '1102': 'Arruda dos Vinhos',
 '1510': 'Seixal',
 '1114': 'Vila Franca de Xira',
 '1506': 'Moita',
 '1115': 'Amadora',
 '1512': 'Setúbal',
 '1109': 'Mafra',
 '1113': 'Torres Vedras',
 '1503': 'Almada',
 '1110': 'Oeiras',
 '1111': 'Sintra',
 '1508': 'Palmela',
 '1105': 'Cascais',
 '1116': 'Odivelas',
 '1502': 'Alcochete',
 '1507': 'Montijo',
 '1112': 'Sobral de Monte Agraço',
 '1106': 'Lisboa'}

In [38]:
map_col = create_map([lit(x) for i in dict.items() for x in i])
df_l = df_l.withColumn('arraydata', transform('municipalities', lambda x: map_col[x]))
df_l.show()

+---------------+-------+----------+----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+----------+--------------------+
|_corrupt_record|  color|facilities|  id|          localities|           long_name|      municipalities|            patterns|              routes|short_name|text_color|           arraydata|
+---------------+-------+----------+----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+----------+--------------------+
|           NULL|#C61D23|        []|2115|             [Mafra]|Codeçal (Tapada N...|              [1109]|[2115_0_1, 2115_0_2]|            [2115_0]|      2115|   #FFFFFF|             [Mafra]|
|           NULL|#C61D23|        []|2532|[Alverca, Vila Fr...|Alverca(Est) - Lo...|        [1114, 1107]|[2532_0_1, 2532_0_2]|            [2532_0]|      2532|   #FFFFFF|[Vila Franca de X...|
|           NULL|#C61D23|        []|3119|[Pinhal d