<a href="https://colab.research.google.com/github/joanadecaa1/data_processing/blob/main/spark/misc/read_from_api.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Read from API
- Read data from API using requests and RDDs

# Setting up PySpark

In [None]:
%pip install pyspark



In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName('Spark Course').config('spark.ui.port', '4050').getOrCreate()
sc = spark.sparkContext

# Get data from API - Vehicles

In [2]:
import requests
from pyspark.sql.types import *

def readFromAPI(url: str, schema: StructType = None):
  response = requests.get(url)
  rdd = sc.parallelize(response.json())

  if schema:
    df = spark.read.schema(schema).json(rdd)
  else:
    df = spark.read.json(rdd)
  return df

In [3]:
readFromAPI("https://api.carrismetropolitana.pt/municipalities").show()

+-----------+-------------+----+--------------------+------+---------+----------------+
|district_id|district_name|  id|                name|prefix|region_id|     region_name|
+-----------+-------------+----+--------------------+------+---------+----------------+
|         07|        Évora|0712|        Vendas Novas|    19|    PT187|Alentejo Central|
|         11|       Lisboa|1101|            Alenquer|    20|    PT16B|           Oeste|
|         11|       Lisboa|1102|   Arruda dos Vinhos|    20|    PT16B|           Oeste|
|         11|       Lisboa|1105|             Cascais|    05|    PT170|             AML|
|         11|       Lisboa|1106|              Lisboa|    06|    PT170|             AML|
|         11|       Lisboa|1107|              Loures|    07|    PT170|             AML|
|         11|       Lisboa|1109|               Mafra|    08|    PT170|             AML|
|         11|       Lisboa|1110|              Oeiras|    12|    PT170|             AML|
|         11|       Lisboa|1111|

# Vehicles

In [4]:
vehicle_schema = StructType([StructField('bearing', IntegerType(), True),
                             StructField('block_id', StringType(), True),
                             StructField('current_status', StringType(), True),
                             StructField('id', StringType(), True),
                             StructField('lat', FloatType(), True),
                             StructField('line_id', StringType(), True),
                             StructField('lon', FloatType(), True),
                             StructField('pattern_id', StringType(), True),
                             StructField('route_id', StringType(), True),
                             StructField('schedule_relationship', StringType(), True),
                             StructField('shift_id', StringType(), True),
                             StructField('speed', FloatType(), True),
                             StructField('stop_id', StringType(), True),
                             StructField('timestamp', TimestampType(), True),
                             StructField('trip_id', StringType(), True)])

vehicles = readFromAPI("https://api.carrismetropolitana.pt/vehicles", vehicle_schema)
print(vehicles.count())
vehicles.show()

374
+-------+--------------------+--------------+--------+---------+-------+----------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+
|bearing|            block_id|current_status|      id|      lat|line_id|       lon|pattern_id|route_id|schedule_relationship|    shift_id|    speed|stop_id|          timestamp|             trip_id|
+-------+--------------------+--------------+--------+---------+-------+----------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+
|     95|20241116-64020101...|   INCOMING_AT|44|12067| 38.52939|   4404|  -8.88578|  4404_0_3|  4404_0|            SCHEDULED|112290000007|      0.0| 160067|2024-11-16 16:51:54|4404_0_3|1600|163...|
|    329|20241116-64020034...|   INCOMING_AT|44|12676| 38.74072|   4725| -9.167635|  4725_0_2|  4725_0|            SCHEDULED|113040000007| 8.611111| 060459|2024-11-16 16:52:31|4725_0_2|1600|160...|
|    3

# Routes

In [6]:
routes.printSchema

In [7]:
routes.show()

+--------------------+-------+----------+------+-------+--------------------+--------------------+--------------+--------------------+----------+----------+
|     _corrupt_record|  color|facilities|    id|line_id|          localities|           long_name|municipalities|            patterns|short_name|text_color|
+--------------------+-------+----------+------+-------+--------------------+--------------------+--------------+--------------------+----------+----------+
|                NULL|#C61D23|        []|1001_0|   1001|[Alfragide, Amado...|Alfragide (Estr S...|        [1115]|[1001_0_1, 1001_0_2]|      1001|   #FFFFFF|
|                NULL|#C61D23|        []|1002_0|   1002|[Reboleira, Amado...|Reboleira (Estaçã...|        [1115]|          [1002_0_3]|      1002|   #FFFFFF|
|                NULL|#C61D23|        []|1003_0|   1003|[Amadora, Amadora...|Amadora (Estação ...|        [1115]|[1003_0_1, 1003_0_2]|      1003|   #FFFFFF|
|                NULL|#C61D23|        []|1004_0|   1004|[A

In [8]:
from pyspark.sql.functions import *
routes = readFromAPI("https://api.carrismetropolitana.pt/routes")
print(routes.count())
routes.filter(array_contains(col("localities"), "Colégio Militar") & array_contains(col("localities"), "Brandoa")).show(100, False)

1017
+---------------+-------+----------+------+-------+-----------------------------------------------------------------------------------------+------------------------------------------------------------------+--------------+--------------------+----------+----------+
|_corrupt_record|color  |facilities|id    |line_id|localities                                                                               |long_name                                                         |municipalities|patterns            |short_name|text_color|
+---------------+-------+----------+------+-------+-----------------------------------------------------------------------------------------+------------------------------------------------------------------+--------------+--------------------+----------+----------+
|NULL           |#C61D23|[]        |1706_0|1706   |[Colégio Militar, Lisboa, Amadora, Brandoa, Casal da Mira, Alfornelos, Benfica]          |Colégio Militar (Metro) - UBBO via Brandoa           

In [9]:
#------------------------------------------

In [10]:
routes.schema #Esta é a etsrutura dos dados,mas devemos sempre verificar

StructType([StructField('_corrupt_record', StringType(), True), StructField('color', StringType(), True), StructField('facilities', ArrayType(StringType(), True), True), StructField('id', StringType(), True), StructField('line_id', StringType(), True), StructField('localities', ArrayType(StringType(), True), True), StructField('long_name', StringType(), True), StructField('municipalities', ArrayType(StringType(), True), True), StructField('patterns', ArrayType(StringType(), True), True), StructField('short_name', StringType(), True), StructField('text_color', StringType(), True)])

# Questions

# Q1
- adjust ROUTE dataframe
  - use correct schema to get the data (StructField)
  - make sure _corrupt_record is removed from schema
- find the routes that pass through the localities "Brandoa" and "Colégio Militar"

### API - https://github.com/carrismetropolitana/api