In [1]:
import time
import requests
import datetime as dt
from multiprocessing import Process

import findspark
findspark.init()
from pathlib import Path
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, LongType
from pyspark.sql.functions import col, regexp_replace
from pyspark.sql import SparkSession

In [2]:
conf = SparkConf().setAppName("spark-streaming-micro-batch")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [3]:
! hadoop fs -mkdir -p /user/root/onibus/batch

In [4]:
! hadoop fs -mkdir -p /user/root/onibus/stream

In [5]:
BASE_URL = "https://dados.mobilidade.rio/gps/sppo?" \
           "dataInicial={dt_inicial}+{hora_inicial}&dataFinal={dt_final}+{hora_final}"

In [6]:
def call_bus_api(interval=60):
    offset = dt.timedelta(hours=3)
    dtf = dt.datetime.now() - offset
    dti = dtf - dt.timedelta(seconds=interval)
    dt_inicial = dti.strftime("%Y-%m-%d")
    dt_final = dtf.strftime("%Y-%m-%d")
    hora_inicial = dti.strftime("%H:%M:%S")
    hora_final = dtf.strftime("%H:%M:%S")
    ret = requests.get(
        BASE_URL.format(
            dt_inicial=dt_inicial, dt_final=dt_final, hora_inicial=hora_inicial, hora_final=hora_final
        )
    )
    rdd = spark.sparkContext.parallelize([ret.text])
    df = spark.read.json(rdd)
    df.write.json(f"/user/root/onibus/stream/{dtf.strftime('%Y%m%d_%H%M%S')}")

In [7]:
def handle_stream(bdf, ctrl):
    offset = dt.timedelta(hours=3)
    dtt = dt.datetime.now() - offset
    df = bdf.withColumn(
        "latitude", regexp_replace(col("latitude"), ",", ".").cast(FloatType())
    ).withColumn(
        "longitude", regexp_replace(col("longitude"), ",", ".").cast(FloatType())
    ).withColumn(
        "datahora", col("datahora").cast(LongType())
    ).withColumn(
        "velocidade", col("velocidade").cast(IntegerType())
    ).withColumn(
        "datahoraenvio", col("datahoraenvio").cast(LongType())
    ).withColumn(
        "datahoraservidor", col("datahoraservidor").cast(LongType())
    )
    df.write.csv(f"/user/root/onibus/batch/{dtt.strftime('%Y%m%d_%H%M%S')}")

In [8]:
schema = StructType([
    StructField("ordem", StringType(), True),
    StructField("latitude", StringType(), True),
    StructField("longitude", StringType(), True),
    StructField("datahora", StringType(), True),
    StructField("velocidade", StringType(), True),
    StructField("linha", StringType(), True),
    StructField("datahoraenvio", StringType(), True),
    StructField("datahoraservidor", StringType(), True),
])

In [9]:
def loop(max_iter=12):
    idx = 1
    while True:
        print(f"{idx} - fetching API")
        call_bus_api()
        if idx == max_iter:
            break
        time.sleep(60)
        idx += 1

In [10]:
sdf = spark.readStream.schema(schema).json("/user/root/onibus/stream/*/*")

In [11]:
stream_handle = sdf.writeStream                         \
                   .outputMode("append")                \
                   .foreachBatch(handle_stream)         \
                   .trigger(processingTime="2 minutes") \
                   .start()

In [12]:
stream_handle.isActive

True

In [13]:
stream_handle.status

{'message': 'Initializing sources',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [21]:
stream_handle.lastProgress

{'id': '56b1ed67-31c6-4fe5-9e5d-c6760956fc90',
 'runId': '53dc5900-ee2a-4bff-9c42-b7975df29124',
 'name': None,
 'timestamp': '2023-06-06T00:56:00.000Z',
 'batchId': 0,
 'numInputRows': 6932,
 'inputRowsPerSecond': 92.96462194565888,
 'processedRowsPerSecond': 1226.0346657233817,
 'durationMs': {'addBatch': 4921,
  'getBatch': 237,
  'getOffset': 175,
  'queryPlanning': 147,
  'triggerExecution': 5654,
  'walCommit': 97},
 'stateOperators': [],
 'sources': [{'description': 'FileStreamSource[hdfs://node-master:9000/user/root/onibus/stream/*/*]',
   'startOffset': None,
   'endOffset': {'logOffset': 0},
   'numInputRows': 6932,
   'inputRowsPerSecond': 92.96462194565888,
   'processedRowsPerSecond': 1226.0346657233817}],
 'sink': {'description': 'ForeachBatchSink'}}

In [15]:
# stream_handle.stop()

In [16]:
p = Process(target=loop)
p.start()

1 - fetching API
2 - fetching API
3 - fetching API
4 - fetching API
5 - fetching API
6 - fetching API
7 - fetching API
