In [1]:
import time
import requests
import datetime as dt
from multiprocessing import Process

import findspark
findspark.init()
from pathlib import Path
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, LongType
from pyspark.sql.functions import col, regexp_replace
from pyspark.sql import SparkSession

In [2]:
conf = SparkConf().setAppName("spark-streaming-micro-batch")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [3]:
def shape(df):
    return (df.count(), len(df.columns))

In [4]:
! hadoop fs -mkdir -p /user/root/onibus/sink

In [5]:
! hadoop fs -mkdir -p /user/root/onibus/source

In [6]:
! hadoop fs -mkdir -p /user/root/onibus/checkpoint

In [7]:
! hadoop fs -ls /user/root/onibus/

Found 3 items
drwxr-xr-x   - root supergroup          0 2023-08-25 00:04 /user/root/onibus/checkpoint
drwxr-xr-x   - root supergroup          0 2023-08-25 00:03 /user/root/onibus/sink
drwxr-xr-x   - root supergroup          0 2023-08-25 00:04 /user/root/onibus/source


In [8]:
BASE_URL = "https://dados.mobilidade.rio/gps/sppo?" \
           "dataInicial={dt_inicial}+{hora_inicial}&dataFinal={dt_final}+{hora_final}"

In [9]:
def call_bus_api(interval=60):
    offset = dt.timedelta(hours=3)
    dtf = dt.datetime.now() - offset
    dti = dtf - dt.timedelta(seconds=interval)
    dt_inicial = dti.strftime("%Y-%m-%d")
    dt_final = dtf.strftime("%Y-%m-%d")
    hora_inicial = dti.strftime("%H:%M:%S")
    hora_final = dtf.strftime("%H:%M:%S")
    ret = requests.get(
        BASE_URL.format(
            dt_inicial=dt_inicial, dt_final=dt_final, hora_inicial=hora_inicial, hora_final=hora_final
        )
    )
    rdd = spark.sparkContext.parallelize([ret.text])
    df = spark.read.json(rdd)
    df.write.json(f"/user/root/onibus/source/{dtf.strftime('%Y%m%d_%H%M%S')}")

In [10]:
def handle_stream(bdf, ctrl):
    offset = dt.timedelta(hours=3)
    dtt = dt.datetime.now() - offset
    df = bdf.withColumn(
        "latitude", regexp_replace(col("latitude"), ",", ".").cast(FloatType())
    ).withColumn(
        "longitude", regexp_replace(col("longitude"), ",", ".").cast(FloatType())
    ).withColumn(
        "datahora", col("datahora").cast(LongType())
    ).withColumn(
        "velocidade", col("velocidade").cast(IntegerType())
    ).withColumn(
        "datahoraenvio", col("datahoraenvio").cast(LongType())
    ).withColumn(
        "datahoraservidor", col("datahoraservidor").cast(LongType())
    )
    df.write.csv(f"/user/root/onibus/sink/{dtt.strftime('%Y%m%d_%H%M%S')}")

In [11]:
schema = StructType([
    StructField("ordem", StringType(), True),
    StructField("latitude", StringType(), True),
    StructField("longitude", StringType(), True),
    StructField("datahora", StringType(), True),
    StructField("velocidade", StringType(), True),
    StructField("linha", StringType(), True),
    StructField("datahoraenvio", StringType(), True),
    StructField("datahoraservidor", StringType(), True),
])

In [12]:
def loop(max_iter=10):
    idx = 1
    while True:
        print(f"{idx} - fetching API")
        call_bus_api()
        if idx == max_iter:
            break
        time.sleep(60)
        idx += 1

In [13]:
sdf = spark.readStream.schema(schema).json("/user/root/onibus/source/*/*")

In [55]:
stream_handle = sdf.writeStream                                                  \
                   .outputMode("append")                                         \
                   .foreachBatch(handle_stream)                                  \
                   .trigger(processingTime="3 minutes")                          \
                   .option("checkpointLocation", "/user/root/onibus/checkpoint") \
                   .start()

In [56]:
stream_handle.isActive

True

In [57]:
stream_handle.status

{'message': 'Processing new data',
 'isDataAvailable': True,
 'isTriggerActive': True}

#### Batch control

- **batchID** `0` => `perdemos - antes da interrupção - 6_090`
- **batchID** `1` => `7_474`
- **batchID** `2` => `25_059`
- **batchID** `3` => `17_384`
- **batchID** `4` => previsto: `15_482` / real: `15_482`
- **batchID** `5` => `0`
- **total** `49_917 / 56_007`

In [68]:
stream_handle.lastProgress

{'id': 'fd28bb24-84b6-45a9-920f-a23b4a07721f',
 'runId': '269cbe2a-a507-4078-acde-69b15140cbca',
 'name': None,
 'timestamp': '2023-08-25T00:30:00.001Z',
 'batchId': 5,
 'numInputRows': 0,
 'inputRowsPerSecond': 0.0,
 'processedRowsPerSecond': 0.0,
 'durationMs': {'getOffset': 3669, 'triggerExecution': 3753},
 'stateOperators': [],
 'sources': [{'description': 'FileStreamSource[hdfs://node-master:9000/user/root/onibus/source/*/*]',
   'startOffset': {'logOffset': 4},
   'endOffset': {'logOffset': 4},
   'numInputRows': 0,
   'inputRowsPerSecond': 0.0,
   'processedRowsPerSecond': 0.0}],
 'sink': {'description': 'ForeachBatchSink'}}

In [69]:
stream_handle.recentProgress

[{'id': 'fd28bb24-84b6-45a9-920f-a23b4a07721f',
  'runId': '269cbe2a-a507-4078-acde-69b15140cbca',
  'name': None,
  'timestamp': '2023-08-25T00:26:55.603Z',
  'batchId': 4,
  'numInputRows': 15482,
  'processedRowsPerSecond': 2030.4262295081967,
  'durationMs': {'addBatch': 2674,
   'getBatch': 140,
   'getOffset': 3366,
   'queryPlanning': 63,
   'triggerExecution': 7625,
   'walCommit': 550},
  'stateOperators': [],
  'sources': [{'description': 'FileStreamSource[hdfs://node-master:9000/user/root/onibus/source/*/*]',
    'startOffset': {'logOffset': 3},
    'endOffset': {'logOffset': 4},
    'numInputRows': 15482,
    'processedRowsPerSecond': 2030.4262295081967}],
  'sink': {'description': 'ForeachBatchSink'}},
 {'id': 'fd28bb24-84b6-45a9-920f-a23b4a07721f',
  'runId': '269cbe2a-a507-4078-acde-69b15140cbca',
  'name': None,
  'timestamp': '2023-08-25T00:27:03.229Z',
  'batchId': 5,
  'numInputRows': 0,
  'inputRowsPerSecond': 0.0,
  'processedRowsPerSecond': 0.0,
  'durationMs': {'

In [70]:
stream_handle.stop()

> interrupt cell without stopping kernel: `ESC + I (2x)`<br/>
> by Pedro Nora & ChatGPT

In [27]:
p = Process(target=loop)
p.start()

1 - fetching API
2 - fetching API
3 - fetching API
4 - fetching API
5 - fetching API
6 - fetching API
7 - fetching API
8 - fetching API
9 - fetching API
10 - fetching API


In [48]:
! hadoop fs -ls /user/root/onibus/source/

Found 12 items
drwxr-xr-x   - root supergroup          0 2023-08-25 00:08 /user/root/onibus/source/20230824_210642
drwxr-xr-x   - root supergroup          0 2023-08-25 00:11 /user/root/onibus/source/20230824_210914
drwxr-xr-x   - root supergroup          0 2023-08-25 00:13 /user/root/onibus/source/20230824_211324
drwxr-xr-x   - root supergroup          0 2023-08-25 00:14 /user/root/onibus/source/20230824_211432
drwxr-xr-x   - root supergroup          0 2023-08-25 00:15 /user/root/onibus/source/20230824_211543
drwxr-xr-x   - root supergroup          0 2023-08-25 00:16 /user/root/onibus/source/20230824_211650
drwxr-xr-x   - root supergroup          0 2023-08-25 00:18 /user/root/onibus/source/20230824_211756
drwxr-xr-x   - root supergroup          0 2023-08-25 00:19 /user/root/onibus/source/20230824_211902
drwxr-xr-x   - root supergroup          0 2023-08-25 00:20 /user/root/onibus/source/20230824_212011
drwxr-xr-x   - root supergroup          0 2023-08-25 00:21 /user/root/onibus/source/2

In [47]:
! hadoop fs -ls /user/root/onibus/sink/

Found 4 items
drwxr-xr-x   - root supergroup          0 2023-08-25 00:11 /user/root/onibus/sink/20230824_211105
drwxr-xr-x   - root supergroup          0 2023-08-25 00:16 /user/root/onibus/sink/20230824_211606
drwxr-xr-x   - root supergroup          0 2023-08-25 00:18 /user/root/onibus/sink/20230824_211803
drwxr-xr-x   - root supergroup          0 2023-08-25 00:21 /user/root/onibus/sink/20230824_212101


In [60]:
dir_source, dir_sink = "/user/root/onibus/source/*/*", "/user/root/onibus/sink/*/*"

In [61]:
df_sink = spark.read.schema(schema).csv(dir_sink)

In [62]:
shape(df_sink)

(71489, 8)

In [63]:
df_source = spark.read.schema(schema).json(dir_source)

In [64]:
shape(df_source)

(71489, 8)