In [59]:
import os
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit

spark = SparkSession.builder \
        .appName("ingestion-linhas") \
        .master("local[*]") \
        .config("spark.hadoop.fs.s3a.endpoint", os.getenv("S3_ENDPOINT")) \
        .config("spark.hadoop.fs.s3a.access.key", os.getenv("AWS_ACCESS_KEY_ID")) \
        .config("spark.hadoop.fs.s3a.secret.key", os.getenv("AWS_SECRET_ACCESS_KEY")) \
        .config("spark.hadoop.fs.s3a.path.style.access", "true") \
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .getOrCreate()
    
today=datetime.now().strftime('%Y-%m-%d')
df=spark.read.option('header', 'true').csv(f's3a://silver/posicao_by_linha/csv/')

In [60]:
df.show(10)

+-----+-----+----+--------------------+-------------------+------------------+----------+
|   hr|    p|   a|                  ta|                 py|                px|      date|
+-----+-----+----+--------------------+-------------------+------------------+----------+
|08:48|10582|true|2024-11-04T11:47:39Z|         -23.437415|        -46.779865|2024-11-04|
|08:48|10581|true|2024-11-04T11:48:06Z|        -23.4407735|       -46.7593955|2024-11-04|
|08:48|10584|true|2024-11-04T11:47:50Z|-23.486878500000003|       -46.7265645|2024-11-04|
|08:48|10583|true|2024-11-04T11:48:12Z|-23.436503000000002|       -46.7631435|2024-11-04|
|08:48|16531|true|2024-11-04T11:48:08Z|-23.445529999999998|-46.71263999999999|2024-11-04|
|08:48|16523|true|2024-11-04T11:47:39Z|         -23.486697|        -46.726888|2024-11-04|
|08:48|16444|true|2024-11-04T11:48:09Z|        -23.4695535|       -46.7243475|2024-11-04|
|08:48|16401|true|2024-11-04T11:47:30Z|-23.444692500000002|      -46.71084275|2024-11-04|
|08:48|163

In [49]:
today=datetime.now().strftime('%Y-%m-%d')
df=spark.read.json(f's3a://bronze/posicao_by_linha/dt_ingestion={today}*')

In [52]:
df.show(10)

+-----+-----+--------------------+
|   cl|   hr|                  vs|
+-----+-----+--------------------+
| 2495|08:48|[{true, null, 105...|
|35263|08:48|[{true, null, 105...|
|  833|08:48|[{true, null, 165...|
|33601|08:48|[{true, null, 164...|
|  621|08:48|[{true, null, 212...|
|33389|08:48|[{true, null, 212...|
|  519|08:48|[{true, null, 222...|
|33287|08:48|[{true, null, 222...|
|  661|08:48|[{true, null, 213...|
|33429|08:48|[{true, null, 213...|
+-----+-----+--------------------+
only showing top 10 rows



In [47]:
headers={'Cookie': 'apiCredentials=728FF49050A120E52F3A5B9EC3C40CAF74189451AF3C694D47662747A69D5FE22EC3012D685012F7529B6B2B44F6B61D9A3935EABB3257082BE0A895C0380CF4AB7D1ED2E19997B4692652CCE3DBDF570C334E2AAB3DEFBEF033743ADABB5125B11089C7D0C6FE2B7F214F04418B02E592B9A05D8C6AEB8BC90DEC3DB9BACCEB236460DA314B1652B42C5CF92C3176E450B26D73969818B961A7FFD8016F28B8CA781879F808113381F59AAB0E47BFEAD688BF132A969552EF28D6C629E670D1ED2CF19E; path=/; HttpOnly; SameSite=Lax, __cf_bm=G4Pc.aAZAzpEcAadB_AsRKUqc_rF88_LC9ql19q7QMQ-1730720418-1.0.1.1-MgU.4BryXQU0d5yUnny4zlPYpLxyINlbKLrNX.dBz4fLEz8_nJQZ6hVb5a.dpP5D1CP9rHnfOklBsJ6A3HBoMw; path=/; expires=Mon, 04-Nov-24 12:10:18 GMT; domain=.sptrans.com.br; HttpOnly; Secure; SameSite=None'}

In [48]:
import requests

base_url='http://api.olhovivo.sptrans.com.br/v2.1'

linhas=spark.read.option('header', 'true').csv('s3a://silver/linhas/csv/')
linhas=linhas.select('cl').limit(50).collect() #codigo linhas
for row in linhas:
    cl=row['cl']
    response=requests.get(f'{base_url}/Posicao/Linha?codigoLinha={cl}', headers=headers)
    print(response.json())


{'hr': '08:40', 'vs': [{'p': '10584', 'a': True, 'ta': '2024-11-04T11:40:28Z', 'py': -23.487057999999998, 'px': -46.7364625, 'sv': None, 'is': None}, {'p': '10582', 'a': True, 'ta': '2024-11-04T11:40:16Z', 'py': -23.437415, 'px': -46.779865, 'sv': None, 'is': None}, {'p': '10581', 'a': True, 'ta': '2024-11-04T11:40:02Z', 'py': -23.434534, 'px': -46.776998500000005, 'sv': None, 'is': None}]}
{'hr': '08:40', 'vs': [{'p': '10583', 'a': True, 'ta': '2024-11-04T11:40:05Z', 'py': -23.4639745, 'px': -46.754187, 'sv': None, 'is': None}]}
{'hr': '08:40', 'vs': [{'p': '16531', 'a': True, 'ta': '2024-11-04T11:40:24Z', 'py': -23.4439115, 'px': -46.712223, 'sv': None, 'is': None}, {'p': '16444', 'a': True, 'ta': '2024-11-04T11:40:02Z', 'py': -23.486845000000002, 'px': -46.7270285, 'sv': None, 'is': None}, {'p': '16523', 'a': True, 'ta': '2024-11-04T11:40:15Z', 'py': -23.477185499999997, 'px': -46.728111, 'sv': None, 'is': None}]}
{'hr': '08:40', 'vs': [{'p': '16401', 'a': True, 'ta': '2024-11-04T11

In [26]:
mock=[
[],
[],
[{'cp': 700016620, 'np': 'EDUARDO PRADO - B/C', 'ed': 'AL RIBEIRO DA SILVA / AL EDUARDO PRADO', 'py': -23.530596, 'px': -46.64676}, {'cp': 700016866, 'np': 'NOTHMANN C/B', 'ed': 'R GENERAL JULIO MARCONDES SALGADO/ AL NOTHMANN', 'py': -23.536318, 'px': -46.651201}],
[{'cp': 700016619, 'np': 'EDUARDO PRADO C/B', 'ed': 'AL RIBEIRO DA SILVA / AL EDUARDO PRADO', 'py': -23.529916, 'px': -46.64719}, {'cp': 700016621, 'np': 'GLETE - C/B', 'ed': 'AL GLETE/ AL NOTHMANN', 'py': -23.532573, 'px': -46.645203}],
[{'cp': 290001401, 'np': 'FREGUESIA B/C', 'ed': 'R CESARE BADIALI/ R ENRICO CARAFA - Rua Giácomo Toreli', 'py': -23.494786, 'px': -46.708775}, {'cp': 480012877, 'np': 'MARGINAL B/C', 'ed': 'AC PONTE DO PIQUERI AV EMB MACEDO SOARES/ R PROFESSORA SURAIA AIDAR MENON', 'py': -23.511052, 'px': -46.705493}, {'cp': 480012881, 'np': 'JOSE MARIA B/C', 'ed': 'R BELCHIOR CARNEIRO/ PC SEBASTIAO JAYME PINTO', 'py': -23.514036, 'px': -46.702259}, {'cp': 640001400, 'np': 'RIO VERDE B/C_2', 'ed': 'R CANNER/ R ROBERTO SWICKER JUNIOR', 'py': -23.490768, 'px': -46.710252}, {'cp': 640001403, 'np': 'PETRONIO PORTELA B/C', 'ed': 'AV MIN. PETRONIO PORTELA/ R SEM NOME', 'py': -23.498233, 'px': -46.706861}, {'cp': 640001408, 'np': 'PAULA FERREIRA B/C', 'ed': 'PC RUY CARLOS VIEIRA BERBET/ AV PAULA FERREIRA', 'py': -23.500896, 'px': -46.706633}, {'cp': 640001414, 'np': 'PIQUERI B/C', 'ed': 'AC A AV OTAVIANO ALVES DE LIMA/ R BENEDITO MONTEIRO', 'py': -23.50616, 'px': -46.706061}],
[{'cp': 290001396, 'np': 'FREGUESIA C/B', 'ed': 'R HENRI POTIRON/ R JULIEN BELIN', 'py': -23.493997, 'px': -46.709123}, {'cp': 480012878, 'np': 'MARGINAL C/B', 'ed': 'R ANTONIO SARKIS/ AC AV EMB MACEDO SOARES PONTE DO PIQUERI', 'py': -23.510894, 'px': -46.705402}, {'cp': 480012942, 'np': 'ZANELLA C/B', 'ed': 'R NICOLAU PERRONE/ AV ERMANO MARCHETTI', 'py': -23.51644, 'px': -46.698513}, {'cp': 640001391, 'np': 'PAULA FERREIRA C/B', 'ed': 'AV MIN. PETRONIO PORTELA/ R SEM NOME', 'py': -23.500624, 'px': -46.706488}, {'cp': 640001393, 'np': 'PETRONIO PORTELA C/B', 'ed': 'PC DA. AMALIA G SOLITARI/ AV MIN. PETRONIO PORTELA', 'py': -23.498009, 'px': -46.70677}, {'cp': 640001397, 'np': 'RIO VERDE C/B', 'ed': 'R JOTACA/ R RAIMUNDO DA COSTA E SILVA', 'py': -23.490798, 'px': -46.709998}, {'cp': 640001417, 'np': 'PIQUERI C/B', 'ed': 'AC A AV OTAVIANO ALVES DE LIMA', 'py': -23.506372, 'px': -46.705887}],
[],
[]]

In [27]:
for row in mock:
    print({"cl": 1, "paradas": row})

{'cl': 1, 'paradas': []}
{'cl': 1, 'paradas': []}
{'cl': 1, 'paradas': [{'cp': 700016620, 'np': 'EDUARDO PRADO - B/C', 'ed': 'AL RIBEIRO DA SILVA / AL EDUARDO PRADO', 'py': -23.530596, 'px': -46.64676}, {'cp': 700016866, 'np': 'NOTHMANN C/B', 'ed': 'R GENERAL JULIO MARCONDES SALGADO/ AL NOTHMANN', 'py': -23.536318, 'px': -46.651201}]}
{'cl': 1, 'paradas': [{'cp': 700016619, 'np': 'EDUARDO PRADO C/B', 'ed': 'AL RIBEIRO DA SILVA / AL EDUARDO PRADO', 'py': -23.529916, 'px': -46.64719}, {'cp': 700016621, 'np': 'GLETE - C/B', 'ed': 'AL GLETE/ AL NOTHMANN', 'py': -23.532573, 'px': -46.645203}]}
{'cl': 1, 'paradas': [{'cp': 290001401, 'np': 'FREGUESIA B/C', 'ed': 'R CESARE BADIALI/ R ENRICO CARAFA - Rua Giácomo Toreli', 'py': -23.494786, 'px': -46.708775}, {'cp': 480012877, 'np': 'MARGINAL B/C', 'ed': 'AC PONTE DO PIQUERI AV EMB MACEDO SOARES/ R PROFESSORA SURAIA AIDAR MENON', 'py': -23.511052, 'px': -46.705493}, {'cp': 480012881, 'np': 'JOSE MARIA B/C', 'ed': 'R BELCHIOR CARNEIRO/ PC SEBASTI

In [19]:
f'{base_url}/Parada/BuscarParadasPorLinha?codigoLinha={cl}'

'https://api.olhovivo.sptrans.com.br/v2.1/Parada/BuscarParadasPorLinha?codigoLinha=33429'

In [None]:
#[row['col'] for row in df.select('col').collect()]

In [12]:
df=df.withColumn('date', lit(today))
df.show(10)

+-----+-----+----+---+---+-------------------+-----------------+
|   cl|   lc|  lt| sl| tl|                 tp|               ts|
+-----+-----+----+---+---+-------------------+-----------------+
| 2495|false|1019|  1| 10|     TERM. PIRITUBA|     SOL NASCENTE|
|35263|false|1019|  2| 10|     TERM. PIRITUBA|     SOL NASCENTE|
|  833|false|1021|  1| 10|     TERM. PIRITUBA|COHAB BRASILÂNDIA|
|33601|false|1021|  2| 10|     TERM. PIRITUBA|COHAB BRASILÂNDIA|
|  621|false|107T|  1| 10|    TERM. PINHEIROS|   METRÔ TUCURUVI|
|33389|false|107T|  2| 10|    TERM. PINHEIROS|   METRÔ TUCURUVI|
|  519|false|118C|  1| 10|TERM. AMARAL GURGEL|    JD. PERY ALTO|
|33287|false|118C|  2| 10|TERM. AMARAL GURGEL|    JD. PERY ALTO|
|  661|false|119C|  1| 10|TERM. PRINC. ISABEL|   PQ. EDU CHAVES|
|33429|false|119C|  2| 10|TERM. PRINC. ISABEL|   PQ. EDU CHAVES|
+-----+-----+----+---+---+-------------------+-----------------+
only showing top 10 rows



In [14]:
df.write.format('delta') \
        .mode('overwrite') \
        .option('overwriteSchema', 'true') \
        .partitionBy('date') \
        .save('s3a://silver/linhas/')