In [11]:
import os
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit

spark = SparkSession.builder \
        .appName("ingestion-linhas") \
        .master("local[*]") \
        .config("spark.hadoop.fs.s3a.endpoint", os.getenv("S3_ENDPOINT")) \
        .config("spark.hadoop.fs.s3a.access.key", os.getenv("AWS_ACCESS_KEY_ID")) \
        .config("spark.hadoop.fs.s3a.secret.key", os.getenv("AWS_SECRET_ACCESS_KEY")) \
        .config("spark.hadoop.fs.s3a.path.style.access", "true") \
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .getOrCreate()
    
today=datetime.now().strftime('%Y-%m-%d')
df=spark.read.json(f's3a://bronze/linhas/dt_ingestion={today}*')

In [12]:
df=df.withColumn('date', lit(today))
df.show(10)

+-----+-----+----+---+---+-------------------+-----------------+
|   cl|   lc|  lt| sl| tl|                 tp|               ts|
+-----+-----+----+---+---+-------------------+-----------------+
| 2495|false|1019|  1| 10|     TERM. PIRITUBA|     SOL NASCENTE|
|35263|false|1019|  2| 10|     TERM. PIRITUBA|     SOL NASCENTE|
|  833|false|1021|  1| 10|     TERM. PIRITUBA|COHAB BRASILÂNDIA|
|33601|false|1021|  2| 10|     TERM. PIRITUBA|COHAB BRASILÂNDIA|
|  621|false|107T|  1| 10|    TERM. PINHEIROS|   METRÔ TUCURUVI|
|33389|false|107T|  2| 10|    TERM. PINHEIROS|   METRÔ TUCURUVI|
|  519|false|118C|  1| 10|TERM. AMARAL GURGEL|    JD. PERY ALTO|
|33287|false|118C|  2| 10|TERM. AMARAL GURGEL|    JD. PERY ALTO|
|  661|false|119C|  1| 10|TERM. PRINC. ISABEL|   PQ. EDU CHAVES|
|33429|false|119C|  2| 10|TERM. PRINC. ISABEL|   PQ. EDU CHAVES|
+-----+-----+----+---+---+-------------------+-----------------+
only showing top 10 rows



In [14]:
df.write.format('delta') \
        .mode('overwrite') \
        .option('overwriteSchema', 'true') \
        .partitionBy('date') \
        .save('s3a://silver/linhas/')