In [11]:
!pip install boto3

Collecting boto3
  Downloading boto3-1.40.1-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<1.41.0,>=1.40.1 (from boto3)
  Downloading botocore-1.40.1-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.14.0,>=0.13.0 (from boto3)
  Downloading s3transfer-0.13.1-py3-none-any.whl.metadata (1.7 kB)
Downloading boto3-1.40.1-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.9/139.9 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.40.1-py3-none-any.whl (13.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.9/13.9 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Downloading s3transfer-0.13.1-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.3/85.3 kB[0m [31m13.3 MB/s[0

In [12]:
from pyspark.sql import functions as f
from pyspark.sql.window import Window as W
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType
import boto3

# 1 - configurando


In [None]:

spark = SparkSession.builder \
    .appName("Teste PySpark") \
    .master("spark://spark-master:7077") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4",) \
    .config("spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version", "1")\
    .getOrCreate()

hadoop_conf = spark._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.endpoint", "http://localstack:4566")
hadoop_conf.set("fs.s3a.access.key", "test")
hadoop_conf.set("fs.s3a.secret.key", "test")
hadoop_conf.set("fs.s3a.path.style.access", "true")
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoop_conf.set("fs.s3a.committer.name", "directory")


# 2 - lendo arquivos do bucket e unificando

In [None]:

schema = StructType([
    StructField("TP_FUNDO_CLASSE", StringType(), True),
    StructField("CNPJ_FUNDO_CLASSE", StringType(), True),
    StructField("ID_SUBCLASSE", StringType(),True),
    StructField("DT_COMPTC", DateType(), True),
    StructField("NR_COTST", IntegerType(), True),
    StructField("VL_QUOTA", DoubleType(), True),
    StructField("VL_PATRIM_LIQ", DoubleType(), True),
    StructField("CAPTC_DIA", DoubleType(), True),
    StructField("RESG_DIA", DoubleType(), True),
    StructField("VL_TOTAL", DoubleType(), True)
])

try:
    df = spark.read \
        .option("header", "true") \
        .option("encoding", "latin1") \
        .option("sep", ";") \
        .schema(schema) \
        .csv("s3a://s3-cvm-fii/raw/*.csv")
    print('ok \u2705')
except Exception as e:
    print(f'\u270c{e}')

# 3 tratando 

In [2]:

df = (df
    .withColumn('CNPJ_FUNDO_CLASSE',f.regexp_replace(f.col('CNPJ_FUNDO_CLASSE'), r'[./-]', ''))
    .filter(f.col('TP_FUNDO_CLASSE')=='FI')
    .withColumn('ano',f.year(f.col('DT_COMPTC')))
    .select(
        f.col('CNPJ_FUNDO_CLASSE').alias('cnpj_fundo'),
        f.col('NR_COTST').alias('qtd_cotistas'),
        f.col('RESG_DIA').alias('valor_resgates'),
        f.col('CAPTC_DIA').alias('valor_aplicacoes'),
        f.col('VL_QUOTA').alias('cota'),
        f.col('VL_TOTAL').alias('valor_carteira'),
        f.col('VL_PATRIM_LIQ').alias('pl_fundo'),
        f.col('DT_COMPTC').alias('data_referencia'),
        f.col('ano'),
        f.current_date().alias('dt_ingest'))
)

df_25 = df.filter(f.col("ano") == 2025)


ok ✅
✅ [DADOS SALVOS COM SUCESSO EM s3a://s3-cvm-fii/s3a://s3-cvm-fii/stage-test2/]


# 4 - upload

In [None]:

try:
    df_25.write.mode("overwrite").parquet("s3a://s3-cvm-fii/stage-test2/")
    print("\u2705 [DADOS SALVOS COM SUCESSO EM s3a://s3-cvm-fii/s3a://s3-cvm-fii/stage-test2/]")
except Exception as e:
    print(e)

In [8]:
print('\u2705')

✅


# 5 - CONSULTANDO NO STAGE e LENDO PARQUET

In [16]:
s3 = boto3.client(
    "s3",
    endpoint_url="http://localstack:4566",
    aws_access_key_id="test",
    aws_secret_access_key="test",
    region_name="us-east-1"
)

bucket_name = "s3-cvm-fii"
prefix = "stage-test2/"

response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

files = [f"s3a://{bucket_name}/{obj['Key']}" for obj in response.get("Contents", []) if obj['Key'].endswith('.csv')]

for path in files:
    print(path)
print('ok \u2705')


ok ✅


In [15]:

try:
    df = spark.read \
        .option("header", "true") \
        .option("encoding", "latin1") \
        .option("sep", ";") \
        .schema(schema) \
        .parquet("s3a://s3-cvm-fii/stage-test2/*.parquet")
    print('ok \u2705')
except Exception as e:
    print(f'\u270c{e}')

[]

In [None]:
s3a://s3-cvm-fii/stage-test2

In [None]:
## LENDO 

In [None]:


# ## variação cota dia
# df_fi = (
#     df_fi.withColumn('cota_dia_anterior',
#                          f.lag(f.col('cota')).over(W.partitionBy(f.col('cnpj_fundo'))
#                                                      .orderBy(f.col('data_referencia'))))
#         .withColumn("variacao_cota_dia",
#             f.when(
#                    (f.col("cota_dia_anterior").isNotNull()) & (f.col("cota_dia_anterior") != 0),
#                     f.round(((f.col("cota") - f.col("cota_dia_anterior")) / f.col("cota_dia_anterior")) * 100,4)))
#         .withColumn("ano", f.year(f.col("data_referencia")))
#         .withColumn("mes",f.month(f.col("data_referencia")))
#         .withColumn("net",
#                    f.col("valor_aplicacoes") - f.col("valor_resgates"))
#         .withColumn("pl_d1",
#                     f.lag(f.col("pl_fundo")).over(W.partitionBy(f.col("cnpj_fundo"))
#                                                     .orderBy(f.col("data_referencia")))
#                     )
#         .withColumn('pnl',f.col('pl_fundo') - f.col('pl_d1') - f.col('net'))
#         .withColumn("dt_ingest", f.current_date())
#         .select(
#              'cnpj_fundo',
#              'pl_fundo',
#              'cota',
#              'qtd_cotistas',
#              'valor_aplicacoes',
#              'valor_resgates',
#              'net',
#              'pnl',
#              'valor_carteira',
#              'data_referencia',
#              'variacao_cota_dia',
#              'data_referencia',
#              'mes',
#              'ano',
#             'dt_ingest')
             
# ).orderBy('data_referencia')