In [1]:
from pyspark.sql import functions as f
from pyspark.sql.window import Window as W
from pyspark.sql import SparkSession

## 1 - Configurando spark

In [2]:
spark = SparkSession.builder \
    .appName("Teste PySpark") \
    .master("spark://spark-master:7077") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4") \
    .getOrCreate()

In [3]:
hadoop_conf = spark._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.endpoint", "http://localstack:4566")
hadoop_conf.set("fs.s3a.access.key", "test")
hadoop_conf.set("fs.s3a.secret.key", "test")
hadoop_conf.set("fs.s3a.path.style.access", "true")
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")


In [None]:
# data = [("Alice", 25), ("Bob", 30), ("Carol", 27)]
# df = spark.createDataFrame(data, ["Nome", "Idade"])
# df.show()

## 2 - Acessando bucket S3

In [None]:
# !pip install boto3

In [12]:
import boto3

s3 = boto3.client(
    "s3",
    endpoint_url="http://localstack:4566",
    aws_access_key_id="test",
    aws_secret_access_key="test",
    region_name="us-east-1"
)

bucket_name = "s3-cvm-fii"
prefix = "raw/"

response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

files = [f"s3a://{bucket_name}/{obj['Key']}" for obj in response.get("Contents", []) if obj['Key'].endswith('.csv')]

# for path in files:
#     print(path)
print('ok \u2705')


ok ✅


## 3 - lendo arquivos do bucket e unificando

In [13]:
try:
    df = spark.read \
        .option("header", "true") \
        .option("encoding", "latin1") \
        .option("sep", ";") \
        .option("inferSchema", "true") \
        .csv("s3a://s3-cvm-fii/raw/*.csv")
    print('ok \u2705')
except Exception as e:
    print(f'\u270c{e}')

ok ✅


In [14]:
df.show(10,truncate=30)

+---------------+------------------+------------+----------+----------+---------------+-------------+---------+--------+--------+
|TP_FUNDO_CLASSE| CNPJ_FUNDO_CLASSE|ID_SUBCLASSE| DT_COMPTC|  VL_TOTAL|       VL_QUOTA|VL_PATRIM_LIQ|CAPTC_DIA|RESG_DIA|NR_COTST|
+---------------+------------------+------------+----------+----------+---------------+-------------+---------+--------+--------+
|             FI|00.017.024/0001-53|        NULL|2024-10-01|1132614.16|36.454622000000|   1137511.23|     0.00|    0.00|       1|
|             FI|00.017.024/0001-53|        NULL|2024-10-02|1133099.67|36.466316600000|   1137876.14|     0.00|    0.00|       1|
|             FI|00.017.024/0001-53|        NULL|2024-10-03|1133598.00|36.478972500000|   1138271.05|     0.00|    0.00|       1|
|             FI|00.017.024/0001-53|        NULL|2024-10-04|1134093.71|36.492039300000|   1138678.78|     0.00|    0.00|       1|
|             FI|00.017.024/0001-53|        NULL|2024-10-07|1134255.06|36.505022800000|   

In [15]:
df.count()

22365065

In [16]:
df.printSchema()

root
 |-- TP_FUNDO_CLASSE: string (nullable = true)
 |-- CNPJ_FUNDO_CLASSE: string (nullable = true)
 |-- ID_SUBCLASSE: string (nullable = true)
 |-- DT_COMPTC: string (nullable = true)
 |-- VL_TOTAL: string (nullable = true)
 |-- VL_QUOTA: string (nullable = true)
 |-- VL_PATRIM_LIQ: string (nullable = true)
 |-- CAPTC_DIA: string (nullable = true)
 |-- RESG_DIA: string (nullable = true)
 |-- NR_COTST: integer (nullable = true)



In [24]:
columns_rename = {
                    "TP_FUNDO_CLASSE": "tipo_fundo",
                     "CNPJ_FUNDO_CLASSE": "cnpj_fundo",
                     "DT_COMPTC": "data_referencia",
                     "NR_COTST": "qtd_cotistas",
                     "RESG_DIA": "valor_resgates",
                     "CAPTC_DIA": "valor_aplicacoes",
                     "VL_QUOTA": "cota",
                     "VL_TOTAL": "valor_carteira",
                     "VL_PATRIM_LIQ":'pl_fundo'
                    }

for old_name, new_name in columns_rename.items():
    df = df.withColumnRenamed(old_name, new_name)


In [25]:
df_fi=df.filter(f.col('TP_FUNDO_CLASSE')=='FI')

In [27]:

df_fi.sample(False, 0.001).show()

+----------+------------------+------------+---------------+--------------+-----------------+-------------+----------------+--------------+------------+
|tipo_fundo|        cnpj_fundo|ID_SUBCLASSE|data_referencia|valor_carteira|             cota|   patrimonio|valor_aplicacoes|valor_resgates|qtd_cotistas|
+----------+------------------+------------+---------------+--------------+-----------------+-------------+----------------+--------------+------------+
|        FI|00.398.561/0001-90|        NULL|     2024-10-21|   23475252.07| 187.734400400000|  23443586.02|            0.00|      11159.76|         984|
|        FI|01.045.435/0001-15|        NULL|     2024-10-02|     496331.16|  19.946286800000|    492609.14|            0.00|          0.00|          81|
|        FI|02.367.527/0001-84|        NULL|     2024-10-03| 3256177058.25| 199.367351400000|3255755175.29|      1253414.03|    6366911.29|       24417|
|        FI|02.846.259/0001-83|        NULL|     2024-10-21|   92986915.66| 220.04

In [28]:
df_fi = (
        df_fi
        .withColumn('cnpj_fundo',f.regexp_replace(f.col('cnpj_fundo'), r'[./-]', ''))
        .select(
             'cnpj_fundo',
             'patrimonio',
             'cota',
             'qtd_cotistas',
             'valor_aplicacoes',
             'valor_resgates',
             'valor_carteira',
             'data_referencia')
        )

In [30]:
df_teste = df_fi.limit(10)
df_teste.show()

+--------------+----------+---------------+------------+----------------+--------------+--------------+---------------+
|    cnpj_fundo|patrimonio|           cota|qtd_cotistas|valor_aplicacoes|valor_resgates|valor_carteira|data_referencia|
+--------------+----------+---------------+------------+----------------+--------------+--------------+---------------+
|00017024000153|1137511.23|36.454622000000|           1|            0.00|          0.00|    1132614.16|     2024-10-01|
|00017024000153|1137876.14|36.466316600000|           1|            0.00|          0.00|    1133099.67|     2024-10-02|
|00017024000153|1138271.05|36.478972500000|           1|            0.00|          0.00|    1133598.00|     2024-10-03|
|00017024000153|1138678.78|36.492039300000|           1|            0.00|          0.00|    1134093.71|     2024-10-04|
|00017024000153|1139083.91|36.505022800000|           1|            0.00|          0.00|    1134255.06|     2024-10-07|
|00017024000153|1136373.51|36.5178166000

In [22]:
df_teste.show()

+--------------+----------+---------------+------------+----------------+--------------+--------------+---------------+
|    cnpj_fundo|patrimonio|           cota|qtd_cotistas|valor_aplicacoes|valor_resgates|valor_carteira|data_referencia|
+--------------+----------+---------------+------------+----------------+--------------+--------------+---------------+
|00017024000153|1137511.23|36.454622000000|           1|            0.00|          0.00|    1132614.16|     2024-10-01|
|00017024000153|1137876.14|36.466316600000|           1|            0.00|          0.00|    1133099.67|     2024-10-02|
|00017024000153|1138271.05|36.478972500000|           1|            0.00|          0.00|    1133598.00|     2024-10-03|
|00017024000153|1138678.78|36.492039300000|           1|            0.00|          0.00|    1134093.71|     2024-10-04|
|00017024000153|1139083.91|36.505022800000|           1|            0.00|          0.00|    1134255.06|     2024-10-07|
|00017024000153|1136373.51|36.5178166000

In [None]:
break

In [56]:
## variação cota dia
(
    df_teste.withColumn('cota_dia_anterior',
                         f.lag(f.col('cota')).over(W.partitionBy(f.col('cnpj_fundo'))
                                                     .orderBy(f.col('data_referencia'))))
        .withColumn("variacao_cota_dia",
            f.when(
                   (f.col("cota_dia_anterior").isNotNull()) & (f.col("cota_dia_anterior") != 0),
                    f.round(((f.col("cota") - f.col("cota_dia_anterior")) / f.col("cota_dia_anterior")) * 100,4)
                    )
                   )
        .withColumn("ano", f.year(f.col("data_referencia")))
        .withColumn("mes",f.month(f.col("data_referencia")))

        .withColumn("last_day_of_month", f.last_day("data_referencia"))

        .withColumn('ultima_cota_mes',
                             f.lag(f.col('cota')).over(W.partitionBy(f.col('cnpj_fundo'))
                                                         .orderBy(f.col('last_day_of_month'))))
        .withColumn("variacao_cota_mes",
            f.when(
                    (f.col("ultima_cota_mes").isNotNull()) & (f.col("ultima_cota_mes") != 0),
                   f.round(((f.col("cota") - f.col("ultima_cota_mes")) / f.col("ultima_cota_mes")) * 100,4)
                )
            )
        .select(
             'cnpj_fundo',
             'patrimonio',
             'cota',
             'qtd_cotistas',
             'valor_aplicacoes',
             'valor_resgates',
             'valor_carteira',
             'data_referencia',
            'variacao_cota_dia',
            'variacao_cota_mes',
            'mes',
            'ano')
             
).show(10,truncate=10)

+----------+----------+----------+------------+----------------+--------------+--------------+---------------+-----------------+-----------------+---+----+
|cnpj_fundo|patrimonio|      cota|qtd_cotistas|valor_aplicacoes|valor_resgates|valor_carteira|data_referencia|variacao_cota_dia|variacao_cota_mes|mes| ano|
+----------+----------+----------+------------+----------------+--------------+--------------+---------------+-----------------+-----------------+---+----+
|0001702...|1137511.23|36.4546...|           1|            0.00|          0.00|    1132614.16|     2024-10-01|             NULL|             NULL| 10|2024|
|0001702...|1137876.14|36.4663...|           1|            0.00|          0.00|    1133099.67|     2024-10-02|           0.0321|           0.0321| 10|2024|
|0001702...|1138271.05|36.4789...|           1|            0.00|          0.00|    1133598.00|     2024-10-03|           0.0347|           0.0347| 10|2024|
|0001702...|1138678.78|36.4920...|           1|            0.00|

-----------------------
Campo: CAPTC_DIA
-----------------------
   Descrição : Captação do dia
   Domínio   : Numérico2

-----------------------
Campo: CNPJ_FUNDO_CLASSE
-----------------------
   Descrição : CNPJ do fundo/classe

-----------------------
Campo: DT_COMPTC
-----------------------
   Descrição : Data de competência do documento
   Domínio   : AAAA-MM-DD
   Tipo Dados: date

-----------------------
Campo: ID_SUBCLASSE
-----------------------
   Descrição : Identificador da subclasse


-----------------------
Campo: NR_COTST
-----------------------
   Descrição : Número de cotistas


-----------------------
Campo: RESG_DIA
-----------------------
   Descrição : Resgate no dia

-----------------------
Campo: TP_FUNDO_CLASSE
-----------------------
   Descrição : Tipo de fundo/classe

-----------------------
Campo: VL_PATRIM_LIQ
-----------------------
   Descrição : Valor do patrimônio líquido
   Domínio   : Numérico
-----------------------
Campo: VL_QUOTA
-----------------------
   Descrição : Valor da cota
   Domínio   : Numérico
   
-----------------------
Campo: VL_TOTAL
-----------------------
   Descrição : Valor total da carteira
   Domínio   : Numérico
