In [None]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
import os
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor

# Configure Spark session with Delta Lake support
builder = SparkSession.builder \
    .appName("RecreateDeltaTables") \
    .master("local[*]") \
    .config("spark.driver.memory", "6g") \
    .config("spark.executor.memory", "6g") \
    .config("spark.sql.shuffle.partitions", "8") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.sql.warehouse.dir", "storage")  # Adjust to absolute path

spark = configure_spark_with_delta_pip(builder).getOrCreate()

spark.sql("CREATE DATABASE IF NOT EXISTS bd_becomex")

bd_becomex = "spark-warehouse/bd_becomex.db"

# Lista os nomes das tabelas (pastas com _delta_log)
table_names = [d.name for d in Path(bd_becomex).iterdir() if d.is_dir() and (d / "_delta_log").exists()]

# Coloca num dicionário com a chave 'nome'
parameters = {'nome': table_names}

bd_becomex = "spark-warehouse/bd_becomex.db"

# Lista os nomes das tabelas (pastas com _delta_log)
table_names = [d.name for d in Path(bd_becomex).iterdir() if d.is_dir() and (d / "_delta_log").exists()]

parameters = {'nome': table_names}

def ingest_data_bronze(read_path, table_name, database_name):
    try:
        df = spark.read.format("delta").load(read_path)
        df.write.mode('overwrite').option('overwriteSchema', True).format('delta').saveAsTable(f"{database_name}.{table_name}")
    except Exception as e:
        print(f"❌ tabela {table_name} Falhou - {str(e)}")

# Cria tuplas com os argumentos (read_path, table_name, database_name)
args = [(f"{bd_becomex}/{name}", name, "bd_becomex") for name in parameters['nome']]

# Roda em paralelo
with ThreadPoolExecutor(max_workers=12) as executor:
    executor.map(lambda p: ingest_data_bronze(*p), args)


In [12]:
import pandas as pd
pd.set_option('display.max_columns', None)

In [9]:
spark.sql("""use bd_becomex""")

DataFrame[]

In [13]:
df = spark.sql("""

WITH ncm_joined AS (
    SELECT *
    FROM ncm n
    INNER JOIN sh s ON n.CO_SH6 = s.CO_SH6
    INNER JOIN ncm_ppe np ON n.CO_PPE = np.CO_PPE
    INNER JOIN ncm_ppi npi ON n.CO_PPI = npi.CO_PPI
    INNER JOIN ncm_fat_agreg nfa ON n.CO_FAT_AGREG = nfa.CO_FAT_AGREG
    INNER JOIN ncm_cuci nc ON n.CO_CUCI_ITEM = nc.CO_CUCI_ITEM
    INNER JOIN ncm_cgce ncg ON n.CO_CGCE_N3 = ncg.CO_CGCE_N3
    INNER JOIN ncm_isic ni ON n.CO_ISIC_CLASSE = ni.CO_ISIC_CLASSE
),
fat_import_joined AS (
    SELECT *
    FROM bd_becomex.import fi
    INNER JOIN ncm_unidade nu ON fi.CO_UNID = nu.CO_UNID
    INNER JOIN urf u ON fi.CO_URF = u.CO_URF
    INNER JOIN paises p ON fi.CO_PAIS = p.CO_PAIS
    INNER JOIN via v ON fi.CO_VIA = v.CO_VIA
),
fat_export_joined AS (
    SELECT *
    FROM bd_becomex.export fe
    INNER JOIN ncm_unidade nu ON fe.CO_UNID = nu.CO_UNID
    INNER JOIN urf u ON fe.CO_URF = u.CO_URF
    INNER JOIN paises p ON fe.CO_PAIS = p.CO_PAIS
    INNER JOIN via v ON fe.CO_VIA = v.CO_VIA
)
SELECT *
FROM fat_import_joined fi
INNER JOIN ncm_joined nj ON fi.CO_NCM = nj.CO_NCM


""")
               #.limit(10).toPandas()

In [14]:
df.show()

[Stage 455:>                                                        (0 + 1) / 1]

+------+------+-------+-------+-------+---------+------+-------+--------+----------+------+--------+---------+-------+----------------+-------+-------+--------------------+-------+-------------+-------------+--------------+-------------+--------------+------+--------------+-------+-------+------+------+------+------------+------------+----------+-------+--------------+-------------+--------------------+--------------------+--------------------+------+--------------------+--------------------+--------------------+------+--------------------+--------------------+--------------------+------+-------------+--------------+------------+-------------+--------------------+--------------------+--------------------+------+--------------------+--------------------+-----------------+------+--------------------+--------------------+-----------------+------------+----------------+----------------+------------+--------------------+-----------+--------------------+-------------+--------------------+---

                                                                                