In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, round, split, year

# Função para criar sessão Spark
def create_spark_session() -> SparkSession:
    spark = (
        SparkSession.builder
        .appName("ETL Silver - Medallion Architecture")
        .enableHiveSupport()
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
        .config("spark.hadoop.fs.s3minio.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
        .getOrCreate()
    )
    return spark

# Inicializa Spark
spark = create_spark_session()

# Configurações MinIO
minio_delta_options = {
    "fs.s3a.access.key": "admin",
    "fs.s3a.secret.key": "senhasegura",
    "fs.s3a.endpoint": "http://minio:9000",
    "fs.s3a.path.style.access": "true",
    "fs.s3a.connection.ssl.enabled": "false"
}

# Caminho Bronze
bronze_player_path = "s3a://bucket-bronze-zone/soccer/player"

# Leitura do Delta Lake Bronze
df_bronze = (
    spark.read
    .format("delta")
    .options(**minio_delta_options)
    .load(bronze_player_path)
)

# Transformações:
df_silver = (
    df_bronze
    # Remover colunas _airbyte_*
    .drop(*[c for c in df_bronze.columns if c.startswith("_airbyte")])
    # Renomear player_name para name
    .withColumnRenamed("player_name", "name")
    # Converter birthday de string para DateType (corrigido)
    .withColumn("birth_date", to_date(split(col("birthday"), " ").getItem(0), "yyyy-MM-dd"))
    .drop("birthday")
    # Converter weight de libras para kg
    .withColumn("weight_kg", round(col("weight") * 0.4536, 2))
    .drop("weight")
    # Renomear height para explicitar unidade
    .withColumnRenamed("height", "height_cm")
    # Criar coluna 'year' extraída de birth_date para particionamento
    .withColumn("year", year(col("birth_date")))
    # Remover registros com valores nulos
    .dropna()
)

# Exibir esquema após transformação
print("\nEsquema transformado da tabela 'player' Silver:")
df_silver.printSchema()

# Caminho Silver
silver_player_path = "s3a://bucket-silver-zone/soccer/player"

# Escrita no Delta Lake Silver com particionamento por 'year'
(
    df_silver.write
    .format("delta")
    .mode("overwrite")
    .partitionBy("year")  # Particionamento
    .option("overwriteSchema", "true")
    .options(**minio_delta_options)
    .save(silver_player_path)
)

print("Tabela 'player' transformada, particionada por 'year' e gravada com sucesso na camada Silver no MinIO!")

# Finaliza Spark
spark.stop()



Esquema transformado da tabela 'player' Silver:
root
 |-- id: long (nullable = true)
 |-- height_cm: long (nullable = true)
 |-- name: string (nullable = true)
 |-- player_api_id: long (nullable = true)
 |-- player_fifa_api_id: long (nullable = true)
 |-- _ingest_timestamp: timestamp (nullable = true)
 |-- _source_file: string (nullable = true)
 |-- birth_date: date (nullable = true)
 |-- weight_kg: double (nullable = true)
 |-- year: integer (nullable = true)

Tabela 'player' transformada, particionada por 'year' e gravada com sucesso na camada Silver no MinIO!
