In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date

# Função para criar sessão Spark com ajuste de parser de data
def create_spark_session() -> SparkSession:
    spark = (
        SparkSession.builder
        .appName("ETL Silver - player_attributes")
        .enableHiveSupport()
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
        .config("spark.hadoop.fs.s3minio.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
        .config("spark.sql.legacy.timeParserPolicy", "LEGACY")  # Ajuste para parser de data
        .getOrCreate()
    )
    return spark

# Inicializa Spark
spark = create_spark_session()

# Configurações MinIO
minio_delta_options = {
    "fs.s3a.access.key": "admin",
    "fs.s3a.secret.key": "senhasegura",
    "fs.s3a.endpoint": "http://minio:9000",
    "fs.s3a.path.style.access": "true",
    "fs.s3a.connection.ssl.enabled": "false"
}

# Caminho Bronze
bronze_path = "s3a://bucket-bronze-zone/soccer/player_attributes"

# Leitura do Delta Lake Bronze
df_bronze = (
    spark.read
    .format("delta")
    .options(**minio_delta_options)
    .load(bronze_path)
)

# Transformações:
df_silver = (
    df_bronze
    # Remover colunas técnicas geradas pelo Airbyte (_airbyte_*)
    .drop(*[c for c in df_bronze.columns if c.startswith("_airbyte")])
    # Criar coluna 'attr_date' convertendo 'date' (string) para DateType
    # Usando parser LEGACY, que lida automaticamente com formatos compatíveis
    .withColumn("attr_date", to_date(col("date")))
    # Remover a coluna original 'date', já que foi transformada
    .drop("date")
    # Remover registros que contenham valores nulos em qualquer coluna
    .dropna()
)

# Exibir esquema após transformação
print("\nEsquema transformado da tabela 'player_attributes' Silver:")
df_silver.printSchema()

# Caminho Silver
silver_path = "s3a://bucket-silver-zone/soccer/player_attributes"

# Escrita no Delta Lake Silver
(
    df_silver.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .options(**minio_delta_options)
    .save(silver_path)
)

print("Tabela 'player_attributes' transformada e gravada com sucesso na camada Silver no MinIO!")

# Finaliza Spark
spark.stop()



Esquema transformado da tabela 'player_attributes' Silver:
root
 |-- id: long (nullable = true)
 |-- curve: long (nullable = true)
 |-- vision: long (nullable = true)
 |-- agility: long (nullable = true)
 |-- balance: long (nullable = true)
 |-- jumping: long (nullable = true)
 |-- marking: long (nullable = true)
 |-- stamina: long (nullable = true)
 |-- volleys: long (nullable = true)
 |-- crossing: long (nullable = true)
 |-- strength: long (nullable = true)
 |-- dribbling: long (nullable = true)
 |-- finishing: long (nullable = true)
 |-- gk_diving: long (nullable = true)
 |-- penalties: long (nullable = true)
 |-- potential: long (nullable = true)
 |-- reactions: long (nullable = true)
 |-- aggression: long (nullable = true)
 |-- gk_kicking: long (nullable = true)
 |-- long_shots: long (nullable = true)
 |-- shot_power: long (nullable = true)
 |-- gk_handling: long (nullable = true)
 |-- gk_reflexes: long (nullable = true)
 |-- positioning: long (nullable = true)
 |-- acceleration