In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, year, explode, split

# Função para criar sessão Spark
def create_spark_session() -> SparkSession:
    spark = (
        SparkSession.builder
        .appName("ETL Gold - Match_Player_Attributes")
        .enableHiveSupport()
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
        .config("spark.hadoop.fs.s3minio.impl", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
        .getOrCreate()
    )
    return spark

# Inicializa Spark
spark = create_spark_session()

# Configurações MinIO
minio_delta_options = {
    "fs.s3a.access.key": "admin",
    "fs.s3a.secret.key": "senhasegura",
    "fs.s3a.endpoint": "http://minio:9000",
    "fs.s3a.path.style.access": "true",
    "fs.s3a.connection.ssl.enabled": "false"
}

# Caminhos Silver
match_path = "s3a://bucket-silver-zone/soccer/Match"
player_path = "s3a://bucket-silver-zone/soccer/player"
player_attr_path = "s3a://bucket-silver-zone/soccer/player_attributes"

# Leitura das tabelas Silver
df_match = (
    spark.read
    .format("delta")
    .options(**minio_delta_options)
    .load(match_path)
)

df_player = (
    spark.read
    .format("delta")
    .options(**minio_delta_options)
    .load(player_path)
)

df_player_attr = (
    spark.read
    .format("delta")
    .options(**minio_delta_options)
    .load(player_attr_path)
)

# Explode IDs dos jogadores em campo (exemplo simplificado para home_player_1)
df_match_exploded = df_match.withColumn("player_api_id", col("home_player_1"))

# Seleção de colunas relevantes de cada DataFrame
df_match_selected = df_match_exploded.select(
    col("id").alias("match_id"),
    "season", "stage", "home_team_goal", "away_team_goal", "match_date", "year", "player_api_id"
)

df_player_attr_selected = df_player_attr.select(
    "player_api_id", "overall_rating", "potential", "preferred_foot"
)

df_player_selected = df_player.select(
    col("id").alias("player_id"), "player_api_id", col("name").alias("player_name"), 
    "birth_date", "weight_kg", "height_cm"
)

# Realiza os JOINs
df_join = (
    df_match_selected
    .join(df_player_attr_selected, "player_api_id", "left")
    .join(df_player_selected, "player_api_id", "left")
)

# Remover registros nulos
df_final = df_join.dropna()

# Caminho Gold
gold_path = "s3a://bucket-gold-zone/soccer/match_player_attributes"

# Escrita no Delta Lake Gold
(
    df_final.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .options(**minio_delta_options)
    .save(gold_path)
)

print("Tabela final (JOIN completo de player + player_attributes + match) gravada com sucesso na camada Gold no MinIO!")

# Finaliza Spark
spark.stop()


Tabela final (JOIN completo de player + player_attributes + match) gravada com sucesso na camada Gold no MinIO!
