In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, regexp_extract

# Inicializa Spark
spark = SparkSession.builder.appName("BreweryDataSilver").getOrCreate()

# Le da camada bronze
bronze_path = "/datalake/bronze/breweries/spark/"
silver_path = "/datalake/silver/breweries/"

df = spark.read.json(bronze_path)

# Limpeza e transformacao dos dados
cleaned_df = df.withColumn("brewery_type", when(col("brewery_type").isNull(), "unknown").otherwise(col("brewery_type"))) \
    .withColumn("state", regexp_extract(col("state"), "^[A-Z]{2}$", 0)) \
    .withColumn("country", when(col("country").isNull(), "United States").otherwise(col("country")))

# Padroniza nomes de paises
cleaned_df = cleaned_df.withColumn("country", 
    when(col("country").isin(["US", "USA", "United States"]), "United States")
    .otherwise(col("country")))

# Particiona por pais e estado
cleaned_df.write.mode("overwrite") \
    .partitionBy("country", "state") \
    .parquet(f"{silver_path}breweries_cleaned.parquet")

print("Dados transformados e salvos na camada prata")