# Silver Layer - Limpeza e Transformacao

Este notebook realiza a limpeza, validacao e transformacao dos dados brutos.


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import TimestampType
import os

spark = SparkSession.builder \
    .appName("Silver Layer") \
    .getOrCreate()

df = spark.read.parquet("data/bronze/dados_brutos.parquet")

print(f"Dados Bronze: {df.count()} linhas")


## Removendo duplicatas


In [None]:
df_dedup = df.dropDuplicates()
duplicatas_removidas = df.count() - df_dedup.count()
print(f"Duplicatas removidas: {duplicatas_removidas}")

print("\nValores nulos:")
df_dedup.select([count(when(col(c).isNull(), c)).alias(c) for c in df_dedup.columns]).show()


## Tratando valores nulos e padronizando dados


In [None]:
df_clean = df_dedup.fillna({
    'CustomerID': 0,
    'Description': 'PRODUTO SEM DESCRICAO'
})

df_clean = df_clean.withColumn("InvoiceDate", to_timestamp(col("InvoiceDate"), "MM/dd/yyyy HH:mm"))

df_clean = df_clean.withColumn("Description", trim(upper(col("Description"))))

df_clean = df_clean.filter(col("UnitPrice") >= 0)


## Criando colunas calculadas e salvando Silver Layer


In [None]:
df_clean = df_clean.withColumn("eh_devolucao", when(col("Quantity") < 0, True).otherwise(False))

df_clean = df_clean.withColumn("valor_total", col("Quantity") * col("UnitPrice"))

df_clean = df_clean.withColumn("data_processamento", current_timestamp())

os.makedirs("data/silver", exist_ok=True)

output_path = "data/silver/dados_limpos.parquet"

df_clean.write \
    .mode("overwrite") \
    .parquet(output_path)

print(f"\nDados Silver salvos: {df_clean.count()} linhas")

spark.stop()
