# **Silver Layer** - Transformação e Limpeza da Tabela Cities


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, trim, when
from delta import *

# -------------------------------------------
# 2. Leitura dos dados da camada RAW
# -------------------------------------------
df_cities_raw = spark.table("Bronze_Data.cities.raw_cities_data")

# -------------------------------------------
# 3. Normalização textual e tipos
# -------------------------------------------
for col_name in ["store_id", "storetype_id", "city_code", "country_id"]:
    df_cities_raw = df_cities_raw.withColumn(col_name, trim(lower(col(col_name).cast("string"))))

# -------------------------------------------
# 4. Limpeza dos dados
# -------------------------------------------
# store_size deve ser positivo
df_cities_clean = df_cities_raw.withColumn("store_size", 
     when(col("store_size").cast("float") > 0, col("store_size").cast("float")).otherwise(None))

# Eliminar linhas com store_id nulo (chave primária)
df_cities_clean = df_cities_clean.filter(col("store_id").isNotNull())

# Remover duplicados mantendo a primeira ocorrência
df_cities_clean = df_cities_clean.dropDuplicates(["store_id"])

StatementMeta(, ade0afec-cee4-41cf-8ca6-617a65b17a17, 3, Finished, Available, Finished)

In [2]:
from pyspark.sql.functions import col, when

# Mapping of incorrect/corrupted city names to correct ones
corrections = {
    'eski?ehir': 'eskisehir',
    '?zmir': 'izmir',
    'adapazar?': 'adapazari',
    'diyarbak?r': 'diyarbakir',
    'sanl?urfa': 'sanliurfa'
}

# Apply the replacements using when/otherwise chain
df_cities_clean = df_cities_clean.withColumn(
    "city_code",
    when(col("city_code") == "eski?ehir", "eskisehir")
    .when(col("city_code") == "?zmir", "izmir")
    .when(col("city_code") == "adapazar?", "adapazari")
    .when(col("city_code") == "diyarbak?r", "diyarbakir")
    .when(col("city_code") == "sanl?urfa", "sanliurfa")
    .otherwise(col("city_code"))
)

StatementMeta(, ade0afec-cee4-41cf-8ca6-617a65b17a17, 4, Finished, Available, Finished)

In [3]:
# # -------------------------------------------
# # 5. Visualização final para verificação
# # -------------------------------------------
# df_cities_clean.printSchema()
# df_cities_clean.show(100)

# print("\n✅ Silver Layer - Cities concluída com sucesso.")


StatementMeta(, ade0afec-cee4-41cf-8ca6-617a65b17a17, 5, Finished, Available, Finished)

root
 |-- store_id: string (nullable = true)
 |-- storetype_id: string (nullable = true)
 |-- store_size: float (nullable = true)
 |-- city_id_old: string (nullable = true)
 |-- country_id: string (nullable = true)
 |-- city_code: string (nullable = true)

+--------+------------+----------+-----------+----------+-------------+
|store_id|storetype_id|store_size|city_id_old|country_id|    city_code|
+--------+------------+----------+-----------+----------+-------------+
|   s0002|        st04|      39.0|       C007|    turkey|        adana|
|   s0003|        st03|      17.0|       C014|    turkey|     istanbul|
|   s0005|        st04|      19.0|       C001|    turkey|      denizli|
|   s0007|        st03|      16.0|       C014|    turkey|     istanbul|
|   s0010|        st04|      17.0|       C014|    turkey|     istanbul|
|   s0012|        st04|      28.0|       C005|    turkey|      antalya|
|   s0014|        st03|      14.0|       C014|    turkey|     istanbul|
|   s0015|        st04|

In [4]:
# -------------------------------------------
# 6. Escrita dos dados limpos na camada SILVER (Delta)
# -------------------------------------------
df_cities_clean.write.format("delta").mode("overwrite").saveAsTable("Silver_Data.cities.silver_cities_data")

StatementMeta(, ade0afec-cee4-41cf-8ca6-617a65b17a17, 6, Finished, Available, Finished)