In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [0]:
df_suicide = spark.table("silver.data_suicide.who_suicide_data")
df_hdi = spark.table("silver.data_suicide.hdi_owid")

In [0]:
gold_suicide_hdi = (
    df_suicide
    .filter(F.col("sex") == "both")  # Total geral
    .join(df_hdi, 
          (df_suicide.country_code == df_hdi.country_code_iso3) & 
          (df_suicide.year == df_hdi.year), 
          "inner")
    .select(
        df_suicide.country_code,
        df_hdi.country_name,
        df_suicide.region,
        df_suicide.year,
        df_suicide.rate_per_100k.alias("suicide_rate"),
        df_hdi.hdi,
        F.when(df_hdi.hdi >= 0.8, "Very High")
         .when(df_hdi.hdi >= 0.7, "High")
         .when(df_hdi.hdi >= 0.55, "Medium")
         .otherwise("Low").alias("hdi_category")
    )
    .withColumn("decade", (F.col("year") / 10).cast("int") * 10)
)


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

window_country = Window.partitionBy("country_code").orderBy("year")
gold_suicide_hdi = (
    gold_suicide_hdi
    .withColumn("prev_year_rate", F.lag("suicide_rate").over(window_country))
    .withColumn(
        "rate_change_pct",
        F.when(
            (F.col("prev_year_rate").isNotNull()) & (F.col("prev_year_rate") != 0),
            (F.col("suicide_rate") - F.col("prev_year_rate")) / F.col("prev_year_rate") * 100
        ).otherwise(F.lit(None))
    )
    .withColumn(
        "trend",
        F.when(F.col("rate_change_pct") > 5, "Increasing")
         .when(F.col("rate_change_pct") < -5, "Decreasing")
         .otherwise("Stable")
    )
)

spark.sql("CREATE SCHEMA IF NOT EXISTS gold.data_suicide")
(
    gold_suicide_hdi.write
    .mode("overwrite")
    .saveAsTable("gold.data_suicide.suicide_hdi_analysis")
)

In [0]:
# Rename year in df_hdi to avoid ambiguity
df_hdi_renamed = df_hdi.withColumnRenamed("year", "hdi_year")

gold_gender = (
    df_male.join(df_female, ["country_code", "year"], "inner")
    .join(
        df_hdi_renamed.select(
            F.col("country_code_iso3"),
            F.col("country_name"),
            F.col("hdi_year"),
            F.col("hdi")
        ),
        (df_male.country_code == F.col("country_code_iso3")) &
        (df_male.year == F.col("hdi_year")),
        "inner"
    )
    .select(
        F.col("country_code"),
        F.col("country_name"),
        F.col("year"),
        F.col("male_rate"),
        F.col("female_rate"),
        F.when(
            F.col("female_rate") != 0,
            F.col("male_rate") / F.col("female_rate")
        ).otherwise(None).alias("gender_ratio"),
        (F.col("male_rate") - F.col("female_rate")).alias("gender_gap"),
        F.col("hdi")
    )
    .withColumn(
        "gap_category",
        F.when(F.col("gender_ratio") > 4, "Very High Gap")
         .when(F.col("gender_ratio") > 3, "High Gap")
         .when(F.col("gender_ratio") > 2, "Moderate Gap")
         .otherwise("Low Gap")
    )
)

(
    gold_gender.write
    .mode("overwrite")
    .saveAsTable("gold.data_suicide.gender_disparity_analysis")
)

In [0]:
# ============================================================================
# GOLD 3: RANKING E COMPARAÇÃO REGIONAL
# ============================================================================

window_year_region = Window.partitionBy("year", "region").orderBy(F.desc("suicide_rate"))
window_year_global = Window.partitionBy("year").orderBy(F.desc("suicide_rate"))

gold_rankings = (
    df_suicide
    .filter(F.col("sex") == "both")
    .join(df_hdi.select("country_code_iso3", "country_name", "year"),
          (df_suicide.country_code == df_hdi.country_code_iso3) & 
          (df_suicide.year == df_hdi.year),
          "left")
    .select(
        df_suicide.country_code,
        "country_name",
        df_suicide.region,
        df_suicide.year,
        df_suicide.rate_per_100k.alias("suicide_rate")
    )
    .withColumn("rank_in_region", F.row_number().over(window_year_region))
    .withColumn("rank_global", F.row_number().over(window_year_global))
)

# Calcular médias regionais
regional_avg = (
    gold_rankings
    .groupBy("region", "year")
    .agg(
        F.avg("suicide_rate").alias("regional_avg_rate"),
        F.count("country_code").alias("countries_in_region")
    )
)

gold_rankings = gold_rankings.join(
    regional_avg, 
    ["region", "year"], 
    "left"
).withColumn(
    "vs_regional_avg",
    F.col("suicide_rate") - F.col("regional_avg_rate")
)

(
    gold_rankings.write
    .mode("overwrite")
    .saveAsTable("gold.data_suicide.country_rankings")
)

In [0]:
# ============================================================================
# GOLD 4: CORRELAÇÕES E FATORES (integração com drogas quando disponível)
# ============================================================================

# Esta tabela será atualizada quando tivermos silver de drogas
try:
    df_drugs = spark.table("silver.data_suicide.drug_indicators_wide")
    
    gold_correlations = (
        gold_suicide_hdi
        .join(df_drugs,
              (gold_suicide_hdi.country_code == df_drugs.country_code) &
              (gold_suicide_hdi.year == df_drugs.year),
              "left")
        .select(
            gold_suicide_hdi.country_code,
            gold_suicide_hdi.country_name,
            gold_suicide_hdi.year,
            gold_suicide_hdi.suicide_rate,
            gold_suicide_hdi.hdi,
            F.col("treatment_coverage"),
            F.col("capacity_index"),
            F.col("pwid_pop")
        )
    )
    
    (
        gold_correlations.write
        .mode("overwrite")
        .saveAsTable("gold.data_suicide.multifactor_correlation")
    )
    print("✅ Tabela de correlações criada")
    
except Exception as e:
    print(f"⚠️ Tabela de drogas não disponível ainda: {e}")

In [0]:
# ============================================================================
# GOLD 5: AGREGAÇÕES PARA DASHBOARDS - MÉTRICAS GLOBAIS
# ============================================================================

gold_global_metrics = (
    df_suicide
    .filter(F.col("sex") == "both")
    .groupBy("year")
    .agg(
        F.avg("rate_per_100k").alias("global_avg_rate"),
        F.max("rate_per_100k").alias("global_max_rate"),
        F.min("rate_per_100k").alias("global_min_rate"),
        F.stddev("rate_per_100k").alias("global_std_rate"),
        F.count("country_code").alias("countries_reporting")
    )
    .orderBy("year")
)

(
    gold_global_metrics.write
    .mode("overwrite")
    .saveAsTable("gold.data_suicide.global_metrics")
)

In [0]:
# ============================================================================
# GOLD 6: ANÁLISE DE MUDANÇAS SIGNIFICATIVAS (ALERTS)
# ============================================================================

# Identificar países com mudanças bruscas (>20% em um ano ou >50% em 5 anos)
window_country = Window.partitionBy("country_code").orderBy("year")

gold_alerts = (
    gold_suicide_hdi
    .withColumn("rate_5y_ago", F.lag("suicide_rate", 5).over(window_country))
    .withColumn(
        "change_5y_pct",
        F.when(
            (F.col("rate_5y_ago").isNotNull()) & (F.col("rate_5y_ago") != 0),
            (F.col("suicide_rate") - F.col("rate_5y_ago")) / F.col("rate_5y_ago") * 100
        ).otherwise(None)
    )
    .filter(
        (F.abs(F.col("rate_change_pct")) > 20) |
        (F.abs(F.col("change_5y_pct")) > 50)
    )
    .select(
        "country_code",
        "country_name",
        "year",
        "suicide_rate",
        "rate_change_pct",
        "change_5y_pct",
        "hdi",
        F.when(F.col("rate_change_pct") > 20, "Sharp Increase (1y)")
         .when(F.col("rate_change_pct") < -20, "Sharp Decrease (1y)")
         .when(F.col("change_5y_pct") > 50, "Major Increase (5y)")
         .when(F.col("change_5y_pct") < -50, "Major Decrease (5y)")
         .alias("alert_type")
    )
)

(
    gold_alerts.write
    .mode("overwrite")
    .saveAsTable("gold.data_suicide.significant_changes_alerts")
)

In [0]:


# ============================================================================
# DOCUMENTAÇÃO DAS TABELAS
# ============================================================================

spark.sql("""
COMMENT ON TABLE gold.data_suicide.suicide_hdi_analysis IS
'Análise temporal de suicídio correlacionada com HDI, incluindo tendências e categorização'
""")

spark.sql("""
COMMENT ON TABLE gold.data_suicide.gender_disparity_analysis IS
'Análise comparativa entre taxas de suicídio masculina e feminina, com gaps e ratios'
""")

spark.sql("""
COMMENT ON TABLE gold.data_suicide.country_rankings IS
'Rankings de países por taxa de suicídio (regional e global) com comparação vs. média regional'
""")

spark.sql("""
COMMENT ON TABLE gold.data_suicide.global_metrics IS
'Métricas agregadas globais por ano para uso em dashboards executivos'
""")

spark.sql("""
COMMENT ON TABLE gold.data_suicide.significant_changes_alerts IS
'Alertas de mudanças significativas (>20% em 1 ano ou >50% em 5 anos)'
""")

print("=" * 80)
print("✅ CAMADA GOLD CRIADA COM SUCESSO!")
print("=" * 80)
print("\nTabelas criadas:")
print("1. gold.data_suicide.suicide_hdi_analysis")
print("2. gold.data_suicide.gender_disparity_analysis")
print("3. gold.data_suicide.country_rankings")
print("4. gold.data_suicide.multifactor_correlation (se dados de drogas disponíveis)")
print("5. gold.data_suicide.global_metrics")
print("6. gold.data_suicide.significant_changes_alerts")