In [0]:
import requests
import pandas as pd
import pyspark.sql.functions as F

In [0]:
url = (
    "https://ghoapi.azureedge.net/api/SDGSUICIDE"
    "?$filter=date(TimeDimensionBegin) ge 2000-01-01"
)
data = requests.get(url, timeout=60).json()["value"]
raw_df = pd.json_normalize(data)

spark_df = spark.createDataFrame(raw_df)

columns_to_drop = [col for col, dtype in spark_df.dtypes if dtype == "void"]
spark_df = spark_df.drop(*columns_to_drop)

spark_df = spark_df.withColumn("ingestion_ts", F.current_timestamp())

(
    spark_df.write
    .mode("overwrite")
    .parquet("s3://bucketbastet/data-suicide/who")
)

In [0]:
spark.sql(f"CREATE SCHEMA IF NOT EXISTS bronze.data_suicide")

In [0]:
# Ler parquet do landing
landing_path = "s3://bucketbastet/data-suicide/who"
df_raw = spark.read.parquet(landing_path)

# Dropar colunas void se existirem
void_cols = [c for c, t in df_raw.dtypes if t == "void"]
df_bronze = df_raw.drop(*void_cols) if void_cols else df_raw

# Visualizar dados
display(df_bronze)

# Contar quantos arquivos de dados foram lidos
arquivos_lidos = df_raw.inputFiles()
print("Total de arquivos lidos:", len(arquivos_lidos))

# Salvar como tabela bronze 
(
    df_bronze.write
    .mode("overwrite")             
    .option("mergeSchema", "true")
    .saveAsTable("bronze.data_suicide.who_suicide_data")
)


In [0]:
display(spark.sql("describe bronze.data_suicide.who_suicide_data"))