In [0]:
import io
import requests
import pandas as pd
from pyspark.sql import functions as F

In [0]:

csv_url = "https://ourworldindata.org/grapher/human-development-index.csv"  # OWID HDI CSV
landing_path_hdi = "s3://bucketbastet/data-suicide/hdi_owid/"               # landing (Parquet)
bronze_table = "bronze.data_suicide.hdi_owid"                                # bronze table

In [0]:
resp = requests.get(csv_url, timeout=120)
resp.raise_for_status()

pdf = pd.read_csv(io.StringIO(resp.text))   
spark_df = spark.createDataFrame(pdf)

In [0]:
# drop colunas void
void_cols = [c for c, t in spark_df.dtypes if t == "void"]
spark_df = spark_df.drop(*void_cols) if void_cols else spark_df

# salva landing em Parquet
(spark_df.write.mode("overwrite").parquet(landing_path_hdi))
print("Landing salvo em:", landing_path_hdi)

In [0]:
df_raw = spark.read.parquet(landing_path_hdi)

# Quantos arquivos 'part-*' o Spark leu
arquivos_lidos = df_raw.inputFiles()
print("Arquivos lidos (part-*):", len(arquivos_lidos))

display(df_raw)

invalid_chars = [' ', ',', ';', '{', '}', '(', ')', '\n', '\t', '=']
def clean_col(col):
    for ch in invalid_chars:
        col = col.replace(ch, '_')
    return col

for col in df_raw.columns:
    new_col = clean_col(col)
    if new_col != col:
        df_raw = df_raw.withColumnRenamed(col, new_col)



In [0]:
# salvar como tabela no catálogo bronze
(
    df_raw.write
    .mode("overwrite")
    .option("mergeSchema","true")
    .saveAsTable(bronze_table)
)
print("Bronze gravada em:", bronze_table)