In [0]:
# ======================================================
# 1. Função para extrair dados de API REST e converter em DataFrame Spark
# ======================================================
import requests
import json
from pyspark.sql.functions import col, lit, when

def extract_api_to_df(url, spark):
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
    else:
        raise Exception(f"Erro ao acessar API {url}: {response.status_code}")
    
    rdd = spark.sparkContext.parallelize([json.dumps(data)])
    return spark.read.json(rdd)

# ======================================================
# 2. API Rest Countries
# ======================================================
url_countries = "https://restcountries.com/v3.1/all"
df_raw_countries = extract_api_to_df(url_countries, spark)

df_countries = df_raw_countries.selectExpr(
    "cca3 as country_code",
    "name.common as country_name",
    "region",
    "subregion",
    "population",
    "area",
    "capital[0] as capital",
    "languages"
)

df_countries = df_countries.withColumn(
    "population_density",
    when(col("area") > 0, col("population") / col("area")).otherwise(lit(None))
)

display(df_countries.limit(10))  # Databricks exibe bonitinho

# ======================================================
# 3. API SpaceX Launchpads
# ======================================================
url_spacex = "https://api.spacexdata.com/v4/launchpads"
df_raw_spacex = extract_api_to_df(url_spacex, spark)

df_spacex = df_raw_spacex.selectExpr(
    "id as launchpad_id",
    "name as launchpad_name",
    "locality",
    "region as launchpad_region",
    "full_name",
    "latitude",
    "longitude",
    "status",
    "launch_attempts",
    "launch_successes",
    "timezone"
)

display(df_spacex.limit(10))

# ======================================================
# 4. Integração dos dados (SpaceX x Countries)
# ======================================================
df_join = df_spacex.join(
    df_countries,
    df_spacex.launchpad_region == df_countries.region,
    "left"
)

display(df_join.select(
    "launchpad_name", "launchpad_region", "country_name", "population", "capital"
).limit(10))

# ======================================================
# 5. Persistência no Databricks (Delta Lake ou Parquet)
# ======================================================
df_countries.write.format("delta").mode("overwrite").save("/mnt/datalake/countries_data")
df_spacex.write.format("delta").mode("overwrite").save("/mnt/datalake/spacex_launchpads")
df_join.write.format("delta").mode("overwrite").save("/mnt/datalake/spacex_countries_join")
