In [0]:
import json
from pathlib import Path
import pandas as pd

In [0]:

dbutils.widgets.text("raw_root", "/Volumes/workspace/raw/tvmze", "Raw Root")
# Léalos en Python
raw_root = Path(dbutils.widgets.get("raw_root"))

In [0]:
%sql
DROP DATABASE IF EXISTS workspace.bronze CASCADE; 

In [0]:
%sql
CREATE DATABASE IF NOT EXISTS workspace.bronze
COMMENT 'Capa Bronze: datos crudos procesados'

In [0]:
spark.sql("DROP TABLE IF EXISTS workspace.bronze.tvmaze")

In [0]:

spark.sql("""
CREATE TABLE IF NOT EXISTS workspace.bronze.tvmaze (
  id BIGINT,
  url STRING,
  name STRING,
  season BIGINT,
  number DOUBLE,
  type STRING,
  airdate STRING,
  airtime STRING,
  airstamp STRING,
  runtime DOUBLE,
  image DOUBLE,
  summary STRING,
  rating_average DOUBLE,
  links_self_href STRING,
  links_show_href STRING,
  links_show_name STRING,
  embedded_show_id BIGINT,
  embedded_show_url STRING,
  embedded_show_name STRING,
  embedded_show_type STRING,
  embedded_show_language STRING,
  embedded_show_genres ARRAY<STRING>,
  embedded_show_status STRING,
  embedded_show_runtime DOUBLE,
  embedded_show_averageRuntime DOUBLE,
  embedded_show_premiered STRING,
  embedded_show_ended STRING,
  embedded_show_officialSite STRING,
  embedded_show_schedule_time STRING,
  embedded_show_schedule_days ARRAY<STRING>,
  embedded_show_rating_average DOUBLE,
  embedded_show_weight BIGINT,
  embedded_show_webChannel_id DOUBLE,
  embedded_show_webChannel_name STRING,
  embedded_show_webChannel_country_name STRING,
  embedded_show_webChannel_country_code STRING,
  embedded_show_webChannel_country_timezone STRING,
  embedded_show_webChannel_officialSite STRING,
  embedded_show_dvdCountry_name STRING,
  embedded_show_dvdCountry_code STRING,
  embedded_show_dvdCountry_timezone STRING,
  embedded_show_externals_tvrage DOUBLE,
  embedded_show_externals_thetvdb DOUBLE,
  embedded_show_externals_imdb STRING,
  embedded_show_image_medium STRING,
  embedded_show_image_original STRING,
  embedded_show_summary STRING,
  embedded_show_updated BIGINT,
  embedded_show_links_self_href STRING,
  embedded_show_links_previousepisode_href STRING,
  embedded_show_links_previousepisode_name STRING,
  image_medium STRING,
  image_original STRING,
  embedded_show_links_nextepisode_href STRING,
  embedded_show_links_nextepisode_name STRING,
  embedded_show_network_id DOUBLE,
  embedded_show_network_name STRING,
  embedded_show_network_country_name STRING,
  embedded_show_network_country_code STRING,
  embedded_show_network_country_timezone STRING,
  embedded_show_network_officialSite STRING
)

""")

In [0]:
'''
df_spark = (
    spark.read
    .option("recursiveFileLookup", "true")  # busca en subcarpetas YYYY/MM/DD
    .option("pathGlobFilter", "tvmaze.json")  # solo esos archivos
    .option("multiLine", "true")  # cada archivo es un JSON array
    .json(raw_root)
)
'''
candidate_files = sorted(raw_root.glob("**/tvmaze.json"))

records: list[dict] = []
for json_file in candidate_files:
    payload = json.loads(json_file.read_text(encoding="utf-8"))
    records.extend(payload)  # asumimos que payload siempre es lista

df = pd.json_normalize(records) if records else pd.DataFrame()
df_spark = spark.createDataFrame(df)



In [0]:
# Usamos overwrite para reemplazar completamente los datos existentes
df_spark.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable("workspace.bronze.tvmaze")