# Extract raw data

In [67]:
# Welcome to your new notebook
# Type here in the cell editor to add code!
import requests
import time
import logging
from typing import List, Dict, Optional
from tenacity import retry, stop_after_attempt, wait_exponential
from datetime import datetime
from pyspark.sql.functions import max as spark_max, col, lit
from pyspark.sql.types import * 
from delta.tables import DeltaTable

StatementMeta(, 51b2efbe-d492-42c1-b2b3-0b7044649d34, 69, Finished, Available, Finished, False)

In [68]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s"
)

BASE_URL = "https://api.spaceflightnewsapi.net/v4"
RATE_LIMIT_SECONDS = 0.3

ENDPOINTS = [
    "articles",
    "blogs",
    "reports"
]

# Definición de la estructura de datos
spaceflight_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("title", StringType(), True),
    StructField("url", StringType(), True),
    StructField("image_url", StringType(), True),
    StructField("news_site", StringType(), True),
    StructField("summary", StringType(), True),
    StructField("published_at", StringType(), True),
    StructField("updated_at", StringType(), True),
    StructField("featured", BooleanType(), True),
    StructField("launches", ArrayType(StringType()), True),
    StructField("events", ArrayType(StringType()), True)
])

info_schema = StructType([
    StructField("version", StringType(), True),
    StructField("news_sites", ArrayType(StringType()), True)
])

StatementMeta(, 51b2efbe-d492-42c1-b2b3-0b7044649d34, 70, Finished, Available, Finished, False)

In [69]:
# OBTENER ÚLTIMA FECHA DE EXTRACCIÓN
# ---------------------------------------------------------
def get_last_extraction_date(table_name: str) -> Optional[str]:
    """
    Obtiene la fecha máxima de published_at de la tabla Bronze.
    Retorna None si la tabla no existe o está vacía.
    """
    try:
        df = spark.table(f"bronze_{table_name}")
        last_date = df.agg(spark_max("published_at")).collect()[0][0]
        
        if last_date:
            logging.info(f"Última extracción de {table_name}: {last_date}")
            return last_date
        else:
            logging.info(f"Tabla {table_name} vacía. Extracción completa.")
            return None
    except:
        logging.info(f"Tabla bronze_{table_name} no existe. Extracción completa.")
        return None

StatementMeta(, 51b2efbe-d492-42c1-b2b3-0b7044649d34, 71, Finished, Available, Finished, False)

In [70]:
# Cliente API
# ---------------------------------------------------------
class SpaceflightAPI:
    """Cliente con soporte para extracción incremental"""
    
    def __init__(self, base_url: str):
        self.base_url = base_url
        self.session = requests.Session()
    
    @retry(stop=stop_after_attempt(3), wait=wait_exponential(min=2, max=10))
    def _request(self, url: str, params: Optional[Dict] = None) -> Dict:
        response = self.session.get(url, params=params, timeout=30)
        response.raise_for_status()
        return response.json()
    
    def fetch_paginated(self, endpoint: str, published_at_gt: Optional[str] = None):
        """
        Extrae registros con filtro opcional de fecha.
        
        Args:
            endpoint: articles, blogs, o reports
            published_at_gt: Solo extraer registros con published_at > esta fecha
        """
        records = []
        url = f"{self.base_url}/{endpoint}"
        
        # Parámetros de filtro
        params = {}
        if published_at_gt:
            params['published_at_gt'] = published_at_gt
            logging.info(f"Extracción incremental: {endpoint} desde {published_at_gt}")
        else:
            logging.info(f"Extracción completa: {endpoint}")
        
        # Primera request con params
        data = self._request(url, params=params)
        page = data.get("results", [])
        records.extend(page)
        
        total_available = data.get("count", 0)
        logging.info(f"{endpoint}: {len(page)} registros en página 1 | Total disponible: {total_available}")
        
        # Paginación
        url = data.get("next")
        page_num = 1
        
        while url:
            page_num += 1
            data = self._request(url)
            page = data.get("results", [])
            records.extend(page)
            
            logging.info(f"{endpoint}: {len(page)} registros en página {page_num} | Total extraído: {len(records)}")
            
            url = data.get("next")
            time.sleep(RATE_LIMIT_SECONDS)
        
        logging.info(f"Extracción completa {endpoint}: {len(records)} registros")
        return records
    
    def fetch_info(self) -> Dict:
        logging.info("Extrayendo metadata API")
        return self._request(f"{self.base_url}/info")


StatementMeta(, 51b2efbe-d492-42c1-b2b3-0b7044649d34, 72, Finished, Available, Finished, False)

In [71]:
# Normalización estructura requerida
def normalize_record(record: Dict):
    """
    Ajusta el registro al esquema definido
    """
    return {
        "id": record.get("id"),
        "title": record.get("title"),
        "url": record.get("url"),
        "image_url": record.get("image_url"),
        "news_site": record.get("news_site"),
        "summary": record.get("summary"),
        "published_at": record.get("published_at"),
        "updated_at": record.get("updated_at"),
        "featured": record.get("featured"),
        "launches": [str(x) for x in record.get("launches") or []],
        "events": [str(x) for x in record.get("events") or []]
    }

StatementMeta(, 51b2efbe-d492-42c1-b2b3-0b7044649d34, 73, Finished, Available, Finished, False)

In [72]:
# Deduplicación
# ---------------------------------------------------------
def deduplicate(records):
    """
    Elimina duplicados usando ID como clave primaria.
    """
    unique = {}
    for r in records:
        if r.get("id") not in unique:
            unique[r["id"]] = r

    logging.info(f"Duplicados eliminados: {len(records)-len(unique)}")

    return list(unique.values())

StatementMeta(, 51b2efbe-d492-42c1-b2b3-0b7044649d34, 74, Finished, Available, Finished, False)

In [73]:
# Función para guardar tablas
NOW = datetime.now()

def save_to_bronze(records, table_name, schema):
    """
    Guarda datos en capa bronze
    """

    if not records:
        logging.warning(f"No hay datos para {table_name}")
        return

    # Convertir a Spark DataFrame
    df = spark.createDataFrame(records, schema=schema)

    # Agregar metadata
    df = df.withColumn("ingestion_date", lit(NOW.strftime('%Y-%m-%d %H:%M:%S')))

    # Verificar si la tabla existe
    table_path = f"bronze_{table_name}"

    try:
        # Tabla existe - hacer MERGE
        delta_table = DeltaTable.forName(spark, table_path)
        
        delta_table.alias("old").merge(
            df.alias("new"),
            "old.id = new.id"  # Condición de match
        ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
        
        print(f"MERGE en {table_path}")
        
    except:
        # Tabla no existe - crear nueva
        df.write.mode("overwrite").format("delta").saveAsTable(table_path)
        print(f"Creada {table_path}")

StatementMeta(, 51b2efbe-d492-42c1-b2b3-0b7044649d34, 75, Finished, Available, Finished, False)

In [74]:
# Ejecución
# ---------------------------------------------------------
api = SpaceflightAPI(BASE_URL)

# Obtener últimas fechas de extracción
last_date_articles = get_last_extraction_date("articles")
last_date_blogs = get_last_extraction_date("blogs")
last_date_reports = get_last_extraction_date("reports")

# Extraer solo datos nuevos
articles_raw = api.fetch_paginated("articles", published_at_gt=last_date_articles)
blogs_raw = api.fetch_paginated("blogs", published_at_gt=last_date_blogs)
reports_raw = api.fetch_paginated("reports", published_at_gt=last_date_reports)
info_raw = api.fetch_info()

# Deduplicar y normalizar
articles = deduplicate([normalize_record(x) for x in articles_raw])
blogs = deduplicate([normalize_record(x) for x in blogs_raw])
reports = deduplicate([normalize_record(x) for x in reports_raw])

# Guardar en capa bronze
save_to_bronze(articles, "articles", spaceflight_schema)
save_to_bronze(blogs, "blogs", spaceflight_schema)
save_to_bronze(reports, "reports", spaceflight_schema)

save_to_bronze([info_raw], "api_info", info_schema)


StatementMeta(, 51b2efbe-d492-42c1-b2b3-0b7044649d34, 76, Finished, Available, Finished, False)

2026-02-18 10:28:09,615 | INFO | Última extracción de articles: 2026-02-18T05:01:00Z
2026-02-18 10:28:10,134 | INFO | Última extracción de blogs: 2026-02-17T16:55:18Z
2026-02-18 10:28:10,666 | INFO | Última extracción de reports: 2024-09-07T04:14:32Z
2026-02-18 10:28:10,666 | INFO | Extracción incremental: articles desde 2026-02-18T05:01:00Z
2026-02-18 10:28:12,775 | INFO | articles: 0 registros en página 1 | Total disponible: 0
2026-02-18 10:28:12,776 | INFO | ✅ Extracción completa articles: 0 registros
2026-02-18 10:28:12,776 | INFO | Extracción incremental: blogs desde 2026-02-17T16:55:18Z
2026-02-18 10:28:13,864 | INFO | blogs: 0 registros en página 1 | Total disponible: 0
2026-02-18 10:28:13,864 | INFO | ✅ Extracción completa blogs: 0 registros
2026-02-18 10:28:13,865 | INFO | Extracción incremental: reports desde 2024-09-07T04:14:32Z
2026-02-18 10:28:15,001 | INFO | reports: 0 registros en página 1 | Total disponible: 0
2026-02-18 10:28:15,001 | INFO | ✅ Extracción completa repor

Creada bronze_api_info


In [1]:
# VERIFICACIÓN DE TABLAS DELTA

print("="*70)
print("VERIFICACIÓN DE TABLAS DELTA BRONZE")
print("="*70)

tablas = [
    'bronze_api_info',
    'bronze_articles', 
    'bronze_blogs',
    'bronze_reports'
]

total_registros = 0

for tabla in tablas:
    print(f"\nTabla: {tabla}")
    print("-"*70)
    
    try:
        # Leer tabla Delta
        df = spark.table(tabla)
        count = df.count()
        total_registros += count
        
        print(f" Registros: {count:,}")
        
        if count > 0:
            # Mostrar columnas
            print(f"Columnas ({len(df.columns)}): {', '.join(df.columns[:10])}...")
            
            # Mostrar muestra de datos
            print(f"\nMuestra de datos:")
            df.select('id', 'title', 'published_at', 'news_site').show(3, truncate=50)
            
            # Estadísticas por fecha de ingesta
            if 'ingestion_date' in df.columns or 'year' in df.columns:
                print(f"\nDistribución temporal:")
                if 'ingestion_date' in df.columns:
                    df.groupBy('ingestion_date').count() \
                      .orderBy('ingestion_date', ascending=False) \
                      .show(5)
                elif 'year' in df.columns and 'month' in df.columns:
                    df.groupBy('year', 'month').count() \
                      .orderBy('year', 'month', ascending=False) \
                      .show(5)
        else:
            print(f"Tabla vacía (0 registros)")
            
    except Exception as e:
        print(f"Error: {str(e)}")

print("\n" + "="*70)
print(f"TOTAL REGISTROS EN BRONZE: {total_registros:,}")
print("="*70)

if total_registros == 0:
    print("\nPROBLEMA: Las tablas están vacías")
else:
    print(f"\nDatos encontrados: {total_registros:,} registros")

StatementMeta(, c0d81875-43ae-4df5-991d-6fbf304bdf7e, 3, Finished, Available, Finished, False)

VERIFICACIÓN DE TABLAS DELTA BRONZE

Tabla: bronze_api_info
----------------------------------------------------------------------
   ✅ Registros: 1
Columnas (3): version, news_sites, ingestion_date...

Muestra de datos:
Error: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `id` cannot be resolved. Did you mean one of the following? [`version`, `news_sites`, `ingestion_date`].;
'Project ['id, 'title, 'published_at, 'news_site]
+- SubqueryAlias spark_catalog.chimcobldhq2asrgc5hmapjcd5jmgt15edo62or5cpm6ipr8ehfmoobbclk6utbjckim8ojf.bronze_api_info
   +- Relation spark_catalog.chimcobldhq2asrgc5hmapjcd5jmgt15edo62or5cpm6ipr8ehfmoobbclk6utbjckim8ojf.bronze_api_info[version#756,news_sites#757,ingestion_date#758] parquet


Tabla: bronze_articles
----------------------------------------------------------------------
   ✅ Registros: 32,141
Columnas (12): id, title, url, image_url, news_site, summary, published_at, updated_at, featured, launches...

Muestra de datos