In [1]:
import pandas as pd
from lxml import etree
from pathlib import Path
import random
import csv

In [2]:
def count_safetyreports(xml_file):
    n = 0
    for _, _ in etree.iterparse(
        str(xml_file),
        events=("end",),
        tag="safetyreport",
        recover=True,
        huge_tree=True
    ):
        n += 1
    return n

def iter_safetyreports(xml_file):
    context = etree.iterparse(
        str(xml_file),
        events=("end",),
        tag="safetyreport",
        recover=True,
        huge_tree=True
    )
    for _, elem in context:
        yield elem
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]

def get_text(elem, path):
    node = elem.find(path)
    return node.text.strip() if (node is not None and node.text) else None

def reservoir_sample_ids(xml_files, k, seed):
    rng = random.Random(seed)
    reservoir = []
    n_seen = 0
    for xf in xml_files:
        # Aviso para que veas el progreso en pantalla
        print(f"Leyendo para muestreo: {xf.name}...")
        for sr in iter_safetyreports(xf):
            srid = get_text(sr, "safetyreportid")
            if srid is None:
                continue
            n_seen += 1
            if len(reservoir) < k:
                reservoir.append(srid)
            else:
                j = rng.randrange(n_seen)
                if j < k:
                    reservoir[j] = srid
    return reservoir, n_seen



In [3]:
#Trimestre 1 

In [4]:
#Busca la carpeta 'Q1' junto al notebook
xml_dir = Path("Q1")

# Archivos: Busca los del Trimestre 1 (*_ADR25Q1.xml)
xml_arch = sorted(xml_dir.glob("*_ADR25Q1.xml"))

print(f"Directorio de búsqueda: {xml_dir.absolute()}")
print(f"Archivos encontrados: {len(xml_arch)}")

if len(xml_arch) > 0:
    # Definimos dónde guardar los CSV (dentro de q1/data_processed)
    out_dir = xml_dir / "data_processed/q1_2025_sample100k"
    out_dir.mkdir(parents=True, exist_ok=True)
    
    out_reports_csv   = out_dir / "q1_reports_100k.csv"
    out_drugs_csv     = out_dir / "q1_drugs_100k.csv"
    out_reactions_csv = out_dir / "q1_reactions_100k.csv"

    K = 100_000
    SEED = 202501

    # Paso A: Muestreo
    print("\n--- Iniciando Muestreo ---")
    sample_ids, n_total = reservoir_sample_ids(xml_arch, K, SEED)
    sample_set = set(sample_ids)
    print(f"Total reportes: {n_total}")
    print(f"Tamaño de la muestra: {len(sample_set)}")

    #Generar CSVs
    print("\n--- Generando archivos CSV ---")
    with open(out_reports_csv, "w", newline="", encoding="utf-8") as f_rep, \
         open(out_drugs_csv, "w", newline="", encoding="utf-8") as f_drug, \
         open(out_reactions_csv, "w", newline="", encoding="utf-8") as f_reac:

        rep_w = csv.DictWriter(f_rep, fieldnames=["safetyreportid", "occurcountry", "receiptdate"])
        drug_w = csv.DictWriter(f_drug, fieldnames=["safetyreportid", "medicinalproduct", "drugcharacterization", "activesubstancename"])
        reac_w = csv.DictWriter(f_reac, fieldnames=["safetyreportid", "reaction_pt", "reaction_meddra_version_pt"])

        rep_w.writeheader()
        drug_w.writeheader()
        reac_w.writeheader()

        kept = 0
        for xf in xml_arch:
            print(f"Procesando archivo: {xf.name}...") 
            for sr in iter_safetyreports(xf):
                srid = get_text(sr, "safetyreportid")
                
                if srid is None or srid not in sample_set:
                    continue

                # Guardar Reporte
                rep_w.writerow({
                    "safetyreportid": srid,
                    "occurcountry": get_text(sr, "occurcountry"),
                    "receiptdate": get_text(sr, "receiptdate"),
                })

                # Guardar Reacciones
                for r in sr.findall(".//reaction"):
                    pt = get_text(r, "reactionmeddrapt") or get_text(r, "reactionmeddrallt")
                    if pt:
                        reac_w.writerow({
                            "safetyreportid": srid,
                            "reaction_pt": pt.strip().upper(),
                            "reaction_meddra_version_pt": get_text(r, "reactionmeddraversionpt"),
                        })

                # Guardar Drogas
                for d in sr.findall(".//drug"):
                    prod = get_text(d, "medicinalproduct")
                    active = get_text(d, ".//activesubstance/activesubstancename")
                    drug_w.writerow({
                        "safetyreportid": srid,
                        "medicinalproduct": prod.strip().upper() if prod else None,
                        "drugcharacterization": get_text(d, "drugcharacterization"),
                        "activesubstancename": active.strip().upper() if active else None,
                    })
                kept += 1

    print(f"\n Se guardaron {kept} reportes en la carpeta: {out_dir}")
else:
    print("ERROR: No se encontraron archivos XML. Revisa que estén en la carpeta 'Q1'.")

Directorio de búsqueda: /home/violeta/Escritorio/Proyecto_FAERS/Q1
Archivos encontrados: 3

--- Iniciando Muestreo ---
Leyendo para muestreo: 1_ADR25Q1.xml...
Leyendo para muestreo: 2_ADR25Q1.xml...
Leyendo para muestreo: 3_ADR25Q1.xml...
Total reportes: 400514
Tamaño de la muestra: 100000

--- Generando archivos CSV ---
Procesando archivo: 1_ADR25Q1.xml...
Procesando archivo: 2_ADR25Q1.xml...
Procesando archivo: 3_ADR25Q1.xml...

 Se guardaron 100000 reportes en la carpeta: Q1/data_processed/q1_2025_sample100k
