In [6]:
import pandas as pd
from lxml import etree
from pathlib import Path
import random
import csv

### Trimestre 1

In [11]:
xml_dir = Path("faers_xml_2025q1/XML")

xml_arch = sorted(xml_dir.glob("*_ADR25Q1.xml"))  # carga las 3 partes
xml_arch

[PosixPath('faers_xml_2025q1/XML/1_ADR25Q1.xml'),
 PosixPath('faers_xml_2025q1/XML/2_ADR25Q1.xml'),
 PosixPath('faers_xml_2025q1/XML/3_ADR25Q1.xml')]

In [12]:
def count_safetyreports(xml_file):
    n = 0
    for _, _ in etree.iterparse(
        str(xml_file),
        events=("end",),
        tag="safetyreport",
        recover=True,
        huge_tree=True
    ):
        n += 1
    return n

total = 0
for xf in xml_arch:
    c = count_safetyreports(xf)
    print(xf.name, c)
    total += c

print("\nTOTAL reportes (safetyreport):", total)


1_ADR25Q1.xml 126945
2_ADR25Q1.xml 130665
3_ADR25Q1.xml 142904

TOTAL reportes (safetyreport): 400514


In [13]:
from lxml import etree

def iter_safetyreports(xml_file):
    context = etree.iterparse(
        str(xml_file),
        events=("end",),
        tag="safetyreport",
        recover=True,
        huge_tree=True
    )
    for _, elem in context:
        yield elem
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]

def get_text(elem, path):
    node = elem.find(path)
    return node.text.strip() if (node is not None and node.text) else None

seen = set()
dup_count = 0
dup_examples = []   # ejemplos
MAX_EX = 20

total = 0
for xf in xml_arch:
    for sr in iter_safetyreports(xf):
        srid = get_text(sr, "safetyreportid")
        total += 1
        if srid is None:
            continue
        if srid in seen:
            dup_count += 1
            if len(dup_examples) < MAX_EX:
                dup_examples.append((srid, xf.name))
        else:
            seen.add(srid)

print("Total de informes de seguridad:", total)
print("IDs únicos:", len(seen))
print("Repetidos detectados:", dup_count)
print("Ejemplos (srid, archivo):", dup_examples)


Total safetyreport (elementos): 400514
IDs únicos: 400514
Repetidos detectados: 0
Ejemplos (srid, archivo): []


### Muestreo para Q1

In [14]:
K = 100_000
SEED = 202501

out_dir = Path("data_processed/q1_2025_sample100k")
out_dir.mkdir(parents=True, exist_ok=True)

out_reports_csv   = out_dir / "q1_reports_100k.csv"
out_drugs_csv     = out_dir / "q1_drugs_100k.csv"
out_reactions_csv = out_dir / "q1_reactions_100k.csv"


def iter_safetyreports(xml_file):
    context = etree.iterparse(
        str(xml_file),
        events=("end",),
        tag="safetyreport",
        recover=True,
        huge_tree=True
    )
    for _, elem in context:
        yield elem
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]

def get_text(elem, tag):
    node = elem.find(tag)
    return node.text.strip() if (node is not None and node.text) else None

def reservoir_sample_ids(xml_files, k, seed):
    rng = random.Random(seed)
    reservoir = []
    n_seen = 0
    for xf in xml_files:
        for sr in iter_safetyreports(xf):
            srid = get_text(sr, "safetyreportid")
            if srid is None:
                continue
            n_seen += 1
            if len(reservoir) < k:
                reservoir.append(srid)
            else:
                j = rng.randrange(n_seen)
                if j < k:
                    reservoir[j] = srid
    return reservoir, n_seen

# muestrear id's uniformemente
sample_ids, n_total = reservoir_sample_ids(xml_arch, K, SEED)
sample_set = set(sample_ids)
assert len(sample_ids) == K
assert len(sample_set) == K

# extraer muestra y guardarlos como CSV
with open(out_reports_csv, "w", newline="", encoding="utf-8") as f_rep, \
     open(out_drugs_csv, "w", newline="", encoding="utf-8") as f_drug, \
     open(out_reactions_csv, "w", newline="", encoding="utf-8") as f_reac:

    rep_w = csv.DictWriter(f_rep, fieldnames=["safetyreportid", "occurcountry", "receiptdate"])
    drug_w = csv.DictWriter(f_drug, fieldnames=["safetyreportid", "medicinalproduct", "drugcharacterization", "activesubstancename"])
    reac_w = csv.DictWriter(f_reac, fieldnames=["safetyreportid", "reaction_pt", "reaction_meddra_version_pt"])

    rep_w.writeheader()
    drug_w.writeheader()
    reac_w.writeheader()

    kept = 0
    for xf in xml_arch:
        for sr in iter_safetyreports(xf):
            srid = get_text(sr, "safetyreportid")
            if srid is None or srid not in sample_set:
                continue

            rep_w.writerow({
                "safetyreportid": srid,
                "occurcountry": get_text(sr, "occurcountry"),
                "receiptdate": get_text(sr, "receiptdate"),
            })

            for r in sr.findall(".//reaction"):
                pt = get_text(r, "reactionmeddrapt") or get_text(r, "reactionmeddrallt")
                if pt:
                    reac_w.writerow({
                        "safetyreportid": srid,
                        "reaction_pt": pt.strip().upper(),
                        "reaction_meddra_version_pt": get_text(r, "reactionmeddraversionpt"),
                    })

            for d in sr.findall(".//drug"):
                prod = get_text(d, "medicinalproduct")
                active = get_text(d, ".//activesubstance/activesubstancename")
                drug_w.writerow({
                    "safetyreportid": srid,
                    "medicinalproduct": prod.strip().upper() if prod else None,
                    "drugcharacterization": get_text(d, "drugcharacterization"),
                    "activesubstancename": active.strip().upper() if active else None,
                })

            kept += 1

print("Total de informes:", n_total)
print("Tamaño de la muestra (ID únicos):", len(sample_set))
print("Informes extraídos:", kept)
print("Archivos guardados:")
print(" -", out_reports_csv)
print(" -", out_drugs_csv)
print(" -", out_reactions_csv)


Total de informes: 400514
Tamaño de la muestra (ID únicos): 100000
Informes extraídos: 100000
Archivos guardados:
 - data_processed/q1_2025_sample100k/q1_reports_100k.csv
 - data_processed/q1_2025_sample100k/q1_drugs_100k.csv
 - data_processed/q1_2025_sample100k/q1_reactions_100k.csv


### Trimestre 2

In [15]:
xml_dir = Path("faers_xml_2025q2/XML")

xml_arch = sorted(xml_dir.glob("*_ADR25Q2.xml"))  # carga las 3 partes
xml_arch

[PosixPath('faers_xml_2025q2/XML/1_ADR25Q2.xml'),
 PosixPath('faers_xml_2025q2/XML/2_ADR25Q2.xml'),
 PosixPath('faers_xml_2025q2/XML/3_ADR25Q2.xml')]

In [16]:
total = 0
for xf in xml_arch:
    c = count_safetyreports(xf)
    print(xf.name, c)
    total += c

print("\nTOTAL reportes (safetyreport):", total)


1_ADR25Q2.xml 120735
2_ADR25Q2.xml 133702
3_ADR25Q2.xml 138693

TOTAL reportes (safetyreport): 393130


In [17]:
K = 100_000
SEED = 202501

out_dir = Path("data_processed/q2_2025_sample100k")
out_dir.mkdir(parents=True, exist_ok=True)

out_reports_csv   = out_dir / "q2_reports_100k.csv"
out_drugs_csv     = out_dir / "q2_drugs_100k.csv"
out_reactions_csv = out_dir / "q2_reactions_100k.csv"


def iter_safetyreports(xml_file):
    context = etree.iterparse(
        str(xml_file),
        events=("end",),
        tag="safetyreport",
        recover=True,
        huge_tree=True
    )
    for _, elem in context:
        yield elem
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]

def get_text(elem, tag):
    node = elem.find(tag)
    return node.text.strip() if (node is not None and node.text) else None

def reservoir_sample_ids(xml_files, k, seed):
    rng = random.Random(seed)
    reservoir = []
    n_seen = 0
    for xf in xml_files:
        for sr in iter_safetyreports(xf):
            srid = get_text(sr, "safetyreportid")
            if srid is None:
                continue
            n_seen += 1
            if len(reservoir) < k:
                reservoir.append(srid)
            else:
                j = rng.randrange(n_seen)
                if j < k:
                    reservoir[j] = srid
    return reservoir, n_seen

# muestrear id's uniformemente
sample_ids, n_total = reservoir_sample_ids(xml_arch, K, SEED)
sample_set = set(sample_ids)
assert len(sample_ids) == K
assert len(sample_set) == K

# extraer muestra y guardarlos como CSV
with open(out_reports_csv, "w", newline="", encoding="utf-8") as f_rep, \
     open(out_drugs_csv, "w", newline="", encoding="utf-8") as f_drug, \
     open(out_reactions_csv, "w", newline="", encoding="utf-8") as f_reac:

    rep_w = csv.DictWriter(f_rep, fieldnames=["safetyreportid", "occurcountry", "receiptdate"])
    drug_w = csv.DictWriter(f_drug, fieldnames=["safetyreportid", "medicinalproduct", "drugcharacterization", "activesubstancename"])
    reac_w = csv.DictWriter(f_reac, fieldnames=["safetyreportid", "reaction_pt", "reaction_meddra_version_pt"])

    rep_w.writeheader()
    drug_w.writeheader()
    reac_w.writeheader()

    kept = 0
    for xf in xml_arch:
        for sr in iter_safetyreports(xf):
            srid = get_text(sr, "safetyreportid")
            if srid is None or srid not in sample_set:
                continue

            rep_w.writerow({
                "safetyreportid": srid,
                "occurcountry": get_text(sr, "occurcountry"),
                "receiptdate": get_text(sr, "receiptdate"),
            })

            for r in sr.findall(".//reaction"):
                pt = get_text(r, "reactionmeddrapt") or get_text(r, "reactionmeddrallt")
                if pt:
                    reac_w.writerow({
                        "safetyreportid": srid,
                        "reaction_pt": pt.strip().upper(),
                        "reaction_meddra_version_pt": get_text(r, "reactionmeddraversionpt"),
                    })

            for d in sr.findall(".//drug"):
                prod = get_text(d, "medicinalproduct")
                active = get_text(d, ".//activesubstance/activesubstancename")
                drug_w.writerow({
                    "safetyreportid": srid,
                    "medicinalproduct": prod.strip().upper() if prod else None,
                    "drugcharacterization": get_text(d, "drugcharacterization"),
                    "activesubstancename": active.strip().upper() if active else None,
                })

            kept += 1

print("Total de informes:", n_total)
print("Tamaño de la muestra (ID únicos):", len(sample_set))
print("Informes extraídos:", kept)
print("Archivos guardados:")
print(" -", out_reports_csv)
print(" -", out_drugs_csv)
print(" -", out_reactions_csv)

Total de informes: 393130
Tamaño de la muestra (ID únicos): 100000
Informes extraídos: 100000
Archivos guardados:
 - data_processed/q2_2025_sample100k/q2_reports_100k.csv
 - data_processed/q2_2025_sample100k/q2_drugs_100k.csv
 - data_processed/q2_2025_sample100k/q2_reactions_100k.csv


### Trimestre 3

In [18]:
xml_dir = Path("faers_xml_2025q3/XML")

xml_arch = sorted(xml_dir.glob("*_ADR25Q3.xml"))  # carga las 3 partes
xml_arch

[PosixPath('faers_xml_2025q3/XML/1_ADR25Q3.xml'),
 PosixPath('faers_xml_2025q3/XML/2_ADR25Q3.xml'),
 PosixPath('faers_xml_2025q3/XML/3_ADR25Q3.xml')]

In [19]:
total = 0
for xf in xml_arch:
    c = count_safetyreports(xf)
    print(xf.name, c)
    total += c

print("\nTOTAL reportes (safetyreport):", total)

1_ADR25Q3.xml 142950
2_ADR25Q3.xml 156602
3_ADR25Q3.xml 138960

TOTAL reportes (safetyreport): 438512


In [20]:
from lxml import etree

def iter_safetyreports(xml_file):
    context = etree.iterparse(
        str(xml_file),
        events=("end",),
        tag="safetyreport",
        recover=True,
        huge_tree=True
    )
    for _, elem in context:
        yield elem
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]

def get_text(elem, path):
    node = elem.find(path)
    return node.text.strip() if (node is not None and node.text) else None

seen = set()
dup_count = 0
dup_examples = []   # ejemplos
MAX_EX = 20

total = 0
for xf in xml_arch:
    for sr in iter_safetyreports(xf):
        srid = get_text(sr, "safetyreportid")
        total += 1
        if srid is None:
            continue
        if srid in seen:
            dup_count += 1
            if len(dup_examples) < MAX_EX:
                dup_examples.append((srid, xf.name))
        else:
            seen.add(srid)

print("Total de informes de seguridad:", total)
print("IDs únicos:", len(seen))
print("Repetidos detectados:", dup_count)
print("Ejemplos (srid, archivo):", dup_examples)

Total de informes de seguridad: 438512
IDs únicos: 438512
Repetidos detectados: 0
Ejemplos (srid, archivo): []


In [None]:
K = 100_000
SEED = 202501

out_dir = Path("data_processed/q3_2025_sample100k")
out_dir.mkdir(parents=True, exist_ok=True)

out_reports_csv   = out_dir / "q3_reports_100k.csv"
out_drugs_csv     = out_dir / "q3_drugs_100k.csv"
out_reactions_csv = out_dir / "q3_reactions_100k.csv"


def iter_safetyreports(xml_file):
    context = etree.iterparse(
        str(xml_file),
        events=("end",),
        tag="safetyreport",
        recover=True,
        huge_tree=True
    )
    for _, elem in context:
        yield elem
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]

def get_text(elem, tag):
    node = elem.find(tag)
    return node.text.strip() if (node is not None and node.text) else None

def reservoir_sample_ids(xml_files, k, seed):
    rng = random.Random(seed)
    reservoir = []
    n_seen = 0
    for xf in xml_files:
        for sr in iter_safetyreports(xf):
            srid = get_text(sr, "safetyreportid")
            if srid is None:
                continue
            n_seen += 1
            if len(reservoir) < k:
                reservoir.append(srid)
            else:
                j = rng.randrange(n_seen)
                if j < k:
                    reservoir[j] = srid
    return reservoir, n_seen

# muestrear id's uniformemente
sample_ids, n_total = reservoir_sample_ids(xml_arch, K, SEED)
sample_set = set(sample_ids)
assert len(sample_ids) == K
assert len(sample_set) == K

# extraer muestra y guardarlos como CSV
with open(out_reports_csv, "w", newline="", encoding="utf-8") as f_rep, \
     open(out_drugs_csv, "w", newline="", encoding="utf-8") as f_drug, \
     open(out_reactions_csv, "w", newline="", encoding="utf-8") as f_reac:

    rep_w = csv.DictWriter(f_rep, fieldnames=["safetyreportid", "occurcountry", "receiptdate"])
    drug_w = csv.DictWriter(f_drug, fieldnames=["safetyreportid", "medicinalproduct", "drugcharacterization", "activesubstancename"])
    reac_w = csv.DictWriter(f_reac, fieldnames=["safetyreportid", "reaction_pt", "reaction_meddra_version_pt"])

    rep_w.writeheader()
    drug_w.writeheader()
    reac_w.writeheader()

    kept = 0
    for xf in xml_arch:
        for sr in iter_safetyreports(xf):
            srid = get_text(sr, "safetyreportid")
            if srid is None or srid not in sample_set:
                continue

            rep_w.writerow({
                "safetyreportid": srid,
                "occurcountry": get_text(sr, "occurcountry"),
                "receiptdate": get_text(sr, "receiptdate"),
            })

            for r in sr.findall(".//reaction"):
                pt = get_text(r, "reactionmeddrapt") or get_text(r, "reactionmeddrallt")
                if pt:
                    reac_w.writerow({
                        "safetyreportid": srid,
                        "reaction_pt": pt.strip().upper(),
                        "reaction_meddra_version_pt": get_text(r, "reactionmeddraversionpt"),
                    })

            for d in sr.findall(".//drug"):
                prod = get_text(d, "medicinalproduct")
                active = get_text(d, ".//activesubstance/activesubstancename")
                drug_w.writerow({
                    "safetyreportid": srid,
                    "medicinalproduct": prod.strip().upper() if prod else None,
                    "drugcharacterization": get_text(d, "drugcharacterization"),
                    "activesubstancename": active.strip().upper() if active else None,
                })

            kept += 1

print("Total de informes:", n_total)
print("Tamaño de la muestra (ID únicos):", len(sample_set))
print("Informes extraídos:", kept)
print("Archivos guardados:")
print(" -", out_reports_csv)
print(" -", out_drugs_csv)
print(" -", out_reactions_csv)