# 01 — Exploración de Papers
Análisis exploratorio de los 20 artículos académicos sobre crimen organizado y gobernanza criminal en América Latina.

In [None]:
import sys, json
from pathlib import Path
ROOT = Path("..").resolve()
sys.path.insert(0, str(ROOT))
import pandas as pd
import matplotlib.pyplot as plt
from dotenv import load_dotenv
load_dotenv(ROOT / ".env")
print("Setup OK")

In [None]:
with open(ROOT / "papers" / "papers.json", encoding="utf-8") as f:
    data = json.load(f)
papers = data["papers"]
df = pd.DataFrame([{
    "id": p["id"],
    "title": p["title"],
    "year": p.get("year"),
    "venue": p.get("venue", ""),
    "num_authors": len(p.get("authors", [])),
    "num_topics": len(p.get("topics", [])),
    "has_abstract": bool(p.get("abstract")),
} for p in papers])
print(f"Total papers: {len(df)}")
df.head()

In [None]:
fig, ax = plt.subplots(figsize=(10, 4))
df["year"].value_counts().sort_index().plot(kind="bar", ax=ax, color="steelblue")
ax.set_title("Distribución de Papers por Año")
ax.set_xlabel("Año")
ax.set_ylabel("Número de papers")
plt.tight_layout()
plt.show()

In [None]:
from collections import Counter
all_topics = [t for p in papers for t in (p.get("topics") or [])]
top_topics = Counter(all_topics).most_common(15)
labels, counts = zip(*top_topics)
fig, ax = plt.subplots(figsize=(12, 5))
ax.barh(labels[::-1], counts[::-1], color="coral")
ax.set_title("Top 15 Temas más frecuentes")
ax.set_xlabel("Frecuencia")
plt.tight_layout()
plt.show()

In [None]:
from src.ingestion import load_papers
all_papers = load_papers(verbose=True)
lengths = [(p["title"][:50], p["num_chars"], p["num_pages"]) for p in all_papers]
df_text = pd.DataFrame(lengths, columns=["title", "num_chars", "num_pages"])
df_text["chars_per_page"] = df_text["num_chars"] // df_text["num_pages"].clip(lower=1)
print(f"Total chars: {df_text[chr(39)+'num_chars'+chr(39)].sum():,}")
df_text.sort_values("num_chars", ascending=False)