In [None]:
'''
This script downloads publicly available documents from the Colombian JEP "Macrocasos"
(Case 01â€“11) section and extracts readable text.

How it works:
    1. Visits each Case page (caso01.html through caso11.html)
    2. Saves HTML pages and any linked PDFs locally
    3. Extracts text from each file into plain .txt files
    4. Creates a summary CSV index of everything collected
    5. Merges all text files into one big searchable document

Outputs:
    jep_all_macrocasos.csv         â†’ index of all URLs and files
    data/html/                     â†’ saved webpage copies
    data/pdfs/                     â†’ downloaded PDFs
    data/texts/                    â†’ extracted text files
    data/all_macrocasos_text.txt   â†’ combined text corpus
'''

# -----------------------------
# 1. Import necessary libraries
# -----------------------------
import os # working with file paths
import re # cleaning/formatting file names
import time # pausing between downloads
import csv
import requests # downloading pages and PDFs from the web
from bs4 import BeautifulSoup # extracting text and links from HTML
from urllib.parse import urljoin, urlparse
from pdfminer.high_level import extract_text

# -----------------------------
# 2. Configuration
# -----------------------------
BASE = "https://www.jep.gov.co"
START = "https://www.jep.gov.co/macrocasos/caso01.html" # first case page
DOMAIN = urlparse(BASE).netloc
HEADERS = {"User-Agent": "RRP_Crawler/1.0 (contact via Harvard University)"} #IDs as academic

# directories
os.makedirs("data/html", exist_ok=True)
os.makedirs("data/pdfs", exist_ok=True)
os.makedirs("data/texts", exist_ok=True)

# These sets/lists keep track of progress
visited = set() # URLs already crawled (to avoid duplicates)
results = [] #summary info that will later be written to CSV


# -------------------------------------------------------
# 3. Helper functions
# -------------------------------------------------------

def fetch(url):
    """Downloads a page or PDF and handles common network errors.
     Returns the requests.Response object or None on failure."""
    try:
        r = requests.get(url, headers=HEADERS, timeout=20)
        r.raise_for_status()
        return r
    except Exception as e:
        print("Error fetching:", url, e)
        return None

def save_text_file(path, text):
    """
    Saves a string of text into a file (used for both HTML and extracted text).
    """
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        f.write(text)

def parse_html(url, html):
    """Turns webpage (HTML) into two things:
        1. All visible text (paragraphs, headings, list items)
        2. A list of all links found on the page"""
    soup = BeautifulSoup(html, "lxml")
    text = " ".join(p.get_text(" ", strip=True) for p in soup.find_all(["p", "h1", "h2", "h3", "li"]))
    links = [urljoin(url, a["href"]) for a in soup.find_all("a", href=True)]
    return text, links

def process_pdf(url):
    """
    Downloads a PDF (if not already saved), extracts its text,
    and adds an entry to the results list.
    """
    filename = os.path.basename(urlparse(url).path)
    pdf_path = os.path.join("data/pdfs", filename)
    text_path = os.path.join("data/texts", filename.replace(".pdf", ".txt"))

    # Skip downloading if the file already exists
    if not os.path.exists(pdf_path):
        print("Downloading PDF:", filename)
        r = fetch(url)
        if r and r.content:
            with open(pdf_path, "wb") as f:
                f.write(r.content)
            time.sleep(1.0)
        else:
            return None

    # Try to extract text from the PDF
    try:
        text = extract_text(pdf_path)
        save_text_file(text_path, text)
        results.append({"url": url, "type": "pdf", "file": pdf_path, "text_file": text_path})
        print(f"Extracted PDF: {filename}")
    except Exception as e:
        print("PDF extract failed:", filename, e)

def crawl(url, depth=0, max_depth=2):
    """
    Recursively explores pages starting from a given URL.
    - depth: how far down the link chain we are (0 = start page)
    - max_depth: how many link levels to follow (to prevent infinite crawling)
    """
    if url in visited or depth > max_depth:
        return
    visited.add(url)

    print(f"Crawling ({depth}): {url}")
    resp = fetch(url)
    if not resp:
        return
    # Check file type. If PDF, handle. If not HTML, skip
    content_type = resp.headers.get("Content-Type", "")
    if "pdf" in content_type or url.lower().endswith(".pdf"):
        process_pdf(url)
        return

    if "html" not in content_type:
        return

    # Save HTML text
    text, links = parse_html(url, resp.text)
    slug = re.sub(r"\W+", "_", url.replace(BASE, "").strip("/"))[:80]
    html_path = os.path.join("data/html", f"{slug or 'index'}.html")
    text_path = os.path.join("data/texts", f"{slug or 'index'}.txt")

    save_text_file(html_path, resp.text)
    save_text_file(text_path, text)

    results.append({"url": url, "type": "html", "file": html_path, "text_file": text_path})
    print(f"Saved HTML: {url}")

    # Crawl sublinks
    for link in links:
        parsed = urlparse(link)
        if DOMAIN in parsed.netloc and link.startswith(BASE):
            crawl(link, depth + 1, max_depth)

# -------------------------------------------------------
# 4. Main code: when the script is executed
# -------------------------------------------------------
"""
    Coordinates the full process:
        - Loops through Cases 01â€“11
        - Runs the crawler
        - Saves a summary CSV
        - Merges all text files into one document for future searching
"""

def main():
    #all case URLs 01-11
    cases = [f"https://www.jep.gov.co/macrocasos/caso{str(i).zfill(2)}.html" for i in range(1, 12)]
    for start_url in cases:
        print(f"\n=== Crawling {start_url} ===")
        crawl(start_url, depth=0, max_depth=2)

    # Write summary CSV
    with open("jep_all_macrocasos.csv", "w", newline="", encoding="utf-8-sig") as f:
        writer = csv.DictWriter(f, fieldnames=["url", "type", "file", "text_file"])
        writer.writeheader()
        writer.writerows(results)

    print("\nCrawl complete!")
    print(f"Visited {len(visited)} URLs; saved {len(results)} files.")
    print("Results written to jep_all_macrocasos.csv")

    # Merge all individual text files into one big file
    merged_path = "data/all_macrocasos_text.txt"
    with open(merged_path, "w", encoding="utf-8") as out:
        for root, _, files in os.walk("data/texts"):
            for file in sorted(files):
                if file.endswith(".txt"):
                    with open(os.path.join(root, file), encoding="utf-8") as f:
                        out.write(f"\n\n===== {file} =====\n")
                        out.write(f.read())
    print(f"\nðŸ§¾ Merged all text files into {merged_path}")

if __name__ == "__main__":
    main()

# Visited 173 URLs; saved 129 files.

  self.log.error("KeyboardInterrupt caught in kernel.")


In [None]:
"""
Keyword finder: this uses the sexual violence keyword list to scan the text files,
and puts into a spreadsheet the ones that mention any of the keywords. The goal is to help
others who know Spanish quickly locate documents that may contain relevant subject matter.

Process:
    - Normalizes accents (so "violaciÃ³n" and "violacion" both match)
    - Uses lowercase comparison for consistency
    - Lists which keywords appear in each file
    - Saves results to keyword_hits.csv with a short text snippet so you can look where it's found
"""

import os
import csv
import unicodedata

# Keyword list, accents removed for normalization
KEYWORDS = [
    "violencia sexual",
    "abuso sexual",
    "penetracion",
    "desnuda",
    "abusaron",
    "acoso",
    "relaciones sexuales",
    "enamoradita",
    "acceso carnal violento",
    "prostituta",
    "prostitucion",
    "actos sexuales",
    "violacion sexual",
    "abuso",
    "abusada",
    "abusado"
]

INPUT_FOLDER = "data/texts"
OUTPUT_CSV = "keyword_hits.csv"

# Normalization function
def normalize(text):
    """
    Lowercase and remove accent marks from a string
    (so 'violaciÃ³n' -> 'violacion').
    """
    text = text.lower()
    # Decompose accented characters, then filter out the accents
    text = unicodedata.normalize("NFKD", text)
    text = "".join(c for c in text if not unicodedata.combining(c))
    return text

# Scan text files
rows = []

for file in os.listdir(INPUT_FOLDER):
    if not file.endswith(".txt"):
        continue
    path = os.path.join(INPUT_FOLDER, file)

    # Read and normalize the file text
    with open(path, encoding="utf-8") as f:
        text = normalize(f.read())

    # Find which keywords appear
    hits = [kw for kw in KEYWORDS if kw in text]
    if hits:
        snippet = text[:500].replace("\n", " ")
        rows.append({
            "file": file,
            "keywords_found": ", ".join(hits),
            "snippet": snippet
        })

# Save matches to CSV
with open(OUTPUT_CSV, "w", newline="", encoding="utf-8-sig") as f:
    writer = csv.DictWriter(f, fieldnames=["file", "keywords_found", "snippet"])
    writer.writeheader()
    writer.writerows(rows)

print(f"Found {len(rows)} files containing at least one keyword.")
print(f"Results saved to {OUTPUT_CSV}")


Found 32 files containing at least one keyword.
Results saved to keyword_hits.csv
