In [None]:
# !pip install requests beautifulsoup4 pandas

B.L.
"noche_niebla_scraping.ipynb_v1" scrapes data from the Noche y Niebla database (https://base.nocheyniebla.org/casos).

**How it works**

The script automatically queries the database for cases in a given date range.

Default range: 1 December 1999 - 31 December 2024
(You can change this in the overall_start / overall_end variables).
To avoid exceeding the server’s row limit (≈2000 cases per query), the script splits the scraping into ~2-month intervals, then merges the results.

**Output**

By default, results are appended into a single CSV file: "nocheyniebla_casos_1999_2024.csv"
Alternatively, you can save quarterly CSVs in casos_quarterly_csvs/ by setting: APPEND_TO_SINGLE_CSV = False

**Data format**

The output preserves the original table headers from the website:

Fecha del hecho

Ubicaciones

P. Responsables

Tipificación
Víctimas
Descripción
Acciones

**Reproducibility**

To reproduce results, follow the step-by-step comments inside the notebook.

The script uses:

requests + BeautifulSoup for scraping
pandas for data storage/export
datetime for date range handling

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
import os

base_url = "https://base.nocheyniebla.org/casos"

# These are the common query parameters (except date fields, which we fill per iteration). The parameters are based on the query filter
query_params = {
    "filtro[apellidos]": "",
    "filtro[departamento_id]": "",
    "filtro[descripcion]": "",
    "filtro[inc_fecha]": "1",
    "filtro[inc_memo]": "1",
    "filtro[inc_presponsables]": "1",
    "filtro[inc_tipificacion]": "1",
    "filtro[inc_ubicaciones]": "1",
    "filtro[inc_victimas]": "1",
    "filtro[municipio_id]": "",
    "filtro[nombres]": "",
    "filtro[orden]": "fecha",
    "filtro[pais_id]": "170",
    "filtro[presponsable_id][]": "",
    "filtro[q]": "",
    "filtro[rangoedad_id]": "",
    "filtro[sectorsocial_id]": "",
    "filtro[sexo]": "",
    "filtro[tviolencia_id]": "Z"
}

# --- SETTINGS ---
# Start and end of the whole period the format is (yyyy, m,d). 
overall_start = datetime(1999, 12, 1)
overall_end = datetime(2024, 12, 31)
three_months = timedelta(days=60)  # Close enough for 2 months (this is to avoid exeeding the limit of 2000 cases)

# Directory for individual CSVs
output_dir = 'casos_quarterly_csvs'
os.makedirs(output_dir, exist_ok=True)

# For a single, growing CSV, this should be set to True
# Otherwise, it saves each quarter to its own file (recommended for automation, then merge later)
APPEND_TO_SINGLE_CSV = True
ALL_ROWS = []
HEADERS = []

# --- MAIN LOOP ---
current_start = overall_start

while current_start < overall_end:
    # Calculate end of period (2 months later, or end of year)
    current_end = min(current_start + three_months, overall_end)
    print(f"\nProcessing {current_start.date()} to {current_end.date()}...")

    # Set dates auto, (don't change this, change the date above in "settings")
    fechaini = current_start.strftime('%Y-%m-%d')
    fechafin = current_end.strftime('%Y-%m-%d')
    query_params['filtro[fechaini]'] = fechaini
    query_params['filtro[fechafin]'] = fechafin

    all_rows = []
    headers = []
    page_number = 1
    new_entries = 1  # Arbitrary positive integer to start loop
    session_headers = HEADERS.copy()  # Copy global headers, if available

    while new_entries > 0:
        query_params['pagina'] = str(page_number)
        try:
            response = requests.get(base_url, params=query_params, timeout=60)
            response.raise_for_status()
        except Exception as e:
            print(f"Failed to fetch page {page_number}: {e}")
            break

        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table')

        if not table:
            print(f"No table found on page {page_number}, moving to next date range.")
            new_entries = 0
            break

        if not headers:
            thead = table.find('thead')
            if thead:
                headers = [th.text.strip() for th in thead.find('tr').find_all('th')]
            else:
                headers = [th.text.strip() for th in table.find('tr').find_all('th')]
            session_headers = headers.copy()
            if not HEADERS:  
                HEADERS = headers.copy()

        new_entries = 0
        for tr in table.find_all('tr')[1:]:
            row = [td.text.strip() for td in tr.find_all('td')]
            if row:
                all_rows.append(row)
                new_entries += 1
        print(f"Scraped {new_entries} rows from {current_start.date()}–{current_end.date()} (page {page_number})")
        page_number += 1

    # Save results for this period
    if headers and all_rows:
        df = pd.DataFrame(all_rows, columns=session_headers)
        if APPEND_TO_SINGLE_CSV:
            ALL_ROWS.extend(all_rows)
            print(f"Quarter {fechaini}_{fechafin} appended to global dataset.")
        else:
            # Save individual quarterly CSV
            filename = os.path.join(output_dir, f"{fechaini}_{fechafin}.csv")
            df.to_csv(filename, index=False)
            print(f"Quarterly data saved to {filename}")

    else:
        print(f"No data found for {fechaini}–{fechafin}")

    # Moving to next period
    current_start = current_end + timedelta(days=1)  # Start next period on the next day

# After all periods (range of dates), optionally save a single combined CSV (here this is set to true above)
if APPEND_TO_SINGLE_CSV and ALL_ROWS and HEADERS:
    combined_df = pd.DataFrame(ALL_ROWS, columns=HEADERS)
    combined_df.to_csv('nocheyniebla_casos_1999_2024.csv', index=False) #this should be changed for want a new filename
    print("\nAll periods saved to 'nocheyniebla_casos_1999_2024.csv'.")
else:
    print("\nAll periods processed.")

print("\nComplete!")



Processing 1999-12-01 to 2000-01-30...
Scraped 20 rows from 1999-12-01–2000-01-30 (page 1)
Scraped 20 rows from 1999-12-01–2000-01-30 (page 2)
Scraped 20 rows from 1999-12-01–2000-01-30 (page 3)
Scraped 20 rows from 1999-12-01–2000-01-30 (page 4)
Scraped 20 rows from 1999-12-01–2000-01-30 (page 5)
Scraped 20 rows from 1999-12-01–2000-01-30 (page 6)
Scraped 20 rows from 1999-12-01–2000-01-30 (page 7)
Scraped 20 rows from 1999-12-01–2000-01-30 (page 8)
Scraped 20 rows from 1999-12-01–2000-01-30 (page 9)
Scraped 20 rows from 1999-12-01–2000-01-30 (page 10)
Scraped 20 rows from 1999-12-01–2000-01-30 (page 11)
Scraped 20 rows from 1999-12-01–2000-01-30 (page 12)
Scraped 20 rows from 1999-12-01–2000-01-30 (page 13)
Scraped 20 rows from 1999-12-01–2000-01-30 (page 14)
Scraped 20 rows from 1999-12-01–2000-01-30 (page 15)
Scraped 20 rows from 1999-12-01–2000-01-30 (page 16)
Scraped 20 rows from 1999-12-01–2000-01-30 (page 17)
Scraped 20 rows from 1999-12-01–2000-01-30 (page 18)
Scraped 20 rows