<img src="https://industrial.uniandes.edu.co/sites/default/files/imagenes/uniandeslogo.png" alt="Universidad de los Andes" style="float: right; width: 300px; height: auto;">

# Cleaning Rutas del conflicto Massacres data

Autor: Juan Diego Heredia Niño 

Email: jd.heredian@uniandes.edu.co

Date: Nov 2025

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import yaml  # To read YAML configuration files
from pathlib import Path  # For cross-platform file path handling
import time

In [None]:
# Load directory paths from configuration file
with open('paths.yml', 'r') as file:
    paths = yaml.safe_load(file)  # Read and parse YAML file

# Create Path objects for each directory
raw = Path(paths['data']['raw'])  # Directory with raw data
temp = Path(paths['data']['temp'])  # Directory with temporary processed data
processed = Path(paths['data']['processed'])  # Directory with final processed data

In [None]:
# --- Configurar navegador ---
opts = Options()
opts.add_argument("--headless")  # quita esta línea si quieres ver el navegador
driver = webdriver.Chrome(options=opts)

url = "https://rutasdelconflicto.com/masacres"
driver.get(url)

wait = WebDriverWait(driver, 15)

In [None]:
# --- Abrir pestaña "Fichas" ---
fichas_btn = wait.until(
    EC.element_to_be_clickable(
        (By.XPATH, '//*[@id="block-system-main"]/div[1]/div[4]/ul/li[2]')
    )
)
fichas_btn.click()
time.sleep(3)

In [None]:
# --- Extraer todas las cartas visibles ---
cards = driver.find_elements(By.XPATH, '//*[@id="menu1"]/div/div/div')
print(f"{len(cards)} cartas encontradas")

data = []

for i in range(len(cards)):
    cards = driver.find_elements(By.XPATH, '//*[@id="menu1"]/div/div/div')
    card = cards[i]

    try:
        # Abrir la ficha en una nueva pestaña
        link = card.find_element(By.XPATH, './/*[@id="card"]/a').get_attribute('href')
        driver.execute_script("window.open(arguments[0]);", link)
        driver.switch_to.window(driver.window_handles[-1])

        wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="ficha"]/div')))

        # --- Datos principales ---
        municipio = driver.find_element(By.XPATH, '//*[@id="ficha"]/div/p[1]').text
        grupo_armado = driver.find_element(By.XPATH, '//*[@id="ficha"]/div/p[3]').text
        fecha = driver.find_element(By.XPATH, '//*[@id="ficha"]/div/p[4]').text

        # --- Listado de víctimas ---
        try:
            victimas_html = driver.find_element(
                By.XPATH, '//*[@id="ficha"]/div/div[2]'
            ).get_attribute("innerHTML")
            listado_victimas = [
                v.strip() for v in victimas_html.split("<br>") if v.strip()
            ]
        except Exception:
            listado_victimas = []

        # --- Texto completo de la noticia ---
        try:
            cuerpo_elem = driver.find_element(
                By.XPATH, '//*[starts-with(@id,"node-")]/div[2]/div/div[2]'
            )
            texto_noticia = cuerpo_elem.text
        except Exception:
            texto_noticia = ""

        # --- Registrar datos ---
        registro = {
            "municipio": municipio,
            "grupo_armado": grupo_armado,
            "fecha": fecha,
            "listado_victimas": listado_victimas,
            "texto_noticia": texto_noticia,
            "url": link
        }

        data.append(registro)

        driver.close()
        driver.switch_to.window(driver.window_handles[0])
        time.sleep(2)

    except Exception as e:
        print(f"Error en carta {i}: {e}")
        if len(driver.window_handles) > 1:
            driver.close()
        driver.switch_to.window(driver.window_handles[0])

In [None]:
# --- Guardar resultados ---
df = pd.DataFrame(data)
df.to_parquet(raw / "rutas_del_conflicto" / "massacres.parquet", index=False)
driver.quit()