<img src="https://industrial.uniandes.edu.co/sites/default/files/imagenes/uniandeslogo.png" alt="Universidad de los Andes" style="float: right; width: 300px; height: auto;">

# Cleaning Indepaz Massacres data

Autor: Juan Diego Heredia Niño 

Email: jd.heredian@uniandes.edu.co

Date: Nov 2025

In [1]:
# This cell imports the core libraries required for data processing and configuration management
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical operations
import yaml  # To read YAML configuration files
from pathlib import Path  # For cross-platform file path handling
import yaml  # To read YAML configuration files
from pathlib import Path  # For cross-platform file path handling
from rapidfuzz import fuzz, process  # Fuzzy string matching algorithms
# The rapidfuzz library is used for fuzzy string matching between municipality names

In [2]:
# The configuration file contains the directory structure used throughout the project
# This approach ensures consistent path references across different notebooks and scripts
with open('paths.yml', 'r') as file:
    paths = yaml.safe_load(file)  # Read and parse YAML file

# Path objects are created for each data stage in the pipeline
# The raw directory contains original unprocessed data from external sources
# The temp directory stores intermediate processing outputs
# The processed directory holds final cleaned datasets ready for analysis
raw = Path(paths['data']['raw'])  # Directory with raw data
temp = Path(paths['data']['temp'])  # Directory with temporary processed data
processed = Path(paths['data']['processed'])  # Directory with final processed data

In [3]:
# This function standardizes column headers from scraped HTML tables
# Web-scraped data often has inconsistent header formats that require normalization
def fix_headers(df):
    # Some tables have numeric column names instead of proper headers
    # In such cases, the first row contains the actual column names
    if list(df.columns) == list(range(df.shape[1])):
        new_cols = df.iloc[0] #['id','Fecha', 'Departamento', 'Municipio', '# de víctimas']
        df = df[1:].copy()  # Exclude the header row from data
        df.columns = new_cols
    # Standardizes the numbering column identifier
    elif 'N°' in df.columns:
        df.rename(columns={'N°':'#'}, inplace=True)
    # Column names are converted to lowercase for consistency
    df.columns = [x.lower() for x in df.columns.to_list()]
    # Spanish column names are translated to English abbreviations
    # This standardization facilitates downstream processing and analysis
    df = (
        df
        .rename(
            columns={
                '# de víctimas':'qty',  # Number of victims
                'fecha':'date',  # Date of the massacre
                'departamento':'dep',  # Department (first-level administrative division)
                'municipio':'old_mun'  # Municipality (second-level administrative division)
            }
        )
        .drop(columns='#')  # The row number column is not needed for analysis
    )
    return df

In [4]:
# This function creates standardized matching keys for geographic location names
# Standardization is necessary because municipality names may have spelling variations
def make_key(df):
    # The key combines department and municipality names into a single searchable string
    # Missing values are replaced with empty strings to avoid concatenation errors
    # Punctuation marks are removed as they introduce unnecessary variation
    # Text normalization removes accents and special characters through Unicode decomposition
    # All text is converted to lowercase to ensure case-insensitive matching
    return (
        df[['dep', 'mun']]
        .fillna('')  # Handle missing values
        .replace(r'[.,]', '', regex=True)  # Remove punctuation
        .agg(' '.join, axis=1)  # Concatenate department and municipality
        .str.lower()  # Convert to lowercase
        .str.normalize('NFKD')  # Unicode normalization
        .str.encode('ascii', 'ignore')  # Remove accents and special characters
        .str.decode('utf-8')  # Decode back to string
        .str.strip()  # Remove leading and trailing whitespace
    )

In [5]:
# This cell performs the main data extraction and cleaning from the Indepaz website
# The Indepaz organization maintains a comprehensive database of massacres in Colombia
# Data is scraped from multiple HTML tables on their webpage
list_of_dfs = [
            fix_headers(df)  # Each table is standardized using the previously defined function
            for df in 
            pd.read_html("https://indepaz.org.co/informe-de-masacres-en-colombia-durante-el-2020-2021/comment-page-4/")
        ]

# All tables are combined into a single dataset
df_massacres = pd.concat(
        list_of_dfs
        , ignore_index=True
        )

# Date formats in the source data are inconsistent and require standardization
# Some dates use two-digit years that need to be converted to four-digit format
dates = df_massacres['date'].astype(str).str.strip()
# Two-digit years are assumed to be in the 2000s and are converted accordingly
dates.loc[dates.str.match(r'\d{1,2}/\d{1,2}/\d{2}$')] = dates.loc[dates.str.match(r'\d{1,2}/\d{1,2}/\d{2}$')].str.replace(r'(\d{1,2}/\d{1,2}/)(\d{2})$', r'\g<1>20\2', regex=True)

# Dates are parsed with day-first format, which is standard in Colombian data
# Invalid dates are coerced to NaT (Not a Time) to prevent parsing errors
df_massacres['date'] = pd.to_datetime(dates, dayfirst=True, errors='coerce')
# Dates are aggregated to monthly frequency for temporal analysis
# This aggregation reduces noise while preserving important temporal patterns
df_massacres['date'] = df_massacres['date'].dt.to_period('M').dt.to_timestamp().dt.date

# Only records with valid numeric victim counts are retained
# Non-numeric entries likely represent data quality issues or missing information
mask = pd.to_numeric(df_massacres['qty'], errors='coerce').notna()
df_massacres = df_massacres[mask].copy()
df_massacres['qty'] = df_massacres['qty'].astype(int)

# Geographic names are standardized to title case for consistency
# Accents and special characters are removed to facilitate matching with official codes
df_massacres[['dep', 'old_mun']] = (
        df_massacres[['dep', 'old_mun']]
        .apply(
                lambda x: 
                    x
                    .str.title()  # Convert to title case
                    .str.normalize('NFKD')  # Unicode normalization
                    .str.encode('ascii', errors='ignore')  # Remove accents
                    .str.decode('utf-8')  # Decode back to string
            )
    )

In [6]:
# This cell performs geographic entity resolution by matching municipality names to official codes
# Some entries list multiple municipalities, which are split into separate records
df_massacres_mun = (
    df_massacres[['dep','old_mun']]
    .assign(
        # Multiple municipalities in a single entry are separated by commas or the conjunction "Y"
        mun=df_massacres['old_mun']
        .str.replace(r'\s+Y\s+', ',', regex=True)  # Replace 'Y' (and) with comma separator
        .str.split(r'\s*,\s*')  # Split on commas to create lists of municipalities
    )
    .explode('mun')  # Create separate rows for each municipality
    .reset_index(drop=True)
)

# The DIVIPOLA dataset contains official municipality codes from Colombia's national statistics agency
# These codes are necessary for linking with other official datasets
df_divipola = pd.read_parquet(temp / 'dane'/ 'divipola' / 'divipola.parquet') 

# Standardized keys are created for both datasets to enable fuzzy matching
df_massacres_mun['match_key'] = make_key(df_massacres_mun)
df_divipola['match_key'] = make_key(df_divipola)

# The list of official municipality keys serves as the reference for matching
choices = df_divipola['match_key'].tolist()

# Fuzzy string matching is used because exact matching would miss valid entries due to spelling variations
# The token sort ratio algorithm accounts for word order differences and partial matches
matches = []
for idx, key in df_massacres_mun['match_key'].items():
    # Each massacre municipality is matched to the most similar DIVIPOLA entry
    match, score, pos = process.extractOne(key, choices, scorer=fuzz.token_sort_ratio)
    # A threshold of 85 percent similarity balances precision and recall
    # Lower thresholds would increase false matches, while higher thresholds would miss valid matches
    if score >= 85:  # ajusta el umbral
        matches.append({
            'mass_idx': idx,
            'divi_idx': df_divipola.index[pos],
            'score': score
        })

# The matches are converted to a dataframe for merging
matches = pd.DataFrame(matches)

# Both datasets are merged based on the fuzzy matching results
result = (
    df_massacres_mun
    .merge(matches, left_index=True, right_on='mass_idx', how='left')
    .merge(df_divipola, left_on='divi_idx', right_index=True, suffixes=('_mass', '_divi'))
)

# Manual corrections are applied for cases where fuzzy matching produces incorrect results
# Cali is the capital of Valle del Cauca and was incorrectly matched to Calima
result['cod_mun'] = np.where(
    result['dep_mass'].isin(['Valle Del Cauca']) &
    result['dep_divi'].isin(['Valle Del Cauca']) &
    result['mun_mass'].isin(['Cali']) &
    result['mun_divi'].isin(['Calima']) &
    (result['score'] < 100), '76001', result['cod_mun']  # Official code for Cali
)

# Cucuta is the capital of Norte de Santander and was incorrectly matched to Cucutilla
result['cod_mun'] = np.where(
    result['dep_mass'].isin(['Norte De Santander']) &
    result['dep_divi'].isin(['Norte De Santander']) &
    result['mun_mass'].isin(['Cucuta']) &
    result['mun_divi'].isin(['Cucutilla']) &
    (result['score'] < 100), '54001', result['cod_mun']  # Official code for Cucuta
)

# The final municipality mapping is extracted for merging back to the main dataset
df_mun = (
        result
        [['dep_mass', 'old_mun','cod_mun']]
        .drop_duplicates()  # Each unique municipality appears only once
        .rename(columns={'dep_mass':'dep'})
        .reset_index(drop=True)
    )

In [7]:
# The municipality codes are merged back into the main massacre dataset
df_massacres = df_massacres.merge(
    df_mun,
    on=['dep', 'old_mun'],
    how='left'
)

# Some municipalities require manual coding due to name ambiguities or variations
# These cases were identified during data validation and represent edge cases
# The dictionary maps department-municipality pairs to their official DIVIPOLA codes
codigos_divipola = {
    ('Valle Del Cauca', 'Buga'): '76111',  # Guadalajara de Buga
    ('Narino', 'Magui Payan'): '52427',  # Alternative spelling of Magüí Payán
    ('Cundinamarca', 'Mesitas Del Colegio'): '25245',  # El Colegio
    ('Bogota', 'Bogota'): '11001',  # Capital district
    ('Cauca', 'Piendamo'): '19548',  # Alternative spelling of Piendamó
    ('Narino', 'Tumaco'): '52835',  # San Andrés de Tumaco
    ('Narino', 'Colon Genova'): '52203',  # Colón
    ('Bolivar', 'Cartagena'): '13001',  # Cartagena de Indias
    ('Bogota', 'Bogota  Sumapaz'): '11001',  # Sumapaz is a rural locality within Bogotá
    ('Narino', 'Andes-Sotomayor'): '52418',  # Los Andes-Sotomayor
    ('Tolima', 'Rio Blanco'): '73616',  # Ríoblanco
    ('Putumayo', 'Leguizamo'): '86573'  # Puerto Leguízamo
}

# The codes are applied to the corresponding records in the dataset
for (dep, mun), codigo in codigos_divipola.items():
    mask = (df_massacres['dep'] == dep) & (df_massacres['old_mun'] == mun)
    df_massacres.loc[mask, 'cod_mun'] = codigo

# The updates are verified to ensure they were applied correctly
print("Registros actualizados:")
print(df_massacres[df_massacres['cod_mun'].isin(codigos_divipola.values())][['dep', 'old_mun', 'cod_mun']].drop_duplicates())

# The final dataset is aggregated by month and municipality
# Multiple massacres in the same municipality-month are summed to get total victims
df_massacres = df_massacres.groupby(['date','cod_mun'])['qty'].sum().reset_index()

# The cleaned data is saved in parquet format for efficient storage and retrieval
# Parquet preserves data types and provides compression, making it ideal for analytical workflows
df_massacres.to_parquet(
        temp / 'indepaz' / 'masacres' / 'massacres.parquet',
        index=False
    )

Registros actualizados:
                 dep           old_mun cod_mun
5    Valle Del Cauca              Buga   76111
12       Bogota D.C.       Bogota D.C.   11001
28            Narino       Magui Payan   52427
46      Cundinamarca        El Colegio   25245
167           Bogota            Bogota   11001
195            Cauca          Piendamo   19548
211       Bogota D.C        Bogota D.C   11001
212           Narino            Tumaco   52835
250           Narino      Colon Genova   52203
260         Putumayo  Puerto Leguizamo   86573
274          Bolivar         Cartagena   13001
354           Bogota   Bogota  Sumapaz   11001
357           Narino   Andes-Sotomayor   52418
402           Tolima        Rio Blanco   73616
434         Putumayo         Leguizamo   86573
