In [1]:
import pandas as pd
import re
import numpy as np
import yaml  # To read YAML configuration files
from pathlib import Path  # For cross-platform file path handling


# Load directory paths from configuration file
with open('paths.yml', 'r') as file:
    paths = yaml.safe_load(file)  # Read and parse YAML file

# Create Path objects for each directory
raw = Path(paths['data']['raw'])  # Directory with raw data
temp = Path(paths['data']['temp'])  # Directory with temporary processed data
processed = Path(paths['data']['processed'])  # Directory with final processed data

In [4]:
df_rdc = pd.read_parquet(temp / 'rdc' / 'massacres.parquet').rename(columns={'fecha':'date','codigo_municipio':'cod_mun','victimas':'qty'})
df_indepaz = pd.read_parquet(temp / 'indepaz' / 'masacres' / 'massacres.parquet')

In [5]:
df_rdc

Unnamed: 0,date,cod_mun,qty
0,1982-08-01,5031.0,9.0
1,1982-09-01,5591.0,5.0
2,1983-08-01,5604.0,20.0
3,1984-06-01,19022.0,4.0
4,1985-05-01,19355.0,4.0
...,...,...,...
729,2013-05-01,76497.0,4.0
730,2013-11-01,76001.0,9.0
731,2014-10-01,76001.0,8.0
732,2014-12-01,5031.0,7.0


In [7]:
df_indepaz

Unnamed: 0,date,cod_mun,qty
0,2020-01-01,05154,3
1,2020-01-01,05642,3
2,2020-01-01,05790,5
3,2020-01-01,54810,3
4,2020-01-01,76364,5
...,...,...,...
505,2025-11-01,47189,3
506,2025-11-01,76243,4
507,2025-12-01,19821,3
508,2025-12-01,20614,3


In [None]:
df_massacres = pd.concat([df_rdc, df_indepaz], ignore_index=True).groupby(['date','cod_mun'])['qty'].sum().reset_index().rename(columns={'cod_mun':'mun_code'})

df_massacres['crime_code'] = '05'

df_massacres['mun_code'] = df_massacres['mun_code'].astype(float).astype(int).astype(str).str.zfill(5)

df_massacres.to_parquet(temp / 'massacres' / 'massacres.parquet', index=False)

In [19]:
# Sanity checks for massacres
print("MASSACRES - Data Quality Checks:")
print(f"  Total records: {len(df_massacres):,}")
print(f"  Date range: {df_massacres['date'].min()} to {df_massacres['date'].max()}")
print(f"  Unique municipalities: {df_massacres['mun_code'].nunique()}")
print(f"  Total victims: {df_massacres['qty'].sum():,}")
print(f"  Null values: {df_massacres.isnull().sum().sum()}")
print(f"  Negative quantities: {(df_massacres['qty'] < 0).sum()}")
print(f"  Duplicates: {df_massacres.duplicated(subset=['date', 'mun_code']).sum()}")
print()

MASSACRES - Data Quality Checks:
  Total records: 1,244
  Date range: 1982-08-01 to 2025-12-01
  Unique municipalities: 557
  Total victims: 7,763.99
  Null values: 0
  Negative quantities: 0
  Duplicates: 0

