<img src="https://industrial.uniandes.edu.co/sites/default/files/imagenes/uniandeslogo.png" alt="Universidad de los Andes" style="float: right; width: 300px; height: auto;">

# Stats for IDB 1st Seminar

Autor: Juan Diego Heredia Niño 

Email: jd.heredian@uniandes.edu.co

Fecha: Octubre 2025

In [1]:
import pandas as pd
import numpy as np
import yaml
from pathlib import Path

In [147]:
# Cargar configuración desde un archivo YAML
with open('paths.yml', 'r') as file:
    paths = yaml.safe_load(file)

raw = Path(paths['data']['raw'])
temp = Path(paths['data']['temp'])
processed = Path(paths['data']['processed'])

tables = Path(paths['outputs']['tables'])

In [None]:
# EXTORSIÓN
df_md_ext = pd.read_excel(raw / 'old' / 'mindef' / 'EXTORSIÓN.xlsx')
df_md_ext["año"] = df_md_ext['FECHA HECHOS'].dt.year
df_md_ext["trimestre"] = df_md_ext['FECHA HECHOS'].dt.quarter
df_md_ext['pesos'] = 0.1031
df_md_ext['casos_ponderados'] = df_md_ext['pesos'] * df_md_ext['CANTIDAD']
df_md_ext.rename(columns={'COD_MUNI':'cod_mun', 'CANTIDAD':'extorsion'}, inplace=True)
df_md_ext = df_md_ext.groupby(['cod_mun','año','trimestre'])[['casos_ponderados','extorsion']].sum().reset_index()

# HOMICIDIOS
df_md_hom = pd.read_excel(raw / 'old' / 'mindef' / 'HOMICIDIO.xlsx')
df_md_hom["año"] = df_md_hom['FECHA HECHO'].dt.year
df_md_hom["trimestre"] = df_md_hom['FECHA HECHO'].dt.quarter
df_md_hom['pesos'] = 0.1704
df_md_hom['casos_ponderados'] = df_md_hom['pesos'] * df_md_hom['VÍCTIMAS']
df_md_hom.rename(columns={'COD_MUNI':'cod_mun','VÍCTIMAS':'homicidios'}, inplace=True)
df_md_hom = df_md_hom.groupby(['cod_mun','año','trimestre'])[['casos_ponderados','homicidios']].sum().reset_index()

# MASACRES
df_md_mas = pd.read_excel(raw / 'old' / 'mindef' / 'MASACRES.xlsx')
df_md_mas["año"] = df_md_mas['FECHA HECHO'].dt.year
df_md_mas["trimestre"] = df_md_mas['FECHA HECHO'].dt.quarter
df_md_mas['pesos'] = 0.4484
df_md_mas['casos_ponderados'] = df_md_mas['pesos'] * df_md_mas['VICTIMAS']
df_md_mas.rename(columns={'COD_MUNI':'cod_mun','VICTIMAS':'masacres'}, inplace=True)
df_md_mas = df_md_mas.groupby(['cod_mun','año','trimestre'])[['casos_ponderados','masacres']].sum().reset_index()

# SECUESTRO
df_md_sec = pd.read_excel(raw / 'old' / 'mindef' / 'SECUESTRO.xlsx')
df_md_sec["año"] = df_md_sec['FECHA HECHO'].dt.year
df_md_sec["trimestre"] = df_md_sec['FECHA HECHO'].dt.quarter
df_md_sec['pesos'] = 0.1435
df_md_sec['casos_ponderados'] = df_md_sec['pesos'] * df_md_sec['CANTIDAD']
df_md_sec.rename(columns={'COD_MUNI':'cod_mun', 'CANTIDAD':'secuestrados'}, inplace=True)
df_md_sec = df_md_sec.groupby(['cod_mun','año','trimestre'])[['casos_ponderados','secuestrados']].sum().reset_index()

# TERRORISMO
df_md_terr = pd.read_excel(raw / 'old' / 'mindef' / 'TERRORISMO.xlsx')
df_md_terr["año"] = df_md_terr['FECHA HECHO'].dt.year
df_md_terr["trimestre"] = df_md_terr['FECHA HECHO'].dt.quarter
df_md_terr['pesos'] = 0.1345
df_md_terr['casos_ponderados'] = df_md_terr['pesos'] * df_md_terr['CANTIDAD']
df_md_terr.rename(columns={'COD_MUNI':'cod_mun', 'CANTIDAD':'terrorismo'}, inplace=True)
df_md_terr = df_md_terr.groupby(['cod_mun','año','trimestre'])[['casos_ponderados','terrorismo']].sum().reset_index()

# Estadísticas descriptivas de los datos procesados
df_1 = (
    pd.concat([
        df_md_terr[['terrorismo']].describe().T,
        df_md_ext[['extorsion']].describe().T,
        df_md_hom[['homicidios']].describe().T,
        df_md_mas[['masacres']].describe().T,
        df_md_sec[['secuestrados']].describe().T
    ])
    [['count', 'mean', 'std', 'min', 'max']]
    .rename(columns={
        'count':'Number of Observations',
        'mean':'Average',
        'std':'Standard Deviation',
        'min':'Minimum',
        'max':'Maximum'})
    .rename(index={
        'terrorismo':'Terrorism',
        'extorsion':'Extortion',
        'homicidios':'Homicides',
        'masacres':'Massacres',
        'secuestrados':'Kidnappings'
    })
    .round(1)
)

Unnamed: 0,cod_mun,año,trimestre,casos_ponderados
0,5001,1996,1,7.2503
1,5001,1996,2,5.5506
2,5001,1996,3,4.8331
3,5001,1996,4,3.4747
4,5001,1997,1,1.435


In [None]:
# Sort data by municipality and time
df_md_terr = df_md_terr.sort_values(by=['cod_mun', 'año', 'trimestre'])
df_md_ext = df_md_ext.sort_values(by=['cod_mun', 'año', 'trimestre'])
df_md_hom = df_md_hom.sort_values(by=['cod_mun', 'año', 'trimestre'])
df_md_mas = df_md_mas.sort_values(by=['cod_mun', 'año', 'trimestre'])
df_md_sec = df_md_sec.sort_values(by=['cod_mun', 'año', 'trimestre'])

# Create violence indicator lags (1 to 8 quarters)
for lag in range(1, 9):
    df_md_terr[f'terrorismo_{lag}'] = df_md_terr.groupby('cod_mun')['terrorismo'].shift(lag)
    df_md_ext[f'extorsion_{lag}'] = df_md_ext.groupby('cod_mun')['extorsion'].shift(lag)
    df_md_hom[f'homicidios_{lag}'] = df_md_hom.groupby('cod_mun')['homicidios'].shift(lag)
    df_md_mas[f'masacres_{lag}'] = df_md_mas.groupby('cod_mun')['masacres'].shift(lag)
    df_md_sec[f'secuestrados_{lag}'] = df_md_sec.groupby('cod_mun')['secuestrados'].shift(lag)

df_md_terr.dropna(inplace=True)
df_md_ext.dropna(inplace=True)
df_md_hom.dropna(inplace=True)
df_md_mas.dropna(inplace=True)
df_md_sec.dropna(inplace=True)

df_md_terr['atipico 1'] = (df_md_terr['terrorismo'] > (df_md_terr[[f'terrorismo_{lag}' for lag in range(1, 9)]].mean(axis=1) + df_md_terr[[f'terrorismo_{lag}' for lag in range(1, 9)]].std(axis=1))).astype(int)
df_md_terr['atipico 2'] = (df_md_terr['terrorismo'] > (df_md_terr[[f'terrorismo_{lag}' for lag in range(1, 9)]].mean(axis=1) + (2 * df_md_terr[[f'terrorismo_{lag}' for lag in range(1, 9)]].std(axis=1)))).astype(int)
df_md_terr.drop(columns=[f'terrorismo_{lag}' for lag in range(1, 9)], inplace=True)

df_md_ext['atipico 1'] = (df_md_ext['extorsion'] > (df_md_ext[[f'extorsion_{lag}' for lag in range(1, 9)]].mean(axis=1) + df_md_ext[[f'extorsion_{lag}' for lag in range(1, 9)]].std(axis=1))).astype(int)
df_md_ext['atipico 2'] = (df_md_ext['extorsion'] > (df_md_ext[[f'extorsion_{lag}' for lag in range(1, 9)]].mean(axis=1) + (2 * df_md_ext[[f'extorsion_{lag}' for lag in range(1, 9)]].std(axis=1)))).astype(int)
df_md_ext.drop(columns=[f'extorsion_{lag}' for lag in range(1, 9)], inplace=True)

df_md_hom['atipico 1'] = (df_md_hom['homicidios'] > (df_md_hom[[f'homicidios_{lag}' for lag in range(1, 9)]].mean(axis=1) + df_md_hom[[f'homicidios_{lag}' for lag in range(1, 9)]].std(axis=1))).astype(int)
df_md_hom['atipico 2'] = (df_md_hom['homicidios'] > (df_md_hom[[f'homicidios_{lag}' for lag in range(1, 9)]].mean(axis=1) + (2 * df_md_hom[[f'homicidios_{lag}' for lag in range(1, 9)]].std(axis=1)))).astype(int)
df_md_hom.drop(columns=[f'homicidios_{lag}' for lag in range(1, 9)], inplace=True)

df_md_mas['atipico 1'] = (df_md_mas['masacres'] > (df_md_mas[[f'masacres_{lag}' for lag in range(1, 9)]].mean(axis=1) + df_md_mas[[f'masacres_{lag}' for lag in range(1, 9)]].std(axis=1))).astype(int)
df_md_mas['atipico 2'] = (df_md_mas['masacres'] > (df_md_mas[[f'masacres_{lag}' for lag in range(1, 9)]].mean(axis=1) + (2 * df_md_mas[[f'masacres_{lag}' for lag in range(1, 9)]].std(axis=1)))).astype(int)
df_md_mas.drop(columns=[f'masacres_{lag}' for lag in range(1, 9)], inplace=True)

df_md_sec['atipico 1'] = (df_md_sec['secuestrados'] > (df_md_sec[[f'secuestrados_{lag}' for lag in range(1, 9)]].mean(axis=1) + df_md_sec[[f'secuestrados_{lag}' for lag in range(1, 9)]].std(axis=1))).astype(int)
df_md_sec['atipico 2'] = (df_md_sec['secuestrados'] > (df_md_sec[[f'secuestrados_{lag}' for lag in range(1, 9)]].mean(axis=1) + (2 * df_md_sec[[f'secuestrados_{lag}' for lag in range(1, 9)]].std(axis=1)))).astype(int)
df_md_sec.drop(columns=[f'secuestrados_{lag}' for lag in range(1, 9)], inplace=True)


df_2 = pd.concat([
    pd.concat([
        df_md_terr[['atipico 1']].describe().T['mean'].rename(index={'atipico 1':'Terrorism'}).to_frame(name='Proportion of Atypical Homicide Municipal Quarters (std)'),
        df_md_terr[['atipico 2']].describe().T['mean'].rename(index={'atipico 2':'Terrorism'}).to_frame(name='Proportion of Atypical Homicide Municipal Quarters (2x std)')
    ], axis=1),

    pd.concat([
        df_md_ext[['atipico 1']].describe().T['mean'].rename(index={'atipico 1':'Extortion'}).to_frame(name='Proportion of Atypical Homicide Municipal Quarters (std)'),
        df_md_ext[['atipico 2']].describe().T['mean'].rename(index={'atipico 2':'Extortion'}).to_frame(name='Proportion of Atypical Homicide Municipal Quarters (2x std)')
    ], axis=1),

    pd.concat([
        df_md_hom[['atipico 1']].describe().T['mean'].rename(index={'atipico 1':'Homicides'}).to_frame(name='Proportion of Atypical Homicide Municipal Quarters (std)'),
        df_md_hom[['atipico 2']].describe().T['mean'].rename(index={'atipico 2':'Homicides'}).to_frame(name='Proportion of Atypical Homicide Municipal Quarters (2x std)')
    ], axis=1),

    pd.concat([
        df_md_mas[['atipico 1']].describe().T['mean'].rename(index={'atipico 1':'Massacres'}).to_frame(name='Proportion of Atypical Homicide Municipal Quarters (std)'),
        df_md_mas[['atipico 2']].describe().T['mean'].rename(index={'atipico 2':'Massacres'}).to_frame(name='Proportion of Atypical Homicide Municipal Quarters (2x std)')
    ], axis=1),

    pd.concat([
        df_md_sec[['atipico 1']].describe().T['mean'].rename(index={'atipico 1':'Kidnappings'}).to_frame(name='Proportion of Atypical Homicide Municipal Quarters (std)'),
        df_md_sec[['atipico 2']].describe().T['mean'].rename(index={'atipico 2':'Kidnappings'}).to_frame(name='Proportion of Atypical Homicide Municipal Quarters (2x std)')
    ], axis=1)
]).round(4).map(lambda x: x*100)


Unnamed: 0,cod_mun,año,trimestre,casos_ponderados,terrorismo,terrorismo_1,terrorismo_2,terrorismo_3,terrorismo_4,terrorismo_5,terrorismo_6,terrorismo_7,terrorismo_8
0,5001,2010,1,0.538,4,,,,,,,,
1,5001,2010,2,0.4035,3,4.0,,,,,,,
2,5001,2010,3,0.269,2,3.0,4.0,,,,,,
3,5001,2011,2,0.538,4,2.0,3.0,4.0,,,,,
4,5001,2011,3,0.538,4,4.0,2.0,3.0,4.0,,,,


In [None]:
df_iacv = pd.read_parquet(temp / 'old' / 'preliminary' / 'df_raw.parquet')[['año', 'trimestre', 'cod_mun', 'casos_ponderados','iacv_1','iacv_2','iacv_3','iacv_4']]
df_iacv = df_iacv.sort_values(by=['cod_mun', 'año', 'trimestre'])

# Create violence indicator lags (5 to 8 quarters)
for lag in range(5, 9):
    df_iacv[f'iacv_{lag}'] = df_iacv.groupby('cod_mun')['casos_ponderados'].shift(lag)

df_iacv.dropna(inplace=True)

df_iacv['atipico 1'] = (df_iacv['casos_ponderados'] > (df_iacv[[f'iacv_{lag}' for lag in range(1, 9)]].mean(axis=1) + df_iacv[[f'iacv_{lag}' for lag in range(1, 9)]].std(axis=1))).astype(int)
df_iacv['atipico 2'] = (df_iacv['casos_ponderados'] > (df_iacv[[f'iacv_{lag}' for lag in range(1, 9)]].mean(axis=1) + (2 * df_iacv[[f'iacv_{lag}' for lag in range(1, 9)]].std(axis=1)))).astype(int)
df_iacv.drop(columns=[f'iacv_{lag}' for lag in range(1, 9)], inplace=True)

df_3 = pd.concat([(
    df_iacv
    .describe()
    [['casos_ponderados']]
    .T
    [['count', 'mean', 'std', 'min', 'max']]
    .rename(columns={
        'count':'Number of Observations',
        'mean':'Average',
        'std':'Standard Deviation',
        'min':'Minimum',
        'max':'Maximum'})
    .rename(index={
        'casos_ponderados':'CVAI'})
        .round(1)
),
(
    pd.concat([
        df_iacv[['atipico 1']].describe().T['mean'].rename(index={'atipico 1':'CVAI'}).to_frame(name='Proportion of Atypical Homicide Municipal Quarters (std)'),
        df_iacv[['atipico 2']].describe().T['mean'].rename(index={'atipico 2':'CVAI'}).to_frame(name='Proportion of Atypical Homicide Municipal Quarters (2x std)')
    ], axis=1)
    .round(4).map(lambda x: x*100)
)], axis=1)

Unnamed: 0,año,trimestre,cod_mun,casos_ponderados,atipico 1,atipico 2
8,2008,1,5001,1.693561,1,1
9,2008,2,5001,1.620068,1,0
10,2008,3,5001,1.900776,1,1
11,2008,4,5001,1.884151,1,0
12,2009,1,5001,1.938437,1,0


In [148]:
pd.concat(
    [
        pd.concat([
            df_1,
            df_2
        ], axis=1), 
        df_3
    ]
).to_excel(tables / 'old_idb_stats' /'idb_stats.xlsx')