<img src="https://industrial.uniandes.edu.co/sites/default/files/imagenes/uniandeslogo.png" alt="Universidad de los Andes" style="float: right; width: 300px; height: auto;">

# Cleaning divipola - Municipality codes

Autor: Juan Diego Heredia Ni√±o 

Email: jd.heredian@uniandes.edu.co

Date: Nov 2025

In [1]:
# Import necessary libraries
import pandas as pd  # For data manipulation and analysis
import yaml  # To read YAML configuration files
from pathlib import Path  # For cross-platform file path handling

In [2]:
# Load directory paths from configuration file
with open('paths.yml', 'r') as file:
    paths = yaml.safe_load(file)  # Read and parse YAML file

# Create Path objects for each directory
raw = Path(paths['data']['raw'])  # Directory with raw data
temp = Path(paths['data']['temp'])  # Directory with temporary processed data
processed = Path(paths['data']['processed'])  # Directory with final processed data

In [None]:
df_divipola = (
    pd.read_excel(
            raw / 'dane' / 'divipola' / 'DIVIPOLA_Municipios.xlsx', 
            header=9, 
            usecols="A:D"
        )
    .dropna()
    .iloc[1:, :]
    .reset_index(drop=True)
    .astype('str')
    .rename(columns={
            'Departamento': 'cod_dep',
            'Unnamed: 1': 'dep',
            'Municipio': 'cod_mun',
            'Unnamed: 3': 'mun'
        })
)

df_divipola['cod_dep'] = df_divipola['cod_dep'].str.zfill(2)
df_divipola['cod_mun'] = df_divipola['cod_mun'].str.zfill(5)

df_divipola[['dep', 'mun']] = (
        df_divipola[['dep', 'mun']]
        .apply(
                lambda x: 
                    x
                    .str.title()
                    .str.normalize('NFKD')
                    .str.encode('ascii', errors='ignore')
                    .str.decode('utf-8')
            )
    )

df_divipola[['dep','mun','cod_dep','cod_mun']].to_parquet(
    temp / 'dane' / 'divipola' / 'divipola.parquet',
    index=False
)