In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

print("--- Sprint 2: The Factor Factory ---")

try:
    df_master = pd.read_csv('../data/iris_master_dataset_v1.csv')
    print("Master dataset loaded successfully.")
except FileNotFoundError:
    print("ERROR: Master dataset not found.")

factor_cols = [
    'ubigeo', 'g_factor_tasa_paralizacion', 'g_factor_ratio_sobretiempo_promedio',
    'g_factor_ratio_sobrecosto_promedio', 's_factor_total_muertes',
    's_factor_edad_prom_muerte', 's_factor_tasa_prevenibles'
]
df_factors = df_master[factor_cols].copy()

# --- Imputación de Nulos (Sintaxis Corregida) ---
print("\nConsolidating factors and imputing nulls with median values...")
for col in df_factors.columns:
    if col != 'ubigeo':
        median_value = df_factors[col].median()
        # LA CORRECCIÓN ESTÁ AQUÍ: Usamos reasignación en lugar de inplace=True
        df_factors[col] = df_factors[col].fillna(median_value)
        
print("Nulls imputed. Verification of nulls:", df_factors.isna().sum().sum())

--- Sprint 2: The Factor Factory ---
Master dataset loaded successfully.

Consolidating factors and imputing nulls with median values...
Nulls imputed. Verification of nulls: 0


In [4]:
# --- Normalización de Factores (Scaling) ---
scaler = MinMaxScaler()

# Guardamos la columna ubigeo y los nombres de las columnas de factores
ubigeo_col = df_factors['ubigeo']
factor_cols_only = [col for col in df_factors.columns if col != 'ubigeo']

# Creamos un nuevo DataFrame escalado
df_scaled = df_factors.copy()
df_scaled[factor_cols_only] = scaler.fit_transform(df_scaled[factor_cols_only])

print("\nAll factors normalized to a [0, 1] scale.")

# --- Ajuste de Direccionalidad del Riesgo ---
# Actualmente, para 's_factor_edad_prom_muerte', un valor alto (cercano a 1) significa
# una edad de muerte alta, lo cual es MENOS riesgo. Debemos invertirlo.
# Fórmula de inversión: 1 - valor_escalado
df_scaled['s_factor_edad_prom_muerte'] = 1 - df_scaled['s_factor_edad_prom_muerte']

print("Risk directionality adjusted for 's_factor_edad_prom_muerte'.")
print("Now, a higher value means higher risk for all factors.")

display(df_scaled.describe().T)


All factors normalized to a [0, 1] scale.
Risk directionality adjusted for 's_factor_edad_prom_muerte'.
Now, a higher value means higher risk for all factors.


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
g_factor_tasa_paralizacion,2017.0,0.046545,0.067504,0.0,0.0,0.030303,0.063291,1.0
g_factor_ratio_sobretiempo_promedio,2017.0,0.057362,0.06784,0.0,0.03273,0.04437,0.062688,1.0
g_factor_ratio_sobrecosto_promedio,2017.0,0.002985,0.034944,0.0,0.0,0.000191,0.00078,1.0
s_factor_total_muertes,2017.0,0.016258,0.057933,0.0,0.001762,0.003693,0.007793,1.0
s_factor_edad_prom_muerte,2017.0,0.406015,0.101706,0.0,0.349575,0.392862,0.441886,1.0
s_factor_tasa_prevenibles,2017.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# --- Construcción del Índice IRIS (v1.0) ---

# 1. Definimos las columnas para cada macro-factor
g_cols = [col for col in df_scaled.columns if 'g_factor' in col]
s_cols = [col for col in df_scaled.columns if 's_factor' in col]

# 2. Calculamos el G-Score y el S-Score como el promedio simple de sus componentes
df_scaled['g_score'] = df_scaled[g_cols].mean(axis=1)
df_scaled['s_score'] = df_scaled[s_cols].mean(axis=1)

# 3. Definimos los pesos para la primera versión del índice
# Empezaremos con una ponderación equitativa.
weights = {'g_score': 0.5, 's_score': 0.5}

# 4. Calculamos el score IRIS final
df_scaled['iris_score'] = (df_scaled['g_score'] * weights['g_score']) + \
                          (df_scaled['s_score'] * weights['s_score'])
                          
print("\nG-Score, S-Score, and final IRIS Score calculated successfully.")

# --- Auditoría Final: Top 10 Distritos con Mayor Riesgo ---
print("\n--- Top 10 Districts with Highest IRIS Risk Score ---")

# Fusionamos los scores con los nombres originales para mayor claridad
df_final_scores = pd.merge(df_master[['ubigeo']], df_scaled, on='ubigeo', how='left')

display(df_final_scores.sort_values(by='iris_score', ascending=False).head(10))


G-Score, S-Score, and final IRIS Score calculated successfully.

--- Top 10 Districts with Highest IRIS Risk Score ---


Unnamed: 0,ubigeo,g_factor_tasa_paralizacion,g_factor_ratio_sobretiempo_promedio,g_factor_ratio_sobrecosto_promedio,s_factor_total_muertes,s_factor_edad_prom_muerte,s_factor_tasa_prevenibles,g_score,s_score,iris_score
1514,LIMA_LIMA_SAN JUAN DE LURIGANCHO,0.018182,0.030002,0.000218,1.0,0.421559,0.0,0.016134,0.473853,0.244993
1738,PIURA_PIURA_LA ARENA,0.008547,1.0,0.0,0.031815,0.426068,0.0,0.336182,0.152628,0.244405
621,CAJAMARCA_CELENDIN_OXAMARCA,0.046512,0.991103,0.00028,0.003456,0.400852,0.0,0.345965,0.134769,0.240367
229,ANCASH_RECUAY_ACCOMARCA,1.0,0.04437,0.0,0.003693,0.392862,0.0,0.348123,0.132185,0.240154
984,HUANCAVELICA_TAYACAJA_DANIEL ALOMIA ROBLES,1.0,0.035255,0.0,0.003693,0.392862,0.0,0.345085,0.132185,0.238635
1606,LORETO_PUTUMAYO_YAGUAS,0.1,0.219295,0.199144,6.8e-05,0.8753,0.0,0.172813,0.291789,0.232301
1708,PIURA_HUANCABAMBA_CANCHAQUE,0.06383,0.968386,0.000658,0.010131,0.345995,0.0,0.344291,0.118709,0.2315
1298,LA LIBERTAD_PACASMAYO_GUADALUPE,0.0,0.904007,0.00012,0.048214,0.375667,0.0,0.301376,0.141294,0.221335
1727,PIURA_PAITA_ARENAL,0.0,0.039513,1.0,0.000712,0.274035,0.0,0.346504,0.091582,0.219043
1612,LORETO_REQUENA_MAQUIA,0.0,0.049818,0.447489,0.001288,0.808522,0.0,0.165769,0.269937,0.217853


In [6]:
# --- Exportar los Scores Finales ---
scores_path = '../data/iris_scores_v1.csv'
df_final_scores.to_csv(scores_path, index=False)

print("\n----------------------------------------------------")
print("✅  SPRINT 2 COMPLETED SUCCESSFULLY!")
print(f"Final scores dataset '{scores_path}' has been exported.")


----------------------------------------------------
✅  SPRINT 2 COMPLETED SUCCESSFULLY!
Final scores dataset '../data/iris_scores_v1.csv' has been exported.
