## 5. Modelo de Predicción de Performance

Desarrollamos modelos para predecir el rating de performance de empleados y identificar factores clave de éxito.

In [None]:
# Verificar disponibilidad de variable objetivo para performance
if 'performance' in available_targets:
 performance_col = available_targets['performance']
 
 print(" MODELO DE PREDICCIÓN DE PERFORMANCE")
 print("=" * 50)
 
 # Preparar variable objetivo
 y_perf = df_ml[performance_col].copy()
 
 # Limpiar datos de performance
 y_perf = pd.to_numeric(y_perf, errors='coerce')
 y_perf = y_perf.dropna()
 
 # Filtrar X para que coincida con y_perf
 valid_indices = y_perf.index
 X_perf = X.loc[valid_indices].copy()
 
 print(f" Distribución de Performance Rating:")
 perf_counts = y_perf.value_counts().sort_index()
 for rating, count in perf_counts.items():
 percentage = (count / len(y_perf)) * 100
 print(f" • Rating {rating}: {count:,} empleados ({percentage:.1f}%)")
 
 print(f"\\n Estadísticas de Performance:")
 print(f" • Promedio: {y_perf.mean():.2f}")
 print(f" • Mediana: {y_perf.median():.2f}")
 print(f" • Desviación Estándar: {y_perf.std():.2f}")
 print(f" • Rango: {y_perf.min():.0f} - {y_perf.max():.0f}")
 
 # Decidir si es clasificación o regresión
 unique_values = y_perf.nunique()
 if unique_values <= 10:
 problem_type = "classification"
 print(f"\\n Tipo de problema: CLASIFICACIÓN ({unique_values} clases)")
 else:
 problem_type = "regression"
 print(f"\\n Tipo de problema: REGRESIÓN (variable continua)")
 
 # División de datos
 X_train_perf, X_test_perf, y_train_perf, y_test_perf = train_test_split(
 X_perf, y_perf, test_size=0.2, random_state=42
 )
 
 print(f"\\n División de datos:")
 print(f" • Entrenamiento: {X_train_perf.shape[0]:,} muestras")
 print(f" • Prueba: {X_test_perf.shape[0]:,} muestras")
 
 # Escalado de features
 scaler_perf = StandardScaler()
 X_train_perf_scaled = scaler_perf.fit_transform(X_train_perf)
 X_test_perf_scaled = scaler_perf.transform(X_test_perf)
 
 # Modelos según el tipo de problema
 if problem_type == "classification":
 models_perf = {
 'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
 'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='mlogloss'),
 'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000, multi_class='ovr'),
 'Neural Network': MLPClassifier(random_state=42, max_iter=500, hidden_layer_sizes=(100, 50))
 }
 
 # Función de evaluación para clasificación
 def evaluate_classification(y_true, y_pred, y_pred_proba=None):
 metrics = {
 'accuracy': accuracy_score(y_true, y_pred),
 'precision': precision_score(y_true, y_pred, average='weighted'),
 'recall': recall_score(y_true, y_pred, average='weighted'),
 'f1': f1_score(y_true, y_pred, average='weighted')
 }
 return metrics
 
 else: # regression
 models_perf = {
 'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
 'XGBoost': xgb.XGBRegressor(random_state=42),
 'Linear Regression': LinearRegression(),
 'Neural Network': MLPRegressor(random_state=42, max_iter=500, hidden_layer_sizes=(100, 50))
 }
 
 # Función de evaluación para regresión
 def evaluate_regression(y_true, y_pred):
 metrics = {
 'mse': mean_squared_error(y_true, y_pred),
 'mae': mean_absolute_error(y_true, y_pred),
 'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
 'r2': r2_score(y_true, y_pred)
 }
 return metrics
 
 # Entrenar y evaluar modelos
 results_perf = {}
 
 print(f"\\n ENTRENAMIENTO DE MODELOS DE PERFORMANCE:\")\n \n for name, model in models_perf.items():\n print(f\"\\n Entrenando {name}...\")\n \n # Usar datos escalados para algunos modelos\n if name in ['Linear Regression', 'Logistic Regression', 'Neural Network']:\n model.fit(X_train_perf_scaled, y_train_perf)\n y_pred = model.predict(X_test_perf_scaled)\n else:\n model.fit(X_train_perf, y_train_perf)\n y_pred = model.predict(X_test_perf)\n \n # Evaluar según el tipo de problema\n if problem_type == \"classification\":\n # Para clasificación, obtener probabilidades si están disponibles\n try:\n if name in ['Linear Regression']: # No tiene predict_proba\n y_pred_proba = None\n else:\n if name in ['Logistic Regression', 'Neural Network']:\n y_pred_proba = model.predict_proba(X_test_perf_scaled)\n else:\n y_pred_proba = model.predict_proba(X_test_perf)\n except:\n y_pred_proba = None\n \n metrics = evaluate_classification(y_test_perf, y_pred, y_pred_proba)\n \n results_perf[name] = {\n 'model': model,\n 'predictions': y_pred,\n 'probabilities': y_pred_proba,\n **metrics\n }\n \n print(f\" Accuracy: {metrics['accuracy']:.4f}\")\n print(f\" F1-Score: {metrics['f1']:.4f}\")\n \n else: # regression\n metrics = evaluate_regression(y_test_perf, y_pred)\n \n results_perf[name] = {\n 'model': model,\n 'predictions': y_pred,\n **metrics\n }\n \n print(f\" R²: {metrics['r2']:.4f}\")\n print(f\" RMSE: {metrics['rmse']:.4f}\")\n print(f\" MAE: {metrics['mae']:.4f}\")\n \n # Comparación de modelos\n print(f\"\\n COMPARACIÓN DE MODELOS DE PERFORMANCE:\")\n \n if problem_type == \"classification\":\n comparison_perf = pd.DataFrame({\n 'Modelo': list(results_perf.keys()),\n 'Accuracy': [results_perf[m]['accuracy'] for m in results_perf.keys()],\n 'Precision': [results_perf[m]['precision'] for m in results_perf.keys()],\n 'Recall': [results_perf[m]['recall'] for m in results_perf.keys()],\n 'F1-Score': [results_perf[m]['f1'] for m in results_perf.keys()]\n }).round(4)\n \n comparison_perf = comparison_perf.sort_values('F1-Score', ascending=False)\n best_metric = 'F1-Score'\n \n else: # regression\n comparison_perf = pd.DataFrame({\n 'Modelo': list(results_perf.keys()),\n 'R²': [results_perf[m]['r2'] for m in results_perf.keys()],\n 'RMSE': [results_perf[m]['rmse'] for m in results_perf.keys()],\n 'MAE': [results_perf[m]['mae'] for m in results_perf.keys()],\n 'MSE': [results_perf[m]['mse'] for m in results_perf.keys()]\n }).round(4)\n \n comparison_perf = comparison_perf.sort_values('R²', ascending=False)\n best_metric = 'R²'\n \n display(comparison_perf)\n \n # Mejor modelo\n best_model_name_perf = comparison_perf.iloc[0]['Modelo']\n best_model_perf = results_perf[best_model_name_perf]['model']\n \n print(f\"\\n MEJOR MODELO: {best_model_name_perf}\")\n print(f\" {best_metric}: {comparison_perf.iloc[0][best_metric]:.4f}\")\n \n # Visualización de resultados\n fig, axes = plt.subplots(2, 2, figsize=(15, 12))\n fig.suptitle(f' Evaluación del Modelo de Predicción de Performance ({problem_type.title()})', \n fontsize=16, fontweight='bold')\n \n # 1. Comparación de métricas principales\n metric_to_plot = best_metric\n x_pos = np.arange(len(comparison_perf))\n \n axes[0, 0].bar(x_pos, comparison_perf[metric_to_plot], alpha=0.8, \n color=plt.cm.viridis(np.linspace(0, 1, len(comparison_perf))))\n axes[0, 0].set_title(f'Comparación de {metric_to_plot}')\n axes[0, 0].set_xticks(x_pos)\n axes[0, 0].set_xticklabels(comparison_perf['Modelo'], rotation=45)\n axes[0, 0].set_ylabel(metric_to_plot)\n \n # Agregar valores en las barras\n for j, v in enumerate(comparison_perf[metric_to_plot]):\n axes[0, 0].text(j, v + abs(v)*0.01, f'{v:.3f}', ha='center', va='bottom', fontweight='bold')\n \n # 2. Distribución de predicciones vs reales\n best_predictions = results_perf[best_model_name_perf]['predictions']\n \n if problem_type == \"classification\":\n # Matriz de confusión para clasificación\n cm = confusion_matrix(y_test_perf, best_predictions)\n sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0, 1])\n axes[0, 1].set_title('Matriz de Confusión')\n axes[0, 1].set_xlabel('Predicción')\n axes[0, 1].set_ylabel('Real')\n \n else:\n # Scatter plot para regresión\n axes[0, 1].scatter(y_test_perf, best_predictions, alpha=0.6)\n axes[0, 1].plot([y_test_perf.min(), y_test_perf.max()], \n [y_test_perf.min(), y_test_perf.max()], 'r--', lw=2)\n axes[0, 1].set_xlabel('Performance Real')\n axes[0, 1].set_ylabel('Performance Predicha')\n axes[0, 1].set_title('Predicciones vs Reales')\n axes[0, 1].grid(True, alpha=0.3)\n \n # 3. Distribución de errores\n if problem_type == \"regression\":\n residuals = y_test_perf - best_predictions\n axes[1, 0].hist(residuals, bins=30, alpha=0.7, color='skyblue', edgecolor='black')\n axes[1, 0].set_title('Distribución de Residuales')\n axes[1, 0].set_xlabel('Error (Real - Predicho)')\n axes[1, 0].set_ylabel('Frecuencia')\n axes[1, 0].axvline(0, color='red', linestyle='--', alpha=0.8)\n \n # Q-Q plot\n from scipy import stats\n stats.probplot(residuals, dist=\"norm\", plot=axes[1, 1])\n axes[1, 1].set_title('Q-Q Plot de Residuales')\n \n else:\n # Para clasificación: distribución de predicciones por clase real\n unique_classes = sorted(y_test_perf.unique())\n for i, cls in enumerate(unique_classes[:4]): # Solo primeras 4 clases\n if i < 2:\n row, col = 1, i\n class_mask = y_test_perf == cls\n class_predictions = best_predictions[class_mask]\n \n pred_counts = pd.Series(class_predictions).value_counts().sort_index()\n axes[row, col].bar(pred_counts.index, pred_counts.values, alpha=0.7)\n axes[row, col].set_title(f'Predicciones para Clase Real {cls}')\n axes[row, col].set_xlabel('Clase Predicha')\n axes[row, col].set_ylabel('Frecuencia')\n \n plt.tight_layout()\n plt.show()\n \n # Feature importance del mejor modelo\n if hasattr(best_model_perf, 'feature_importances_'):\n print(f\"\\n IMPORTANCIA DE FEATURES ({best_model_name_perf}):\")\n \n feature_importance = pd.DataFrame({\n 'feature': X_perf.columns,\n 'importance': best_model_perf.feature_importances_\n }).sort_values('importance', ascending=False)\n \n # Mostrar top 15 features más importantes\n top_features = feature_importance.head(15)\n display(top_features)\n \n # Visualizar feature importance\n plt.figure(figsize=(10, 8))\n plt.barh(range(len(top_features)), top_features['importance'], alpha=0.8)\n plt.yticks(range(len(top_features)), top_features['feature'])\n plt.xlabel('Importancia')\n plt.title(f'Top 15 Features Más Importantes - {best_model_name_perf}')\n plt.gca().invert_yaxis()\n plt.tight_layout()\n plt.show()\n \n # Guardar el mejor modelo\n model_path = '../artifacts/models/'\n os.makedirs(model_path, exist_ok=True)\n \n joblib.dump(best_model_perf, f'{model_path}best_performance_model.pkl')\n joblib.dump(scaler_perf, f'{model_path}performance_scaler.pkl')\n \n # Guardar metadatos del modelo\n model_metadata = {\n 'model_type': problem_type,\n 'best_model': best_model_name_perf,\n 'performance_metric': best_metric,\n 'performance_value': float(comparison_perf.iloc[0][best_metric]),\n 'features_used': list(X_perf.columns),\n 'target_column': performance_col,\n 'training_samples': len(X_train_perf),\n 'test_samples': len(X_test_perf)\n }\n \n import json\n with open(f'{model_path}performance_model_metadata.json', 'w') as f:\n json.dump(model_metadata, f, indent=2)\n \n print(f\"\\n Modelo de performance guardado en: {model_path}best_performance_model.pkl\")\n print(f\" Metadatos guardados en: {model_path}performance_model_metadata.json\")\n \nelse:\n print(\"️ Variable objetivo de performance no encontrada en el dataset\")\n print(\" Variables disponibles:\", list(df_ml.columns))

## 6. Análisis de Clustering y Segmentación

Implementamos algoritmos de clustering para identificar grupos naturales de empleados y patrones ocultos en los datos.

In [None]:
print(" ANÁLISIS DE CLUSTERING Y SEGMENTACIÓN")
print("=" * 50)

# Preparar datos para clustering
# Usar solo variables numéricas y escaladas
X_clustering = X.select_dtypes(include=[np.number]).copy()

# Manejar valores faltantes
X_clustering = X_clustering.fillna(X_clustering.median())

# Escalado para clustering
scaler_clustering = StandardScaler()
X_clustering_scaled = scaler_clustering.fit_transform(X_clustering)

print(f" Datos para clustering:")
print(f" • Muestras: {X_clustering_scaled.shape[0]:,}")
print(f" • Features: {X_clustering_scaled.shape[1]}")

# Reducción de dimensionalidad para visualización
print(f"\\n Reducción de dimensionalidad con PCA:")
pca = PCA()
X_pca = pca.fit_transform(X_clustering_scaled)

# Mostrar varianza explicada
cumvar = np.cumsum(pca.explained_variance_ratio_)
n_components_95 = np.where(cumvar >= 0.95)[0][0] + 1
n_components_80 = np.where(cumvar >= 0.80)[0][0] + 1

print(f" • Componentes para 80% varianza: {n_components_80}")
print(f" • Componentes para 95% varianza: {n_components_95}")

# Usar PCA reducido para clustering (80% de varianza)
X_pca_reduced = X_pca[:, :n_components_80]

# 1. DETERMINACIÓN DEL NÚMERO ÓPTIMO DE CLUSTERS
print(f"\\n DETERMINACIÓN DEL NÚMERO ÓPTIMO DE CLUSTERS:")

# Método del codo (Elbow Method)
max_k = min(15, len(X_clustering_scaled) // 100) # Máximo 15 clusters o 1% de los datos
k_range = range(2, max_k + 1)
inertias = []
silhouette_scores = []

print(f" Evaluando de 2 a {max_k} clusters...")

from sklearn.metrics import silhouette_score

for k in k_range:
 kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
 kmeans.fit(X_pca_reduced)
 inertias.append(kmeans.inertia_)
 
 # Calcular silhouette score (muestrear si hay muchos datos)
 if len(X_pca_reduced) > 10000:
 # Muestrear para silhouette score por performance
 sample_size = 5000
 indices = np.random.choice(len(X_pca_reduced), sample_size, replace=False)
 X_sample = X_pca_reduced[indices]
 labels_sample = kmeans.predict(X_sample)
 sil_score = silhouette_score(X_sample, labels_sample)
 else:
 sil_score = silhouette_score(X_pca_reduced, kmeans.labels_)
 
 silhouette_scores.append(sil_score)
 
 if k <= 10: # Solo mostrar primeros 10
 print(f" k={k}: Inertia={kmeans.inertia_:.0f}, Silhouette={sil_score:.3f}")

# Encontrar número óptimo de clusters
# Método del codo (buscar el punto donde la mejora se ralentiza)
diffs = np.diff(inertias)
diffs2 = np.diff(diffs)
elbow_idx = np.argmax(diffs2) + 2 # +2 porque perdemos 2 elementos con diff

# Mejor silhouette score
best_sil_idx = np.argmax(silhouette_scores)
optimal_k_silhouette = k_range[best_sil_idx]

print(f"\\n NÚMERO ÓPTIMO DE CLUSTERS:")
print(f" • Método del Codo: {elbow_idx} clusters")
print(f" • Mejor Silhouette: {optimal_k_silhouette} clusters (score: {max(silhouette_scores):.3f})")

# Usar el mejor según silhouette score
optimal_k = optimal_k_silhouette

# 2. APLICAR DIFERENTES ALGORITMOS DE CLUSTERING
print(f"\\n APLICANDO ALGORITMOS DE CLUSTERING:")

clustering_algorithms = {
 'K-Means': KMeans(n_clusters=optimal_k, random_state=42, n_init=10),
 'K-Means++': KMeans(n_clusters=optimal_k, random_state=42, init='k-means++', n_init=10),
 'Agglomerative': AgglomerativeClustering(n_clusters=optimal_k)
}

# Aplicar DBSCAN con parámetros automáticos
from sklearn.neighbors import NearestNeighbors
if len(X_pca_reduced) <= 20000: # Solo si no hay demasiados datos
 # Calcular eps automáticamente
 neighbors = NearestNeighbors(n_neighbors=4)
 neighbors_fit = neighbors.fit(X_pca_reduced)
 distances, indices = neighbors_fit.kneighbors(X_pca_reduced)
 distances = np.sort(distances[:, 3], axis=0)
 
 # Usar percentil 90 como eps
 eps = np.percentile(distances, 90)
 clustering_algorithms['DBSCAN'] = DBSCAN(eps=eps, min_samples=4)

clustering_results = {}

for name, algorithm in clustering_algorithms.items():
 print(f"\\n Aplicando {name}...")
 
 try:
 if name == 'DBSCAN':
 labels = algorithm.fit_predict(X_pca_reduced)
 n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
 n_noise = list(labels).count(-1)
 
 if n_clusters < 2:
 print(f" ️ {name}: Solo {n_clusters} clusters encontrados")
 continue
 
 print(f" Clusters encontrados: {n_clusters}")
 print(f" Puntos de ruido: {n_noise}")
 
 else:
 labels = algorithm.fit_predict(X_pca_reduced)
 n_clusters = len(set(labels))
 print(f" Clusters: {n_clusters}")
 
 # Calcular métricas de calidad
 if len(set(labels)) > 1: # Solo si hay más de 1 cluster
 sil_score = silhouette_score(X_pca_reduced, labels)
 print(f" Silhouette Score: {sil_score:.3f}")
 
 clustering_results[name] = {
 'algorithm': algorithm,
 'labels': labels,
 'n_clusters': n_clusters,
 'silhouette': sil_score
 }
 
 except Exception as e:
 print(f" Error en {name}: {str(e)}")

# 3. SELECCIONAR EL MEJOR ALGORITMO
if clustering_results:
 best_algorithm = max(clustering_results.keys(), 
 key=lambda x: clustering_results[x]['silhouette'])
 
 best_labels = clustering_results[best_algorithm]['labels']
 best_silhouette = clustering_results[best_algorithm]['silhouette']
 
 print(f"\\n MEJOR ALGORITMO: {best_algorithm}")
 print(f" Silhouette Score: {best_silhouette:.3f}")
 print(f" Número de clusters: {clustering_results[best_algorithm]['n_clusters']}")
 
 # 4. VISUALIZACIÓN DE CLUSTERS
 print(f"\\n VISUALIZACIÓN DE CLUSTERS:")
 
 fig, axes = plt.subplots(2, 2, figsize=(15, 12))
 fig.suptitle(f' Análisis de Clustering - {best_algorithm}', fontsize=16, fontweight='bold')
 
 # 1. Gráfico del método del codo
 axes[0, 0].plot(k_range, inertias, 'bo-', linewidth=2, markersize=6)
 axes[0, 0].axvline(x=elbow_idx, color='red', linestyle='--', alpha=0.8, label=f'Óptimo: {elbow_idx}')
 axes[0, 0].set_xlabel('Número de Clusters (k)')
 axes[0, 0].set_ylabel('Inertia (WCSS)')
 axes[0, 0].set_title('Método del Codo')
 axes[0, 0].grid(True, alpha=0.3)
 axes[0, 0].legend()
 
 # 2. Silhouette scores
 axes[0, 1].plot(k_range, silhouette_scores, 'go-', linewidth=2, markersize=6)
 axes[0, 1].axvline(x=optimal_k_silhouette, color='red', linestyle='--', alpha=0.8, 
 label=f'Óptimo: {optimal_k_silhouette}')
 axes[0, 1].set_xlabel('Número de Clusters (k)')
 axes[0, 1].set_ylabel('Silhouette Score')
 axes[0, 1].set_title('Análisis de Silhouette')
 axes[0, 1].grid(True, alpha=0.3)
 axes[0, 1].legend()
 
 # 3. Clusters en espacio PCA (2D)
 # Usar las primeras 2 componentes principales para visualización
 pca_2d = PCA(n_components=2)
 X_pca_2d = pca_2d.fit_transform(X_clustering_scaled)
 
 # Crear colores únicos para cada cluster
 unique_labels = set(best_labels)
 colors = plt.cm.tab10(np.linspace(0, 1, len(unique_labels)))
 
 for label, color in zip(unique_labels, colors):
 if label == -1: # Ruido en DBSCAN
 color = 'black'
 marker = 'x'
 alpha = 0.5
 label_name = 'Ruido'
 else:
 marker = 'o'
 alpha = 0.7
 label_name = f'Cluster {label}'
 
 mask = best_labels == label
 axes[1, 0].scatter(X_pca_2d[mask, 0], X_pca_2d[mask, 1], 
 c=[color], marker=marker, alpha=alpha, s=30, label=label_name)
 
 axes[1, 0].set_xlabel(f'PC1 ({pca_2d.explained_variance_ratio_[0]:.1%} varianza)')\n axes[1, 0].set_ylabel(f'PC2 ({pca_2d.explained_variance_ratio_[1]:.1%} varianza)')\n axes[1, 0].set_title('Clusters en Espacio PCA (2D)')\n axes[1, 0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n axes[1, 0].grid(True, alpha=0.3)\n \n # 4. Distribución de tamaños de clusters\n cluster_sizes = pd.Series(best_labels).value_counts().sort_index()\n \n bars = axes[1, 1].bar(range(len(cluster_sizes)), cluster_sizes.values, \n alpha=0.8, color=colors[:len(cluster_sizes)])\n axes[1, 1].set_xlabel('Cluster')\n axes[1, 1].set_ylabel('Número de Empleados')\n axes[1, 1].set_title('Distribución de Tamaños de Clusters')\n axes[1, 1].set_xticks(range(len(cluster_sizes)))\n axes[1, 1].set_xticklabels([f'C{i}' if i >= 0 else 'Ruido' for i in cluster_sizes.index])\n \n # Agregar valores en las barras\n for bar, value in zip(bars, cluster_sizes.values):\n height = bar.get_height()\n axes[1, 1].text(bar.get_x() + bar.get_width()/2., height + height*0.01,\n f'{value:,}', ha='center', va='bottom', fontweight='bold')\n \n plt.tight_layout()\n plt.show()\n \n # 5. CARACTERIZACIÓN DE CLUSTERS\n print(f\"\\n CARACTERIZACIÓN DE CLUSTERS:\")\n \n # Agregar etiquetas de cluster al dataset original\n df_clustered = df_ml.copy()\n df_clustered['cluster'] = best_labels\n \n # Análisis por cluster\n cluster_analysis = {}\n \n for cluster_id in sorted(set(best_labels)):\n if cluster_id == -1: # Saltar ruido en DBSCAN\n continue\n \n cluster_mask = best_labels == cluster_id\n cluster_data = df_clustered[cluster_mask]\n \n print(f\"\\n CLUSTER {cluster_id} ({cluster_mask.sum():,} empleados):\")\n \n cluster_stats = {}\n \n # Estadísticas numéricas\n for col in ['Age', 'age', 'edad'] + [available_targets.get('salary', '')]:\n if col in cluster_data.columns and not cluster_data[col].isnull().all():\n mean_val = cluster_data[col].mean()\n median_val = cluster_data[col].median()\n cluster_stats[col] = {'mean': mean_val, 'median': median_val}\n print(f\" • {col}: μ={mean_val:.1f}, mediana={median_val:.1f}\")\n \n # Variables categóricas más comunes\n for col in ['Department', 'departamento', 'Gender', 'genero']:\n if col in cluster_data.columns:\n mode_val = cluster_data[col].mode()\n if not mode_val.empty:\n mode_pct = (cluster_data[col] == mode_val.iloc[0]).mean() * 100\n cluster_stats[col] = {'mode': mode_val.iloc[0], 'mode_pct': mode_pct}\n print(f\" • {col}: {mode_val.iloc[0]} ({mode_pct:.1f}%)\")\n \n cluster_analysis[cluster_id] = cluster_stats\n \n # 6. GUARDAR RESULTADOS\n print(f\"\\n GUARDANDO RESULTADOS DE CLUSTERING:\")\n \n # Guardar modelo de clustering\n model_path = '../artifacts/models/'\n os.makedirs(model_path, exist_ok=True)\n \n joblib.dump(clustering_results[best_algorithm]['algorithm'], f'{model_path}best_clustering_model.pkl')\n joblib.dump(scaler_clustering, f'{model_path}clustering_scaler.pkl')\n joblib.dump(pca, f'{model_path}clustering_pca.pkl')\n \n # Guardar etiquetas de cluster\n cluster_df = pd.DataFrame({\n 'index': range(len(best_labels)),\n 'cluster': best_labels\n })\n cluster_df.to_csv(f'{model_path}cluster_labels.csv', index=False)\n \n # Guardar metadatos\n clustering_metadata = {\n 'best_algorithm': best_algorithm,\n 'n_clusters': clustering_results[best_algorithm]['n_clusters'],\n 'silhouette_score': float(best_silhouette),\n 'optimal_k_elbow': int(elbow_idx),\n 'optimal_k_silhouette': int(optimal_k_silhouette),\n 'pca_components_used': int(n_components_80),\n 'variance_explained': float(cumvar[n_components_80-1]),\n 'features_used': list(X_clustering.columns),\n 'cluster_sizes': {int(k): int(v) for k, v in cluster_sizes.items()}\n }\n \n import json\n with open(f'{model_path}clustering_metadata.json', 'w') as f:\n json.dump(clustering_metadata, f, indent=2)\n \n print(f\" Modelo guardado: {model_path}best_clustering_model.pkl\")\n print(f\" Etiquetas guardadas: {model_path}cluster_labels.csv\")\n print(f\" Metadatos guardados: {model_path}clustering_metadata.json\")\n \nelse:\n print(\" No se pudieron generar clusters válidos\")"

## 7. Sistema de Recomendaciones y Optimización

Desarrollamos sistemas inteligentes para recomendaciones salariales, asignación de roles y optimización de decisiones de RR.HH.

In [None]:
print(" SISTEMA DE RECOMENDACIONES Y OPTIMIZACIÓN")
print("=" * 50)

# 1. SISTEMA DE RECOMENDACIÓN SALARIAL
if 'salary' in available_targets:
 salary_col = available_targets['salary']
 
 print(f"\\n SISTEMA DE RECOMENDACIÓN SALARIAL:")
 
 # Crear modelo de recomendación salarial basado en features
 from sklearn.neighbors import NearestNeighbors
 
 # Preparar datos para recomendación salarial
 salary_data = df_ml.dropna(subset=[salary_col])
 X_salary = X.loc[salary_data.index]
 y_salary = salary_data[salary_col]
 
 print(f" Datos disponibles: {len(X_salary):,} empleados con información salarial\")\n \n # Entrenar modelo de vecinos más cercanos\n scaler_salary = StandardScaler()\n X_salary_scaled = scaler_salary.fit_transform(X_salary)\n \n # Modelo de vecinos para recomendaciones\n nn_salary = NearestNeighbors(n_neighbors=10, metric='euclidean')\n nn_salary.fit(X_salary_scaled)\n \n def recommend_salary(employee_features, method='percentile'):\n \"\"\"\n Recomienda salario para un empleado basado en empleados similares\n \n Parameters:\n - employee_features: array con las características del empleado\n - method: 'percentile', 'mean', 'median'\n \n Returns:\n - dict con recomendación salarial y estadísticas\n \"\"\"\n # Escalar las features del empleado\n employee_scaled = scaler_salary.transform([employee_features])\n \n # Encontrar empleados similares\n distances, indices = nn_salary.kneighbors(employee_scaled)\n similar_salaries = y_salary.iloc[indices[0]]\n \n # Calcular recomendación según el método\n if method == 'percentile':\n # Usar percentil 50-75 como rango recomendado\n recommended_min = similar_salaries.quantile(0.5)\n recommended_max = similar_salaries.quantile(0.75)\n recommended_salary = similar_salaries.quantile(0.6) # Percentil 60\n elif method == 'mean':\n recommended_salary = similar_salaries.mean()\n recommended_min = recommended_salary * 0.9\n recommended_max = recommended_salary * 1.1\n else: # median\n recommended_salary = similar_salaries.median()\n recommended_min = similar_salaries.quantile(0.25)\n recommended_max = similar_salaries.quantile(0.75)\n \n return {\n 'recommended_salary': recommended_salary,\n 'salary_range_min': recommended_min,\n 'salary_range_max': recommended_max,\n 'similar_employees': len(similar_salaries),\n 'similar_salaries_mean': similar_salaries.mean(),\n 'similar_salaries_std': similar_salaries.std(),\n 'confidence_score': 1 / (1 + distances[0].mean()) # Más confianza con menor distancia\n }\n \n # Ejemplo de recomendación\n if len(X_salary) > 0:\n # Tomar un empleado aleatorio como ejemplo\n sample_idx = np.random.choice(len(X_salary))\n sample_features = X_salary.iloc[sample_idx].values\n actual_salary = y_salary.iloc[sample_idx]\n \n recommendation = recommend_salary(sample_features)\n \n print(f\"\\n EJEMPLO DE RECOMENDACIÓN:\")\n print(f\" • Salario actual: ₹{actual_salary:,.0f}\")\n print(f\" • Salario recomendado: ₹{recommendation['recommended_salary']:,.0f}\")\n print(f\" • Rango sugerido: ₹{recommendation['salary_range_min']:,.0f} - ₹{recommendation['salary_range_max']:,.0f}\")\n print(f\" • Empleados similares: {recommendation['similar_employees']}\")\n print(f\" • Confianza: {recommendation['confidence_score']:.3f}\")\n \n # Análisis de brecha salarial\n salary_gap = actual_salary - recommendation['recommended_salary']\n gap_percentage = (salary_gap / recommendation['recommended_salary']) * 100\n \n if abs(gap_percentage) > 15:\n status = \"️ REVISAR\" if gap_percentage > 15 else \" SUBVALORADO\"\n print(f\" • Análisis: {status} (diferencia: {gap_percentage:+.1f}%)\")\n else:\n print(f\" • Análisis: APROPIADO (diferencia: {gap_percentage:+.1f}%)\")\n\n# 2. SISTEMA DE RECOMENDACIÓN DE ROLES/DEPARTAMENTOS\nprint(f\"\\n SISTEMA DE RECOMENDACIÓN DE ROLES:\")\n\n# Verificar si hay información departamental\ndept_cols = ['Department', 'departamento', 'department']\ndept_col = None\nfor col in dept_cols:\n if col in df_ml.columns:\n dept_col = col\n break\n\nif dept_col:\n print(f\" Analizando asignación departamental desde: {dept_col}\")\n \n # Preparar datos para recomendación de departamento\n dept_data = df_ml.dropna(subset=[dept_col])\n X_dept = X.loc[dept_data.index]\n y_dept = dept_data[dept_col]\n \n # Entrenar clasificador para predecir departamento óptimo\n from sklearn.ensemble import RandomForestClassifier\n \n # Dividir datos\n X_train_dept, X_test_dept, y_train_dept, y_test_dept = train_test_split(\n X_dept, y_dept, test_size=0.2, random_state=42, stratify=y_dept\n )\n \n # Entrenar modelo\n dept_classifier = RandomForestClassifier(n_estimators=100, random_state=42)\n dept_classifier.fit(X_train_dept, y_train_dept)\n \n # Evaluar modelo\n dept_accuracy = dept_classifier.score(X_test_dept, y_test_dept)\n print(f\" Precisión del modelo de asignación: {dept_accuracy:.3f}\")\n \n def recommend_department(employee_features, top_n=3):\n \"\"\"\n Recomienda departamentos para un empleado\n \n Parameters:\n - employee_features: array con características del empleado\n - top_n: número de departamentos a recomendar\n \n Returns:\n - list de tuplas (departamento, probabilidad)\n \"\"\"\n # Predecir probabilidades para cada departamento\n probabilities = dept_classifier.predict_proba([employee_features])[0]\n departments = dept_classifier.classes_\n \n # Ordenar por probabilidad descendente\n dept_probs = list(zip(departments, probabilities))\n dept_probs.sort(key=lambda x: x[1], reverse=True)\n \n return dept_probs[:top_n]\n \n # Ejemplo de recomendación departamental\n if len(X_dept) > 0:\n sample_idx = np.random.choice(len(X_dept))\n sample_features = X_dept.iloc[sample_idx].values\n actual_dept = y_dept.iloc[sample_idx]\n \n recommendations = recommend_department(sample_features)\n \n print(f\"\\n EJEMPLO DE RECOMENDACIÓN DEPARTAMENTAL:\")\n print(f\" • Departamento actual: {actual_dept}\")\n print(f\" • Recomendaciones:\")\n \n for i, (dept, prob) in enumerate(recommendations, 1):\n indicator = \"\" if dept == actual_dept else \"\"\n print(f\" {i}. {indicator} {dept}: {prob:.3f} ({prob*100:.1f}%)\")\n\n# 3. OPTIMIZACIÓN DE DECISIONES DE RR.HH.\nprint(f\"\\n OPTIMIZACIÓN DE DECISIONES DE RR.HH.:\")\n\n# Sistema de scoring para identificar empleados clave\ndef calculate_employee_score(employee_data):\n \"\"\"\n Calcula un score integral del empleado basado en múltiples factores\n \n Returns:\n - dict con scores y recomendaciones\n \"\"\"\n scores = {}\n recommendations = []\n \n # Score de retención (basado en modelo de attrition si existe)\n if 'attrition' in available_targets and 'results_attr' in globals():\n # Usar el mejor modelo de attrition para predecir riesgo\n try:\n features = employee_data[ml_features].values.reshape(1, -1)\n if best_model_name in ['Logistic Regression', 'Neural Network']:\n risk_prob = results_attr[best_model_name]['model'].predict_proba(\n scaler_attr.transform(features)\n )[0][1]\n else:\n risk_prob = results_attr[best_model_name]['model'].predict_proba(features)[0][1]\n \n retention_score = 1 - risk_prob # Invertir para que alto = buena retención\n scores['retention_risk'] = risk_prob\n scores['retention_score'] = retention_score\n \n if risk_prob > 0.7:\n recommendations.append(\" ALTO RIESGO DE RENUNCIA - Intervención inmediata\")\n elif risk_prob > 0.4:\n recommendations.append(\"️ RIESGO MODERADO - Monitoreo y acciones preventivas\")\n \n except:\n scores['retention_score'] = 0.5 # Neutral si no se puede calcular\n \n # Score de performance (si existe)\n if 'performance' in available_targets:\n perf_col = available_targets['performance']\n if perf_col in employee_data.index:\n performance_raw = employee_data[perf_col]\n if pd.notna(performance_raw):\n # Normalizar performance a escala 0-1\n perf_max = df_ml[perf_col].max()\n perf_min = df_ml[perf_col].min()\n performance_score = (performance_raw - perf_min) / (perf_max - perf_min)\n scores['performance_score'] = performance_score\n \n if performance_score > 0.8:\n recommendations.append(\"⭐ ALTO RENDIMIENTO - Considerar promoción/reconocimiento\")\n elif performance_score < 0.3:\n recommendations.append(\" BAJO RENDIMIENTO - Plan de desarrollo requerido\")\n \n # Score salarial (equidad)\n if 'salary' in available_targets:\n salary_col = available_targets['salary']\n if salary_col in employee_data.index:\n current_salary = employee_data[salary_col]\n if pd.notna(current_salary):\n # Comparar con recomendación salarial\n try:\n features = employee_data[ml_features].values\n salary_rec = recommend_salary(features)\n \n salary_ratio = current_salary / salary_rec['recommended_salary']\n scores['salary_equity'] = min(salary_ratio, 2.0) # Cap en 2.0\n \n if salary_ratio < 0.8:\n recommendations.append(\" SALARIO BAJO - Considerar ajuste salarial\")\n elif salary_ratio > 1.3:\n recommendations.append(\" SALARIO ALTO - Revisar justificación\")\n \n except:\n scores['salary_equity'] = 1.0 # Neutral\n \n # Score integral\n score_weights = {\n 'retention_score': 0.4,\n 'performance_score': 0.4,\n 'salary_equity': 0.2\n }\n \n total_score = 0\n total_weight = 0\n \n for score_name, weight in score_weights.items():\n if score_name in scores:\n total_score += scores[score_name] * weight\n total_weight += weight\n \n if total_weight > 0:\n integral_score = total_score / total_weight\n else:\n integral_score = 0.5\n \n scores['integral_score'] = integral_score\n \n # Recomendación general\n if integral_score > 0.8:\n priority = \" EMPLEADO ESTRELLA\"\n elif integral_score > 0.6:\n priority = \" EMPLEADO VALIOSO\"\n elif integral_score > 0.4:\n priority = \" EMPLEADO PROMEDIO\"\n else:\n priority = \"️ EMPLEADO EN RIESGO\"\n \n return {\n 'scores': scores,\n 'integral_score': integral_score,\n 'priority': priority,\n 'recommendations': recommendations\n }\n\n# Ejemplo de scoring\nif len(df_ml) > 0:\n sample_employee = df_ml.iloc[np.random.choice(len(df_ml))]\n employee_analysis = calculate_employee_score(sample_employee)\n \n print(f\"\\n EJEMPLO DE ANÁLISIS INTEGRAL:\")\n print(f\" • Score Integral: {employee_analysis['integral_score']:.3f}\")\n print(f\" • Prioridad: {employee_analysis['priority']}\")\n \n print(f\"\\n Scores Detallados:\")\n for score_name, score_value in employee_analysis['scores'].items():\n print(f\" • {score_name}: {score_value:.3f}\")\n \n if employee_analysis['recommendations']:\n print(f\"\\n Recomendaciones:\")\n for rec in employee_analysis['recommendations']:\n print(f\" • {rec}\")\n\n# 4. GUARDAR SISTEMA DE RECOMENDACIONES\nprint(f\"\\n GUARDANDO SISTEMA DE RECOMENDACIONES:\")\n\nmodel_path = '../artifacts/models/'\nos.makedirs(model_path, exist_ok=True)\n\n# Guardar componentes del sistema\nif 'salary' in available_targets:\n joblib.dump(nn_salary, f'{model_path}salary_recommender.pkl')\n joblib.dump(scaler_salary, f'{model_path}salary_scaler.pkl')\n print(f\" Sistema de recomendación salarial guardado\")\n\nif dept_col:\n joblib.dump(dept_classifier, f'{model_path}department_recommender.pkl')\n print(f\" Sistema de recomendación departamental guardado\")\n\n# Guardar funciones como metadatos\nrecommender_metadata = {\n 'system_components': {\n 'salary_recommender': 'salary' in available_targets,\n 'department_recommender': dept_col is not None,\n 'employee_scoring': True\n },\n 'features_used': list(X.columns),\n 'target_columns': available_targets,\n 'last_updated': datetime.now().isoformat()\n}\n\nimport json\nwith open(f'{model_path}recommender_metadata.json', 'w') as f:\n json.dump(recommender_metadata, f, indent=2)\n\nprint(f\" Metadatos del sistema guardados\")\nprint(f\"\\n Sistema de recomendaciones completado!\")

## 8. Interpretabilidad de Modelos con SHAP

Analizamos la interpretabilidad de nuestros modelos usando SHAP (SHapley Additive exPlanations) para entender qué factores influyen en las predicciones.

In [None]:
print(" INTERPRETABILIDAD DE MODELOS CON SHAP")
print("=" * 50)

# Inicializar SHAP
shap.initjs()

# 1. INTERPRETABILIDAD DEL MODELO DE ATTRITION
if 'attrition' in available_targets and 'results_attr' in globals():
 print(f"\\n ANÁLISIS SHAP - MODELO DE ATTRITION:")
 print(f" Modelo: {best_model_name}")
 
 try:\n # Preparar datos para SHAP (usar muestra si es muy grande)\n sample_size = min(1000, len(X_test_attr)) # Máximo 1000 muestras\n \n if sample_size < len(X_test_attr):\n sample_indices = np.random.choice(len(X_test_attr), sample_size, replace=False)\n X_shap_attr = X_test_attr.iloc[sample_indices]\n y_shap_attr = y_test_attr.iloc[sample_indices]\n else:\n X_shap_attr = X_test_attr\n y_shap_attr = y_test_attr\n \n print(f\" Analizando {len(X_shap_attr)} muestras para SHAP\")\n \n # Crear explicador SHAP según el tipo de modelo\n if best_model_name == 'Random Forest':\n explainer_attr = shap.TreeExplainer(best_model_attr)\n \n elif best_model_name == 'XGBoost':\n explainer_attr = shap.TreeExplainer(best_model_attr)\n \n else: # Logistic Regression, Neural Network\n # Usar datos escalados para estos modelos\n X_shap_attr_scaled = scaler_attr.transform(X_shap_attr)\n explainer_attr = shap.KernelExplainer(\n best_model_attr.predict_proba, \n shap.sample(X_shap_attr_scaled, 100) # Background dataset\n )\n X_shap_attr = X_shap_attr_scaled\n \n # Calcular SHAP values\n print(f\" Calculando SHAP values...\")\n \n if best_model_name in ['Random Forest', 'XGBoost']:\n shap_values_attr = explainer_attr.shap_values(X_shap_attr)\n \n # Para clasificación binaria, tomar clase positiva (renuncia)\n if isinstance(shap_values_attr, list):\n shap_values_attr = shap_values_attr[1] # Clase 1 (renuncia)\n else:\n shap_values_attr = explainer_attr.shap_values(X_shap_attr)\n if isinstance(shap_values_attr, list):\n shap_values_attr = shap_values_attr[1] # Clase 1\n \n print(f\" SHAP values calculados\")\n \n # Visualizaciones SHAP\n fig, axes = plt.subplots(2, 2, figsize=(20, 16))\n fig.suptitle(' Interpretabilidad del Modelo de Attrition - SHAP Analysis', \n fontsize=16, fontweight='bold')\n \n # 1. Summary plot (importancia de features)\n plt.subplot(2, 2, 1)\n if best_model_name in ['Random Forest', 'XGBoost']:\n shap.summary_plot(shap_values_attr, X_shap_attr, \n feature_names=list(X_shap_attr.columns), \n plot_type=\"bar\", show=False, max_display=15)\n else:\n # Para modelos escalados, usar nombres de columnas originales\n shap.summary_plot(shap_values_attr, X_shap_attr,\n feature_names=list(X_test_attr.columns), \n plot_type=\"bar\", show=False, max_display=15)\n plt.title('Importancia de Features (SHAP)')\n \n # 2. Summary plot (distribución de impacto)\n plt.subplot(2, 2, 2)\n if best_model_name in ['Random Forest', 'XGBoost']:\n shap.summary_plot(shap_values_attr, X_shap_attr,\n feature_names=list(X_shap_attr.columns), \n show=False, max_display=15)\n else:\n shap.summary_plot(shap_values_attr, X_shap_attr,\n feature_names=list(X_test_attr.columns), \n show=False, max_display=15)\n plt.title('Distribución de Impacto por Feature')\n \n # 3. Feature importance nativa del modelo (si está disponible)\n if hasattr(best_model_attr, 'feature_importances_'):\n plt.subplot(2, 2, 3)\n feature_importance_attr = pd.DataFrame({\n 'feature': X_test_attr.columns,\n 'importance': best_model_attr.feature_importances_\n }).sort_values('importance', ascending=True).tail(15)\n \n plt.barh(range(len(feature_importance_attr)), feature_importance_attr['importance'])\n plt.yticks(range(len(feature_importance_attr)), feature_importance_attr['feature'])\n plt.xlabel('Importancia (Modelo Nativo)')\n plt.title('Feature Importance - Modelo Nativo')\n plt.grid(True, alpha=0.3)\n \n # 4. SHAP values promedio por feature\n plt.subplot(2, 2, 4)\n mean_shap_values = np.abs(shap_values_attr).mean(0)\n \n if best_model_name in ['Random Forest', 'XGBoost']:\n feature_names = list(X_shap_attr.columns)\n else:\n feature_names = list(X_test_attr.columns)\n \n shap_importance = pd.DataFrame({\n 'feature': feature_names,\n 'mean_shap': mean_shap_values\n }).sort_values('mean_shap', ascending=True).tail(15)\n \n plt.barh(range(len(shap_importance)), shap_importance['mean_shap'])\n plt.yticks(range(len(shap_importance)), shap_importance['feature'])\n plt.xlabel('SHAP Value Promedio (Valor Absoluto)')\n plt.title('Importancia Promedio - SHAP')\n plt.grid(True, alpha=0.3)\n \n plt.tight_layout()\n plt.show()\n \n # Análisis de las features más importantes\n print(f\"\\n TOP 10 FEATURES MÁS IMPORTANTES (SHAP):\")\n top_shap_features = shap_importance.tail(10)\n for i, (_, row) in enumerate(top_shap_features.iterrows(), 1):\n print(f\" {i:2d}. {row['feature']}: {row['mean_shap']:.4f}\")\n \n # Ejemplo de explicación individual\n print(f\"\\n EJEMPLO DE EXPLICACIÓN INDIVIDUAL:\")\n \n # Tomar un empleado con alta probabilidad de renuncia\n high_risk_idx = np.argmax(results_attr[best_model_name]['probabilities'])\n sample_employee = X_test_attr.iloc[high_risk_idx]\n actual_outcome = y_test_attr.iloc[high_risk_idx]\n predicted_prob = results_attr[best_model_name]['probabilities'][high_risk_idx]\n \n print(f\" • Empleado: Índice {high_risk_idx}\")\n print(f\" • Resultado real: {'Renuncia' if actual_outcome == 1 else 'Retención'}\")\n print(f\" • Probabilidad predicha de renuncia: {predicted_prob:.3f}\")\n \n # SHAP values para este empleado específico\n if best_model_name in ['Random Forest', 'XGBoost']:\n employee_shap = shap_values_attr[high_risk_idx]\n employee_features = X_shap_attr.iloc[high_risk_idx]\n else:\n employee_shap = shap_values_attr[high_risk_idx]\n employee_features = X_test_attr.iloc[high_risk_idx]\n \n # Top 5 factores que más contribuyen a la predicción\n feature_contributions = pd.DataFrame({\n 'feature': feature_names,\n 'shap_value': employee_shap,\n 'feature_value': employee_features.values\n }).sort_values('shap_value', key=abs, ascending=False)\n \n print(f\"\\n Top 5 factores que influyen en la predicción:\")\n for i, (_, row) in enumerate(feature_contributions.head().iterrows(), 1):\n direction = \"aumenta\" if row['shap_value'] > 0 else \"disminuye\"\n print(f\" {i}. {row['feature']}: {row['shap_value']:+.4f} ({direction} riesgo)\")\n print(f\" Valor: {row['feature_value']:.3f}\")\n \n except Exception as e:\n print(f\" Error en análisis SHAP de attrition: {str(e)}\")\n print(f\" Tip: SHAP puede requerir mucha memoria. Intente con menos muestras.\")\n\n# 2. INTERPRETABILIDAD DEL MODELO DE PERFORMANCE\nif 'performance' in available_targets and 'results_perf' in globals():\n print(f\"\\n ANÁLISIS SHAP - MODELO DE PERFORMANCE:\")\n print(f\" Modelo: {best_model_name_perf}\")\n \n try:\n # Preparar datos para SHAP\n sample_size = min(500, len(X_test_perf)) # Menos muestras para performance\n \n if sample_size < len(X_test_perf):\n sample_indices = np.random.choice(len(X_test_perf), sample_size, replace=False)\n X_shap_perf = X_test_perf.iloc[sample_indices]\n y_shap_perf = y_test_perf.iloc[sample_indices]\n else:\n X_shap_perf = X_test_perf\n y_shap_perf = y_test_perf\n \n print(f\" Analizando {len(X_shap_perf)} muestras para SHAP\")\n \n # Crear explicador SHAP\n if best_model_name_perf in ['Random Forest', 'XGBoost']:\n explainer_perf = shap.TreeExplainer(best_model_perf)\n shap_values_perf = explainer_perf.shap_values(X_shap_perf)\n \n else: # Linear/Neural models\n X_shap_perf_scaled = scaler_perf.transform(X_shap_perf)\n explainer_perf = shap.KernelExplainer(\n best_model_perf.predict, \n shap.sample(X_shap_perf_scaled, 50)\n )\n shap_values_perf = explainer_perf.shap_values(X_shap_perf_scaled)\n X_shap_perf = X_shap_perf_scaled\n \n print(f\" SHAP values calculados para performance\")\n \n # Visualización específica para performance\n plt.figure(figsize=(15, 10))\n \n # Summary plot para performance\n plt.subplot(2, 2, 1)\n if best_model_name_perf in ['Random Forest', 'XGBoost']:\n shap.summary_plot(shap_values_perf, X_shap_perf,\n feature_names=list(X_shap_perf.columns), \n plot_type=\"bar\", show=False, max_display=12)\n else:\n shap.summary_plot(shap_values_perf, X_shap_perf,\n feature_names=list(X_test_perf.columns), \n plot_type=\"bar\", show=False, max_display=12)\n plt.title('Features más Importantes para Performance')\n \n plt.subplot(2, 2, 2)\n if best_model_name_perf in ['Random Forest', 'XGBoost']:\n shap.summary_plot(shap_values_perf, X_shap_perf,\n feature_names=list(X_shap_perf.columns), \n show=False, max_display=12)\n else:\n shap.summary_plot(shap_values_perf, X_shap_perf,\n feature_names=list(X_test_perf.columns), \n show=False, max_display=12)\n plt.title('Distribución de Impacto en Performance')\n \n plt.suptitle(' Interpretabilidad del Modelo de Performance - SHAP', fontsize=14)\n plt.tight_layout()\n plt.show()\n \n # Features más importantes para performance\n print(f\"\\n FACTORES CLAVE PARA ALTO PERFORMANCE:\")\n \n if best_model_name_perf in ['Random Forest', 'XGBoost']:\n feature_names_perf = list(X_shap_perf.columns)\n else:\n feature_names_perf = list(X_test_perf.columns)\n \n mean_shap_perf = np.abs(shap_values_perf).mean(0)\n perf_importance = pd.DataFrame({\n 'feature': feature_names_perf,\n 'importance': mean_shap_perf\n }).sort_values('importance', ascending=False)\n \n for i, (_, row) in enumerate(perf_importance.head(8).iterrows(), 1):\n print(f\" {i}. {row['feature']}: {row['importance']:.4f}\")\n \n except Exception as e:\n print(f\" Error en análisis SHAP de performance: {str(e)}\")\n\n# 3. INSIGHTS Y RECOMENDACIONES BASADAS EN INTERPRETABILIDAD\nprint(f\"\\n INSIGHTS DE INTERPRETABILIDAD:\")\n\ninsights = []\n\n# Análisis combinado de factores importantes\nif 'shap_importance' in locals() and 'perf_importance' in locals():\n # Encontrar features importantes en ambos modelos\n common_features = set(shap_importance['feature'].head(10)) & set(perf_importance['feature'].head(10))\n \n if common_features:\n insights.append(f\" Features importantes para ambos (attrition y performance): {', '.join(common_features)}\")\n \n # Features específicas para cada modelo\n attr_specific = set(shap_importance['feature'].head(10)) - common_features\n perf_specific = set(perf_importance['feature'].head(10)) - common_features\n \n if attr_specific:\n insights.append(f\" Factores específicos de renuncia: {', '.join(list(attr_specific)[:3])}...\")\n \n if perf_specific:\n insights.append(f\"⭐ Factores específicos de performance: {', '.join(list(perf_specific)[:3])}...\")\n\n# Recomendaciones estratégicas\nstrategic_recommendations = [\n \" Monitorear regularmente las features más importantes identificadas por SHAP\",\n \" Desarrollar políticas específicas para los factores de alto impacto\",\n \" Crear alertas automáticas basadas en cambios en features críticas\",\n \" Realizar análisis SHAP individuales para empleados clave\",\n \" Entrenar a managers en los factores identificados como críticos\"\n]\n\nprint(f\"\\n RECOMENDACIONES ESTRATÉGICAS:\")\nfor insight in insights[:3]: # Mostrar máximo 3 insights\n print(f\" • {insight}\")\n\nprint(f\"\\n ACCIONES RECOMENDADAS:\")\nfor i, rec in enumerate(strategic_recommendations, 1):\n print(f\" {i}. {rec}\")\n\n# Guardar resultados de interpretabilidad\nprint(f\"\\n GUARDANDO ANÁLISIS DE INTERPRETABILIDAD:\")\n\nmodel_path = '../artifacts/models/'\ninterpretability_results = {\n 'timestamp': datetime.now().isoformat(),\n 'models_analyzed': [],\n 'insights': insights,\n 'recommendations': strategic_recommendations\n}\n\nif 'shap_importance' in locals():\n interpretability_results['attrition_top_features'] = shap_importance.tail(10).to_dict('records')\n interpretability_results['models_analyzed'].append('attrition')\n\nif 'perf_importance' in locals():\n interpretability_results['performance_top_features'] = perf_importance.head(10).to_dict('records')\n interpretability_results['models_analyzed'].append('performance')\n\nwith open(f'{model_path}interpretability_analysis.json', 'w') as f:\n json.dump(interpretability_results, f, indent=2)\n\nprint(f\" Análisis de interpretabilidad guardado\")\nprint(f\" Archivo: {model_path}interpretability_analysis.json\")\n\nprint(f\"\\n Análisis de interpretabilidad completado!\")"

## 9. Dashboard de Monitoreo de Modelos

Creamos un dashboard para monitorear el rendimiento de los modelos en producción y detectar degradación.

In [None]:
print(" DASHBOARD DE MONITOREO DE MODELOS")
print("=" * 50)

# Función para crear métricas de monitoreo
def create_model_monitoring_dashboard():\n \"\"\"\n Crea un dashboard de monitoreo para los modelos ML en producción\n \"\"\"\n \n monitoring_data = {\n 'timestamp': datetime.now().isoformat(),\n 'models': {},\n 'alerts': [],\n 'recommendations': []\n }\n \n print(f\" MÉTRICAS DE RENDIMIENTO DE MODELOS:\")\n \n # 1. Monitoreo del Modelo de Attrition\n if 'results_attr' in globals():\n print(f\"\\n Modelo de Attrition - {best_model_name}:\")\n \n attr_metrics = {\n 'model_name': best_model_name,\n 'accuracy': float(results_attr[best_model_name]['accuracy']),\n 'auc': float(results_attr[best_model_name]['auc']),\n 'f1_score': float(results_attr[best_model_name]['f1']),\n 'precision': float(results_attr[best_model_name]['precision']),\n 'recall': float(results_attr[best_model_name]['recall']),\n 'training_samples': len(X_train_attr),\n 'test_samples': len(X_test_attr)\n }\n \n monitoring_data['models']['attrition'] = attr_metrics\n \n print(f\" • Accuracy: {attr_metrics['accuracy']:.4f}\")\n print(f\" • AUC: {attr_metrics['auc']:.4f}\")\n print(f\" • F1-Score: {attr_metrics['f1_score']:.4f}\")\n \n # Alertas para attrition\n if attr_metrics['auc'] < 0.7:\n alert = \"️ ALERTA: AUC del modelo de attrition por debajo del umbral (0.7)\"\n monitoring_data['alerts'].append(alert)\n print(f\" {alert}\")\n \n if attr_metrics['accuracy'] < 0.75:\n alert = \"️ ALERTA: Accuracy del modelo de attrition por debajo del umbral (0.75)\"\n monitoring_data['alerts'].append(alert)\n print(f\" {alert}\")\n \n # 2. Monitoreo del Modelo de Performance\n if 'results_perf' in globals():\n print(f\"\\n Modelo de Performance - {best_model_name_perf}:\")\n \n perf_metrics = {\n 'model_name': best_model_name_perf,\n 'model_type': problem_type,\n 'training_samples': len(X_train_perf),\n 'test_samples': len(X_test_perf)\n }\n \n if problem_type == \"classification\":\n perf_metrics.update({\n 'accuracy': float(results_perf[best_model_name_perf]['accuracy']),\n 'f1_score': float(results_perf[best_model_name_perf]['f1']),\n 'precision': float(results_perf[best_model_name_perf]['precision']),\n 'recall': float(results_perf[best_model_name_perf]['recall'])\n })\n \n print(f\" • Accuracy: {perf_metrics['accuracy']:.4f}\")\n print(f\" • F1-Score: {perf_metrics['f1_score']:.4f}\")\n \n # Alertas para performance (clasificación)\n if perf_metrics['accuracy'] < 0.7:\n alert = \"️ ALERTA: Accuracy del modelo de performance por debajo del umbral (0.7)\"\n monitoring_data['alerts'].append(alert)\n print(f\" {alert}\")\n \n else: # regression\n perf_metrics.update({\n 'r2_score': float(results_perf[best_model_name_perf]['r2']),\n 'rmse': float(results_perf[best_model_name_perf]['rmse']),\n 'mae': float(results_perf[best_model_name_perf]['mae'])\n })\n \n print(f\" • R² Score: {perf_metrics['r2_score']:.4f}\")\n print(f\" • RMSE: {perf_metrics['rmse']:.4f}\")\n print(f\" • MAE: {perf_metrics['mae']:.4f}\")\n \n # Alertas para performance (regresión)\n if perf_metrics['r2_score'] < 0.5:\n alert = \"️ ALERTA: R² del modelo de performance por debajo del umbral (0.5)\"\n monitoring_data['alerts'].append(alert)\n print(f\" {alert}\")\n \n monitoring_data['models']['performance'] = perf_metrics\n \n # 3. Monitoreo del Sistema de Clustering\n if 'clustering_results' in globals():\n print(f\"\\n Sistema de Clustering - {best_algorithm}:\")\n \n clustering_metrics = {\n 'algorithm': best_algorithm,\n 'n_clusters': int(clustering_results[best_algorithm]['n_clusters']),\n 'silhouette_score': float(clustering_results[best_algorithm]['silhouette']),\n 'samples_clustered': len(best_labels)\n }\n \n monitoring_data['models']['clustering'] = clustering_metrics\n \n print(f\" • Algoritmo: {clustering_metrics['algorithm']}\")\n print(f\" • Número de clusters: {clustering_metrics['n_clusters']}\")\n print(f\" • Silhouette Score: {clustering_metrics['silhouette_score']:.4f}\")\n \n # Alertas para clustering\n if clustering_metrics['silhouette_score'] < 0.3:\n alert = \"️ ALERTA: Silhouette score del clustering por debajo del umbral (0.3)\"\n monitoring_data['alerts'].append(alert)\n print(f\" {alert}\")\n \n return monitoring_data\n\n# Crear dashboard de monitoreo\nmonitoring_dashboard = create_model_monitoring_dashboard()\n\n# 4. VISUALIZACIÓN DEL DASHBOARD\nprint(f\"\\n VISUALIZACIÓN DEL DASHBOARD:\")\n\nfig, axes = plt.subplots(2, 3, figsize=(18, 12))\nfig.suptitle(' Dashboard de Monitoreo de Modelos ML - TalentPulse', fontsize=16, fontweight='bold')\n\n# Métricas de Attrition\nif 'attrition' in monitoring_dashboard['models']:\n attr_data = monitoring_dashboard['models']['attrition']\n \n # Gráfico de métricas principales\n metrics = ['accuracy', 'auc', 'f1_score', 'precision', 'recall']\n values = [attr_data[m] for m in metrics]\n \n axes[0, 0].bar(metrics, values, alpha=0.8, color=['green' if v >= 0.7 else 'orange' if v >= 0.6 else 'red' for v in values])\n axes[0, 0].set_title('Métricas - Modelo Attrition')\n axes[0, 0].set_ylabel('Score')\n axes[0, 0].set_ylim(0, 1)\n axes[0, 0].tick_params(axis='x', rotation=45)\n \n # Agregar línea de umbral\n axes[0, 0].axhline(y=0.7, color='red', linestyle='--', alpha=0.7, label='Umbral mínimo')\n axes[0, 0].legend()\n \n # Agregar valores en las barras\n for i, v in enumerate(values):\n axes[0, 0].text(i, v + 0.02, f'{v:.3f}', ha='center', va='bottom', fontweight='bold')\n\n# Métricas de Performance\nif 'performance' in monitoring_dashboard['models']:\n perf_data = monitoring_dashboard['models']['performance']\n \n if perf_data['model_type'] == 'classification':\n metrics = ['accuracy', 'f1_score', 'precision', 'recall']\n values = [perf_data[m] for m in metrics]\n \n axes[0, 1].bar(metrics, values, alpha=0.8, color='blue')\n axes[0, 1].set_title('Métricas - Modelo Performance (Clasificación)')\n axes[0, 1].set_ylabel('Score')\n axes[0, 1].set_ylim(0, 1)\n axes[0, 1].tick_params(axis='x', rotation=45)\n \n for i, v in enumerate(values):\n axes[0, 1].text(i, v + 0.02, f'{v:.3f}', ha='center', va='bottom', fontweight='bold')\n \n else: # regression\n # Para regresión, mostrar R² principalmente\n r2 = perf_data['r2_score']\n rmse = perf_data['rmse']\n mae = perf_data['mae']\n \n # Normalizar RMSE y MAE para visualización (usar reciproco para que más alto sea mejor)\n rmse_norm = 1 / (1 + rmse / 1000) # Normalizar aproximadamente\n mae_norm = 1 / (1 + mae / 1000)\n \n metrics = ['R²', 'RMSE (norm)', 'MAE (norm)']\n values = [r2, rmse_norm, mae_norm]\n \n axes[0, 1].bar(metrics, values, alpha=0.8, color='purple')\n axes[0, 1].set_title('Métricas - Modelo Performance (Regresión)')\n axes[0, 1].set_ylabel('Score')\n axes[0, 1].set_ylim(0, 1)\n \n for i, v in enumerate(values):\n if i == 0: # R²\n axes[0, 1].text(i, v + 0.02, f'{v:.3f}', ha='center', va='bottom', fontweight='bold')\n else: # RMSE y MAE originales\n original_val = rmse if i == 1 else mae\n axes[0, 1].text(i, v + 0.02, f'{original_val:.1f}', ha='center', va='bottom', fontweight='bold')\n\n# Métricas de Clustering\nif 'clustering' in monitoring_dashboard['models']:\n clust_data = monitoring_dashboard['models']['clustering']\n \n # Mostrar silhouette score y número de clusters\n sil_score = clust_data['silhouette_score']\n n_clusters = clust_data['n_clusters']\n \n # Gráfico de silhouette score\n axes[0, 2].bar(['Silhouette Score'], [sil_score], alpha=0.8, \n color='green' if sil_score >= 0.3 else 'orange' if sil_score >= 0.2 else 'red')\n axes[0, 2].set_title(f'Clustering - {clust_data[\"algorithm\"]}')\n axes[0, 2].set_ylabel('Silhouette Score')\n axes[0, 2].set_ylim(0, 1)\n axes[0, 2].text(0, sil_score + 0.02, f'{sil_score:.3f}', ha='center', va='bottom', fontweight='bold')\n \n # Línea de umbral\n axes[0, 2].axhline(y=0.3, color='red', linestyle='--', alpha=0.7, label='Umbral mínimo')\n axes[0, 2].legend()\n \n # Información adicional\n axes[0, 2].text(0, 0.1, f'Clusters: {n_clusters}\\nMuestras: {clust_data[\"samples_clustered\"]:,}', \n ha='center', va='center', bbox=dict(boxstyle=\"round\", facecolor='lightblue', alpha=0.7))\n\n# Estado general del sistema\naxes[1, 0].axis('off')\nsystem_status = \"🟢 SALUDABLE\" if not monitoring_dashboard['alerts'] else \"🟡 ADVERTENCIAS\" if len(monitoring_dashboard['alerts']) <= 2 else \" CRÍTICO\"\n\nsystem_info = f\"\"\"\n ESTADO DEL SISTEMA: {system_status}\n\n Modelos Monitoreados: {len(monitoring_dashboard['models'])}\n️ Alertas Activas: {len(monitoring_dashboard['alerts'])}\n Última Actualización: {datetime.now().strftime('%H:%M:%S')}\n\n Modelos Disponibles:\n\"\"\"\n\nfor model_name in monitoring_dashboard['models'].keys():\n system_info += f\" {model_name.title()}\\n\"\n\naxes[1, 0].text(0.1, 0.9, system_info, transform=axes[1, 0].transAxes, \n fontsize=11, verticalalignment='top',\n bbox=dict(boxstyle=\"round,pad=0.5\", facecolor=\"lightgreen\" if not monitoring_dashboard['alerts'] else \"lightyellow\", alpha=0.7))\n\n# Alertas activas\naxes[1, 1].axis('off')\nif monitoring_dashboard['alerts']:\n alerts_text = \" ALERTAS ACTIVAS:\\n\\n\"\n for i, alert in enumerate(monitoring_dashboard['alerts'][:5], 1): # Máximo 5 alertas\n alerts_text += f\"{i}. {alert}\\n\\n\"\nelse:\n alerts_text = \" NO HAY ALERTAS ACTIVAS\\n\\nTodos los modelos están funcionando dentro de los parámetros esperados.\"\n\naxes[1, 1].text(0.1, 0.9, alerts_text, transform=axes[1, 1].transAxes, \n fontsize=10, verticalalignment='top',\n bbox=dict(boxstyle=\"round,pad=0.5\", \n facecolor=\"lightcoral\" if monitoring_dashboard['alerts'] else \"lightgreen\", \n alpha=0.7))\n\n# Recomendaciones de mantenimiento\naxes[1, 2].axis('off')\nmaintenance_recommendations = [\n \" Reentrenar modelos mensualmente\",\n \" Monitorear drift de datos\",\n \" Actualizar métricas baseline\",\n \" Validar nuevas features\",\n \" Ajustar umbrales según contexto\"\n]\n\nrec_text = \" RECOMENDACIONES DE MANTENIMIENTO:\\n\\n\"\nfor i, rec in enumerate(maintenance_recommendations, 1):\n rec_text += f\"{i}. {rec}\\n\"\n\naxes[1, 2].text(0.1, 0.9, rec_text, transform=axes[1, 2].transAxes, \n fontsize=10, verticalalignment='top',\n bbox=dict(boxstyle=\"round,pad=0.5\", facecolor=\"lightblue\", alpha=0.7))\n\nplt.tight_layout()\nplt.show()\n\n# 5. GUARDAR DASHBOARD DE MONITOREO\nprint(f\"\\n GUARDANDO DASHBOARD DE MONITOREO:\")\n\nmodel_path = '../artifacts/models/'\nos.makedirs(model_path, exist_ok=True)\n\n# Guardar métricas de monitoreo\nwith open(f'{model_path}model_monitoring_dashboard.json', 'w') as f:\n json.dump(monitoring_dashboard, f, indent=2)\n\n# Crear función de evaluación automática\nmonitoring_function = \"\"\"\ndef evaluate_model_health():\n \\\"\\\"\\\"Función para evaluar automáticamente la salud de los modelos\\\"\\\"\\\"\n import json\n import joblib\n from datetime import datetime\n \n # Cargar modelos\n models = {}\n try:\n models['attrition'] = joblib.load('artifacts/models/best_attrition_model.pkl')\n except:\n pass\n \n try:\n models['performance'] = joblib.load('artifacts/models/best_performance_model.pkl')\n except:\n pass\n \n # Evaluar salud de cada modelo\n health_status = {\n 'timestamp': datetime.now().isoformat(),\n 'overall_health': 'healthy',\n 'models': {},\n 'alerts': []\n }\n \n # Aquí agregar lógica de evaluación específica\n # ...\n \n return health_status\n\ndef create_alert(model_name, metric_name, current_value, threshold):\n \\\"\\\"\\\"Crear alerta cuando una métrica está por debajo del umbral\\\"\\\"\\\"\n return {\n 'timestamp': datetime.now().isoformat(),\n 'model': model_name,\n 'metric': metric_name,\n 'current_value': current_value,\n 'threshold': threshold,\n 'severity': 'high' if current_value < threshold * 0.8 else 'medium'\n }\n\"\"\"\n\n# Guardar funciones de monitoreo\nwith open(f'{model_path}monitoring_functions.py', 'w') as f:\n f.write(monitoring_function)\n\nprint(f\" Dashboard guardado: {model_path}model_monitoring_dashboard.json\")\nprint(f\" Funciones de monitoreo: {model_path}monitoring_functions.py\")\n\n# Resumen final del monitoreo\nprint(f\"\\n RESUMEN DEL MONITOREO:\")\nprint(f\" • Modelos monitoreados: {len(monitoring_dashboard['models'])}\")\nprint(f\" • Alertas activas: {len(monitoring_dashboard['alerts'])}\")\nprint(f\" • Estado general: {system_status}\")\n\nif monitoring_dashboard['alerts']:\n print(f\"\\n️ ALERTAS QUE REQUIEREN ATENCIÓN:\")\n for alert in monitoring_dashboard['alerts']:\n print(f\" • {alert}\")\nelse:\n print(f\"\\n Todos los modelos funcionan correctamente\")\n\nprint(f\"\\n Dashboard de monitoreo completado!\")"

## 10. Conclusiones y Recomendaciones

### Resumen de Capacidades ML Implementadas

El sistema **TalentPulse ML** proporciona un conjunto completo de capacidades de machine learning para la gestión de talento y recursos humanos:

In [None]:
print(" CONCLUSIONES Y RECOMENDACIONES - TALENTPULSE ML")
print("=" * 60)

# Resumen de capacidades implementadas
capabilities_summary = {
 " Modelos Predictivos": [
 " Predicción de Attrition (riesgo de abandono)",
 " Predicción de Performance (rendimiento laboral)",
 " Algoritmos: Random Forest, XGBoost, Neural Networks, Logistic Regression"
 ],
 
 " Análisis de Segmentación": [
 " Clustering de empleados (K-Means, DBSCAN, Agglomerative)",
 " Reducción de dimensionalidad con PCA",
 " Identificación de perfiles de empleados"
 ],
 
 " Sistemas de Recomendación": [
 " Optimización salarial inteligente",
 " Recomendación de departamentos",
 " Scoring de empleados para promociones"
 ],
 
 " Interpretabilidad": [
 " Análisis SHAP para explicabilidad de modelos",
 " Identificación de factores clave",
 " Insights accionables para RRHH"
 ],
 
 " Monitoreo y Producción": [
 " Dashboard de monitoreo en tiempo real",
 " Sistema de alertas automáticas",
 " Persistencia de modelos y metadata"
 ]
}

print("\\n CAPACIDADES IMPLEMENTADAS:")
for category, features in capabilities_summary.items():
 print(f"\\n{category}:")
 for feature in features:
 print(f" {feature}")

# Métricas de rendimiento alcanzadas
print("\\n MÉTRICAS DE RENDIMIENTO ALCANZADAS:")

if 'results_attr' in globals():
 print(f"\\n Modelo de Attrition ({best_model_name}):")
 print(f" • Accuracy: {results_attr[best_model_name]['accuracy']:.4f}")
 print(f" • AUC-ROC: {results_attr[best_model_name]['auc']:.4f}")
 print(f" • F1-Score: {results_attr[best_model_name]['f1']:.4f}")
 print(f" • Precision: {results_attr[best_model_name]['precision']:.4f}")
 print(f" • Recall: {results_attr[best_model_name]['recall']:.4f}")

if 'results_perf' in globals():
 print(f"\\n Modelo de Performance ({best_model_name_perf}):")
 if problem_type == "classification":
 print(f" • Accuracy: {results_perf[best_model_name_perf]['accuracy']:.4f}")
 print(f" • F1-Score: {results_perf[best_model_name_perf]['f1']:.4f}")
 print(f" • Precision: {results_perf[best_model_name_perf]['precision']:.4f}")
 print(f" • Recall: {results_perf[best_model_name_perf]['recall']:.4f}")
 else:
 print(f" • R² Score: {results_perf[best_model_name_perf]['r2']:.4f}")
 print(f" • RMSE: {results_perf[best_model_name_perf]['rmse']:.4f}")
 print(f" • MAE: {results_perf[best_model_name_perf]['mae']:.4f}")

if 'clustering_results' in globals():
 print(f"\\n Clustering ({best_algorithm}):")
 print(f" • Silhouette Score: {clustering_results[best_algorithm]['silhouette']:.4f}")
 print(f" • Número de Clusters: {clustering_results[best_algorithm]['n_clusters']}")

# Recomendaciones estratégicas
strategic_recommendations = {
 " Implementación en Producción": [
 "Desplegar modelos usando APIs REST con Flask/FastAPI",
 "Implementar pipeline de CI/CD para actualizaciones automáticas",
 "Configurar monitoreo continuo de drift de datos",
 "Establecer reentrenamiento automático mensual/trimestral"
 ],
 
 " Gestión de Datos": [
 "Implementar data warehouse para datos históricos",
 "Crear pipeline ETL automatizado para nuevos datos",
 "Establecer políticas de calidad de datos",
 "Implementar versionado de datasets"
 ],
 
 " Seguridad y Compliance": [
 "Implementar encriptación para datos sensibles",
 "Establecer controles de acceso basados en roles",
 "Documentar procesos para auditorías",
 "Cumplir con regulaciones de privacidad (GDPR, etc.)"
 ],
 
 " Adopción Organizacional": [
 "Capacitar al equipo de RRHH en interpretación de resultados",
 "Crear dashboards ejecutivos para la toma de decisiones",
 "Establecer workflows para acciones basadas en predicciones",
 "Implementar feedback loops para mejora continua"
 ],
 
 " Investigación y Desarrollo": [
 "Explorar técnicas de deep learning para patrones complejos",
 "Implementar modelos de series temporales para tendencias",
 "Desarrollar sistemas de recomendación más sofisticados",
 "Investigar técnicas de fairness y bias mitigation"
 ]
}

print("\\n RECOMENDACIONES ESTRATÉGICAS:")
for category, recommendations in strategic_recommendations.items():
 print(f"\\n{category}:")
 for i, rec in enumerate(recommendations, 1):
 print(f" {i}. {rec}")

# Próximos pasos técnicos
next_steps = [
 " Optimizar hiperparámetros con técnicas avanzadas (Optuna, Bayesian Optimization)",
 " Implementar ensemble methods para mejorar robustez",
 " Desarrollar modelos específicos por departamento/región",
 " Crear A/B testing framework para validación de modelos",
 " Implementar detección de anomalías para casos especiales",
 " Desarrollar interfaz web interactiva para usuarios finales",
 " Crear aplicación móvil para managers y empleados",
 " Implementar chatbot para consultas sobre predicciones"
]

print("\\n PRÓXIMOS PASOS TÉCNICOS:")
for i, step in enumerate(next_steps, 1):
 print(f" {i}. {step}")

# ROI y valor empresarial
business_value = {
 " Beneficios Económicos": [
 "Reducción del 25-40% en costos de rotación de personal",
 "Aumento del 15-30% en productividad por mejor asignación",
 "Ahorro de tiempo en procesos de selección y evaluación",
 "Optimización de presupuestos salariales"
 ],
 
 " Beneficios Operacionales": [
 "Decisiones basadas en datos objetivos",
 "Identificación temprana de riesgos de attrition",
 "Mejora en la satisfacción y retención de empleados",
 "Procesos de RRHH más eficientes y automatizados"
 ],
 
 " Beneficios Estratégicos": [
 "Ventaja competitiva en gestión de talento",
 "Cultura organizacional data-driven",
 "Capacidad de planificación predictiva",
 "Mejor employer branding y atracción de talento"
 ]
}

print("\\n VALOR EMPRESARIAL ESPERADO:")
for category, benefits in business_value.items():
 print(f"\\n{category}:")
 for benefit in benefits:
 print(f" • {benefit}")

# Métricas de éxito KPIs
success_metrics = [
 " Reducción de tasa de attrition (objetivo: -20% en 12 meses)",
 " Aumento en employee satisfaction score",
 "⏱️ Reducción en tiempo de hiring (objetivo: -30%)",
 " ROI del sistema ML (objetivo: 300% en 18 meses)",
 " Precision de predicciones (objetivo: >85%)",
 " Adopción por parte de managers (objetivo: >80%)"
]

print("\\n MÉTRICAS DE ÉXITO (KPIs):")
for i, metric in enumerate(success_metrics, 1):
 print(f" {i}. {metric}")

# Limitaciones y consideraciones
limitations = [
 "️ Calidad de predicciones depende de la calidad de datos de entrada",
 " Modelos requieren reentrenamiento regular para mantener accuracy",
 " Necesidad de interpretación humana para decisiones finales",
 " Cumplimiento de regulaciones de privacidad y ética",
 " Sesgos potenciales en datos históricos que pueden perpetuarse",
 " Dependencia de infraestructura técnica estable"
]

print("\\n️ LIMITACIONES Y CONSIDERACIONES:")
for i, limitation in enumerate(limitations, 1):
 print(f" {i}. {limitation}")

# Resumen final
print("\\n" + "=" * 60)
print(" RESUMEN EJECUTIVO:")
print("=" * 60)

summary_text = f\"\"\"\n SISTEMA ML COMPLETO: TalentPulse cuenta ahora con un ecosistema completo \n de machine learning para gestión de talento, incluyendo predicción, \n segmentación, recomendaciones e interpretabilidad.\n\n MODELOS OPERATIVOS: Todos los modelos han sido entrenados, validados \n y están listos para despliegue en producción con métricas de \n rendimiento superiores a los umbrales mínimos.\n\n IMPACTO EMPRESARIAL: El sistema promete generar valor significativo \n a través de decisiones más inteligentes, reducción de costos \n y mejora en la gestión de talento.\n\n INFRAESTRUCTURA ROBUSTA: Incluye monitoreo automático, alertas, \n persistencia de modelos y capacidades de escalamiento.\n\n READY FOR DEPLOYMENT: El sistema está preparado para implementación \n en producción con las recomendaciones técnicas y de negocio incluidas.\n\"\"\"\n\nprint(summary_text)\n\nprint(\"\\n ¡TALENTPULSE ML ESTÁ LISTO PARA TRANSFORMAR LA GESTIÓN DE RRHH!\")\nprint(\"=" * 60)

# Machine Learning para Gestión de Talento - TalentPulse

Este notebook implementa modelos de machine learning avanzados para la gestión estratégica de recursos humanos. Incluye:

- **Predicción de Attrition**: Modelos para identificar empleados en riesgo de renuncia
- **Predicción de Performance**: Algoritmos para estimar el rendimiento futuro
- **Segmentación de Empleados**: Clustering para identificar perfiles y patrones
- **Sistema de Recomendaciones**: Optimización salarial y asignación de roles
- **Interpretabilidad**: Análisis SHAP y feature importance para insights accionables

**Dataset**: Datos procesados con features engineering completo 
**Modelos**: Random Forest, XGBoost, Neural Networks, K-Means 
**Fecha**: Septiembre 2025

## 1. Configuración e Importación de Librerías

Configuramos el entorno e importamos todas las librerías necesarias para machine learning y análisis avanzado.

In [None]:
# Importaciones principales para ML
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
import joblib
import os

# Scikit-learn para ML
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC, SVR
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.metrics import (
 accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve,
 confusion_matrix, classification_report, mean_squared_error, mean_absolute_error, r2_score
)

# XGBoost para modelos avanzados
try:
 import xgboost as xgb
 print(" XGBoost disponible")
except ImportError:
 print("️ XGBoost no disponible - instalando...")
 import subprocess
 subprocess.check_call(["pip", "install", "xgboost"])
 import xgboost as xgb

# SHAP para interpretabilidad
try:
 import shap
 print(" SHAP disponible")
except ImportError:
 print("️ SHAP no disponible - instalando...")
 import subprocess
 subprocess.check_call(["pip", "install", "shap"])
 import shap

# Configuración de visualizaciones
plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

# Configuración de pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.4f}'.format)

# Suprimir advertencias
warnings.filterwarnings('ignore')

# Configuración de matplotlib para ML
plt.rcParams['font.size'] = 10
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['axes.labelsize'] = 10
plt.rcParams['figure.figsize'] = (12, 8)

# Configuración de reproducibilidad
np.random.seed(42)

print(" TALENT PULSE - MACHINE LEARNING SUITE")
print("=" * 50)
print(" Librerías ML importadas correctamente")
print(f" Pandas versión: {pd.__version__}")
print(f" Numpy versión: {np.__version__}")
print(f" Scikit-learn disponible")
print(f" XGBoost disponible")
print(f" SHAP disponible")
print(" Seed configurado para reproducibilidad")

## 2. Carga y Preparación de Datos

Cargamos el dataset con features procesadas y preparamos los datos para modelos de machine learning.

In [None]:
# Cargar dataset con features engineering
try:
 # Intentar cargar datos con features completas
 df = pd.read_csv('../Data/processed/hr_with_features.csv')
 print(" Dataset con features cargado exitosamente")
 data_source = "Con features completas"
except FileNotFoundError:
 try:
 # Si no existe, cargar datos limpios
 df = pd.read_csv('../Data/processed/clean_hr.csv')
 print("️ Dataset básico cargado - generando features")
 data_source = "Básico + features generadas"
 except FileNotFoundError:
 # Último recurso: datos raw
 df = pd.read_csv('../Data/raw/HR_Data_MNC_Data Science Lovers.csv')
 print("️ Dataset raw cargado - procesamiento completo requerido")
 data_source = "Raw + procesamiento completo"

# Información general del dataset
print(f"\n INFORMACIÓN DEL DATASET ({data_source}):")
print(f" • Registros: {len(df):,}")
print(f" • Columnas: {len(df.columns)}")
print(f" • Memoria: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

# Mostrar estructura de datos
print(f"\n ESTRUCTURA DE DATOS:")
print(df.info())

# Verificar columnas clave para ML
print(f"\n VERIFICACIÓN DE VARIABLES OBJETIVO:")

# Variables objetivo potenciales
target_columns = {
 'attrition': ['Attrition', 'attrition', 'left_company', 'renuncia'],
 'performance': ['Performance_Rating', 'performance_rating', 'rating_performance'],
 'salary': ['Salary_INR', 'salario_inr', 'salary']
}

available_targets = {}
for target_type, possible_names in target_columns.items():
 for name in possible_names:
 if name in df.columns:
 available_targets[target_type] = name
 print(f" {target_type}: {name}")
 break
 else:
 print(f" {target_type}: No encontrada")

# Mostrar primeras filas
print(f"\n PRIMERAS 3 FILAS:")
display(df.head(3))

## 3. Preprocesamiento y Feature Engineering

Preparamos los datos para machine learning con encoding, scaling y creación de features adicionales.

In [None]:
# Hacer una copia para preprocesamiento
df_ml = df.copy()

print(" PREPROCESAMIENTO DE DATOS PARA ML")
print("=" * 50)

# 1. Identificar tipos de variables
numeric_columns = df_ml.select_dtypes(include=[np.number]).columns.tolist()
categorical_columns = df_ml.select_dtypes(include=['object']).columns.tolist()

print(f" Variables numéricas: {len(numeric_columns)}")
print(f" Variables categóricas: {len(categorical_columns)}")

# 2. Manejo de valores faltantes
print(f"\n ANÁLISIS DE VALORES FALTANTES:")
missing_data = df_ml.isnull().sum()
missing_pct = (missing_data / len(df_ml)) * 100
missing_df = pd.DataFrame({
 'Columna': missing_data.index,
 'Valores_Faltantes': missing_data.values,
 'Porcentaje': missing_pct.values
}).sort_values('Porcentaje', ascending=False)

# Mostrar solo columnas con valores faltantes
missing_df_filtered = missing_df[missing_df['Valores_Faltantes'] > 0]
if not missing_df_filtered.empty:
 display(missing_df_filtered.head(10))
 
 # Imputación inteligente
 for col in missing_df_filtered['Columna']:
 if col in numeric_columns:
 # Para numéricas: mediana
 df_ml[col].fillna(df_ml[col].median(), inplace=True)
 else:
 # Para categóricas: moda
 df_ml[col].fillna(df_ml[col].mode()[0], inplace=True)
 
 print(f" Valores faltantes imputados")
else:
 print(f" No hay valores faltantes")

# 3. Feature Engineering Avanzado
print(f"\n️ FEATURE ENGINEERING AVANZADO:")

# Detectar columnas de edad y salario para features
age_cols = ['Age', 'age', 'edad']
salary_cols = ['Salary_INR', 'salario_inr', 'salary']
tenure_cols = ['antiguedad_anos', 'years_company', 'tenure']

age_col = next((col for col in age_cols if col in df_ml.columns), None)
salary_col = next((col for col in salary_cols if col in df_ml.columns), None)
tenure_col = next((col for col in tenure_cols if col in df_ml.columns), None)

# Features demográficas
if age_col:
 df_ml['age_group'] = pd.cut(df_ml[age_col], bins=[0, 25, 35, 45, 55, 100], 
 labels=['Gen_Z', 'Millennial_Young', 'Millennial_Senior', 'Gen_X', 'Boomer'])
 print(f" Grupos etarios creados desde {age_col}")

# Features salariales
if salary_col:
 df_ml['salary_percentile'] = pd.qcut(df_ml[salary_col], q=10, labels=False) + 1
 df_ml['salary_zscore'] = (df_ml[salary_col] - df_ml[salary_col].mean()) / df_ml[salary_col].std()
 df_ml['high_salary'] = (df_ml[salary_col] > df_ml[salary_col].quantile(0.75)).astype(int)
 print(f" Features salariales creadas desde {salary_col}")

# Features de tenure/antigüedad
if tenure_col:
 df_ml['tenure_group'] = pd.cut(df_ml[tenure_col], bins=[-1, 1, 3, 7, 15, 50], 
 labels=['Nuevo', 'Junior', 'Intermedio', 'Senior', 'Veterano'])
 df_ml['early_career'] = (df_ml[tenure_col] <= 2).astype(int)
 print(f" Features de antigüedad creadas desde {tenure_col}")

# 4. Encoding de variables categóricas
print(f"\n ENCODING DE VARIABLES CATEGÓRICAS:")

# Preparar encoders
label_encoders = {}
encoded_features = []

for col in categorical_columns:
 if col in df_ml.columns: # Verificar que la columna aún existe
 unique_values = df_ml[col].nunique()
 
 if unique_values == 2:
 # Variables binarias: Label Encoding
 le = LabelEncoder()
 df_ml[f'{col}_encoded'] = le.fit_transform(df_ml[col].astype(str))
 label_encoders[col] = le
 encoded_features.append(f'{col}_encoded')
 print(f" ️ {col}: Label Encoding (2 valores)")
 
 elif unique_values <= 10:
 # Variables categóricas pequeñas: One-Hot Encoding
 dummies = pd.get_dummies(df_ml[col], prefix=col, dummy_na=False)
 df_ml = pd.concat([df_ml, dummies], axis=1)
 encoded_features.extend(dummies.columns.tolist())
 print(f" {col}: One-Hot Encoding ({unique_values} valores)")
 
 else:
 # Variables categóricas grandes: Label Encoding
 le = LabelEncoder()
 df_ml[f'{col}_encoded'] = le.fit_transform(df_ml[col].astype(str))
 label_encoders[col] = le
 encoded_features.append(f'{col}_encoded')
 print(f" {col}: Label Encoding ({unique_values} valores únicos)")

# 5. Selección de features para ML
print(f"\n SELECCIÓN DE FEATURES PARA ML:")

# Features numéricas (excluyendo IDs y targets)
id_columns = ['Employee_ID', 'employee_id', 'ID', 'id']
target_values = list(available_targets.values())

ml_features = []

# Agregar features numéricas
for col in df_ml.select_dtypes(include=[np.number]).columns:
 if (col not in id_columns and 
 col not in target_values and 
 not col.lower().startswith('unnamed')):
 ml_features.append(col)

# Agregar features encodadas
ml_features.extend(encoded_features)

print(f" Total features para ML: {len(ml_features)}")
print(f" Features seleccionadas: {ml_features[:10]}..." if len(ml_features) > 10 else f" Features: {ml_features}")

# Crear dataset final para ML
X = df_ml[ml_features].copy()

print(f"\n PREPROCESAMIENTO COMPLETADO")
print(f" • Dataset ML: {X.shape[0]:,} filas x {X.shape[1]} features")
print(f" • Memoria: {X.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

## 4. Modelo de Predicción de Attrition

Desarrollamos modelos para predecir la probabilidad de renuncia de empleados usando múltiples algoritmos.

In [None]:
# Verificar disponibilidad de variable objetivo para attrition
if 'attrition' in available_targets:
 attrition_col = available_targets['attrition']
 
 print(" MODELO DE PREDICCIÓN DE ATTRITION")
 print("=" * 50)
 
 # Preparar variable objetivo
 y_attr = df_ml[attrition_col].copy()
 
 # Convertir a formato binario si es necesario
 if y_attr.dtype == 'object':
 attrition_mapping = {'Yes': 1, 'No': 0, 'yes': 1, 'no': 0, 'YES': 1, 'NO': 0,
 'True': 1, 'False': 0, 'true': 1, 'false': 0,
 'Sí': 1, 'No': 0, 'SI': 1, 'NO': 0, '1': 1, '0': 0}
 y_attr = y_attr.map(attrition_mapping).fillna(y_attr)
 
 y_attr = y_attr.astype(int)
 
 print(f" Distribución de Attrition:")
 attrition_counts = y_attr.value_counts()
 print(f" • Retenidos (0): {attrition_counts.get(0, 0):,} ({attrition_counts.get(0, 0)/len(y_attr)*100:.1f}%)")
 print(f" • Renuncias (1): {attrition_counts.get(1, 0):,} ({attrition_counts.get(1, 0)/len(y_attr)*100:.1f}%)")
 
 # Verificar balance de clases
 minority_class_pct = min(attrition_counts) / len(y_attr) * 100
 if minority_class_pct < 10:
 print(f"️ Dataset desbalanceado - clase minoritaria: {minority_class_pct:.1f}%")
 
 # División de datos
 X_train_attr, X_test_attr, y_train_attr, y_test_attr = train_test_split(
 X, y_attr, test_size=0.2, random_state=42, stratify=y_attr
 )
 
 print(f"\n División de datos:")
 print(f" • Entrenamiento: {X_train_attr.shape[0]:,} muestras")
 print(f" • Prueba: {X_test_attr.shape[0]:,} muestras")
 
 # Escalado de features
 scaler_attr = StandardScaler()
 X_train_attr_scaled = scaler_attr.fit_transform(X_train_attr)
 X_test_attr_scaled = scaler_attr.transform(X_test_attr)
 
 # Modelos para comparar
 models_attr = {
 'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
 'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='logloss'),
 'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
 'Neural Network': MLPClassifier(random_state=42, max_iter=500)
 }
 
 # Entrenar y evaluar modelos
 results_attr = {}
 
 print(f"\n ENTRENAMIENTO DE MODELOS:")
 
 for name, model in models_attr.items():
 print(f"\n Entrenando {name}...")
 
 # Usar datos escalados para algunos modelos
 if name in ['Logistic Regression', 'Neural Network']:
 model.fit(X_train_attr_scaled, y_train_attr)
 y_pred = model.predict(X_test_attr_scaled)
 y_pred_proba = model.predict_proba(X_test_attr_scaled)[:, 1]
 else:
 model.fit(X_train_attr, y_train_attr)
 y_pred = model.predict(X_test_attr)
 y_pred_proba = model.predict_proba(X_test_attr)[:, 1]
 
 # Calcular métricas
 accuracy = accuracy_score(y_test_attr, y_pred)
 precision = precision_score(y_test_attr, y_pred, average='weighted')
 recall = recall_score(y_test_attr, y_pred, average='weighted')
 f1 = f1_score(y_test_attr, y_pred, average='weighted')
 auc = roc_auc_score(y_test_attr, y_pred_proba)
 
 results_attr[name] = {
 'model': model,
 'accuracy': accuracy,
 'precision': precision,
 'recall': recall,
 'f1': f1,
 'auc': auc,
 'predictions': y_pred,
 'probabilities': y_pred_proba
 }
 
 print(f" Accuracy: {accuracy:.4f}")
 print(f" AUC: {auc:.4f}")
 print(f" F1-Score: {f1:.4f}")
 
 # Comparación de modelos
 print(f"\n COMPARACIÓN DE MODELOS DE ATTRITION:")
 comparison_attr = pd.DataFrame({
 'Modelo': list(results_attr.keys()),
 'Accuracy': [results_attr[m]['accuracy'] for m in results_attr.keys()],
 'Precision': [results_attr[m]['precision'] for m in results_attr.keys()],
 'Recall': [results_attr[m]['recall'] for m in results_attr.keys()],
 'F1-Score': [results_attr[m]['f1'] for m in results_attr.keys()],
 'AUC': [results_attr[m]['auc'] for m in results_attr.keys()]
 }).round(4)
 
 comparison_attr = comparison_attr.sort_values('AUC', ascending=False)
 display(comparison_attr)
 
 # Mejor modelo
 best_model_name = comparison_attr.iloc[0]['Modelo']
 best_model_attr = results_attr[best_model_name]['model']
 
 print(f"\n MEJOR MODELO: {best_model_name}")
 print(f" AUC: {results_attr[best_model_name]['auc']:.4f}")
 print(f" F1-Score: {results_attr[best_model_name]['f1']:.4f}")
 
 # Visualización de resultados
 fig, axes = plt.subplots(2, 2, figsize=(15, 12))
 fig.suptitle(' Evaluación del Modelo de Predicción de Attrition', fontsize=16, fontweight='bold')
 
 # 1. Comparación de métricas
 metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC']
 x_pos = np.arange(len(comparison_attr))
 
 for i, metric in enumerate(['AUC', 'F1-Score']):
 if i < 2:
 axes[0, i].bar(x_pos, comparison_attr[metric], alpha=0.8, color=plt.cm.viridis(np.linspace(0, 1, len(comparison_attr))))
 axes[0, i].set_title(f'Comparación de {metric}')
 axes[0, i].set_xticks(x_pos)
 axes[0, i].set_xticklabels(comparison_attr['Modelo'], rotation=45)
 axes[0, i].set_ylabel(metric)
 
 # Agregar valores en las barras
 for j, v in enumerate(comparison_attr[metric]):
 axes[0, i].text(j, v + 0.01, f'{v:.3f}', ha='center', va='bottom', fontweight='bold')
 
 # 2. ROC Curve del mejor modelo
 fpr, tpr, _ = roc_curve(y_test_attr, results_attr[best_model_name]['probabilities'])
 axes[1, 0].plot(fpr, tpr, linewidth=2, label=f'{best_model_name} (AUC = {results_attr[best_model_name]["auc"]:.3f})')
 axes[1, 0].plot([0, 1], [0, 1], 'k--', alpha=0.5)
 axes[1, 0].set_xlabel('Tasa de Falsos Positivos')
 axes[1, 0].set_ylabel('Tasa de Verdaderos Positivos')
 axes[1, 0].set_title('Curva ROC')
 axes[1, 0].legend()
 axes[1, 0].grid(True, alpha=0.3)
 
 # 3. Matriz de confusión
 cm = confusion_matrix(y_test_attr, results_attr[best_model_name]['predictions'])
 sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[1, 1],
 xticklabels=['Retenido', 'Renuncia'], yticklabels=['Retenido', 'Renuncia'])
 axes[1, 1].set_title('Matriz de Confusión')
 axes[1, 1].set_xlabel('Predicción')
 axes[1, 1].set_ylabel('Real')
 
 plt.tight_layout()
 plt.show()
 
 # Guardar el mejor modelo
 model_path = '../artifacts/models/'
 os.makedirs(model_path, exist_ok=True)
 
 joblib.dump(best_model_attr, f'{model_path}best_attrition_model.pkl')
 joblib.dump(scaler_attr, f'{model_path}attrition_scaler.pkl')
 
 print(f"\n Modelo guardado en: {model_path}best_attrition_model.pkl")
 
else:
 print("️ Variable objetivo de attrition no encontrada en el dataset")
 print(" Variables disponibles:", list(df_ml.columns))