## ANÁLISIS DE DATOS EXTREMOS (OUTLIERS) Y PCA (PRINCIPAL COMPONENT ANALYSIS)

In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from sklearn.covariance import MinCovDet
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy.stats import anderson
import seaborn as sns

# Cargar el dataset ajustado
df = pd.read_csv('https://raw.githubusercontent.com/ringoquimico/Portfolio/refs/heads/main/Data%20Sources/call_center_data.csv', sep=';', quotechar='"')

# Codificar variables categóricas
categorical_vars = ['channel', 'classification', 'resolved_in_sla', 'first_touch_resolution', 'csat_rated_group_name', 'issue_classification']
df_encoded = pd.get_dummies(df, columns=categorical_vars, drop_first=False)



In [46]:
df.columns

Index(['case_id', 'date', 'start_of_week', 'start_of_month',
       'translated_comments', 'sentiment', 'sentiment_rate', 'channel',
       'resolution_time_min', 'csat_rating_received', 'classification',
       'resolved_in_sla', 'first_touch_resolution', 'group_name_history',
       'groups', 'total_groups', 'csat_rated_group_name',
       'issue_classification'],
      dtype='object')

In [47]:
df_encoded.columns

Index(['case_id', 'date', 'start_of_week', 'start_of_month',
       'translated_comments', 'sentiment', 'sentiment_rate',
       'resolution_time_min', 'csat_rating_received', 'group_name_history',
       'groups', 'total_groups', 'channel_chatbot', 'channel_email',
       'channel_phone', 'classification_DETRACTOR', 'classification_PROMOTER',
       'resolved_in_sla_0', 'resolved_in_sla_1', 'first_touch_resolution_0',
       'first_touch_resolution_1', 'csat_rated_group_name_Grupo A',
       'csat_rated_group_name_Grupo B', 'csat_rated_group_name_Grupo C',
       'csat_rated_group_name_Grupo D', 'csat_rated_group_name_Grupo E',
       'csat_rated_group_name_Grupo F', 'csat_rated_group_name_Grupo G',
       'csat_rated_group_name_Grupo H', 'csat_rated_group_name_Grupo I',
       'csat_rated_group_name_Grupo J', 'issue_classification_Account Setup',
       'issue_classification_Balance Inquiry',
       'issue_classification_Card Issues', 'issue_classification_Fees Inquiry',
       'issu

### OUTLIERS Y PCA

In [34]:

# Definir variables numéricas para PCA (incluye diferencia para detectar tiempos muertos)
df_clean = df_encoded.copy()
df_clean['dead_time'] = df_clean['resolution_time_min'] - df_clean['aht']
var_list = [
    'resolution_time_min', 'csat_rating_received', 'sentiment_rate', 'total_groups',
    'aht', 'talk_time', 'hold_time', 'wrap_up_time', 'dead_time',
    'channel_phone', 'channel_email', 'channel_chatbot' , 'resolved_in_sla_1', 'resolved_in_sla_0', 'first_touch_resolution_True'
]
id_columns = ['case_id']

# Eliminar NaN y preparar datos
df_clean = df_clean.dropna(subset=id_columns + var_list)
X = df_clean[var_list].values

# Verificar el número de muestras
print(f"Número de muestras después de eliminar NaN: {X.shape[0]}")
n_samples = X.shape[0]
n_outliers_expected = int(n_samples * 0.1)
print(f"Número esperado de outliers (10% de contaminación): {n_outliers_expected}")

# Estandarizar los datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Verificar normalidad con Anderson-Darling
print("\nPrueba de normalidad (Anderson-Darling) por variable:")
is_gaussian = []
for i, var in enumerate(var_list):
    result = anderson(X_scaled[:, i])
    is_gaussian_var = result.statistic < result.critical_values[2]
    is_gaussian.append(is_gaussian_var)
    print(f"{var}: statistic = {result.statistic:.4f}, critical value (5%) = {result.critical_values[2]:.4f}, Gaussian = {is_gaussian_var}")

# Decidir método
gaussian_count = sum(is_gaussian)
total_vars = len(var_list)
print(f"\nPorcentaje de variables gaussianas: {(gaussian_count / total_vars) * 100:.1f}%")
method = "MinCovDet" if (gaussian_count / total_vars) < 0.5 else "EllipticEnvelope"
print(f"Método sugerido: {method}")

# Reducir a 2D con PCA
pca = PCA(n_components=2)
X_2d = pca.fit_transform(X_scaled)

# Loadings
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
print("\nContribuciones de las variables a las Componentes Principales (loadings):")
for i, var in enumerate(var_list):
    print(f"{var}: PC1 = {loadings[i, 0]:.4f}, PC2 = {loadings[i, 1]:.4f}")
print(f"Porcentaje de varianza explicada: PC1 = {pca.explained_variance_ratio_[0] * 100:.2f}%, PC2 = {pca.explained_variance_ratio_[1] * 100:.2f}%")

pc1_dominant = var_list[np.argmax(np.abs(loadings[:, 0]))]
pc2_dominant = var_list[np.argmax(np.abs(loadings[:, 1]))]
print(f"\nPC1 está dominada principalmente por: {pc1_dominant}")
print(f"PC2 está dominada principalmente por: {pc2_dominant}")

# Detección de outliers con MinCovDet
model = MinCovDet(random_state=42).fit(X_2d)
mahal_distances = model.mahalanobis(X_2d)
threshold = np.percentile(mahal_distances, 100 * (1 - 0.1))
outlier_indices = np.where(mahal_distances > threshold)[0]

# Tukey
q1_x = np.percentile(X_2d[:, 0], 25)
q3_x = np.percentile(X_2d[:, 0], 75)
iqr_x = q3_x - q1_x
lower_bound_x = q1_x - 1.5 * iqr_x
upper_bound_x = q3_x + 1.5 * iqr_x

q1_y = np.percentile(X_2d[:, 1], 25)
q3_y = np.percentile(X_2d[:, 1], 75)
iqr_y = q3_y - q1_y
lower_bound_y = q1_y - 1.5 * iqr_y
upper_bound_y = q3_y + 1.5 * iqr_y

tukey_outliers_x = (X_2d[:, 0] < lower_bound_x) | (X_2d[:, 0] > upper_bound_x)
tukey_outliers_y = (X_2d[:, 1] < lower_bound_y) | (X_2d[:, 1] > upper_bound_y)
tukey_outlier_indices = np.where(tukey_outliers_x | tukey_outliers_y)[0]

outlier_types_tukey = []
for idx in tukey_outlier_indices:
    x_val = X_2d[idx, 0]
    y_val = X_2d[idx, 1]
    if x_val > upper_bound_x or y_val > upper_bound_y:
        outlier_types_tukey.append("Superior")
    elif x_val < lower_bound_x or y_val < lower_bound_y:
        outlier_types_tukey.append("Inferior")
    else:
        outlier_types_tukey.append("Unknown")

# Preparar DataFrames
df_clean = df_clean.reset_index(drop=True)
df_clean['Key'] = df_clean['case_id'].astype(str)
df_outliers = df_clean.iloc[outlier_indices].copy()
df_outliers['Tipo_Outlier'] = ['Superior' if mahal_distances[idx] > threshold else 'Inferior' for idx in outlier_indices]
df_outliers['Metodo_Seleccionado'] = 'MinCovDet'

df_outliers_tukey = df_clean.iloc[tukey_outlier_indices].copy()
df_outliers_tukey['Tipo_Outlier'] = outlier_types_tukey
df_outliers_tukey['Metodo_Seleccionado'] = 'Tukey'

# Visualización
plt.figure(figsize=(12, 8))
inliers = X_2d[mahal_distances <= threshold]
outliers_method = X_2d[outlier_indices]
outliers_tukey = X_2d[tukey_outlier_indices]
plt.scatter(inliers[:, 0], inliers[:, 1], color='black', label='Inliers')
plt.scatter(outliers_method[:, 0], outliers_method[:, 1], color='red', label='Outliers (MinCovDet)')
plt.scatter(outliers_tukey[:, 0], outliers_tukey[:, 1], color='blue', label='Outliers (Tukey)', alpha=0.5)
plt.title(f"Outlier Detection for Low CSAT Scores (PCA 2D)\nVarianza explicada: PC1 {pca.explained_variance_ratio_[0]*100:.2f}%, PC2 {pca.explained_variance_ratio_[1]*100:.2f}%")
plt.xlabel(f'PC1 (Dominada por {pc1_dominant})')
plt.ylabel(f'PC2 (Dominada por {pc2_dominant})')
plt.legend()
plt.show()

# Heatmap de correlaciones
plt.figure(figsize=(12, 10))
correlation_matrix = df_clean[var_list].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
plt.title('Correlation Heatmap for CSAT Variables')
plt.show()

# Exportar resultados
df_outliers.to_csv('csat_outliers_adjusted.csv', index=False)
df_outliers_tukey.to_csv('csat_outliers_tukey_adjusted.csv', index=False)
print("\nArchivos 'csat_outliers_adjusted.csv' y 'csat_outliers_tukey_adjusted.csv' creados.")

KeyError: 'aht'

### ANÁLISIS DE RESULTADOS

In [None]:

# Cargar datos
df_clean = pd.read_csv('call_center_data_adjusted.csv', sep=';')
var_list = [
    'resolution_time_min', 'csat_rating_received', 'sentiment_rate', 'total_groups',
    'aht', 'talk_time', 'hold_time', 'wrap_up_time', 'dead_time',
    'channel_phone', 'channel_email', 'resolved_in_sla_True', 'first_touch_resolution_True'
]
id_columns = ['case_id']

# Codificar variables categóricas
df_encoded = pd.get_dummies(df_clean, columns=['channel', 'classification', 'resolved_in_sla', 'first_touch_resolution', 'csat_rated_group_name', 'issue_classification'], drop_first=True)
df_clean = df_encoded.dropna(subset=id_columns + var_list).reset_index(drop=True)
df_clean['dead_time'] = df_clean['resolution_time_min'] - df_clean['aht']
X = df_clean[var_list].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA(n_components=2)
X_2d = pca.fit_transform(X_scaled)
df_outliers = pd.read_csv('csat_outliers_adjusted.csv')
outlier_indices = df_outliers.index

# Paso 1: Validar causas de CSAT scores bajos
print("\n=== Paso 1: Validar causas de CSAT scores bajos ===")
low_csat_outliers = df_outliers[df_outliers['csat_rating_received'] < 3]
print("\nOutliers con CSAT scores bajos (< 3):")
print(low_csat_outliers[['Key', 'csat_rating_received', 'resolution_time_min', 'sentiment_rate', 'total_groups', 'dead_time', 'translated_comments']].head(5))

correlation_with_csat = df_clean[var_list].corr()['csat_rating_received'].sort_values(ascending=False)
print("\nCorrelación de csat_rating_received con otras variables:")
print(correlation_with_csat)
main_cause = correlation_with_csat.index[1]
print(f"\nVariable más correlacionada con CSAT scores bajos (posible causa): {main_cause}")

# Paso 2: Investigar causas secundarias con más componentes
print("\n=== Paso 2: Investigar causas secundarias con más componentes ===")
pca_extended = PCA(n_components=0.8)
X_pca = pca_extended.fit_transform(X_scaled)
print(f"Número de componentes para el 80% de varianza: {pca_extended.n_components_}")
print(f"Varianza total explicada: {sum(pca_extended.explained_variance_ratio_)*100:.2f}%")

loadings_pc3 = pca_extended.components_[2]
print("\nLoadings de PC3:")
for i, var in enumerate(var_list):
    print(f"{var}: {loadings_pc3[i]:.4f}")
pc3_dominant = var_list[np.argmax(np.abs(loadings_pc3))]
print(f"PC3 está dominada principalmente por: {pc3_dominant}")

q1_pc3 = np.percentile(X_pca[:, 2], 25)
q3_pc3 = np.percentile(X_pca[:, 2], 75)
iqr_pc3 = q3_pc3 - q1_pc3
lower_bound_pc3 = q1_pc3 - 1.5 * iqr_pc3
upper_bound_pc3 = q3_pc3 + 1.5 * iqr_pc3
outliers_pc3 = (X_pca[:, 2] < lower_bound_pc3) | (X_pca[:, 2] > upper_bound_pc3)
outliers_pc3_indices = np.where(outliers_pc3)[0]
print(f"\nNúmero de outliers en PC3: {len(outliers_pc3_indices)}")

df_outliers_pc3 = df_clean.iloc[outliers_pc3_indices].copy()
df_outliers_pc3['Tipo_Outlier'] = np.where(X_pca[outliers_pc3_indices, 2] > upper_bound_pc3, 'Superior', 'Inferior')
print("\nOutliers en PC3 (primeras filas):")
print(df_outliers_pc3[['Key', pc3_dominant, 'csat_rating_received', 'translated_comments']].head())

# Paso 3: Segmentar por PC2
print("\n=== Paso 3: Segmentar outliers por PC2 ===")
pc2_threshold_high = np.percentile(X_2d[:, 1], 75)
pc2_threshold_low = np.percentile(X_2d[:, 1], 25)

high_impact = df_outliers[X_2d[outlier_indices, 1] > pc2_threshold_high]
low_impact = df_outliers[X_2d[outlier_indices, 1] < pc2_threshold_low]

print("\nOutliers con alta influencia de PC2 (posiblemente canales problemáticos):")
print(high_impact[['Key', 'csat_rating_received', pc2_dominant, 'translated_comments']].head())
print("\nOutliers con baja influencia de PC2:")
print(low_impact[['Key', 'csat_rating_received', pc2_dominant, 'translated_comments']].head())

avg_csat_high = high_impact['csat_rating_received'].mean()
avg_csat_low = low_impact['csat_rating_received'].mean()
print(f"\nCSAT promedio en casos de alta influencia (PC2): {avg_csat_high:.2f}")
print(f"CSAT promedio en casos de baja influencia (PC2): {avg_csat_low:.2f}")

# Paso 4: Estrategias para mejorar CSAT
print("\n=== Paso 4: Estrategias para mejorar CSAT scores ===")
print("Conclusión basada en datos:")
print(f"- La variable más correlacionada con CSAT scores bajos es: {main_cause}.")
print(f"- PC3, dominada por {pc3_dominant}, sugiere causas secundarias (como tiempos muertos si es 'dead_time').")
print(f"- Los casos con alta influencia de PC2 (probablemente {pc2_dominant}) tienen un CSAT promedio de {avg_csat_high:.2f}, vs. {avg_csat_low:.2f} en baja influencia.")

print("\nRecomendaciones:")
print(f"1. Prioriza reducir {main_cause}: Si es 'resolution_time_min' o 'dead_time', optimiza los procesos para eliminar tiempos muertos; si es 'total_groups', minimiza transferencias.")
print(f"2. Aborda causas secundarias: Analiza outliers en PC3 ({pc3_dominant}) para problemas como 'hold_time' o excesivo 'dead_time'.")
print(f"3. Enfócate en canales de alta influencia: Si {pc2_dominant} es 'channel_phone', mejora el soporte telefónico.")
print("4. Monitorea casos con CSAT bajo: Usa los archivos CSV para seguimiento y analiza los 'translated_comments' para identificar patrones cualitativos.")