# Modelado

### Importacion de librerias

In [6]:
import pandas as pd
import numpy as np

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [9]:
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

### Carga de datos

In [12]:
df = pd.read_csv("../data/processed/reclamos_clean.csv", low_memory=False)

In [14]:
df.head()

Unnamed: 0,ID_PERIODO,DE_TIPO_INSTITUCION,DE_MEDIO_PRESENTACION,DE_MEDIO_RECEPCION,FE_PRESEN_RECLA,DE_SERVICIO,DE_COMPETENCIA,DE_CLASIF_1,DE_ESTADO_RECLAMO,DE_ETAPA_RECLAMO,DESCRIPCION,MES,DIA_SEMANA,AÑO,DIA_MES,ES_FIN_SEMANA,AÑO_ID_PERIODO,MES_ID_PERIODO
0,202101,IAFAS,Físico,Libro de Reclamaciones Físico,2020-11-27,Desconocido,Si,Negar o demora en otorgar la cobertura en salud,Resuelto,Resultado y Notificación,Sin descripción,11,4,2020,27,0,2021,1
1,202101,IAFAS,Físico,Libro de Reclamaciones Físico,2021-01-04,Desconocido,Si,Cobrar indebidamente,Resuelto,Resultado y Notificación,Me cobraron por una vacuna que estaba incluida...,1,0,2021,4,0,2021,1
2,202101,IAFAS,Físico,Libro de Reclamaciones Físico,2021-01-06,Desconocido,Si,Otros relativos a las IAFAS,Concluido,Archivo y Custodia del Expediente,Las ambulancias de EsSalud no llegaron a tiemp...,1,2,2021,6,0,2021,1
3,202101,IAFAS,Físico,Libro de Reclamaciones Físico,2021-01-06,Desconocido,Si,Otros relativos a las IAFAS,Concluido,Archivo y Custodia del Expediente,La atención que brindan las IAFAS no ha sido d...,1,2,2021,6,0,2021,1
4,202101,IAFAS,Físico,Libro de Reclamaciones Físico,2021-01-08,Desconocido,Si,Otros relativos a las IAFAS,En trámite,Evaluación e investigación,He tenido problemas para acceder a mis servici...,1,4,2021,8,0,2021,1


### Variables categoricas

In [17]:
categorical_cols_onehot = ['DE_MEDIO_PRESENTACION', 'DE_MEDIO_RECEPCION', 'DE_ESTADO_RECLAMO', 'DE_ETAPA_RECLAMO', 'DE_SERVICIO']

In [19]:
categorical_cols_ordinal = ['DE_TIPO_INSTITUCION', 'DE_COMPETENCIA']

In [31]:
text_col = 'DESCRIPCION'

In [33]:
# 6. Dividir en train y test

In [35]:
X = df[[
    'DE_TIPO_INSTITUCION', 'DE_MEDIO_PRESENTACION', 'DE_MEDIO_RECEPCION',
    'DE_SERVICIO', 'DE_COMPETENCIA', 'DE_ESTADO_RECLAMO', 'DE_ETAPA_RECLAMO',
    'DESCRIPCION', 'DIA_MES', 'MES', 'DIA_SEMANA', 'ES_FIN_SEMANA',
    'AÑO_ID_PERIODO', 'MES_ID_PERIODO'
]]

In [37]:
y = df['DE_CLASIF_1']

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [55]:
import nltk

In [56]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\harol\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [58]:
from nltk.corpus import stopwords

In [59]:
stop_words_spanish = stopwords.words('spanish')

In [63]:
# 8. Construir preprocesamiento
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_cols_onehot),
        ('ordinal', OrdinalEncoder(), categorical_cols_ordinal),
        #('text', TfidfVectorizer(max_features=5000), text_col)
        ('text', TfidfVectorizer(stop_words=stop_words_spanish, max_features=3000, ngram_range=(1,2), min_df=5), text_col)
    ],
    remainder='passthrough'  # deja pasar las numéricas sin tocar
)

In [65]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [67]:
model.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [69]:
print("Accuracy en Train:", model.score(X_train, y_train))
print("Accuracy en Test:", model.score(X_test, y_test))

Accuracy en Train: 0.9974329610870718
Accuracy en Test: 0.9587739793408756


In [70]:
y_pred = model.predict(X_test)

In [71]:
print(classification_report(y_test, y_pred))

                                                                                  precision    recall  f1-score   support

                                                                               -       0.99      1.00      1.00     12654
                                                            Cobrar indebidamente       1.00      1.00      1.00      2119
                      Demorar la gestión de la carta de garantía y/o reembolsos.       0.73      0.65      0.68      1055
                                Negar  o demora en otorgar la cobertura en salud       0.71      0.68      0.69      1821
                       Negar atención para el trámite de registro o acreditación       0.99      0.99      0.99       189
                     Negar el otorgamiento de prestaciones económicas o sociales       1.00      0.94      0.97        47
                                     Negar la acreditación de usuario asegurado.       0.78      0.83      0.81      2121
                       

In [22]:
X = df_clean.drop(columns=['DE_CLASIF_1'])
y = df_clean['DE_CLASIF_1']d

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [26]:
print(f"Tamaño entrenamiento: {X_train.shape}")
print(f"Tamaño prueba: {X_test.shape}")

Tamaño entrenamiento: (130111, 16)
Tamaño prueba: (32528, 16)


In [27]:
cat_bajo_cardinal = ['DE_TIPO_ADMINISTRADO', 'DE_TIPO_INSTITUCION', 
                     'DE_MEDIO_PRESENTACION', 'DE_MEDIO_RECEPCION',
                     'DE_SERVICIO', 'DE_COMPETENCIA']
cat_alto_cardinal = ['CO_ADMIN_DECLA', 'CO_ADMIN_SUCE', 'CO_UGIPRESS']
numericas = ['MES', 'DIA_SEMANA', 'AÑO', 'DIA_MES', 'PERIODO_AÑO', 'PERIODO_MES']

In [28]:
cat_bajo_cardinal = [col for col in cat_bajo_cardinal if col in X_train.columns]
cat_alto_cardinal = [col for col in cat_alto_cardinal if col in X_train.columns]
numericas = [col for col in numericas if col in X_train.columns]

In [33]:
# Convertir las olumnas categóricas de baja y alta cardinalidad a tipo texto
for col in cat_bajo_cardinal:
    if df[col].dtype != 'object':
        print(f"{col} no es tipo object. Convertir a string.")
        df[col] = df[col].astype(str)

In [34]:
for col in cat_alto_cardinal:
    if df[col].dtype != 'object':
        print(f"{col} no es tipo object. Convertir a string.")
        df[col] = df[col].astype(str)

In [35]:
# Forzar a tipo string las columnas categóricas que no son 'object'
for col in cat_bajo_cardinal + cat_alto_cardinal:
    if df[col].dtype != 'object':
        df[col] = df[col].astype(str)


In [51]:
print(X_train[cat_bajo_cardinal + cat_alto_cardinal].dtypes)


NameError: name 'cat_bajo_cardinal' is not defined

In [37]:
for col in cat_bajo_cardinal + cat_alto_cardinal:
    X_train[col] = X_train[col].astype(str)
    X_test[col] = X_test[col].astype(str)


In [38]:
transformadores = []

In [39]:
if 'DESCRIPCION' in X_train.columns:
    transformadores.append(('text', TfidfVectorizer(stop_words=stop_words_spanish, max_features=3000, ngram_range=(1,2), min_df=5), 'DESCRIPCION'))

In [40]:
if cat_bajo_cardinal:
    transformadores.append(('cat_low', Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]), cat_bajo_cardinal))

In [41]:
if cat_alto_cardinal:
    transformadores.append(('cat_high', Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ]), cat_alto_cardinal))

In [42]:
if numericas:
    transformadores.append(('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median'))
    ]), numericas))

In [43]:
preprocessor = ColumnTransformer(transformers=transformadores, remainder='drop')

In [44]:
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier


In [52]:
# Xgboost
from sklearn.preprocessing import LabelEncoder

# Codificar las etiquetas (y)
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [None]:
# Xgboost
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        objective='multi:softprob',  # útil para clasificación multiclase
        eval_metric='mlogloss',      # buena métrica para esto
        use_label_encoder=False,
        random_state=42,
        n_jobs=-1
    ))
])

In [45]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=100,
        class_weight='balanced',
       random_state=42
    ))
])

In [47]:
print("Entrenando modelo...")
#pipeline.fit(X_train, y_train_encoded)
pipeline.fit(X_train, y_train)
print("Modelo entrenado.")

Entrenando modelo...
Modelo entrenado.


In [46]:
y_pred = pipeline.predict(X_test)

NotFittedError: This ColumnTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [68]:
#xgboost
y_pred_encoded = pipeline.predict(X_test)

In [70]:
# Decodificar predicciones
y_pred = label_encoder.inverse_transform(y_pred_encoded)

In [72]:
print("\nReporte de clasificación:")
print(classification_report(y_test, y_pred))


Reporte de clasificación:
                                                                                  precision    recall  f1-score   support

                                                                               -       1.00      1.00      1.00     12654
                                                            Cobrar indebidamente       1.00      1.00      1.00      2119
                      Demorar la gestión de la carta de garantía y/o reembolsos.       0.86      0.87      0.86      1055
                                Negar  o demora en otorgar la cobertura en salud       0.79      0.80      0.79      1821
                       Negar atención para el trámite de registro o acreditación       1.00      1.00      1.00       189
                     Negar el otorgamiento de prestaciones económicas o sociales       1.00      1.00      1.00        47
                                     Negar la acreditación de usuario asegurado.       0.88      0.87      0.88      2

In [None]:
plt.figure(figsize=(12, 10))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=sorted(set(y_test)), yticklabels=sorted(set(y_test)))
plt.title('Matriz de Confusión')
plt.ylabel('Real')
plt.xlabel('Predicho')
plt.tight_layout()
plt.show()

In [80]:
%pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.19.0-cp312-cp312-win_amd64.whl.metadata (4.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.2.2-py3-none-any.whl.metadata (2.6 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-win_amd64.whl.metadata (5.3 kB)
Collecting opt-einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting termcolor>=1.1.0 (from tensorflow)
  Downloading termcolor-3.0

In [82]:
#Redes
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from tensorflow.keras.utils import to_categorical

# Label encoding
label_encoder = LabelEncoder()
y_train_int = label_encoder.fit_transform(y_train)
y_test_int = label_encoder.transform(y_test)

# One-hot encoding
y_train_cat = to_categorical(y_train_int)
y_test_cat = to_categorical(y_test_int)


In [84]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [88]:
print(X_train.dtypes)


ID_PERIODO                        int64
CO_ADMIN_DECLA                    int64
CO_UGIPRESS                       int64
DE_TIPO_INSTITUCION              object
CO_ADMIN_SUCE                     int64
DE_MEDIO_PRESENTACION            object
CO_UNICO_RECLAMO                 object
DE_MEDIO_RECEPCION               object
FE_PRESEN_RECLA          datetime64[ns]
DE_SERVICIO                      object
DE_COMPETENCIA                   object
AÑO                               int32
MES                               int32
DESCRIPCION                      object
DIA_SEMANA                        int32
DIA_MES                           int32
dtype: object


In [90]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train_cat, epochs=20, batch_size=32, validation_split=0.2)


ValueError: Invalid dtype: object

In [None]:
# Predicciones
y_pred_prob = model.predict(X_test)
y_pred_int = y_pred_prob.argmax(axis=1)
y_pred = label_encoder.inverse_transform(y_pred_int)

# Reporte
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
