In [1]:
import pandas as pd
import os


# Ejemplo para cargar breast cancer (ajustá según dataset real)
os.chdir(os.path.dirname(os.path.abspath("__file__")) + "/../notebooks")
df = pd.read_csv("../datasets/breast+cancer+wisconsin+original/breast-cancer-wisconsin.data", header=None)




In [2]:
df.head()
df.info()
df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       699 non-null    int64 
 1   1       699 non-null    int64 
 2   2       699 non-null    int64 
 3   3       699 non-null    int64 
 4   4       699 non-null    int64 
 5   5       699 non-null    int64 
 6   6       699 non-null    object
 7   7       699 non-null    int64 
 8   8       699 non-null    int64 
 9   9       699 non-null    int64 
 10  10      699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
dtype: int64

In [3]:
# Renombrar columnas para facilitar el trabajo
df.columns = [f"col_{i}" for i in range(df.shape[1])]

# Verificar valores únicos de la columna con tipo 'object'
print(df['col_6'].unique())

# Convertir columna 6 a numérica (manejo de valores faltantes como '?')
df['col_6'] = pd.to_numeric(df['col_6'], errors='coerce')

# Eliminar filas con valores faltantes (podés imputar si preferís)
df.dropna(inplace=True)

# Separar características y etiqueta
X = df.drop(columns='col_10')
y = df['col_10']

# Ver distribución de clases
print(y.value_counts())


['1' '10' '2' '4' '3' '9' '7' '?' '5' '8' '6']
col_10
2    444
4    239
Name: count, dtype: int64


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import os

# Asegurar carpetas
os.makedirs("../figures", exist_ok=True)
os.makedirs("../resultados", exist_ok=True)

# Escalar datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Técnicas de sobremuestreo
tecnicas = {
    "SMOTE": SMOTE(random_state=42),
    "ADASYN": ADASYN(random_state=42),
    "BorderlineSMOTE": BorderlineSMOTE(random_state=42)
}

# Modelos a evaluar
modelos = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "KNN": KNeighborsClassifier()
}

resultados = []

# Iterar por cada modelo y técnica
for nombre_modelo, modelo in modelos.items():
    for nombre_tecnica, sampler in tecnicas.items():
        # Aplicar sobremuestreo
        X_res, y_res = sampler.fit_resample(X_scaled, y)

        # Separar en train y test
        X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=42)

        # Entrenar modelo
        modelo.fit(X_train, y_train)
        y_pred = modelo.predict(X_test)

        # Evaluar
        report = classification_report(y_test, y_pred, output_dict=True)
        cm = confusion_matrix(y_test, y_pred)

        # Guardar heatmap
        plt.figure(figsize=(5,4))
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
        plt.title(f"{nombre_modelo} + {nombre_tecnica}")
        plt.xlabel("Predicción")
        plt.ylabel("Real")
        plt.tight_layout()
        plt.savefig(f"../figures/{nombre_modelo}_{nombre_tecnica}_heatmap.png")
        plt.close()

        # Guardar métricas dinámicamente
        labels = list(map(str, sorted(y.unique())))
        entry = {
            "Modelo": nombre_modelo,
            "Técnica": nombre_tecnica,
            "Accuracy": report.get("accuracy", 0)
        }

        for label in labels:
            if label in report:
                entry[f"Precision ({label})"] = report[label]["precision"]
                entry[f"Recall ({label})"] = report[label]["recall"]
                entry[f"F1-score ({label})"] = report[label]["f1-score"]
            else:
                entry[f"Precision ({label})"] = None
                entry[f"Recall ({label})"] = None
                entry[f"F1-score ({label})"] = None

        resultados.append(entry)

# Guardar métricas en CSV
df_resultados = pd.DataFrame(resultados)
df_resultados.to_csv("../resultados/resultados_baseline.csv", index=False)
df_resultados


Unnamed: 0,Modelo,Técnica,Accuracy,Precision (2),Recall (2),F1-score (2),Precision (4),Recall (4),F1-score (4)
0,RandomForest,SMOTE,0.985019,0.984375,0.984375,0.984375,0.985612,0.985612,0.985612
1,RandomForest,ADASYN,0.981132,1.0,0.96124,0.980237,0.964539,1.0,0.981949
2,RandomForest,BorderlineSMOTE,0.992509,1.0,0.984375,0.992126,0.985816,1.0,0.992857
3,LogisticRegression,SMOTE,0.970037,0.954545,0.984375,0.969231,0.985185,0.956835,0.970803
4,LogisticRegression,ADASYN,0.984906,1.0,0.968992,0.984252,0.971429,1.0,0.985507
5,LogisticRegression,BorderlineSMOTE,0.985019,1.0,0.96875,0.984127,0.972028,1.0,0.985816
6,KNN,SMOTE,0.981273,0.976744,0.984375,0.980545,0.985507,0.978417,0.981949
7,KNN,ADASYN,0.981132,1.0,0.96124,0.980237,0.964539,1.0,0.981949
8,KNN,BorderlineSMOTE,0.988764,1.0,0.976562,0.988142,0.978873,1.0,0.989324
