In [None]:
# Comentario. Importar librerías
import joblib
import pandas as pd
import re
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

class LimpiarComillas(BaseEstimator, TransformerMixin):
    """
    • Limpia comillas simples y espacios en headers y celdas.
    • Quita espacios internos SÓLO en valores numéricos.
    • Convierte strings numéricos a float (NaN si falla) cuando cast_numeric=True.
    """

    def __init__(self, cast_numeric: bool = True):
        self.cast_numeric = cast_numeric

    @staticmethod
    def _clean_cell(val, cast_numeric):
        if not isinstance(val, str):
            return val
        v = val.strip(" '")
        if re.fullmatch(r"[0-9\.,\s]+", v):
            v_num = v.replace(" ", "")
            if cast_numeric:
                if "," in v_num and "." not in v_num:
                    v_num = v_num.replace(",", ".")
                return pd.to_numeric(v_num, errors="coerce")
            return v_num
        return v

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        # Limpiar nombres de columnas
        cols_raw = X.columns.astype(str).str.strip(" '")
        seen, cols_clean = {}, []
        for c in cols_raw:
            cnt = seen.get(c, 0)
            cols_clean.append(f"{c}_{cnt}" if cnt else c)
            seen[c] = cnt + 1
        X.columns = cols_clean

        # Limpiar celdas en columnas object o string
        obj_cols = X.select_dtypes(include=["object", "string"]).columns
        X[obj_cols] = X[obj_cols].applymap(
            lambda v: self._clean_cell(v, self.cast_numeric)
        )
        return X

class ConvertirObjectAString(BaseEstimator, TransformerMixin):
    """
    Convierte columnas de tipo 'object' o 'string' a tipo 'string' nativo de pandas.
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in X.select_dtypes(include=["object", "string"]).columns:
            X[col] = X[col].astype("string")
        return X

# Pipeline básico de limpieza
pipeline_red = Pipeline([
    ("convertir_a_str", ConvertirObjectAString()),
    ("strip", LimpiarComillas())
])

def crear_pipeline_completo(X):
    """
    Construye un Pipeline completo con:
    - Limpieza de comillas y conversión de objetos a string (pipeline_red)
    - Escalado de columnas numéricas
    - Codificación one-hot de columnas categóricas (salida sparse para ahorro de memoria)
    """
    # Identificar columnas categóricas y numéricas
    columnas_categoricas = X.select_dtypes(include=["object", "string"]).columns.tolist()
    columnas_numericas = X.select_dtypes(include=["number"]).columns.tolist()

    # ColumnTransformer con salida sparse
    preprocesador = ColumnTransformer([
        ("num", StandardScaler(), columnas_numericas),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True, dtype=np.float32), columnas_categoricas)
    ], sparse_threshold=0.0)

    pipeline_completo = Pipeline([
        ("limpieza", pipeline_red),
        ("prepro", preprocesador)
    ])
    return pipeline_completo


# Comentario. Guardar a disco
joblib.dump(pipeline_red, "pipeline_red.pkl")


['pipeline_red.pkl']

In [None]:
import os, joblib, pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from scikeras.wrappers import KerasClassifier
from keras import layers, models
import joblib, pandas as pd, numpy as np
from transformadores_red import LimpiarComillas      

pipeline_red = joblib.load('pipeLine_red.pkl')
df = pd.read_csv(
    "Muetra_Taller3.csv",
    header=None,
    quotechar="'",
    skipinitialspace=True,
    dtype=str
)




df_procesado = pipeline_red.transform(df)         
df_procesado.head()

datos = df_procesado.to_numpy()
df_procesado.head()

df_procesado.info()

  X[obj_cols] = X[obj_cols].applymap(


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101656 entries, 0 to 101655
Data columns (total 43 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   0       101656 non-null  string
 1   1       101656 non-null  string
 2   2       101656 non-null  string
 3   3       101656 non-null  string
 4   4       101656 non-null  string
 5   5       101656 non-null  string
 6   6       101656 non-null  string
 7   7       101656 non-null  string
 8   8       101656 non-null  string
 9   9       101656 non-null  string
 10  10      101656 non-null  string
 11  11      101656 non-null  string
 12  12      101656 non-null  string
 13  13      101656 non-null  string
 14  14      101656 non-null  string
 15  15      101656 non-null  string
 16  16      101656 non-null  string
 17  17      101656 non-null  string
 18  18      101656 non-null  string
 19  19      101656 non-null  string
 20  20      101656 non-null  string
 21  21      101656 non-null  string
 

In [None]:
# Paso 1: Importar librerías y cargar datos
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
from keras.models import Sequential
from keras.layers import Dense
import importlib
import transformadores_red
importlib.reload(transformadores_red)
from transformadores_red import crear_pipeline_completo
import numpy as np

# —— Carga y limpieza básica —— 
df_procesado = pd.read_csv("Muetra_Taller3.csv", encoding="utf-8")
# Si tus columnas tienen comillas o espacios extra:
df_procesado.columns = (
    df_procesado.columns
        .str.strip()
        .str.replace("'", "", regex=False)
)

# Separar columnas
ids         = df_procesado.iloc[:, 0]     # primera columna: ID
X           = df_procesado.iloc[:, 1:-1]  # de la 2ª a la penúltima: features
y           = df_procesado.iloc[:, -1]    # última columna: etiqueta

# Paso 3: Codificar variable objetivo
le = LabelEncoder()
y_encoded = le.fit_transform(y)
print("Mapeo de clases:", dict(zip(le.classes_, le.transform(le.classes_))))

#Crear pipeline y transformar X
pipeline_completo = crear_pipeline_completo(X)
X_procesado       = pipeline_completo.fit_transform(X)

# Paso 5: Convertir a arrays NumPy
X_array = np.asarray(X_procesado, dtype="float64")
y_array = np.asarray(y_encoded,  dtype="float64")


Mapeo de clases: {" 'Ataque'": np.int64(0), " 'Normal'": np.int64(1)}


  )


MemoryError: Unable to allocate 6.62 GiB for an array with shape (101655, 17481) and data type float32

In [16]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

# Definir arquitectura mejorada
model = Sequential()
model.add(Dense(124, activation='relu', input_dim=X_array.shape[1]))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(4, activation='relu'))
model.add(Dense(2, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compilar con optimizador ajustado
optimizer = Adam(learning_rate=0.0005)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Callback para detener si no mejora

# Entrenar el modelo con validación y early stopping
model.fit(X_array, y_array,
          epochs=20,
          batch_size=32,
          validation_split=0.2,
          verbose=1)

# Guardar el modelo
model.save("modelo_ANN.keras")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m2542/2542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - accuracy: 0.9869 - loss: 0.5659 - val_accuracy: 0.9896 - val_loss: 0.2976
Epoch 2/20
[1m2542/2542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.9902 - loss: 0.2490 - val_accuracy: 0.9896 - val_loss: 0.1470
Epoch 3/20
[1m2542/2542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.9898 - loss: 0.1277 - val_accuracy: 0.9896 - val_loss: 0.0877
Epoch 4/20
[1m2542/2542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.9906 - loss: 0.0777 - val_accuracy: 0.9896 - val_loss: 0.0659
Epoch 5/20
[1m2542/2542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.9897 - loss: 0.0635 - val_accuracy: 0.9896 - val_loss: 0.0593
Epoch 6/20
[1m2542/2542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.9903 - loss: 0.0561 - val_accuracy: 0.9896 - val_loss: 0.0580
Epoch 7/20
[1m

In [None]:
#importar librerías necesarias
import pandas as pd
import numpy as np
import joblib
from keras.models import load_model
from transformadores_red import crear_pipeline_completo, pipeline_red

#cargar los datos de evaluación
df_nuevo = pd.read_csv("Muetra_Taller3_Evaluacion.csv",
                       header=None,
                       quotechar="'",
                       skipinitialspace=True,
                       dtype=str)

#aplicar pipeline de limpieza (comillas, espacios, tipos)
df_limpio = pipeline_red.transform(df_nuevo)

# seleccionar las columnas de entrada (sin ID y sin target)
X_eval = df_limpio.iloc[:, 1:]  # columnas de la 1 a la última

#econstruir el pipeline completo con las columnas del set limpio
pipeline_completo = crear_pipeline_completo(X_eval)

#Paentrenar el pipeline SOLO con los datos originales 
df_entrenamiento = pd.read_csv("Muetra_Taller3.csv",
                                header=None,
                                quotechar="'",
                                skipinitialspace=True,
                                dtype=str)

df_entrenamiento_limpio = pipeline_red.transform(df_entrenamiento)
X_train = df_entrenamiento_limpio.iloc[:, 1:-1]  # sin ID, sin target

pipeline_completo.fit(X_train)  #aquí se entrena el preprocesador!

# transformar los datos de evaluación
X_eval_array = pipeline_completo.transform(X_eval)

modelo = load_model("modelo_ANN.keras")

predicciones = modelo.predict(X_eval_array, verbose=0).flatten()

#guardar 150 IDs más altos
ids = df_nuevo.iloc[:, 0]
top_150_indices = np.argsort(predicciones)[-150:][::-1]
top_150_ids = ids.iloc[top_150_indices]

with open("inspeccion.txt", "w") as f:
    f.write(",".join(map(str, top_150_ids.tolist())))

print("✅ Evaluación completada y archivo 'inspeccion.txt' generado.")


  X[obj_cols] = X[obj_cols].applymap(
  X[obj_cols] = X[obj_cols].applymap(
  X[obj_cols] = X[obj_cols].applymap(
  X[obj_cols] = X[obj_cols].applymap(


✅ Evaluación completada y archivo 'inspeccion.txt' generado.
