In [1]:
!pip install kagglehub[pandas-datasets] --quiet

In [2]:
# @title Descargar el dataset y obtener la ruta local

import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd
import numpy as np


# Esto descarga el dataset y te devuelve una ruta local
dataset_dir = kagglehub.dataset_download("yasserh/titanic-dataset")
dataset_dir

Using Colab cache for faster access to the 'titanic-dataset' dataset.


'/kaggle/input/titanic-dataset'

In [3]:
# @title Ver qué archivos trae y elegir el file_path correcto
import os
for root, dirs, files in os.walk(dataset_dir):
    for f in files:
        print(os.path.join(root, f))


/kaggle/input/titanic-dataset/Titanic-Dataset.csv


In [4]:
# @title Cargar el archivo

file_path = "Titanic-Dataset.csv"  # <-- cámbialo EXACTO como aparece en el listado

df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "yasserh/titanic-dataset",
    file_path
)

df.head()


  df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'titanic-dataset' dataset.


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# @title Ver las columnas del dataset

df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [6]:
#  @title Identificar el TARGET (y)

df["Survived"].value_counts()

Unnamed: 0_level_0,count
Survived,Unnamed: 1_level_1
0,549
1,342


In [7]:
#  @title Definir X e y correctamente

target = "Survived"

drop_cols = ["PassengerId", "Name", "Ticket", "Cabin"]

X = df.drop(columns=[target] + drop_cols)
y = df[target].astype(int)

X.head(), y.head()


(   Pclass     Sex   Age  SibSp  Parch     Fare Embarked
 0       3    male  22.0      1      0   7.2500        S
 1       1  female  38.0      1      0  71.2833        C
 2       3  female  26.0      0      0   7.9250        S
 3       1  female  35.0      1      0  53.1000        S
 4       3    male  35.0      0      0   8.0500        S,
 0    0
 1    1
 2    1
 3    1
 4    0
 Name: Survived, dtype: int64)

In [8]:
#  @title Entender QUÉ tipo de datos hay en X

X.dtypes

Unnamed: 0,0
Pclass,int64
Sex,object
Age,float64
SibSp,int64
Parch,int64
Fare,float64
Embarked,object


In [9]:
#  @title Separar columnas numéricas y categóricas

num_cols = X.select_dtypes(include=["number"]).columns.tolist()
cat_cols = X.select_dtypes(exclude=["number"]).columns.tolist()

num_cols, cat_cols


(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], ['Sex', 'Embarked'])

In [10]:
#  @title Split Train-Test

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=1234,
    stratify=y
)

X_train.shape, X_test.shape


((712, 7), (179, 7))

In [11]:
# @title Pipeline numérico

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])


In [12]:
# @title Pipeline categórico

from sklearn.preprocessing import OneHotEncoder

categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])


In [13]:
# @title Unir todo con ColumnTransformer

from sklearn.compose import ColumnTransformer

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, num_cols),
        ("cat", categorical_pipe, cat_cols),
    ]
)


In [14]:
# @title Ejecucion Pipeline

X_train_p = preprocess.fit_transform(X_train)
X_test_p  = preprocess.transform(X_test)


## Baseline model

In [15]:
# @title Regresion Logistica como Modelo Base

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report

baseline = LogisticRegression(max_iter=5000)
baseline.fit(X_train_p, y_train)

In [16]:
proba_base = baseline.predict_proba(X_test_p)[:, 1]

In [17]:
pred_base = (proba_base >= 0.5).astype(int)

In [18]:
print("AUC baseline:", roc_auc_score(y_test, proba_base))
print(classification_report(y_test, pred_base))

AUC baseline: 0.8383399209486166
              precision    recall  f1-score   support

           0       0.81      0.85      0.83       110
           1       0.75      0.68      0.71        69

    accuracy                           0.79       179
   macro avg       0.78      0.77      0.77       179
weighted avg       0.79      0.79      0.79       179



## Construcción de la red neuronal (MLP)

In [19]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

In [20]:
# @title Convertir X_train_p y X_test_p a tensores

X_train_nn = X_train_p.toarray() if hasattr(X_train_p, "toarray") else np.array(X_train_p)
X_test_nn  = X_test_p.toarray()  if hasattr(X_test_p, "toarray")  else np.array(X_test_p)

y_train_np = y_train.values if hasattr(y_train, "values") else np.array(y_train)
y_test_np  = y_test.values  if hasattr(y_test, "values")  else np.array(y_test)

# Tensores float32 para X, float32 para y (porque usaremos BCEWithLogitsLoss)
X_train_t = torch.tensor(X_train_nn, dtype=torch.float32)
X_test_t  = torch.tensor(X_test_nn, dtype=torch.float32)
y_train_t = torch.tensor(y_train_np, dtype=torch.float32).view(-1, 1)
y_test_t  = torch.tensor(y_test_np, dtype=torch.float32).view(-1, 1)

X_train_t.shape, y_train_t.shape

(torch.Size([712, 10]), torch.Size([712, 1]))

In [21]:
# @title DataLoader (mini-batches)

train_ds = TensorDataset(X_train_t, y_train_t)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)

In [22]:
# @title Elegir dispositivo

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [26]:
# @title Definir la red (MLP)

class MLP(nn.Module):
    def __init__(self, n_features: int):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_features, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 1)  # logits
        )

    def forward(self, x):
        return self.net(x)

n_features = X_train_t.shape[1]
model = MLP(n_features).to(device)
model


MLP(
  (net): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=64, out_features=32, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=32, out_features=1, bias=True)
  )
)

In [27]:
# @title Loss + optimizer

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [29]:
# @title Funciones de entrenamiento y evaluación (con AUC)

@torch.no_grad()
def predict_proba(model, X_tensor, batch_size=512):
    model.eval()
    probs = []
    dl = DataLoader(TensorDataset(X_tensor), batch_size=batch_size, shuffle=False)
    for (xb,) in dl:
        xb = xb.to(device)
        logits = model(xb)
        p = torch.sigmoid(logits).detach().cpu().numpy().ravel()
        probs.append(p)
    return np.concatenate(probs)

def train_one_epoch(model, loader):
    model.train()
    total_loss = 0.0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)

        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * xb.size(0)
    return total_loss / len(loader.dataset)

In [30]:
# @title Entrenar con Early Stopping por AUC

max_epochs = 300
patience = 20
best_auc = -np.inf
best_state = None
pat = 0

# Split interno de validación (desde train)
from sklearn.model_selection import train_test_split
idx = np.arange(len(X_train_nn))
train_idx, val_idx = train_test_split(idx, test_size=0.2, random_state=42, stratify=y_train_np)

X_tr = torch.tensor(X_train_nn[train_idx], dtype=torch.float32)
y_tr = torch.tensor(y_train_np[train_idx], dtype=torch.float32).view(-1,1)

X_val = torch.tensor(X_train_nn[val_idx], dtype=torch.float32)
y_val = torch.tensor(y_train_np[val_idx], dtype=torch.float32).view(-1,1)

train_loader = DataLoader(TensorDataset(X_tr, y_tr), batch_size=32, shuffle=True)

for epoch in range(1, max_epochs + 1):
    loss = train_one_epoch(model, train_loader)

    val_proba = predict_proba(model, X_val)
    val_auc = roc_auc_score(y_val.numpy().ravel(), val_proba)

    if val_auc > best_auc + 1e-4:
        best_auc = val_auc
        best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
        pat = 0
    else:
        pat += 1

    if epoch % 10 == 0 or epoch == 1:
        print(f"Epoch {epoch:3d} | loss {loss:.4f} | val_auc {val_auc:.4f} | best {best_auc:.4f} | pat {pat}")

    if pat >= patience:
        print(f"Early stopping en epoch {epoch} (best val_auc={best_auc:.4f})")
        break

# Restaurar mejores pesos
model.load_state_dict(best_state)
model.to(device)


Epoch   1 | loss 0.6915 | val_auc 0.7831 | best 0.7831 | pat 0
Epoch  10 | loss 0.4534 | val_auc 0.9017 | best 0.9017 | pat 0
Epoch  20 | loss 0.4239 | val_auc 0.9091 | best 0.9093 | pat 1
Epoch  30 | loss 0.4070 | val_auc 0.9012 | best 0.9103 | pat 3
Epoch  40 | loss 0.3910 | val_auc 0.9089 | best 0.9103 | pat 13
Early stopping en epoch 47 (best val_auc=0.9103)


MLP(
  (net): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=64, out_features=32, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=32, out_features=1, bias=True)
  )
)

In [31]:
# @title Evaluación final en TEST

test_proba = predict_proba(model, X_test_t)
test_auc = roc_auc_score(y_test_np, test_proba)

test_pred = (test_proba >= 0.5).astype(int)

print("AUC PyTorch:", test_auc)
print("Confusion matrix:\n", confusion_matrix(y_test_np, test_pred))
print(classification_report(y_test_np, test_pred))


AUC PyTorch: 0.8245059288537548
Confusion matrix:
 [[99 11]
 [26 43]]
              precision    recall  f1-score   support

           0       0.79      0.90      0.84       110
           1       0.80      0.62      0.70        69

    accuracy                           0.79       179
   macro avg       0.79      0.76      0.77       179
weighted avg       0.79      0.79      0.79       179



In [32]:
# @title Ajuste de umbral

def eval_threshold(t):
    p = (test_proba >= t).astype(int)
    print("threshold =", t)
    print(confusion_matrix(y_test_np, p))
    print(classification_report(y_test_np, p))

eval_threshold(0.5)
eval_threshold(0.4)
eval_threshold(0.3)


threshold = 0.5
[[99 11]
 [26 43]]
              precision    recall  f1-score   support

           0       0.79      0.90      0.84       110
           1       0.80      0.62      0.70        69

    accuracy                           0.79       179
   macro avg       0.79      0.76      0.77       179
weighted avg       0.79      0.79      0.79       179

threshold = 0.4
[[89 21]
 [20 49]]
              precision    recall  f1-score   support

           0       0.82      0.81      0.81       110
           1       0.70      0.71      0.71        69

    accuracy                           0.77       179
   macro avg       0.76      0.76      0.76       179
weighted avg       0.77      0.77      0.77       179

threshold = 0.3
[[86 24]
 [16 53]]
              precision    recall  f1-score   support

           0       0.84      0.78      0.81       110
           1       0.69      0.77      0.73        69

    accuracy                           0.78       179
   macro avg       0.77

```
BCEWithLogitsLoss(pos_weight=...)
```



In [34]:
# y_train_np debe ser 0/1 (numpy array)
pos = (y_train_np == 1).sum()
neg = (y_train_np == 0).sum()

pos_weight_value = neg / pos
pos_weight = torch.tensor([pos_weight_value], dtype=torch.float32).to(device)

pos, neg, pos_weight_value


(np.int64(273), np.int64(439), np.float64(1.6080586080586081))

In [39]:
class MLP(nn.Module):
    def __init__(self, n_features: int):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_features, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 1)  # logits
        )

    def forward(self, x):
        return self.net(x)

n_features = X_train_t.shape[1]
model = MLP(n_features).to(device)

In [40]:
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [41]:
idx = np.arange(len(X_train_nn))
train_idx, val_idx = train_test_split(
    idx, test_size=0.2, random_state=42, stratify=y_train_np
)

X_tr = torch.tensor(X_train_nn[train_idx], dtype=torch.float32)
y_tr = torch.tensor(y_train_np[train_idx], dtype=torch.float32).view(-1, 1)

X_val = torch.tensor(X_train_nn[val_idx], dtype=torch.float32)
y_val = torch.tensor(y_train_np[val_idx], dtype=torch.float32).view(-1, 1)

train_loader = DataLoader(TensorDataset(X_tr, y_tr), batch_size=32, shuffle=True)

@torch.no_grad()
def predict_proba(model, X_tensor, batch_size=512):
    model.eval()
    probs = []
    dl = DataLoader(TensorDataset(X_tensor), batch_size=batch_size, shuffle=False)
    for (xb,) in dl:
        xb = xb.to(device)
        logits = model(xb)
        p = torch.sigmoid(logits).detach().cpu().numpy().ravel()
        probs.append(p)
    return np.concatenate(probs)

def train_one_epoch(model, loader):
    model.train()
    total_loss = 0.0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)

        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * xb.size(0)
    return total_loss / len(loader.dataset)

# Early stopping
max_epochs = 300
patience = 20
best_auc = -np.inf
best_state = None
pat = 0

for epoch in range(1, max_epochs + 1):
    loss = train_one_epoch(model, train_loader)

    val_proba = predict_proba(model, X_val)
    val_auc = roc_auc_score(y_val.numpy().ravel(), val_proba)

    if val_auc > best_auc + 1e-4:
        best_auc = val_auc
        best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
        pat = 0
    else:
        pat += 1

    if epoch % 10 == 0 or epoch == 1:
        print(f"Epoch {epoch:3d} | loss {loss:.4f} | val_auc {val_auc:.4f} | best {best_auc:.4f} | pat {pat}")

    if pat >= patience:
        print(f"Early stopping en epoch {epoch} (best val_auc={best_auc:.4f})")
        break

model.load_state_dict(best_state)
model.to(device)


Epoch   1 | loss 0.8416 | val_auc 0.8031 | best 0.8031 | pat 0
Epoch  10 | loss 0.5714 | val_auc 0.9052 | best 0.9052 | pat 0
Epoch  20 | loss 0.5430 | val_auc 0.9110 | best 0.9124 | pat 2
Epoch  30 | loss 0.5068 | val_auc 0.9099 | best 0.9124 | pat 12
Early stopping en epoch 38 (best val_auc=0.9124)


MLP(
  (net): Sequential(
    (0): Linear(in_features=10, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=64, out_features=32, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=32, out_features=1, bias=True)
  )
)

In [42]:
test_proba = predict_proba(model, X_test_t)
test_auc = roc_auc_score(y_test_np, test_proba)
test_pred = (test_proba >= 0.5).astype(int)

print("AUC PyTorch + pos_weight:", test_auc)
print("Confusion matrix:\n", confusion_matrix(y_test_np, test_pred))
print(classification_report(y_test_np, test_pred))


AUC PyTorch + pos_weight: 0.8299077733860342
Confusion matrix:
 [[88 22]
 [18 51]]
              precision    recall  f1-score   support

           0       0.83      0.80      0.81       110
           1       0.70      0.74      0.72        69

    accuracy                           0.78       179
   macro avg       0.76      0.77      0.77       179
weighted avg       0.78      0.78      0.78       179



In [43]:
def eval_threshold(t):
    p = (test_proba >= t).astype(int)
    print("threshold =", t)
    print(confusion_matrix(y_test_np, p))
    print(classification_report(y_test_np, p))

for t in [0.5, 0.45, 0.4, 0.35, 0.3]:
    eval_threshold(t)


threshold = 0.5
[[88 22]
 [18 51]]
              precision    recall  f1-score   support

           0       0.83      0.80      0.81       110
           1       0.70      0.74      0.72        69

    accuracy                           0.78       179
   macro avg       0.76      0.77      0.77       179
weighted avg       0.78      0.78      0.78       179

threshold = 0.45
[[87 23]
 [16 53]]
              precision    recall  f1-score   support

           0       0.84      0.79      0.82       110
           1       0.70      0.77      0.73        69

    accuracy                           0.78       179
   macro avg       0.77      0.78      0.77       179
weighted avg       0.79      0.78      0.78       179

threshold = 0.4
[[84 26]
 [15 54]]
              precision    recall  f1-score   support

           0       0.85      0.76      0.80       110
           1       0.68      0.78      0.72        69

    accuracy                           0.77       179
   macro avg       0.7