In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
import torch.nn as nn

In [2]:
train_df = pd.read_csv("train_semi_supervised.csv")
test_df = pd.read_csv("test_semi_supervised.csv")

In [None]:
feature_cols = train_df.columns[:-1]    # todas menos la última
label_col    = train_df.columns[-1]     # última columna
test_feature_cols = test_df.columns[1:]  # todas menos 'id'


# Separar labeled y unlabeled
df_labeled   = train_df[train_df[label_col].notna()].copy()
df_unlabeled = train_df[train_df[label_col].isna()].copy()

# division en training y test
train_df, val_df = train_test_split(
    df_labeled,
    test_size=0.2,
    random_state=42,
    stratify=df_labeled[label_col]   # estratificación por clase
)

# Convertir label a int (solo los labeled)
df_labeled[label_col] = df_labeled[label_col].astype(int)

train_df.shape, df_labeled.shape, df_unlabeled.shape, val_df.shape

((35023, 188), (43779, 188), (43775, 188), (8756, 188))

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader

class ECGSupervisedDataset(Dataset):
    def __init__(self, df, feature_cols, label_col):
        # Guardamos los datos ya como numpy para que __getitem__ sea rápido
        self.X = df[feature_cols].to_numpy(dtype=np.float32)
        self.y = df[label_col].to_numpy(dtype=np.int64)  # para CrossEntropyLoss

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        x = torch.from_numpy(self.X[idx])              # tensor float32
        y = torch.tensor(self.y[idx], dtype=torch.long)  # tensor long (clase)
        return x, y


In [5]:
class ECGUnlabeledDataset(Dataset):
    def __init__(self, df, feature_cols):
        self.X = df[feature_cols].to_numpy(dtype=np.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        x = torch.from_numpy(self.X[idx])
        return x


In [None]:
class ECGTestDataset(Dataset):
    def __init__(self, df):
        self.df = df

        # primera columna = id
        self.ids = df.iloc[:, 0].to_numpy()

        # features = todas excepto id y label
        self.X = df.iloc[:, 1:-1].to_numpy(dtype=np.float32)

        # revisar si la última columna es el label
        # si contiene solo 0,1,2,3,4 asumimos labels reales
        last_col = df.iloc[:, -1]

        if last_col.dropna().isin([0,1,2,3,4]).all():
            self.has_labels = True
            self.y = last_col.to_numpy(dtype=np.int64)
        else:
            self.has_labels = False
            self.y = None

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        x  = torch.from_numpy(self.X[idx])
        id_ = int(self.ids[idx])

        if self.has_labels:
            y = int(self.y[idx])
            return x, id_, y

        return x, id_


In [None]:
# Datasets
train_dataset      = ECGSupervisedDataset(df_labeled, feature_cols, label_col)
val_dataset   = ECGSupervisedDataset(val_df, feature_cols, label_col)
unlabeled_dataset  = ECGUnlabeledDataset(df_unlabeled, feature_cols)
test_dataset       = ECGTestDataset(test_df)

# DataLoaders
batch_size = 256  # ajustar según GPU/CPU

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0,
    pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=0,
    pin_memory=True
)

unlabeled_loader = DataLoader(
    unlabeled_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0,
    pin_memory=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=0,
    pin_memory=False
)


In [9]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [12]:

class ECGBetterCNN(nn.Module):
    def __init__(self, input_length=187, num_classes=5):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv1d(1, 32, kernel_size=5, padding=2),
            nn.BatchNorm1d(32),
            nn.ReLU(),

            nn.Conv1d(32, 64, kernel_size=5, padding=2),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),   # 187 -> ~93

            nn.Conv1d(64, 128, kernel_size=5, padding=2),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),   # ~93 -> ~46
        )

        # tamaño después de las pools: 187 // 4 ≈ 46
        conv_out_len = input_length // 4

        self.classifier = nn.Sequential(
            nn.Linear(128 * conv_out_len, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        # x: (batch, 187) -> (batch, 1, 187)
        x = x.unsqueeze(1)
        x = self.features(x)          # (batch, 128, L')
        x = x.flatten(1)              # (batch, 128*L')
        x = self.classifier(x)        # (batch, num_classes)
        return x


num_features = len(feature_cols)   # debería ser 187
num_classes  = train_df[label_col].nunique()

model = ECGBetterCNN(input_length=num_features, num_classes=num_classes).to(device)


In [13]:
class_counts = train_df[label_col].value_counts().sort_index()
num_samples = class_counts.sum()
num_classes = len(class_counts)

weights_np = num_samples / (num_classes * class_counts.to_numpy())
print("class counts:\n", class_counts)
print("class weights:\n", weights_np)

class_weights_tensor = torch.tensor(weights_np, dtype=torch.float32).to(device)

class counts:
 187
0.0    28989
1.0      889
2.0     2315
3.0      257
4.0     2573
Name: count, dtype: int64
class weights:
 [ 0.24162958  7.8791901   3.02574514 27.25525292  2.72234745]


In [14]:
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0

    for xb, yb in loader:
        xb = xb.to(device)
        yb = yb.to(device)

        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * xb.size(0)
        preds = logits.argmax(dim=1)
        total_correct += (preds == yb).sum().item()
        total_samples += xb.size(0)

    avg_loss = total_loss / total_samples
    avg_acc  = total_correct / total_samples
    return avg_loss, avg_acc


@torch.no_grad()
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0

    for xb, yb in loader:
        xb = xb.to(device)
        yb = yb.to(device)

        logits = model(xb)
        loss = criterion(logits, yb)

        total_loss += loss.item() * xb.size(0)
        preds = logits.argmax(dim=1)
        total_correct += (preds == yb).sum().item()
        total_samples += xb.size(0)

    avg_loss = total_loss / total_samples
    avg_acc  = total_correct / total_samples
    return avg_loss, avg_acc


In [15]:
xb, yb = next(iter(train_loader))
print(xb.shape, yb.shape)


torch.Size([256, 187]) torch.Size([256])


In [16]:
num_epochs = 20

for epoch in range(1, num_epochs + 1):
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc     = evaluate(model, val_loader, criterion, device)

    print(f"Epoch {epoch:02d} | "
          f"train_loss={train_loss:.4f} acc={train_acc:.4f} | "
          f"val_loss={val_loss:.4f} acc={val_acc:.4f}")


Epoch 01 | train_loss=0.5655 acc=0.7320 | val_loss=0.3915 acc=0.9038
Epoch 02 | train_loss=0.3560 acc=0.8416 | val_loss=0.3054 acc=0.8199
Epoch 03 | train_loss=0.2920 acc=0.8645 | val_loss=0.2148 acc=0.9467
Epoch 04 | train_loss=0.2263 acc=0.8882 | val_loss=0.1745 acc=0.8926
Epoch 05 | train_loss=0.2087 acc=0.8965 | val_loss=0.1631 acc=0.9487
Epoch 06 | train_loss=0.1872 acc=0.9022 | val_loss=0.1189 acc=0.9577
Epoch 07 | train_loss=0.1680 acc=0.9146 | val_loss=0.1131 acc=0.9195
Epoch 08 | train_loss=0.1539 acc=0.9189 | val_loss=0.0924 acc=0.9560
Epoch 09 | train_loss=0.1152 acc=0.9371 | val_loss=0.0804 acc=0.9547
Epoch 10 | train_loss=0.1059 acc=0.9373 | val_loss=0.0779 acc=0.9717
Epoch 11 | train_loss=0.1029 acc=0.9418 | val_loss=0.0679 acc=0.9721
Epoch 12 | train_loss=0.0873 acc=0.9493 | val_loss=0.0521 acc=0.9632
Epoch 13 | train_loss=0.0763 acc=0.9533 | val_loss=0.0556 acc=0.9684
Epoch 14 | train_loss=0.0915 acc=0.9512 | val_loss=0.0460 acc=0.9643
Epoch 15 | train_loss=0.0698 acc=0

In [17]:
import torch.nn.functional as F
import numpy as np

@torch.no_grad()
def generate_pseudo_labels(model, unlabeled_loader, device, threshold=0.9):
    model.eval()
    all_max_probs = []
    all_preds = []

    for xb in unlabeled_loader:
        xb = xb.to(device)
        logits = model(xb)
        probs = F.softmax(logits, dim=1)
        max_probs, preds = probs.max(dim=1)

        all_max_probs.append(max_probs.cpu())
        all_preds.append(preds.cpu())

    all_max_probs = torch.cat(all_max_probs).numpy()
    all_preds = torch.cat(all_preds).numpy()

    mask = all_max_probs >= threshold

    print(f"Total unlabeled: {len(all_max_probs)}")
    print(f"Pseudo-labels con conf >= {threshold}: {mask.sum()}")

    return all_preds, all_max_probs, mask


In [18]:
all_preds, all_max_probs, mask = generate_pseudo_labels(
    model, unlabeled_loader, device, threshold=0.9
)


Total unlabeled: 43775
Pseudo-labels con conf >= 0.9: 40392


In [19]:
# Filas confiables del unlabeled
df_pseudo = df_unlabeled.loc[mask].copy()
df_pseudo[label_col] = all_preds[mask]  # asignamos la clase predicha
df_pseudo["is_pseudo"] = True           # opcional, para debug

print("df_pseudo shape:", df_pseudo.shape)

# Marcamos los reales (opcional también)
train_df["is_pseudo"] = False

# Nuevo train con reales + pseudo
train_df_self = pd.concat([train_df, df_pseudo], ignore_index=True)
print("Antes self-training:", train_df.shape)
print("Después self-training:", train_df_self.shape)


df_pseudo shape: (40392, 189)
Antes self-training: (35023, 189)
Después self-training: (75415, 189)


In [20]:
df_unlabeled_remaining = df_unlabeled.loc[~mask].reset_index(drop=True)


In [21]:
class_counts = train_df_self[label_col].value_counts().sort_index()
num_samples = class_counts.sum()
num_classes = len(class_counts)

weights = num_samples / (num_classes * class_counts.to_numpy())
print("class weights (self-training):", weights)

class_weights_tensor = torch.tensor(weights, dtype=torch.float32).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)


class weights (self-training): [ 0.24302333  7.92590646  2.92760093 26.98211091  2.62907443]


In [None]:
num_epochs_self = 10

for epoch in range(1, num_epochs_self + 1):
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc     = evaluate(model, val_loader, criterion, device)

    print(f"[SELF] Epoch {epoch:02d} | "
          f"train_loss={train_loss:.4f} acc={train_acc:.4f} | "
          f"val_loss={val_loss:.4f} acc={val_acc:.4f}")


[SELF] Epoch 01 | train_loss=0.0510 acc=0.9698 | val_loss=0.0473 acc=0.9790
[SELF] Epoch 02 | train_loss=0.0512 acc=0.9709 | val_loss=0.0334 acc=0.9786
[SELF] Epoch 03 | train_loss=0.0721 acc=0.9639 | val_loss=0.0379 acc=0.9744
[SELF] Epoch 04 | train_loss=0.0583 acc=0.9695 | val_loss=0.0270 acc=0.9880
[SELF] Epoch 05 | train_loss=0.0615 acc=0.9698 | val_loss=0.0300 acc=0.9794
[SELF] Epoch 06 | train_loss=0.0525 acc=0.9726 | val_loss=0.0299 acc=0.9733
[SELF] Epoch 07 | train_loss=0.0391 acc=0.9772 | val_loss=0.0239 acc=0.9863
[SELF] Epoch 08 | train_loss=0.0313 acc=0.9813 | val_loss=0.0195 acc=0.9892
[SELF] Epoch 09 | train_loss=0.0344 acc=0.9811 | val_loss=0.0349 acc=0.9823
[SELF] Epoch 10 | train_loss=0.0353 acc=0.9803 | val_loss=0.0208 acc=0.9861


In [None]:
import torch.nn.functional as F
import numpy as np

model.eval()
all_ids = []
all_preds = []

with torch.no_grad():
    for batch in test_loader:
        # soportar (x, id) o (x, id, y_real)
        if len(batch) == 3:
            xb, ids, _ = batch
        else:
            xb, ids = batch

        xb = xb.to(device)
        logits = model(xb)
        probs = F.softmax(logits, dim=1)
        preds = probs.argmax(dim=1)

        all_ids.append(ids.numpy())
        all_preds.append(preds.cpu().numpy())

all_ids   = np.concatenate(all_ids)
all_preds = np.concatenate(all_preds).astype(int)


In [24]:
submission = pd.DataFrame({
    "Id": np.arange(len(all_preds)),   # 0,1,2,...,N-1
    "Label": all_preds
})

print(submission.head())
submission.to_csv("submission.csv", index=False)

   Id  Label
0   0      0
1   1      0
2   2      0
3   3      0
4   4      0
