## Load model from checkpoint

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch.utils.tensorboard import SummaryWriter
import time
from datetime import datetime
import os
import torch.nn as nn


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [2]:
CHECKPOINT_PATH = "checkpoints_2025-07-31_21-51-51/autoencoder_epoch500.pt"  # Example checkpoint directory


In [3]:
def load_checkpoint(path, model, optimizer, scheduler, device):
    if not os.path.exists(path):
        print(f"❌ Checkpoint not found: {path}")
        return 0, [], []

    print(f"🔄 Loading checkpoint from: {path}")
    checkpoint = torch.load(path, map_location=device)

    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    if 'scheduler_state_dict' in checkpoint:
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

    epoch = checkpoint.get('epoch', 0) + 1
    train_losses = checkpoint.get('train_losses', [])
    test_losses = checkpoint.get('test_losses', [])

    print(f"✅ Resumed from epoch {epoch}")
    return epoch, train_losses, test_losses

In [4]:
class TitanicAutoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim=4):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 16),
            nn.ReLU(),
            nn.Linear(16, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)       # compressed embedding
        x_recon = self.decoder(z) # reconstructed input
        return x_recon

    def encode(self, x):
        return self.encoder(x)    # get embedding only

In [5]:
INPUT_DIM = 10
LATENT_DIM = 8
model = TitanicAutoencoder(input_dim=INPUT_DIM, latent_dim=LATENT_DIM).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5)



In [6]:
# Load the checkpoint
start_epoch, train_losses, test_losses = load_checkpoint(
    CHECKPOINT_PATH, model, optimizer, scheduler, device
)

🔄 Loading checkpoint from: checkpoints_2025-07-31_21-51-51/autoencoder_epoch500.pt
✅ Resumed from epoch 500


## Load training data

In [7]:
df = sns.load_dataset("titanic")

In [8]:
## Randomly select a fraction of the dataset
FRACTION = 0.8

train_df = df.sample(frac=FRACTION, random_state=42) # fix seed for reproducibility
# Get the remaining 20% of rows for the test set
# This is achieved by selecting rows whose index is not present in the training set
test_df = df.drop(train_df.index)
# Display the shapes of the resulting DataFrames
print(f"Shape of training DataFrame: {train_df.shape}")
print(f"Shape of test DataFrame: {test_df.shape}")

# You can now save these DataFrames if needed
train_df.to_csv('train_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)

Shape of training DataFrame: (713, 15)
Shape of test DataFrame: (178, 15)


In [9]:
## Drop columns with missing value and alive column
df_nomissing = df.drop(columns=['age', 'deck', 'embarked', 'embark_town', 'survived',])

In [10]:
# Normalize numeric feature
from sklearn.preprocessing import StandardScaler
num_cols = ["fare", "sibsp", "parch"]
scaler = StandardScaler()
df_nomissing[num_cols] = scaler.fit_transform(df_nomissing[num_cols])

In [11]:
#Convert categorial features to integers using Label Encoding
from sklearn.preprocessing import LabelEncoder

cat_cols = ["pclass", "sex", "class", "who", "adult_male", "alive", "alone"]  # treat pclass as categorical
label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    df_nomissing[col] = le.fit_transform(df_nomissing[col])
    label_encoders[col] = le

In [12]:
## Randomly select a fraction of the dataset
FRACTION = 0.8

train_df_nomissing = df_nomissing.sample(frac=FRACTION, random_state=42) # fix seed for reproducibility
# Get the remaining 20% of rows for the test set
# This is achieved by selecting rows whose index is not present in the training set
test_df_nomissing = df_nomissing.drop(train_df_nomissing.index)
# Display the shapes of the resulting DataFrames
print(f"Shape of training DataFrame: {train_df_nomissing.shape}")
print(f"Shape of test DataFrame: {test_df_nomissing.shape}")

Shape of training DataFrame: (713, 10)
Shape of test DataFrame: (178, 10)


In [13]:
X_train = train_df_nomissing.values.astype("float32")
X_test = test_df_nomissing.values.astype("float32")

## Pytorch dataset for loading data to training loop

In [14]:
from torch.utils.data import Dataset, DataLoader

class TitanicAutoencoderDataset(Dataset):
    def __init__(self, X):
        self.X = torch.tensor(X, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.X[idx]  # input = target

train_ds = TitanicAutoencoderDataset(X_train)
test_ds = TitanicAutoencoderDataset(X_test)

## Get GPU utilization

In [15]:
import subprocess

def get_gpu_utilization():
    result = subprocess.run(
        ['nvidia-smi', '--query-gpu=utilization.gpu,memory.used',
         '--format=csv,nounits,noheader'],
        stdout=subprocess.PIPE,
        text=True
    )
    gpu_util, mem_used = map(int, result.stdout.strip().split(','))
    return gpu_util, mem_used

## Training starts from checkpoint

In [16]:
# Create a timestamped run directory
os.makedirs("runs-start-from-checkpoint", exist_ok=True) # Create directories for logs and checkpoints
RUN_NAME = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
LOG_DIR = f"runs-checkpoint/autoencoder_{RUN_NAME}" # Create directories for logs and checkpoints
CHECKPOINT_DIR = f"ckpt_checkpoints_{RUN_NAME}" # Create directories for logs and checkpoints
os.makedirs(CHECKPOINT_DIR, exist_ok=True) # Create directories if they don't exist
BATCH_SIZE = 256 # Adjust batch size as needed
WORKERS = 1 # Number of workers for DataLoader, adjust based on your system

writer = SummaryWriter(log_dir=LOG_DIR)

In [17]:
loss_fn = nn.MSELoss()

In [18]:
def plot_loss_curve(train_losses, test_losses):
    fig, ax = plt.subplots(figsize=(8, 5))
    ax.plot(train_losses, label="Train Loss", marker='o')
    ax.plot(test_losses, label="Test Loss", marker='x')
    ax.set_xlabel("Epoch")
    ax.set_ylabel("Loss")
    ax.set_title("Train vs Test Loss")
    ax.legend()
    ax.grid(True)
    return fig

In [19]:
#Training loop
# Prepare training and test datasets
train_ds = TitanicAutoencoderDataset(X_train)
test_ds = TitanicAutoencoderDataset(X_test)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, num_workers=WORKERS, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, num_workers=WORKERS)

# Track losses
train_losses = []
test_losses = []
best_loss = float('inf') # Initialize best_loss for comparison and checkpointing
# Training loop
num_epochs = 1500
for epoch in range(start_epoch, num_epochs):
    start_time = time.time()
    model.train()
    train_loss = 0.0

    for x, y in train_loader:
        x = x.to(device)
        y = y.to(device)

        recon = model(x)
        loss = loss_fn(recon, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    end_time = time.time()
    epoch_duration = end_time - start_time
    # GPU metrics
    gpu_alloc = torch.cuda.memory_allocated() / 1024**2 # Convert to MB
    gpu_reserved = torch.cuda.memory_reserved() / 1024**2 # Convert to MB
    writer.add_scalar("GPU/Memory_Allocated_MB", gpu_alloc, epoch)
    writer.add_scalar("GPU/Memory_Reserved_MB", gpu_reserved, epoch)

    try:
        gpu_util, mem_used = get_gpu_utilization()
        writer.add_scalar("GPU/Utilization_%", gpu_util, epoch)
        writer.add_scalar("GPU/Memory_Used_MB", mem_used, epoch)
    except:
        pass  # In case nvidia-smi isn't available

    # Evaluate on test set
    model.eval()
    test_loss = 0.0
    with torch.no_grad():
        for x, y in test_loader:
            x = x.to(device)
            y = y.to(device)
            recon = model(x)
            loss = loss_fn(recon, y)
            test_loss += loss.item()

    avg_test_loss = test_loss / len(test_loader)
    test_losses.append(avg_test_loss)

    # ✅ Log scalar losses
    writer.add_scalar("Loss/train", avg_train_loss, epoch)
    writer.add_scalar("Loss/test", avg_test_loss, epoch)
    writer.add_figure("Loss Overlap Curve", plot_loss_curve(train_losses, test_losses), global_step=epoch)

    # ✅ Log weights and gradients
    for name, param in model.named_parameters():
        writer.add_histogram(f"Weights/{name}", param, epoch)
        if param.grad is not None:
            writer.add_histogram(f"Gradients/{name}", param.grad, epoch)

    # ✅ Log activation from encoder
    with torch.no_grad():
        activation_sample = torch.tensor(X_train[:1], dtype=torch.float32).to(device)
        encoded = model.encoder(activation_sample)
        writer.add_histogram("Activations/EncoderOutput", encoded, epoch)
        current_lr = optimizer.param_groups[0]['lr']
        writer.add_scalar("LR", current_lr, epoch)

    print(f"Epoch {epoch+1:2d} | Train Loss: {avg_train_loss:.4f} | Test Loss: {avg_test_loss:.4f} | LR: {current_lr:.6f} | Time: {epoch_duration:.2f} sec")

    # Save only the best model based on test loss
    if avg_test_loss < best_loss:
        best_loss = avg_test_loss
        print(f"New best model found at epoch {epoch+1}, saving checkpoint...")
        
            # Save the model state
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'train_losses': train_losses,
            'test_losses': test_losses,
            'input_dim': X_train.shape[1],           # ✅ Save architecture args
            'latent_dim': LATENT_DIM 
            }, f"{CHECKPOINT_DIR}/autoencoder_epoch{epoch+1}.pt")
    
    scheduler.step(avg_test_loss) # Adjust learning rate based on test loss

# ✅ After training — log embeddings to projector
with torch.no_grad():
    sample_input = torch.tensor(X_test[:500], dtype=torch.float32).to(device)
    latent_vectors = model.encoder(sample_input)
    metadata = [f"Passenger {i}" for i in range(sample_input.shape[0])]
    writer.add_embedding(latent_vectors, metadata=metadata, tag="LatentEmbeddings", global_step=num_epochs)
    
writer.close()

Epoch 501 | Train Loss: 0.0028 | Test Loss: 0.0025 | LR: 0.001000 | Time: 0.17 sec
New best model found at epoch 501, saving checkpoint...
Epoch 502 | Train Loss: 0.0028 | Test Loss: 0.0025 | LR: 0.001000 | Time: 0.05 sec
New best model found at epoch 502, saving checkpoint...
Epoch 503 | Train Loss: 0.0028 | Test Loss: 0.0025 | LR: 0.001000 | Time: 0.05 sec
New best model found at epoch 503, saving checkpoint...
Epoch 504 | Train Loss: 0.0028 | Test Loss: 0.0025 | LR: 0.001000 | Time: 0.05 sec
New best model found at epoch 504, saving checkpoint...
Epoch 505 | Train Loss: 0.0027 | Test Loss: 0.0025 | LR: 0.001000 | Time: 0.05 sec
New best model found at epoch 505, saving checkpoint...
Epoch 506 | Train Loss: 0.0028 | Test Loss: 0.0025 | LR: 0.001000 | Time: 0.05 sec
New best model found at epoch 506, saving checkpoint...
Epoch 507 | Train Loss: 0.0027 | Test Loss: 0.0025 | LR: 0.001000 | Time: 0.05 sec
New best model found at epoch 507, saving checkpoint...
Epoch 508 | Train Loss: 0.0