In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader, Subset
from PIL import Image
import numpy as np
import os
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import time
import datetime
import csv
import config
from torch.utils.data import random_split
from importnb import Notebook
with Notebook():
    from jetbot_dataset import *

In [2]:
# --- Diffusion Helpers ---
def linear_beta_schedule(timesteps, beta_start, beta_end):
    return torch.linspace(beta_start, beta_end, timesteps)

def cosine_beta_schedule(timesteps, s=0.008):
    steps = timesteps + 1
    x = torch.linspace(0, timesteps, steps, dtype=torch.float64)
    alphas_cumprod = torch.cos(((x / timesteps) + s) / (1 + s) * np.pi * 0.5) ** 2
    alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
    betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
    return torch.clip(betas, 0, 0.999)

def get_index_from_list(vals, t, x_shape):
    batch_size = t.shape[0]
    out = vals.gather(-1, t)
    return out.reshape(batch_size, *((1,) * (len(x_shape) - 1))).to(t.device)

def forward_diffusion_sample(x_0, t, betas, alphas_cumprod, device="cpu"):
    noise = torch.randn_like(x_0)
    sqrt_alphas_cumprod_t = get_index_from_list(torch.sqrt(alphas_cumprod), t, x_0.shape)
    sqrt_one_minus_alphas_cumprod_t = get_index_from_list(
        torch.sqrt(1. - alphas_cumprod), t, x_0.shape
    )
    return sqrt_alphas_cumprod_t * x_0 + sqrt_one_minus_alphas_cumprod_t * noise, noise

# --- U-Net Model ---
class SinusoidalPositionEmbeddings(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.dim = dim

    def forward(self, time):
        device = time.device
        half_dim = self.dim // 2
        embeddings = np.log(10000) / (half_dim - 1)
        embeddings = torch.exp(torch.arange(half_dim, device=device) * -embeddings)
        embeddings = time[:, None] * embeddings[None, :]
        embeddings = torch.cat((embeddings.sin(), embeddings.cos()), dim=-1)
        # Ensure output matches the embedding dim even if input dim is odd
        if self.dim % 2 == 1:
             embeddings = F.pad(embeddings, (0, 1))
        return embeddings

class Block(nn.Module):
    def __init__(self, in_ch, out_ch, time_emb_dim, up=False):
        super().__init__()
        # Adjusted time_mlp input dimension
        self.time_mlp =  nn.Linear(time_emb_dim, out_ch)
        if up:
            # Adjusted Conv2d input channels for concatenation
            self.conv1 = nn.Conv2d(2*in_ch, out_ch, 3, padding=1)
            self.transform = nn.ConvTranspose2d(out_ch, out_ch, 4, 2, 1)
        else:
            self.conv1 = nn.Conv2d(in_ch, out_ch, 3, padding=1)
            self.transform = nn.Conv2d(out_ch, out_ch, 4, 2, 1)
        self.conv2 = nn.Conv2d(out_ch, out_ch, 3, padding=1)
        self.bnorm1 = nn.BatchNorm2d(out_ch)
        self.bnorm2 = nn.BatchNorm2d(out_ch)
        self.relu  = nn.ReLU()

    def forward(self, x, t_emb): # Modified to accept pre-computed embedding
        h = self.bnorm1(self.relu(self.conv1(x)))
        # Time embedding - Project and add
        time_emb_proj = self.relu(self.time_mlp(t_emb))
        time_emb_proj = time_emb_proj[(..., ) + (None, ) * 2] # Reshape for spatial broadcast
        h = h + time_emb_proj # Add time embedding
        h = self.bnorm2(self.relu(self.conv2(h)))
        return self.transform(h)
        
class SimpleUNet(nn.Module):
    def __init__(self, image_channels=3, time_emb_dim=32, num_prev_frames=4):
        super().__init__()

        # --- Increased Channel Depth and Added Layer ---
        down_channels = (128, 256, 512, 512) # Increased channels, added a level
        up_channels = (512, 512, 256, 128)   # Increased channels, added a level

        # Input channels = current frame + previous frames
        in_img_channels = image_channels * (num_prev_frames + 1)
        action_dim = 1 # Single motor action

        # Effective embedding dimension including action
        effective_time_emb_dim = time_emb_dim + action_dim

        # Time embedding MLP
        self.time_mlp = nn.Sequential(
                SinusoidalPositionEmbeddings(time_emb_dim),
                nn.Linear(time_emb_dim, time_emb_dim),
                nn.ReLU()
            )

        # --- Model Layers ---
        self.conv0 = nn.Conv2d(in_img_channels, down_channels[0], 3, padding=1)

        self.downs = nn.ModuleList([])
        for i in range(len(down_channels)-1):
            self.downs.append(Block(down_channels[i], down_channels[i+1], effective_time_emb_dim))

        # Bottleneck (implicitly defined by channel changes)
        # No extra bottleneck block needed here, just the transition

        self.ups = nn.ModuleList([])
        for i in range(len(up_channels)-1):
            self.ups.append(Block(up_channels[i], up_channels[i+1], effective_time_emb_dim, up=True))

        # Final output layer (outputs noise prediction, same channels as original image)
        self.output = nn.Conv2d(up_channels[-1], image_channels, 1)


    def forward(self, x, timestep, action, prev_frames):
        # x: noisy next frame (batch, 3, H, W)
        # timestep: (batch,)
        # action: (batch, 1)
        # prev_frames: (batch, num_prev_frames * 3, H, W)

        # Concatenate the current noisy image with the previous frames
        x = torch.cat([x, prev_frames], dim=1) # Shape: (batch, (N+1)*C, H, W)

        # --- Prepare Time and Action Embedding ---
        t_emb = self.time_mlp(timestep) # Shape: (batch, time_emb_dim)
        if action is not None:
            action = action.float()
            if len(action.shape) == 1:
                 action = action.unsqueeze(1) # Ensure shape is (batch, 1)
            # Concatenate time embedding and action embedding
            t_action_emb = torch.cat([t_emb, action], dim=1) # Shape: (batch, time_emb_dim + 1)
        else:
            # Handle cases where action might be None (e.g., unconditional generation if needed later)
            # Pad action dimensions if needed - adjust padding based on your effective_time_emb_dim
            padding = torch.zeros(t_emb.shape[0], 1, device=t_emb.device)
            t_action_emb = torch.cat([t_emb, padding], dim=1)
        # --- End Embedding Prep ---

        # --- U-Net Architecture ---
        x = self.conv0(x) # Initial processing of combined input
        residual_inputs = []
        # Downsampling path
        for i, down_block in enumerate(self.downs):
            x = down_block(x, t_action_emb) # Pass combined embedding to blocks
            residual_inputs.append(x)

        # Upsampling path
        for i, up_block in enumerate(self.ups):
            residual_x = residual_inputs.pop()
            x = torch.cat((x, residual_x), dim=1) # Concatenate skip connection
            x = up_block(x, t_action_emb) # Pass combined embedding to blocks

        return self.output(x) # Predict noise

# --- Training Loop ---
def train(model, dataloader, optimizer, betas, alphas_cumprod, start_epoch, num_epochs,
          device, save_every, sample_every, checkpoint_dir, sample_dir, plot_dir,
          plot_every, use_fp16, accumulation_steps, num_prev_frames,
          early_stopping_patience, early_stopping_percentage, min_epochs):
    """
    Trains the diffusion model with early stopping and best model saving/deletion.
    """

    all_losses = []
    start_time = time.time()
    last_plot_epoch = start_epoch - 1
    best_loss = float('inf')
    best_epoch = start_epoch
    epochs_without_improvement = 0
    moving_avg_window = 10
    moving_avg_losses = []
    previous_best_model_path = None  # Keep track of the previous best model's path

    scaler = torch.cuda.amp.GradScaler(enabled=use_fp16)

    for epoch in range(start_epoch, num_epochs):
        model.train()
        epoch_losses = []
        optimizer.zero_grad()
        pbar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")

        for step, (images, actions, prev_frames) in enumerate(pbar):
            images = images.to(device)
            actions = actions.to(device)
            prev_frames = prev_frames.to(device)
            t = torch.randint(0, config.NUM_TIMESTEPS, (images.shape[0],), device=device).long()

            with torch.cuda.amp.autocast(enabled=use_fp16):
                x_noisy, noise = forward_diffusion_sample(images, t, betas, alphas_cumprod, device)
                predicted_noise = model(x_noisy, t, actions, prev_frames)
                loss = F.mse_loss(noise, predicted_noise)
                loss = loss / accumulation_steps

            scaler.scale(loss).backward()

            if (step + 1) % accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()

            epoch_losses.append(loss.item() * accumulation_steps)
            pbar.set_postfix({"Loss": loss.item() * accumulation_steps})

        if optimizer.param_groups[0]['params'][0].grad is not None:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        avg_epoch_loss = sum(epoch_losses) / len(epoch_losses)  if epoch_losses else float('nan') # Handle empty epoch_losses
        if np.isnan(avg_epoch_loss):
            print(f"Warning: NaN loss detected for epoch {epoch+1}. Skipping update/plot.")
            # Optionally: break or handle NaN case differently
            continue
        all_losses.append(avg_epoch_loss)

        moving_avg_losses.append(avg_epoch_loss)
        if len(moving_avg_losses) > moving_avg_window:
            moving_avg_losses.pop(0)
        current_moving_avg = sum(moving_avg_losses) / len(moving_avg_losses)

        if (epoch + 1) % save_every == 0:
            torch.save({
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': avg_epoch_loss,
            }, os.path.join(checkpoint_dir, f"model_epoch_{epoch+1}.pth"))
            print(f"Saved model checkpoint at epoch {epoch+1}")

        if (epoch + 1) % sample_every == 0:
            model.eval()
            with torch.no_grad():
                random_idx = torch.randint(0, len(dataset), (1,)).item()
                real_current_frame, action, real_prev_frames = dataset[random_idx]
                real_current_frame = real_current_frame.unsqueeze(0).to(device)
                real_prev_frames = real_prev_frames.unsqueeze(0).to(device)
                action = action.to(device)

                t_sample = torch.tensor([config.NUM_TIMESTEPS - 1], device=device, dtype=torch.long)
                x_noisy, _ = forward_diffusion_sample(real_current_frame, t_sample, betas, alphas_cumprod, device)
                x = x_noisy

                for i in reversed(range(1, config.NUM_TIMESTEPS)):
                    t = (torch.ones(1) * i).long().to(device)
                    with torch.cuda.amp.autocast(enabled=use_fp16):
                        predicted_noise = model(x, t, action, real_prev_frames)

                    alpha = alphas[t][:, None, None, None]
                    alpha_hat = alphas_cumprod[t][:, None, None, None]
                    beta = betas[t][:, None, None, None]

                    if i > 1:
                        noise = torch.randn_like(x)
                    else:
                        noise = torch.zeros_like(x)
                    x = 1 / torch.sqrt(alpha) * (x - ((1 - alpha) / (torch.sqrt(1 - alpha_hat))) * predicted_noise) + torch.sqrt(beta) * noise
                predicted_next_frame = (x.clamp(-1, 1) + 1) / 2
                predicted_next_frame = (predicted_next_frame * 255).type(torch.uint8)
                prev_images = []

                for i in range(num_prev_frames):
                    frame = real_prev_frames[0, (i * 3):(i + 1) * 3, :, :]
                    frame = (frame.clamp(-1, 1) + 1) / 2
                    frame = (frame * 255).type(torch.uint8)
                    prev_images.append(transforms.ToPILImage()(frame))

                current_tensor = (real_current_frame[0].clamp(-1, 1) + 1) / 2 * 255
                current_image = transforms.ToPILImage()(current_tensor.type(torch.uint8)).convert("RGB")
                predicted_image = transforms.ToPILImage()(predicted_next_frame[0]).convert("RGB")

                total_width = (num_prev_frames + 2) * config.IMAGE_SIZE
                max_height = config.IMAGE_SIZE
                new_im = Image.new('RGB', (total_width, max_height))

                x_offset = 0
                for image in prev_images:
                    new_im.paste(image, (x_offset,0))
                    x_offset += config.IMAGE_SIZE
                new_im.paste(current_image, (x_offset, 0))
                x_offset += config.IMAGE_SIZE
                new_im.paste(predicted_image, (x_offset, 0))

                new_im.save(os.path.join(sample_dir, f"sample_epoch_{epoch+1}.png"))
                print(f"Saved sample image at epoch {epoch+1}")

            model.train()

            print(f"Epoch {epoch+1}, Step {step}:")
            print(f"  Mem Allocated: {torch.cuda.memory_allocated(config.DEVICE) / 1024**2:.2f} MB")
            print(f"  Max Mem Allocated: {torch.cuda.max_memory_allocated(config.DEVICE) / 1024**2:.2f} MB")
            print(f"  Mem Reserved: {torch.cuda.memory_reserved(config.DEVICE) / 1024**2:.2f} MB")
            print(f"  Max Mem Reserved: {torch.cuda.max_memory_reserved(config.DEVICE) / 1024**2:.2f} MB")
        
        
        if (epoch + 1) % plot_every == 0:
            elapsed_time = time.time() - start_time
            formatted_time = str(datetime.timedelta(seconds=elapsed_time))
    
            fig, axes = plt.subplots(1, 2, figsize=(16, 5))
    
            # --- Plot 1: Loss from START_EPOCH of this run ---
            # X-axis: Absolute epoch numbers (start_epoch + 1 up to current epoch + 1)
            # Y-axis: Losses collected *in this run* (all_losses indices 0 up to current)
            current_run_epochs_plotted = range(start_epoch + 1, epoch + 2)
            axes[0].plot(current_run_epochs_plotted, all_losses)
            axes[0].set_xlabel("Epoch")
            axes[0].set_ylabel("Loss")
            axes[0].set_title(f"Loss Since Start (Epoch {start_epoch+1}, Time: {formatted_time})")
            axes[0].grid(True)
    
            # --- Plot 2: Loss since last plot ---
            # X-axis: Absolute epoch numbers for the segment
            x_values_ax1 = range(last_plot_epoch + 2, epoch + 2)
    
            # Y-axis: Slice all_losses using indices relative to this run's start
            # Calculate indices corresponding to the absolute epoch numbers
            start_slice_index = (last_plot_epoch + 1) - start_epoch # Index in all_losses for epoch last_plot_epoch+1
            end_slice_index = (epoch + 1) - start_epoch           # Index in all_losses for epoch epoch+1 (exclusive)
            y_values_ax1 = all_losses[start_slice_index : end_slice_index]
    
            if x_values_ax1 and y_values_ax1:
                if len(x_values_ax1) != len(y_values_ax1):
                     # This check should ideally not be needed with correct logic, but good safeguard
                     print(f"!!! ERROR: Mismatch detected plotting axes[1]: len(x)={len(x_values_ax1)}, len(y)={len(y_values_ax1)}")
                else:
                    axes[1].plot(x_values_ax1, y_values_ax1)
                    axes[1].set_xlabel("Epoch")
                    axes[1].set_ylabel("Loss")
                    axes[1].set_title(f"Loss Since Epoch {last_plot_epoch + 1}")
                    axes[1].grid(True)
            else:
                 axes[1].set_title(f"Loss Since Epoch {last_plot_epoch + 1} (No new data)")
                 axes[1].grid(True)
    
    
            plt.tight_layout()
            plt.savefig(os.path.join(plot_dir, f"loss_plot_epoch_{epoch+1}.png"))
            plt.close()
            print(f"Epoch {epoch+1}: Avg Loss = {avg_epoch_loss:.6f}, Moving Avg = {current_moving_avg:.6f}, Time = {formatted_time}")
    
            last_plot_epoch = epoch # Update absolute last plot epoch index

        # --- Early Stopping (Dynamic Threshold) and Best Model Saving/Deletion---
        if early_stopping_patience is not None and epoch + 1 > min_epochs:
            if current_moving_avg < best_loss:
                best_loss = current_moving_avg
                best_epoch = epoch + 1
                epochs_without_improvement = 0

                # Save the *best* model
                new_best_model_path = os.path.join(checkpoint_dir, f"model_best_epoch_{best_epoch}.pth")
                torch.save({
                    'epoch': best_epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': best_loss,  # Save the best loss
                }, new_best_model_path)
                print(f"Saved best model at epoch {best_epoch} with loss {best_loss}")

                # Delete the *previous* best model (if it exists)
                if previous_best_model_path and os.path.exists(previous_best_model_path):
                    os.remove(previous_best_model_path)
                    print(f"Deleted previous best model: {previous_best_model_path}")
                previous_best_model_path = new_best_model_path # Update the path

            else:
                epochs_without_improvement += 1

            if epochs_without_improvement >= early_stopping_patience:
                if len(moving_avg_losses) == moving_avg_window:
                    prev_moving_avg = sum(moving_avg_losses[:-1]) / (moving_avg_window - 1)
                    improvement = (prev_moving_avg - current_moving_avg) / prev_moving_avg * 100
                if improvement < early_stopping_percentage:
                    print(f"Early stopping triggered at epoch {epoch+1}.  Improvement: {improvement:.2f}%")
                    break

    end_time = time.time()
    total_time = end_time - start_time
    formatted_time = str(datetime.timedelta(seconds=total_time))
    print(f"Total training time: {formatted_time}")

    torch.save({
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': avg_epoch_loss,
    }, os.path.join(checkpoint_dir, "model_last.pth"))
    print(f"Saved last model at epoch {epoch+1} with loss {avg_epoch_loss}")

    return all_losses
    

In [None]:
if __name__ == "__main__":
    # --- Data Transforms ---
    transform = transforms.Compose([
        transforms.Resize((config.IMAGE_SIZE, config.IMAGE_SIZE)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])
    
    # --- Create Dataset and DataLoader ---
    dataset = JetbotDataset(config.CSV_PATH, config.DATA_DIR, config.IMAGE_SIZE, config.NUM_PREV_FRAMES, transform=transform)
    
    # Try to load existing split
    train_dataset, test_dataset = load_train_test_split(dataset, config.SPLIT_DATASET_FILENAME)
    
    if train_dataset is None or test_dataset is None:
        print("Dataset split file not found, creating a new split...")
        train_size = int(0.8 * len(dataset))  # 80% for training
        test_size = len(dataset) - train_size
        train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
    
        save_existing_split(train_dataset, test_dataset, config.SPLIT_DATASET_FILENAME)
    else:
        print("Loaded existing dataset split.")
    
    train_dataloader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False) # Batch size 1 for easier evaluation
    
    # --- Calculate Betas and Alphas ---
    betas = linear_beta_schedule(config.NUM_TIMESTEPS, config.BETA_START, config.BETA_END).to(config.DEVICE)
    #betas = cosine_beta_schedule(NUM_TIMESTEPS).to(DEVICE) # Alternative
    
    alphas = (1. - betas).to(config.DEVICE)
    alphas_cumprod = torch.cumprod(alphas, axis=0).to(config.DEVICE)
    alphas_cumprod_prev = F.pad(alphas_cumprod[:-1], (1, 0), value=1.0).to(config.DEVICE)
    sqrt_recip_alphas = torch.sqrt(1.0 / alphas).to(config.DEVICE)
    
    # --- Create Model and Optimizer ---
    model = SimpleUNet(image_channels=3, time_emb_dim=32, num_prev_frames=config.NUM_PREV_FRAMES).to(config.DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=config.LEARNING_RATE)
    
    # --- Load Checkpoint (if available) ---
    if config.LOAD_CHECKPOINT:
        checkpoint = torch.load(config.LOAD_CHECKPOINT)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        START_EPOCH = checkpoint['epoch']
        print(f"Loaded checkpoint from epoch {START_EPOCH}")
    else:
        START_EPOCH = 0

    print(f"--- Training Configuration ---")
    # Print model parameter count
    print(f"Model Parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
    print(f"  Mem Allocated: {torch.cuda.memory_allocated(config.DEVICE) / 1024**2:.2f} MB")
    print(f"  Max Mem Allocated: {torch.cuda.max_memory_allocated(config.DEVICE) / 1024**2:.2f} MB")
    print(f"  Mem Reserved: {torch.cuda.memory_reserved(config.DEVICE) / 1024**2:.2f} MB")
    print(f"  Max Mem Reserved: {torch.cuda.max_memory_reserved(config.DEVICE) / 1024**2:.2f} MB")    
    print(f"--------------------------")    
    # --- Train the Model ---
    losses = train(model, train_dataloader, optimizer, betas, alphas_cumprod, START_EPOCH, config.NUM_EPOCHS, config.DEVICE,
                   config.SAVE_MODEL_EVERY, config.SAMPLE_EVERY, config.CHECKPOINT_DIR, config.SAMPLE_DIR, config.PLOT_DIR, config.PLOT_EVERY, config.USE_FP16,
                   config.ACCUMULATION_STEPS, config.NUM_PREV_FRAMES, early_stopping_patience=config.EARLY_STOPPING_PATIENCE, early_stopping_percentage=config.EARLY_STOPPING_PERCENTAGE, min_epochs=config.MIN_EPOCHS)
    print("Training complete!")
    
    # --- Final Loss Plot ---
    plt.figure(figsize=(10, 5))
    plt.plot(losses)
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training Loss")
    plt.grid(True)
    plt.savefig(os.path.join(config.PLOT_DIR, "loss_plot_final.png"))  # Save to plot dir
    plt.close()

Loaded combined CSV with columns: ['session_id', 'image_path', 'timestamp', 'action']
Total rows in CSV: 23081, Valid sequence start indices: 23037
Loaded existing dataset split.
Loaded checkpoint from epoch 45
--- Training Configuration ---
Model Parameters: 34,956,835
  Mem Allocated: 537.50 MB
  Max Mem Allocated: 537.50 MB
  Mem Reserved: 554.00 MB
  Max Mem Reserved: 554.00 MB
--------------------------


Epoch 46/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

Saved sample image at epoch 46
Epoch 46, Step 4607:
  Mem Allocated: 563.15 MB
  Max Mem Allocated: 1854.76 MB
  Mem Reserved: 2238.00 MB
  Max Mem Reserved: 2238.00 MB
Saved best model at epoch 46 with loss 0.0026981648200950937


Epoch 47/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

Saved sample image at epoch 47
Epoch 47, Step 4607:
  Mem Allocated: 562.94 MB
  Max Mem Allocated: 1859.26 MB
  Mem Reserved: 2240.00 MB
  Max Mem Reserved: 2240.00 MB
Saved best model at epoch 47 with loss 0.00264527996398398
Deleted previous best model: output_two_action_model_v2_test\checkpoints\model_best_epoch_46.pth


Epoch 48/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

Saved sample image at epoch 48
Epoch 48, Step 4607:
  Mem Allocated: 562.94 MB
  Max Mem Allocated: 1859.26 MB
  Mem Reserved: 2240.00 MB
  Max Mem Reserved: 2240.00 MB
Saved best model at epoch 48 with loss 0.0026370867454843434
Deleted previous best model: output_two_action_model_v2_test\checkpoints\model_best_epoch_47.pth


Epoch 49/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

Saved sample image at epoch 49
Epoch 49, Step 4607:
  Mem Allocated: 562.94 MB
  Max Mem Allocated: 1859.26 MB
  Mem Reserved: 2240.00 MB
  Max Mem Reserved: 2240.00 MB
Saved best model at epoch 49 with loss 0.0026286655644965785
Deleted previous best model: output_two_action_model_v2_test\checkpoints\model_best_epoch_48.pth


Epoch 50/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

Saved sample image at epoch 50
Epoch 50, Step 4607:
  Mem Allocated: 562.94 MB
  Max Mem Allocated: 1859.26 MB
  Mem Reserved: 2242.00 MB
  Max Mem Reserved: 2242.00 MB
Epoch 50: Avg Loss = 0.002438, Moving Avg = 0.002591, Time = 1:08:04.441282
Saved best model at epoch 50 with loss 0.0025906079998564897
Deleted previous best model: output_two_action_model_v2_test\checkpoints\model_best_epoch_49.pth


Epoch 51/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

Saved sample image at epoch 51
Epoch 51, Step 4607:
  Mem Allocated: 562.94 MB
  Max Mem Allocated: 1859.26 MB
  Mem Reserved: 2242.00 MB
  Max Mem Reserved: 2242.00 MB
Saved best model at epoch 51 with loss 0.0025814538655132076
Deleted previous best model: output_two_action_model_v2_test\checkpoints\model_best_epoch_50.pth


Epoch 52/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

Saved sample image at epoch 52
Epoch 52, Step 4607:
  Mem Allocated: 562.94 MB
  Max Mem Allocated: 1859.26 MB
  Mem Reserved: 2242.00 MB
  Max Mem Reserved: 2242.00 MB
Saved best model at epoch 52 with loss 0.0025814245701020083
Deleted previous best model: output_two_action_model_v2_test\checkpoints\model_best_epoch_51.pth


Epoch 53/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

Saved sample image at epoch 53
Epoch 53, Step 4607:
  Mem Allocated: 562.94 MB
  Max Mem Allocated: 1859.26 MB
  Mem Reserved: 2242.00 MB
  Max Mem Reserved: 2242.00 MB
Saved best model at epoch 53 with loss 0.002552939325357113
Deleted previous best model: output_two_action_model_v2_test\checkpoints\model_best_epoch_52.pth


Epoch 54/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

Saved sample image at epoch 54
Epoch 54, Step 4607:
  Mem Allocated: 562.94 MB
  Max Mem Allocated: 1859.26 MB
  Mem Reserved: 2242.00 MB
  Max Mem Reserved: 2242.00 MB
Saved best model at epoch 54 with loss 0.002548779446864961
Deleted previous best model: output_two_action_model_v2_test\checkpoints\model_best_epoch_53.pth


Epoch 55/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

Saved sample image at epoch 55
Epoch 55, Step 4607:
  Mem Allocated: 562.94 MB
  Max Mem Allocated: 1859.26 MB
  Mem Reserved: 2242.00 MB
  Max Mem Reserved: 2242.00 MB
Saved best model at epoch 55 with loss 0.0025299194315774936
Deleted previous best model: output_two_action_model_v2_test\checkpoints\model_best_epoch_54.pth


Epoch 56/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

Saved sample image at epoch 56
Epoch 56, Step 4607:
  Mem Allocated: 562.94 MB
  Max Mem Allocated: 1859.26 MB
  Mem Reserved: 2242.00 MB
  Max Mem Reserved: 2242.00 MB
Saved best model at epoch 56 with loss 0.0025021579898588837
Deleted previous best model: output_two_action_model_v2_test\checkpoints\model_best_epoch_55.pth


Epoch 57/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

Saved sample image at epoch 57
Epoch 57, Step 4607:
  Mem Allocated: 562.94 MB
  Max Mem Allocated: 1859.26 MB
  Mem Reserved: 2242.00 MB
  Max Mem Reserved: 2242.00 MB
Saved best model at epoch 57 with loss 0.002496046151012226
Deleted previous best model: output_two_action_model_v2_test\checkpoints\model_best_epoch_56.pth


Epoch 58/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

Saved sample image at epoch 58
Epoch 58, Step 4607:
  Mem Allocated: 562.94 MB
  Max Mem Allocated: 1859.26 MB
  Mem Reserved: 2242.00 MB
  Max Mem Reserved: 2242.00 MB
Saved best model at epoch 58 with loss 0.002485351083412961
Deleted previous best model: output_two_action_model_v2_test\checkpoints\model_best_epoch_57.pth


Epoch 59/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

Saved sample image at epoch 59
Epoch 59, Step 4607:
  Mem Allocated: 562.94 MB
  Max Mem Allocated: 1859.26 MB
  Mem Reserved: 2242.00 MB
  Max Mem Reserved: 2242.00 MB
Saved best model at epoch 59 with loss 0.002467426008808597
Deleted previous best model: output_two_action_model_v2_test\checkpoints\model_best_epoch_58.pth


Epoch 60/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

Saved sample image at epoch 60
Epoch 60, Step 4607:
  Mem Allocated: 562.94 MB
  Max Mem Allocated: 1859.26 MB
  Mem Reserved: 2242.00 MB
  Max Mem Reserved: 2242.00 MB
Epoch 60: Avg Loss = 0.002468, Moving Avg = 0.002470, Time = 3:22:31.579400


Epoch 61/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

Saved sample image at epoch 61
Epoch 61, Step 4607:
  Mem Allocated: 562.94 MB
  Max Mem Allocated: 1859.26 MB
  Mem Reserved: 2242.00 MB
  Max Mem Reserved: 2242.00 MB
Saved best model at epoch 61 with loss 0.002459503233069426
Deleted previous best model: output_two_action_model_v2_test\checkpoints\model_best_epoch_59.pth


Epoch 62/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

Saved sample image at epoch 62
Epoch 62, Step 4607:
  Mem Allocated: 562.94 MB
  Max Mem Allocated: 1859.26 MB
  Mem Reserved: 2242.00 MB
  Max Mem Reserved: 2242.00 MB
Saved best model at epoch 62 with loss 0.0024449884532348025
Deleted previous best model: output_two_action_model_v2_test\checkpoints\model_best_epoch_61.pth


Epoch 63/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

Saved sample image at epoch 63
Epoch 63, Step 4607:
  Mem Allocated: 562.94 MB
  Max Mem Allocated: 1859.26 MB
  Mem Reserved: 2242.00 MB
  Max Mem Reserved: 2242.00 MB


Epoch 64/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

Saved sample image at epoch 64
Epoch 64, Step 4607:
  Mem Allocated: 562.94 MB
  Max Mem Allocated: 1859.26 MB
  Mem Reserved: 2242.00 MB
  Max Mem Reserved: 2242.00 MB
Saved best model at epoch 64 with loss 0.0024309510097582335
Deleted previous best model: output_two_action_model_v2_test\checkpoints\model_best_epoch_62.pth


Epoch 65/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

Saved sample image at epoch 65
Epoch 65, Step 4607:
  Mem Allocated: 562.94 MB
  Max Mem Allocated: 1859.26 MB
  Mem Reserved: 2242.00 MB
  Max Mem Reserved: 2242.00 MB
Saved best model at epoch 65 with loss 0.002413161471441564
Deleted previous best model: output_two_action_model_v2_test\checkpoints\model_best_epoch_64.pth


Epoch 66/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

Saved sample image at epoch 66
Epoch 66, Step 4607:
  Mem Allocated: 562.94 MB
  Max Mem Allocated: 1859.26 MB
  Mem Reserved: 2242.00 MB
  Max Mem Reserved: 2242.00 MB
Saved best model at epoch 66 with loss 0.002395367413291972
Deleted previous best model: output_two_action_model_v2_test\checkpoints\model_best_epoch_65.pth


Epoch 67/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

Saved sample image at epoch 67
Epoch 67, Step 4607:
  Mem Allocated: 562.94 MB
  Max Mem Allocated: 1859.26 MB
  Mem Reserved: 2242.00 MB
  Max Mem Reserved: 2242.00 MB
Saved best model at epoch 67 with loss 0.002379805663510284
Deleted previous best model: output_two_action_model_v2_test\checkpoints\model_best_epoch_66.pth


Epoch 68/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

Saved sample image at epoch 68
Epoch 68, Step 4607:
  Mem Allocated: 562.94 MB
  Max Mem Allocated: 1859.26 MB
  Mem Reserved: 2242.00 MB
  Max Mem Reserved: 2242.00 MB


Epoch 69/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

Saved sample image at epoch 69
Epoch 69, Step 4607:
  Mem Allocated: 562.94 MB
  Max Mem Allocated: 1859.26 MB
  Mem Reserved: 2242.00 MB
  Max Mem Reserved: 2242.00 MB
Saved best model at epoch 69 with loss 0.0023679036107643646
Deleted previous best model: output_two_action_model_v2_test\checkpoints\model_best_epoch_67.pth


Epoch 70/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

Saved sample image at epoch 70
Epoch 70, Step 4607:
  Mem Allocated: 562.94 MB
  Max Mem Allocated: 1859.26 MB
  Mem Reserved: 2242.00 MB
  Max Mem Reserved: 2242.00 MB
Epoch 70: Avg Loss = 0.002476, Moving Avg = 0.002369, Time = 5:36:27.711079


Epoch 71/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

Saved sample image at epoch 71
Epoch 71, Step 4607:
  Mem Allocated: 562.94 MB
  Max Mem Allocated: 1859.26 MB
  Mem Reserved: 2242.00 MB
  Max Mem Reserved: 2242.00 MB
Saved best model at epoch 71 with loss 0.00235959810770857
Deleted previous best model: output_two_action_model_v2_test\checkpoints\model_best_epoch_69.pth


Epoch 72/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter serve

Saved sample image at epoch 96
Epoch 96, Step 4607:
  Mem Allocated: 562.94 MB
  Max Mem Allocated: 1859.26 MB
  Mem Reserved: 2242.00 MB
  Max Mem Reserved: 2242.00 MB
Saved best model at epoch 96 with loss 0.0021212030719788624
Deleted previous best model: output_two_action_model_v2_test\checkpoints\model_best_epoch_95.pth


Epoch 97/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Saved model checkpoint at epoch 100
Saved sample image at epoch 100
Epoch 100, Step 4607:
  Mem Allocated: 562.94 MB
  Max Mem Allocated: 1859.26 MB
  Mem Reserved: 2242.00 MB
  Max Mem Reserved: 2242.00 MB
Epoch 100: Avg Loss = 0.002223, Moving Avg = 0.002137, Time = 12:16:57.753558


Epoch 101/1000:   0%|          | 0/4608 [00:00<?, ?it/s]

In [7]:
config.SAMPLE_EVERY

20