In [None]:

# Cell 1: Dataset Loading with SUBSET SAMPLING
import pandas as pd
import numpy as np
import cv2
import os
from torch.utils.data import Dataset, DataLoader
import torch

os.environ['CUDA_VISIBLE_DEVICES'] = '1'  


def get_directional_kernels():
    # 8 edge detection kernels: N, NE, E, SE, S, SW, W, NW
    k = np.array([[1, 2, 1], [0, 0, 0], [-1, -2, -1]]) # Vertical
    kernels = [
        k,                              # S
        np.rot90(k, 1),                 # W
        np.rot90(k, 2),                 # N
        np.rot90(k, 3),                 # E
        np.fliplr(k),                   # SW
        np.flipud(k),                   # NE
        np.fliplr(np.rot90(k, 1)),      # NW
        np.flipud(np.rot90(k, 3)),      # SE
    ]
    return kernels

def get_directional_maps(image):
    kernels = get_directional_kernels()
    edge_maps = []
    for kern in kernels:    
        em = cv2.filter2D(image, cv2.CV_32F, kern, borderType=cv2.BORDER_REPLICATE)
        em = np.abs(em)
        maxv = em.max()
        if maxv > 1e-8:
            em = em / maxv 
        else:
            em = np.zeros_like(em, dtype=np.float32)
        
        edge_maps.append(em.astype(np.float32))
    return np.stack(edge_maps, axis=0)  # [8, H, W]


class MathEquation9ChDataset(Dataset):
    def __init__(self, csv_file, dataset_root, split='train', transform=None, max_samples=None):
        """
        Args:
            csv_file: Path to CSV file with 'filename' column
            dataset_root: Root directory of dataset
            split: 'train', 'val', or 'test' - determines subdirectory
            transform: Optional transforms
            max_samples: Maximum number of samples to use (None = use all)
        """
        df = pd.read_csv(csv_file)
        
        # SAMPLE SUBSET if max_samples is specified
        if max_samples is not None and max_samples < len(df):
            print(f"Sampling {max_samples:,} from {len(df):,} available samples...")
            # Use random sampling with fixed seed for reproducibility
            df = df.sample(n=max_samples, random_state=42).reset_index(drop=True)
            print(f"Sampled dataset size: {len(df):,}")
        
        self.data_frame = df
        self.dataset_root = dataset_root
        self.split = split  # train, val, or test
        self.transform = transform

    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, idx):
        # Get filename from CSV
        filename = self.data_frame.iloc[idx]['filename']
        
        # Construct full path: dataset_root/split/filename
        img_full_path = os.path.join(self.dataset_root, self.split, filename)
        img_full_path = os.path.normpath(img_full_path).replace('\\', '/')
        
        image = cv2.imread(img_full_path, cv2.IMREAD_GRAYSCALE)
        if image is None:
            raise FileNotFoundError(f"Image not found: {img_full_path}")
        image = image.astype(np.float32) / 255.0
        H, W = image.shape
        
        # 9 channel construction
        channels = np.zeros((9, H, W), dtype=np.float32)
        channels[0] = image  # Greyscale base
        channels[1:] = get_directional_maps(image)  # 8 directions
        label = self.data_frame.iloc[idx]['normalized_label']
        sample = {'image': torch.tensor(channels, dtype=torch.float32), 'label': label}
        if self.transform:
            sample['image'] = self.transform(sample['image'])
        return sample

# ==============================
# CONFIGURE SUBSET SIZES HERE
# ==============================
TRAIN_SAMPLES = 10000
VAL_SAMPLES = 2000
TEST_SAMPLES = 2000

# Usage:
DATASET_ROOT ='/home/ie643_errorcode500/errorcode500-working/Mathwritting-10k'
TRAIN_CSV = os.path.join(DATASET_ROOT, 'train_database.csv')

# Let's first check if the CSV exists and print its contents
if os.path.exists(TRAIN_CSV):
    df = pd.read_csv(TRAIN_CSV)
    print("="*60)
    print("CSV DATASET INFORMATION")
    print("="*60)
    print("Columns:", df.columns.tolist())
    print(f"Total training samples available: {len(df):,}")
    print(f"Will use: {TRAIN_SAMPLES:,} samples")
    print("="*60)
    print("\nFirst few filenames:")
    print(df['filename'].head())
else:
    print(f"CSV file not found at {TRAIN_CSV}")

# Create dataset with SUBSET
train_dataset = MathEquation9ChDataset(
    TRAIN_CSV, 
    DATASET_ROOT, 
    split='train',
    max_samples=TRAIN_SAMPLES  # KEY PARAMETER
)

train_loader = DataLoader(
    train_dataset, 
    batch_size=32,  # Increased batch size
    shuffle=True,
    num_workers=8,  # Parallel data loading
    pin_memory=True,  # Faster GPU transfer
    persistent_workers=True  # Keep workers alive
)

print(f"\n{'='*60}")
print("TRAINING DATASET READY")
print(f"{'='*60}")
print(f"Dataset size: {len(train_dataset):,} samples")
print(f"Batch size: 32")
print(f"Batches per epoch: {len(train_loader):,}")
print(f"{'='*60}\n")

try:
    for batch in train_loader:
        images, labels = batch['image'], batch['label']
        print(f"✓ Batch loaded successfully")
        print(f"  Image tensor shape: {images.shape}")
        print(f"  First 3 labels: {labels[:3]}")
        break
except Exception as e:
    print(f"\n✗ Error loading batch: {str(e)}")

CSV DATASET INFORMATION
Columns: ['filename', 'sample_id', 'label', 'normalized_label', 'is_symbol']
Total training samples available: 10,000
Will use: 10,000 samples

First few filenames:
0    a9fa242701017325.png
1    78f6cbf89778f0aa.png
2    d167b3da5fc40cc2.png
3    a1fbae100abe06bd.png
4    5b06d5ab01e8019b.png
Name: filename, dtype: object

TRAINING DATASET READY
Dataset size: 10,000 samples
Batch size: 32
Batches per epoch: 313

✓ Batch loaded successfully
  Image tensor shape: torch.Size([32, 9, 480, 1600])
  First 3 labels: ['S_{b}|b+\\rangle=\\frac{\\hbar}{2}|b+\\rangle', '[\\begin{matrix}1&a\\\\ 0&0\\end{matrix}]', '\\int pdq']


In [2]:
import pandas as pd
from collections import Counter

# Load all labels from train/val/test CSVs
csv_files = [
    'train_database.csv',
    'val_database.csv',
    'test_database.csv'
]


all_labels = []
for csv_file in csv_files:
    df = pd.read_csv(os.path.join(DATASET_ROOT, csv_file))
    all_labels.extend(df['normalized_label'].astype(str).tolist())

# Build character-level vocabulary
special_tokens = ['<PAD>', '<SOS>', '<EOS>']
char_counter = Counter()
for label in all_labels:
    char_counter.update(list(label))

vocab = special_tokens + sorted(char_counter.keys())
char2idx = {ch: idx for idx, ch in enumerate(vocab)}
idx2char = {idx: ch for ch, idx in char2idx.items()}

print(f"Vocabulary size: {len(vocab)}")
print("First 20 tokens:", vocab[:20])

# Encode a label string to indices
def encode_label(label, max_len=128):
    tokens = [char2idx['<SOS>']] + [char2idx[ch] for ch in label] + [char2idx['<EOS>']]
    if len(tokens) < max_len:
        tokens += [char2idx['<PAD>']] * (max_len - len(tokens))
    else:
        tokens = tokens[:max_len]
    return tokens

# Example usage
sample_label = all_labels[0]
encoded = encode_label(sample_label)
print("Original label:", sample_label)
print("Encoded:", encoded[:20])

# For your dataset class, you can add:
# label_indices = encode_label(label)
# sample = {'image': image_tensor, 'label': label_indices}


Vocabulary size: 92
First 20 tokens: ['<PAD>', '<SOS>', '<EOS>', ' ', '!', '#', '%', '&', '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3']
Original label: \frac{54.88043}{x}
Encoded: [1, 59, 68, 80, 63, 65, 89, 21, 20, 14, 24, 24, 16, 20, 19, 91, 89, 86, 91, 2]


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, num_layers=4, dropout_p=0.0):
        super().__init__()
        layers = []
        for i in range(num_layers):
            layers.append(nn.Conv2d(
                in_channels if i == 0 else out_channels,
                out_channels,
                kernel_size=3,
                stride=1,
                padding=1
            ))
            layers.append(nn.BatchNorm2d(out_channels))
            layers.append(nn.ReLU(inplace=True))
            if dropout_p > 0:
                layers.append(nn.Dropout2d(p=dropout_p))
        self.block = nn.Sequential(*layers)

    def forward(self, x):
        return self.block(x)

class WatcherFCN(nn.Module):
    def __init__(self, in_channels=9):
        super().__init__()
        # First blocks without dropout
        self.block1 = ConvBlock(in_channels, 32)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.block2 = ConvBlock(32, 64)
        self.pool2 = nn.MaxPool2d(2, 2)
        # Last blocks with 20% dropout
        self.block3 = ConvBlock(64, 64)
        self.pool3 = nn.MaxPool2d(2, 2)
        self.block4 = ConvBlock(64, 128, dropout_p=0.2)
        self.pool4 = nn.MaxPool2d(2, 2)

    def forward(self, x):
        x = self.block1(x)
        x = self.pool1(x)
        x = self.block2(x)
        x = self.pool2(x)
        x = self.block3(x)
        x = self.pool3(x)
        x = self.block4(x)
        x = self.pool4(x)
        return x

# Example usage:
model = WatcherFCN(in_channels=9)
dummy_input = torch.randn(2, 9, 480, 1600)
output = model(dummy_input)
print(output.shape) 


torch.Size([2, 128, 30, 100])


In [4]:
batch_size, channels, height, width = output.shape
encoder_outputs = output.permute(0, 2, 3, 1).reshape(batch_size, height * width, channels)
# encoder_outputs: [batch, 3000, 512]
encoder_outputs.shape

torch.Size([2, 3000, 128])

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CoverageAttention(nn.Module):
    def __init__(self, encoder_dim, decoder_dim, attention_dim, coverage_dim):
        super().__init__()
        self.W_a = nn.Linear(decoder_dim, attention_dim)
        self.U_a = nn.Linear(encoder_dim, attention_dim)
        self.U_f = nn.Linear(coverage_dim, attention_dim)
        self.v = nn.Linear(attention_dim, 1)

    def forward(self, encoder_outputs, decoder_hidden, coverage):
        # encoder_outputs: [batch, L, encoder_dim]
        # decoder_hidden: [batch, decoder_dim]
        # coverage: [batch, L, coverage_dim]
        Wh = self.W_a(decoder_hidden).unsqueeze(1)  # [batch, 1, att_dim]
        Ua = self.U_a(encoder_outputs)              # [batch, L, att_dim]
        Uf = self.U_f(coverage)                     # [batch, L, att_dim]
        att = torch.tanh(Wh + Ua + Uf)              # [batch, L, att_dim]
        scores = self.v(att).squeeze(-1)            # [batch, L]
        alpha = F.softmax(scores, dim=1)            # [batch, L]
        context = torch.sum(encoder_outputs * alpha.unsqueeze(-1), dim=1)  # [batch, encoder_dim]
        return context, alpha

# class ParserGRUDecoder(nn.Module):
#     def __init__(self, vocab_size, encoder_dim=128, embed_dim=256, decoder_dim=256, attention_dim=256, coverage_dim=1):
#         super().__init__()
#         self.embedding = nn.Embedding(vocab_size, embed_dim)
#         self.gru = nn.GRUCell(embed_dim + encoder_dim, decoder_dim)
#         self.attention = CoverageAttention(encoder_dim, decoder_dim, attention_dim, coverage_dim)
#         self.fc = nn.Linear(decoder_dim + encoder_dim, vocab_size)

#     def forward(self, encoder_outputs, targets, max_len):
#         batch_size, L, encoder_dim = encoder_outputs.size()
#         device = encoder_outputs.device
#         coverage = torch.zeros(batch_size, L, 1, device=device)
#         inputs = torch.full((batch_size,), 1, dtype=torch.long, device=device)  # <SOS> token index
#         hidden = torch.zeros(batch_size, 256, device=device)
#         outputs = []
#         for t in range(max_len):
#             embedded = self.embedding(inputs)  # [batch, embed_dim]
#             context, alpha = self.attention(encoder_outputs, hidden, coverage)
#             gru_input = torch.cat([embedded, context], dim=1)
#             hidden = self.gru(gru_input, hidden)
#             output = self.fc(torch.cat([hidden, context], dim=1))
#             outputs.append(output)
#             # Teacher forcing: use ground truth if available
#             if targets is not None and t < targets.size(1):
#                 inputs = targets[:, t]
#             else:
#                 inputs = output.argmax(dim=1)
#             coverage = coverage + alpha.unsqueeze(-1)
#         outputs = torch.stack(outputs, dim=1)  # [batch, max_len, vocab_size]
#         return outputs


#Modified ParserGRUDecoder

class ParserGRUDecoder(nn.Module):
    def __init__(self, vocab_size, encoder_dim=128, embed_dim=256, decoder_dim=256, attention_dim=256, coverage_dim=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        # Modify input size to include context vector
        self.gru = nn.GRUCell(embed_dim + encoder_dim, decoder_dim)
        self.attention = CoverageAttention(encoder_dim, decoder_dim, attention_dim, coverage_dim)
        # Change output layer to use all available information
        self.out = nn.Sequential(
            nn.Linear(decoder_dim + encoder_dim + embed_dim, decoder_dim),
            nn.Tanh(),
            nn.Linear(decoder_dim, vocab_size)
        )
        self.decoder_dim = decoder_dim

    def forward(self, encoder_outputs, targets, max_len):
        batch_size, L, encoder_dim = encoder_outputs.size()
        device = encoder_outputs.device
        coverage = torch.zeros(batch_size, L, 1, device=device)
        inputs = torch.full((batch_size,), 1, dtype=torch.long, device=device)  # <SOS> token index
        hidden = torch.zeros(batch_size, self.decoder_dim, device=device)
        outputs = []

        for t in range(max_len):
            # 1. Get current input embedding
            embedded = self.embedding(inputs)  # [batch, embed_dim]
            
            # 2. Calculate attention and context
            context, alpha = self.attention(encoder_outputs, hidden, coverage)
            
            # 3. Update GRU hidden state with concatenated input
            gru_input = torch.cat([embedded, context], dim=1)
            hidden = self.gru(gru_input, hidden)
            
            # 4. Generate output using all available information
            # Concatenate current embedding, hidden state, and context
            output = self.out(torch.cat([embedded, hidden, context], dim=1))
            outputs.append(output)
            
            # 5. Teacher forcing or use own predictions
            if targets is not None and t < targets.size(1):
                inputs = targets[:, t]
            else:
                inputs = output.argmax(dim=1)
            
            # 6. Update coverage vector
            coverage = coverage + alpha.unsqueeze(-1)
        
        outputs = torch.stack(outputs, dim=1)  # [batch, max_len, vocab_size]
        return outputs

# Example usage:
# encoder_outputs: [batch, L, encoder_dim] (flatten FCN output to [batch, L, 512])
# targets: [batch, max_len] (token indices)
# decoder = ParserGRUDecoder(vocab_size=len(vocab))
# outputs = decoder(encoder_outputs, targets, max_len)


This is the main training code which we were using before. Now I am using a better code which is below this commented cell


In [None]:
# Cell 4: Training with GPU A6000 optimization
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import torch.nn.functional as F

# ==============================
# Training Configuration for A6000
# ==============================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    # Enable TF32 for faster training on A6000
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    # Use cudnn benchmarking for optimal performance
    torch.backends.cudnn.benchmark = True

num_epochs = 10
max_len = 128
best_loss = float('inf')

watcher = WatcherFCN(in_channels=9).to(device)
decoder = ParserGRUDecoder(vocab_size=len(vocab)).to(device)

pad_idx = vocab.index('<PAD>')
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
# optimizer = optim.Adadelta(list(watcher.parameters()) + list(decoder.parameters()))
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.5)

optimizer = optim.AdamW(
    list(watcher.parameters()) + list(decoder.parameters()),
    lr=1e-4,  # Much lower learning rate
    weight_decay=0.01
)

# FIXED: Better scheduler
scheduler = optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=1e-3,
    epochs=num_epochs,
    steps_per_epoch=len(train_loader),
    pct_start=0.1
)

# Mixed precision training for A6000
scaler = torch.cuda.amp.GradScaler()

# ==============================
# Helper: Apply Weight Noise Regularization
# ==============================
# def apply_weight_noise(model, std=0.01):
#     """Adds Gaussian noise to model weights for regularization."""
#     with torch.no_grad():
#         for p in model.parameters():
#             if p.requires_grad:
#                 p.add_(torch.randn_like(p) * std)

################################## Modifying the code here. ##################################
def apply_weight_noise(model, std=0.01):
    """Adds Gaussian noise to model weights for regularization."""
    with torch.no_grad():
        for p in model.parameters():
            if p.requires_grad:
                p.add_(torch.randn_like(p) * std)

# ADDED: Quick validation function
@torch.no_grad()
def quick_validate(watcher, decoder, val_loader, num_samples=5):
    """Quick validation check during training"""
    watcher.eval()
    decoder.eval()
    correct = 0
    total = 0
    
    for batch in val_loader:
        if total >= num_samples:
            break
        images = batch['image'].to(device, non_blocking=True)
        labels = batch['label']
        
        with torch.cuda.amp.autocast():
            watcher_output = watcher(images)
            batch_size, channels, height, width = watcher_output.shape
            encoder_outputs = watcher_output.permute(0, 2, 3, 1).reshape(
                batch_size, height * width, channels
            )
            outputs = decoder(encoder_outputs, None, max_len=128)
        
        for i in range(min(len(outputs), num_samples - total)):
            pred_indices = outputs[i].argmax(dim=-1)
            pred_text = ''.join([idx2char[idx.item()] for idx in pred_indices 
                               if idx2char[idx.item()] not in ['<PAD>', '<SOS>', '<EOS>']])
            true_text = labels[i]
            
            if pred_text == true_text:
                correct += 1
            total += 1
            
            if total <= 3:  # Print first 3 examples
                print(f"  Pred: {pred_text[:50]}...")
                print(f"  True: {true_text[:50]}...")
                print("  " + "-" * 40)
    
    watcher.train()
    decoder.train()
    return correct / total if total > 0 else 0

# Create a small validation loader for quick checks
VAL_CSV = os.path.join(DATASET_ROOT, 'val_database.csv')
val_dataset_quick = MathEquation9ChDataset(
    VAL_CSV, 
    DATASET_ROOT, 
    split='val',
    max_samples=100  # Small subset for quick validation
)
val_loader_quick = DataLoader(
    val_dataset_quick, 
    batch_size=16,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)
# ==============================
# Training Loop with GPU Optimization
# ==============================
try:
    for epoch in range(num_epochs):
        watcher.train()
        decoder.train()
        total_loss = 0
        batch_count = 0
        
        pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}')
        
        # for batch in pbar:
        #     # Non-blocking GPU transfer
        #     images = batch['image'].to(device, non_blocking=True)
        #     labels = [encode_label(lbl, max_len) for lbl in batch['label']]
        #     labels = torch.tensor(labels, dtype=torch.long, device=device)

        #     optimizer.zero_grad(set_to_none=True)
            
        #     try:
        #         # Mixed precision training
        #         with torch.cuda.amp.autocast():
        #             watcher_output = watcher(images)
        #             batch_size, channels, height, width = watcher_output.shape
        #             encoder_outputs = watcher_output.permute(0, 2, 3, 1).reshape(
        #                 batch_size, height * width, channels
        #             )

        #             outputs = decoder(encoder_outputs, labels, max_len)
        #             outputs = outputs.view(-1, outputs.size(-1))
        #             labels_flat = labels.view(-1)

        #             loss = criterion(outputs, labels_flat)

        #         # Scaled backpropagation
        #         scaler.scale(loss).backward()
        #         scaler.unscale_(optimizer)
        #         torch.nn.utils.clip_grad_norm_(
        #             list(watcher.parameters()) + list(decoder.parameters()), 
        #             max_norm=5.0
        #         )
        #         scaler.step(optimizer)
        #         scaler.update()

        #         total_loss += loss.item()
        #         batch_count += 1
        #         pbar.set_postfix({
        #             'loss': f'{loss.item():.4f}',
        #             'gpu_mem': f'{torch.cuda.memory_allocated(0)/1e9:.1f}GB'
        #         })
                
        #     except RuntimeError as e:
        #         print(f"Error in batch: {str(e)}")
        #         torch.cuda.empty_cache()
        #         continue
        ###################################### Modifying the code here. ######################################
        for batch in pbar:
            # Non-blocking GPU transfer
            images = batch['image'].to(device, non_blocking=True)
            labels = [encode_label(lbl, max_len) for lbl in batch['label']]
            labels = torch.tensor(labels, dtype=torch.long, device=device)

            optimizer.zero_grad(set_to_none=True)
            
            try:
                # Mixed precision training
                with torch.cuda.amp.autocast():
                    watcher_output = watcher(images)
                    batch_size, channels, height, width = watcher_output.shape
                    encoder_outputs = watcher_output.permute(0, 2, 3, 1).reshape(
                        batch_size, height * width, channels
                    )

                    outputs = decoder(encoder_outputs, labels, max_len)
                    outputs = outputs.view(-1, outputs.size(-1))
                    labels_flat = labels.view(-1)

                    loss = criterion(outputs, labels_flat)

                # Scaled backpropagation
                scaler.scale(loss).backward()
                scaler.unscale_(optimizer)
                
                # ADDED: Gradient monitoring
                total_norm = torch.nn.utils.clip_grad_norm_(
                    list(watcher.parameters()) + list(decoder.parameters()), 
                    max_norm=1.0  # Reduced from 5.0 to 1.0
                )
                
                scaler.step(optimizer)
                scaler.update()
                scheduler.step()  # Step scheduler after each batch

                total_loss += loss.item()
                batch_count += 1
                
                # ADDED: Better monitoring every 50 batches
                if batch_count % 50 == 0:
                    print(f"\nBatch {batch_count}, Loss: {loss.item():.4f}, Grad Norm: {total_norm:.4f}")
                
                pbar.set_postfix({
                    'loss': f'{loss.item():.4f}',
                    'grad_norm': f'{total_norm:.3f}',  # Added gradient norm
                    'lr': f'{scheduler.get_last_lr()[0]:.2e}',  # Added learning rate
                    'gpu_mem': f'{torch.cuda.memory_allocated(0)/1e9:.1f}GB'
                })
                
            except RuntimeError as e:
                print(f"Error in batch: {str(e)}")
                torch.cuda.empty_cache()
                continue

        avg_loss = total_loss / batch_count
        print(f"\nEpoch {epoch+1}, Average Loss: {avg_loss:.4f}")
        print(f"GPU Memory: {torch.cuda.memory_allocated(0)/1e9:.2f}GB / {torch.cuda.memory_reserved(0)/1e9:.2f}GB")

        # Apply weight noise (annealing regularization)

        ###########################Modifying the code here. ###########################
        # apply_weight_noise(watcher, std=0.01)
        # apply_weight_noise(decoder, std=0.01)
        # ADDED: Quick validation check every 2 epochs
        if epoch % 2 == 0:
            print(f"\nQuick validation check (Epoch {epoch+1}):")
            val_acc = quick_validate(watcher, decoder, val_loader_quick, num_samples=10)
            print(f"Quick validation accuracy: {val_acc:.2%}\n")

        #scheduler.step(avg_loss)

        # Save best model
        if avg_loss < best_loss:
            best_loss = avg_loss
            torch.save({
                'epoch': epoch,
                'watcher_state_dict': watcher.state_dict(),
                'decoder_state_dict': decoder.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                 'scheduler_state_dict': scheduler.state_dict(),
                'loss': best_loss,
            }, 'best_model.pth')
            print(f"Saved best model with loss: {best_loss:.4f}")

        # Clear GPU cache after each epoch
        torch.cuda.empty_cache()

except KeyboardInterrupt:
    print("\nTraining interrupted by user")
except Exception as e:
    print(f"\nError during training: {str(e)}")
    import traceback
    traceback.print_exc()
finally:
    torch.save({
        'watcher_state_dict': watcher.state_dict(),
        'decoder_state_dict': decoder.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': total_loss / batch_count if 'total_loss' in locals() and batch_count > 0 else None,
    }, 'final_model.pth')
    print("Final model saved.")
    torch.cuda.empty_cache()

Using device: cuda
GPU Name: NVIDIA RTX A6000
GPU Memory: 50.91 GB


Epoch 1/10:  10%|██▊                        | 32/313 [01:02<09:05,  1.94s/it, loss=2.6135, gpu_mem=1.0GB]



Training interrupted by user
Final model saved.


In [7]:
# Cell 5: Evaluation with SUBSET SAMPLING
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

# Load models
watcher = WatcherFCN(in_channels=9)
decoder = ParserGRUDecoder(vocab_size=len(vocab))

# Load checkpoint
checkpoint = torch.load('best_model.pth', map_location=device)
watcher.load_state_dict(checkpoint['watcher_state_dict'])
decoder.load_state_dict(checkpoint['decoder_state_dict'])

# Move to device and set to eval mode
watcher = watcher.to(device)
decoder = decoder.to(device)
watcher.eval()
decoder.eval()

print("Model loaded successfully!")
print(f"Best training loss: {checkpoint['loss']:.4f}")

# Evaluation function with GPU optimization
@torch.no_grad()
def evaluate_model(watcher, decoder, data_loader, max_samples=50):
    """Evaluate model and show sample predictions"""
    watcher.eval()
    decoder.eval()
    
    total_correct = 0
    total_samples = 0
    samples = []
    
    for batch in tqdm(data_loader, desc='Evaluating'):
        images = batch['image'].to(device, non_blocking=True)
        labels = batch['label']
        
        # Forward pass with mixed precision
        with torch.cuda.amp.autocast():
            watcher_output = watcher(images)
            batch_size, channels, height, width = watcher_output.shape
            encoder_outputs = watcher_output.permute(0, 2, 3, 1).reshape(
                batch_size, height * width, channels
            )
            
            # Generate predictions (no teacher forcing)
            outputs = decoder(encoder_outputs, None, max_len=128)
        
        # Convert to text
        for i in range(len(outputs)):
            pred_indices = outputs[i].argmax(dim=-1)
            pred_text = ''.join([idx2char[idx.item()] for idx in pred_indices 
                               if idx2char[idx.item()] not in ['<PAD>', '<SOS>', '<EOS>']])
            true_text = labels[i]
            
            is_correct = (pred_text == true_text)
            total_correct += is_correct
            total_samples += 1
            
            # Save samples
            if len(samples) < max_samples:
                samples.append({
                    'pred': pred_text,
                    'true': true_text,
                    'correct': is_correct
                })
        
        # Clear cache periodically
        if total_samples % 200 == 0:
            torch.cuda.empty_cache()
        
        if total_samples >= max_samples:
            break
    
    # Print results
    accuracy = total_correct / total_samples
    print(f"\n{'='*60}")
    print(f"Accuracy: {accuracy:.2%} ({total_correct}/{total_samples})")
    print(f"{'='*60}\n")
    
    # Print samples
    for i, sample in enumerate(samples, 1):
        status = '✓' if sample['correct'] else '✗'
        print(f"Sample {i} {status}")
        print(f"Predicted: {sample['pred']}")
        print(f"True     : {sample['true']}")
        print("-" * 60)
    
    return accuracy, samples

# Evaluate on validation set with SUBSET
VAL_CSV = os.path.join(DATASET_ROOT, 'val_database.csv')
val_dataset = MathEquation9ChDataset(
    VAL_CSV, 
    DATASET_ROOT, 
    split='val',
    max_samples=VAL_SAMPLES  # Use subset
)
val_loader = DataLoader(
    val_dataset, 
    batch_size=32,
    shuffle=False,
    num_workers=8,
    pin_memory=True
)

print(f"Validation dataset: {len(val_dataset):,} samples")
print("Evaluating on Validation Set...")
val_accuracy, val_samples = evaluate_model(watcher, decoder, val_loader, max_samples=50)

# Evaluate on test set with SUBSET
TEST_CSV = os.path.join(DATASET_ROOT, 'test_database.csv')
test_dataset = MathEquation9ChDataset(
    TEST_CSV, 
    DATASET_ROOT, 
    split='test',
    max_samples=TEST_SAMPLES  # Use subset
)
test_loader = DataLoader(
    test_dataset, 
    batch_size=32,
    shuffle=False,
    num_workers=8,
    pin_memory=True
)

print(f"\nTest dataset: {len(test_dataset):,} samples")
print("Evaluating on Test Set...")
test_accuracy, test_samples = evaluate_model(watcher, decoder, test_loader, max_samples=50)

print(f"\n{'='*60}")
print("EVALUATION COMPLETE")
print(f"{'='*60}")
print(f"Validation Accuracy: {val_accuracy:.2%}")
print(f"Test Accuracy: {test_accuracy:.2%}")
print(f"GPU Memory: {torch.cuda.memory_allocated(0)/1e9:.2f}GB")
print(f"{'='*60}\n")

torch.cuda.empty_cache()

FileNotFoundError: [Errno 2] No such file or directory: 'best_model.pth'