In [42]:
import torch 
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import numpy as np
import os
import cv2
import torch
import pandas as pd
from torch.utils.data import Dataset,DataLoader
from torchvision import transforms
from torch.cuda.amp import autocast, GradScaler

In [43]:


class HandwritingDataset(Dataset):
    def __init__(self, csv_file, vocab, img_height=32, max_width=512):
        self.data = pd.read_csv(csv_file)
        self.vocab = vocab
        self.char2idx = {char: idx for idx, char in enumerate(vocab)}
        self.img_height = img_height
        self.max_width = max_width

        self.transform = transforms.Compose([
            transforms.ToTensor(),  # Converts to [0,1] and shape [C,H,W]
            transforms.Normalize((0.5,), (0.5,))  # Normalize grayscale
        ])

    def __len__(self):
        return len(self.data)

    def encode_label(self, text):
        return [self.char2idx[char] for char in text if char in self.char2idx]

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = row['FILENAME']
        label = str(row['IDENTITY'])

        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)  # [H, W]
        if img is None:
            raise ValueError(f"Image not found: {img_path}")

        # Resize to height and pad width if needed
        h, w = img.shape
        new_w = int(self.img_height * w / h)
        new_w = min(new_w, self.max_width)
        img_resized = cv2.resize(img, (new_w, self.img_height))  # [H, W]

        # Pad to max_width
        padded_img = 255 * np.ones((self.img_height, self.max_width), dtype=np.uint8)
        padded_img[:, :new_w] = img_resized

        img_tensor = self.transform(padded_img)  # [1, H, W]
        label_encoded = self.encode_label(label)
        label_encoded = torch.tensor(label_encoded, dtype=torch.long)
        label_length = torch.tensor(len(label_encoded), dtype=torch.long)

        return {
            'image': img_tensor,
            'label_encoded': label_encoded,
            'label_lengths': label_length,
            'label_text': label,
            'filename': img_path
        }

In [44]:
class CRNN(nn.Module):
    def __init__(self, img_h, n_channels, n_classes):
        super(CRNN, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(n_channels, 64, 3, 1, 1),  # [B, 64, H, W]
            nn.ReLU(),
            nn.MaxPool2d(2, 2),                 # [B, 64, H/2, W/2]

            nn.Conv2d(64, 128, 3, 1, 1),        # [B, 128, H/2, W/2]
            nn.ReLU(),
            nn.MaxPool2d(2, 2),                 # [B, 128, H/4, W/4]

            nn.Conv2d(128, 256, 3, 1, 1),
            nn.ReLU(),
            nn.Conv2d(256, 256, 3, 1, 1),
            nn.ReLU(),
            nn.MaxPool2d((2, 1)),               # [B, 256, H/8, W/4]

            nn.Conv2d(256, 512, 3, 1, 1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.Conv2d(512, 512, 3, 1, 1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d((2, 1)),               # [B, 512, H/16, W/4]

            nn.Conv2d(512, 512, 2, 1, 0),       # [B, 512, H/16 - 1, W/4 - 1]
            nn.ReLU()
        )

        self.rnn = nn.Sequential(
            nn.LSTM(512, 256, num_layers=2, bidirectional=True, batch_first=True)
        )

        self.dense = nn.Linear(512, n_classes)

    def forward(self, x):
        conv = self.cnn(x)  # [B, 512, H, W]
        b, c, h, w = conv.size()
        assert h == 1, "the height of conv must be 1"
        conv = conv.squeeze(2)  # remove height dim -> [B, 512, W]
        conv = conv.permute(0, 2, 1)  # [B, W, 512] for RNN

        rnn_out, _ = self.rnn(conv)  # [B, W, 512]
        out = self.dense(rnn_out)   # [B, W, n_classes]
        out = out.permute(1, 0, 2)  # for CTC loss: [W, B, n_classes]
        return out

In [45]:
def crnn_collate_fn(batch):
    """
    batch: list of items returned by __getitem__()
    """
    images = [item['image'] for item in batch]
    labels = [item['label_encoded'] for item in batch]
    label_lengths = torch.tensor([len(l) for l in labels], dtype=torch.long)

    # Stack images into [B, 1, H, W]
    images = torch.stack(images)

    # Concatenate labels into a 1D tensor as required by CTC Loss
    labels_concat = torch.cat(labels, dim=0)

    return {
        'images': images,                      # [B, 1, H, W]
        'labels': labels_concat,               # [Total number of label tokens]
        'label_lengths': label_lengths,        # [B]
        'label_text': [item['label_text'] for item in batch],  # For debugging
        'input_lengths': torch.full(size=(len(images),), fill_value=images.shape[-1] // 4, dtype=torch.long)
        # input_lengths assumes CNN downscales W by 4x
    }

In [59]:
vocab = list("abcdefghijklmnopqrstuvwxyz0123456789")
csv_file=r"C:\Users\Raihan\OneDrive\Desktop\DPIIT HACKATHON\cvsi_fullpath.csv"
dataset = HandwritingDataset(csv_file=csv_file, vocab=vocab)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True,collate_fn=crnn_collate_fn)

In [60]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CRNN(img_h=32, n_channels=1, n_classes=len(vocab) + 1).to(device)

In [61]:
criterion = nn.CTCLoss(blank=len(vocab), zero_infinity=True)
optimizer = optim.AdamW(model.parameters(), lr=1e-3,weight_decay=1e-4)

In [62]:
scaler=torch.amp.GradScaler("cuda")

In [None]:
scaler = GradScaler()
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for batch in tqdm(train_loader):
        images = batch['images'].to(device)                  # [B, 1, H, W]
        targets = batch['labels'].to(device)                 # 1D flattened labels
        target_lengths = batch['label_lengths'].to(device)   # actual lengths of labels

        optimizer.zero_grad()

        with autocast('cuda'):
            outputs = model(images)                          # [B, T, C]
            outputs = outputs.permute(1, 0, 2)               # CTC expects [T, B, C]

            input_lengths = torch.full(
                size=(outputs.size(1),),                     # batch size
                fill_value=outputs.size(0),                  # all have same T
                dtype=torch.long
            ).to(device)

            loss = criterion(outputs, targets, input_lengths, target_lengths)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss / len(train_loader):.4f}")

  scaler = GradScaler()
  with autocast():
  0%|          | 0/10343 [00:01<?, ?it/s]


TypeError: full() received an invalid combination of arguments - got (size=int, fill_value=int, dtype=torch.dtype, ), but expected one of:
 * (tuple of ints size, Number fill_value, *, tuple of names names, torch.dtype dtype = None, torch.layout layout = None, torch.device device = None, bool pin_memory = False, bool requires_grad = False)
 * (tuple of ints size, Number fill_value, *, Tensor out = None, torch.dtype dtype = None, torch.layout layout = None, torch.device device = None, bool pin_memory = False, bool requires_grad = False)
