In [2]:
!pip install torch torchvision transformers



In [11]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, RobertaTokenizer
from sklearn.model_selection import train_test_split
import numpy as np
from transformers import VisionEncoderDecoderModel
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from transformers import get_scheduler
from tqdm import tqdm
import os

In [9]:
from google.colab import drive
from PIL import Image
import os
import torch
from torch.utils.data import Dataset
from transformers import ViTFeatureExtractor, RobertaTokenizer

# Mount Google Drive
drive.mount('/content/drive')

# Define paths (update to your dataset structure in Drive)
base_path = '/content/drive/My Drive/HandwritingDataset/'  # Change to your folder structure
train_csv = os.path.join(base_path, 'train_v2.csv')
val_csv = os.path.join(base_path, 'val_v2.csv')
test_csv = os.path.join(base_path, 'test_v2.csv')
train_dir = os.path.join(base_path, 'train_v2/train')
val_dir = os.path.join(base_path, 'val_v2/val')
test_dir = os.path.join(base_path, 'test_v2/test')

# Feature extractor and tokenizer
feature_extractor = ViTFeatureExtractor.from_pretrained("microsoft/trocr-base-handwritten")
tokenizer = RobertaTokenizer.from_pretrained("microsoft/trocr-base-handwritten")

# Dataset class
class HandwritingDataset(Dataset):
    def __init__(self, data, img_dir, feature_extractor, tokenizer, max_target_length=128, transform=None):
        """
        Args:
            data: Pandas DataFrame containing filenames and labels.
            img_dir: Path to the directory containing images.
            feature_extractor: ViTFeatureExtractor instance for image preprocessing.
            tokenizer: RobertaTokenizer instance for label tokenization.
            max_target_length: Maximum length of tokenized labels.
            transform: Additional transformations for images (optional).
        """
        self.data = data
        self.img_dir = img_dir
        self.feature_extractor = feature_extractor
        self.tokenizer = tokenizer
        self.max_target_length = max_target_length
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = os.path.join(self.img_dir, row['FILENAME'])
        label = row['IDENTITY']

        # Open image
        try:
            image = Image.open(img_path).convert("RGB")
        except FileNotFoundError:
            print(f"Image not found: {img_path}")
            raise

        if self.transform:
            image = self.transform(image)

        # Extract features
        pixel_values = self.feature_extractor(images=image, return_tensors="pt").pixel_values.squeeze(0)

        # Tokenize label
        labels = self.tokenizer(
            label,
            padding="max_length",
            max_length=self.max_target_length,
            truncation=True
        ).input_ids
        labels = torch.tensor(labels)

        return {"pixel_values": pixel_values, "labels": labels}


In [None]:
# Define transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# Prepare datasets and dataloaders
train_dataset = HandwritingDataset(train_data, train_dir, feature_extractor, tokenizer, transform=transform)
val_dataset = HandwritingDataset(val_data, val_dir, feature_extractor, tokenizer, transform=transform)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=8)

In [7]:
# Load pre-trained TrOCR model
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
model.to("cuda")

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_loader) * 10
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Define training function
def train_model(model, train_loader, val_loader, optimizer, lr_scheduler, num_epochs=10):
    model.train()
    device = "cuda"
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train_loss = 0
        for batch in tqdm(train_loader):
            pixel_values = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(pixel_values=pixel_values, labels=labels)

            loss = outputs.loss
            train_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()

        print(f"Training Loss: {train_loss / len(train_loader):.4f}")

        # Validation
        validate_model(model, val_loader)

# Validation function
def validate_model(model, val_loader):
    model.eval()
    val_loss = 0
    device = "cuda"
    with torch.no_grad():
        for batch in tqdm(val_loader):
            pixel_values = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(pixel_values=pixel_values, labels=labels)

            val_loss += outputs.loss.item()
    print(f"Validation Loss: {val_loss / len(val_loader):.4f}")

In [8]:
def train_model(model, train_loader, val_loader, optimizer, criterion, device):
    model.to(device)
    for epoch in range(EPOCHS):
        # Training
        model.train()
        total_loss = 0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(pixel_values=images, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {avg_loss:.4f}")

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(pixel_values=images, labels=labels)
                val_loss += outputs.loss.item()

        avg_val_loss = val_loss / len(val_loader)
        print(f"Validation Loss: {avg_val_loss:.4f}")


In [None]:
train_model(model, train_loader, val_loader, optimizer, lr_scheduler, num_epochs=10)

In [None]:
def test_model(model, test_loader):
    model.eval()
    predictions, references = [], []
    device = "cuda"
    with torch.no_grad():
        for batch in tqdm(test_loader):
            pixel_values = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)
            outputs = model.generate(pixel_values=pixel_values)

            # Decode predictions and references
            pred_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            ref_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)

            predictions.extend(pred_texts)
            references.extend(ref_texts)

    return predictions, references

In [None]:
from sklearn.metrics import f1_score

def calculate_f1(predictions, references):
    char_f1_scores = []
    for pred, ref in zip(predictions, references):
        char_f1_scores.append(f1_score(list(ref), list(pred), average="weighted"))
    return sum(char_f1_scores) / len(char_f1_scores)

predictions, references = test_model(model, test_loader)
f1 = calculate_f1(predictions, references)
print(f"Character-wise F1 Score: {f1:.4f}")

In [None]:
if __name__ == "__main__":
    import os
    from transformers import ViTFeatureExtractor, RobertaTokenizer
    import torch
    from torch.utils.data import DataLoader
    from transformers import VisionEncoderDecoderModel, AdamW, get_scheduler

    # Paths to data
    train_csv = 'train_v2.csv'
    val_csv = 'val_v2.csv'
    test_csv = 'test_v2.csv'
    train_dir = 'train_v2/train'
    val_dir = 'val_v2/val'
    test_dir = 'test_v2/test'

    # Load feature extractor and tokenizer
    feature_extractor = ViTFeatureExtractor.from_pretrained("microsoft/trocr-base-handwritten")
    tokenizer = RobertaTokenizer.from_pretrained("microsoft/trocr-base-handwritten")

    # Define image transformations
    from torchvision import transforms
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])

    # Prepare datasets and dataloaders
    train_dataset = HandwritingDataset(pd.read_csv(train_csv), train_dir, feature_extractor, tokenizer, transform=transform)
    val_dataset = HandwritingDataset(pd.read_csv(val_csv), val_dir, feature_extractor, tokenizer, transform=transform)
    test_dataset = HandwritingDataset(pd.read_csv(test_csv), test_dir, feature_extractor, tokenizer, transform=transform)

    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=8)
    test_loader = DataLoader(test_dataset, batch_size=8)

    # Initialize model
    model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
    model.to("cuda")

    # Modify the decoder's vocabulary size to match your dataset (words or characters)
    model.config.decoder.vocab_size = tokenizer.vocab_size
    model.decoder.resize_token_embeddings(tokenizer.vocab_size)

    # Define optimizer and learning rate scheduler
    optimizer = AdamW(model.parameters(), lr=5e-5)
    num_training_steps = len(train_loader) * 10  # Assuming 10 epochs
    lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    # Train and validate the model
    train_model(model, train_loader, val_loader, optimizer, lr_scheduler, num_epochs=10)

    # Test the model and evaluate F1 score
    predictions, references = test_model(model, test_loader)
    f1 = calculate_f1(predictions, references)
    print(f"Character-wise F1 Score on Test Data: {f1:.4f}")

    # Save predictions and model
    results = pd.DataFrame({"Predicted": predictions, "Reference": references})
    results.to_csv("test_results.csv", index=False)

    model.save_pretrained("fine_tuned_trocr_model")
    tokenizer.save_pretrained("fine_tuned_trocr_model")
