In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirpath, dirnames,filenames in os.walk('/kaggle/input'):
    print(f"there are {len(dirnames)} directoris and {len(filenames)} files in {dirpath}")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch
import requests
import zipfile
import pathlib
from pathlib import Path

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
data_path = Path("/kaggle/input/ai-of-god-3/Public_data")

In [None]:
train_path = data_path/"train_images"
test_path = data_path/"test_images"
train_csv = data_path/"train.csv"

In [None]:
import os, csv, pathlib
from PIL import Image
from torch.utils.data import Dataset

class CustomImageDataset(Dataset):
    def __init__(self, root, csv_file, tokenizer, transform=None, mode="train", max_length=32):
        self.root = root
        self.paths = list(pathlib.Path(self.root).glob("*.png"))
        self.transform = transform
        self.tokenizer = tokenizer
        self.mode = mode
        self.max_length = max_length
        self.labels = {'id': [], 'transcription': []}
        
        if mode == "train":
            with open(csv_file, mode='r') as file:
                reader = csv.DictReader(file)
                for row in reader:
                    self.labels['id'].append(row['unique Id'])
                    self.labels['transcription'].append(row['transcription'])

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, index):
        image_path = self.paths[index]
        image = Image.open(image_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        if self.mode == "train":
            image_id = os.path.splitext(os.path.basename(image_path))[0]
            transcription = self.labels['transcription'][self.labels['id'].index(image_id)]
            encoding = self.tokenizer(transcription, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")
            input_ids = encoding["input_ids"].squeeze(0)  # [max_len]
            return image, input_ids

        return image


In [None]:
from transformers import AutoTokenizer
from torchvision import transforms
from torch.utils.data import DataLoader

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
transform = transforms.Compose([
    transforms.Resize((128, 512)),  # Wider for OCR
    transforms.ToTensor(),
])


In [None]:

data = CustomImageDataset("data/train/images", "data/train/labels.csv", tokenizer, transform)


In [None]:
from torch.utils.data import random_split
train_size = int(0.8 * len(data))
test_size = len(data) - train_size

train_data, test_data = random_split(data, [train_size, test_size])

In [None]:

train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
val_loader = DataLoader(val_data, batch_size=16)


In [None]:
import pandas as pd
label_df = pd.DataFrame(data.labels)
label_df.head(n=10)

In [None]:
display_random_images(data,10,seed=42)

In [None]:
import random
import matplotlib.pyplot as plt
import math
def display_random_images(dataset,n,seed=None):
  if n>10:
    n=10
  if seed:
    torch.manual_seed(seed)
  random_samples_idx = random.sample(range(len(dataset.paths)),n)
  plt.figure(figsize=(16,16))
  for i, id in enumerate(random_samples_idx):
    image, label ,_,_= dataset[id]
    plt.subplot(n,1,i+1)
    plt.imshow(image.permute(1,2,0))
    plt.axis("off")
    title = f"class : {label} | \n size : {image.permute(1,2,0).shape}"
    plt.title(title,fontdict={'fontsize':8})


In [None]:
import torch
import torch.nn as nn
import torchvision.models as models

class CNNEncoder(nn.Module):
    def __init__(self, output_dim=512):
        super().__init__()
        backbone = models.resnet18(pretrained=True)
        self.cnn = nn.Sequential(*list(backbone.children())[:-2])
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.linear = nn.Linear(backbone.fc.in_features, output_dim)

    def forward(self, x):
        features = self.cnn(x)             # [B, 512, H', W']
        pooled = self.avgpool(features)    # [B, 512, 1, 1]
        flat = pooled.view(pooled.size(0), -1)  # [B, 512]
        return self.linear(flat)           # [B, output_dim]


In [None]:

class OCRLSTMDecoder(nn.Module):
    def __init__(self, input_dim=512, hidden_dim=512, vocab_size=30522, embedding_dim=256, num_layers=1, pad_token_id=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_token_id)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.init_linear = nn.Linear(input_dim, hidden_dim)
        self.pad_token_id = pad_token_id

    def forward(self, encoder_output, tgt_input_ids):
        embedded = self.embedding(tgt_input_ids)  # [T, B, E]
        h0 = torch.tanh(self.init_linear(encoder_output)).unsqueeze(0)  # [1, B, H]
        c0 = torch.zeros_like(h0)  # [1, B, H]
        output, _ = self.lstm(embedded, (h0, c0))  # [T, B, H]
        logits = self.fc(output)  # [T, B, vocab]
        return logits


In [None]:
class CNNLSTM_CTC_OCR(nn.Module):
    def __init__(self, cnn_out=512, hidden_dim=256, vocab_size=30522):
        super().__init__()
        backbone = models.resnet18(pretrained=True)
        self.cnn = nn.Sequential(*list(backbone.children())[:-2])  # Remove last 2 layers
        self.reduce_conv = nn.Conv2d(512, cnn_out, kernel_size=1)

        self.lstm = nn.LSTM(cnn_out, hidden_dim, bidirectional=True, batch_first=True)
        self.classifier = nn.Linear(hidden_dim * 2, vocab_size)  # BiLSTM

    def forward(self, x):  # x: [B, 3, H, W]
        features = self.cnn(x)  # [B, 512, H', W']
        features = self.reduce_conv(features)  # [B, cnn_out, H', W']

        B, C, H, W = features.size()
        features = features.permute(0, 3, 2, 1)  # [B, W, H, C]
        features = features.reshape(B, W, H * C)  # [B, W, H*C]

        lstm_out, _ = self.lstm(features)  # [B, W, 2*H]
        logits = self.classifier(lstm_out)  # [B, W, vocab]
        return logits.permute(1, 0, 2)  # [T, B, vocab] for CTC


In [None]:
import numpy as np
from tqdm import tqdm

def wer(reference, hypothesis):
    r = reference.split()
    h = hypothesis.split()
    d = np.zeros((len(r)+1, len(h)+1), dtype=np.uint8)
    for i in range(1, len(r)+1):
        d[i][0] = i
    for j in range(1, len(h)+1):
        d[0][j] = j
    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
            else:
                d[i][j] = min(d[i-1][j], d[i][j-1], d[i-1][j-1]) + 1
    return d[len(r)][len(h)] / len(r)


In [None]:
def train_ctc(model, train_loader, val_loader, tokenizer, optimizer, device, epochs=10):
    criterion = nn.CTCLoss(blank=tokenizer.pad_token_id, zero_infinity=True)
    model.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for images, input_ids in tqdm(train_loader, desc=f"Epoch {epoch+1} [Train]"):
            images = images.to(device)
            input_ids = input_ids.to(device)

            # Forward pass
            logits = model(images)  # [T, B, V]
            log_probs = logits.log_softmax(2)  # required by CTC

            # Get target lengths
            tgt_lengths = (input_ids != tokenizer.pad_token_id).sum(dim=1)  # [B]
            input_lengths = torch.full(size=(log_probs.size(1),), fill_value=log_probs.size(0), dtype=torch.long)

            # CTC loss
            loss = criterion(log_probs, input_ids, input_lengths, tgt_lengths)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1} | Train Loss: {total_loss/len(train_loader):.4f}")

        # Validation with greedy decoding
        model.eval()
        preds, refs = [], []
        with torch.no_grad():
            for images, input_ids in tqdm(val_loader, desc=f"Epoch {epoch+1} [Val]"):
                images = images.to(device)
                input_ids = input_ids.to(device)

                logits = model(images)  # [T, B, V]
                pred_ids = logits.argmax(dim=-1).permute(1, 0)  # [B, T]

                for pred_seq, true_seq in zip(pred_ids, input_ids):
                    pred_tokens = []
                    prev = tokenizer.pad_token_id
                    for p in pred_seq.cpu().numpy():
                        if p != prev and p != tokenizer.pad_token_id:
                            pred_tokens.append(p)
                        prev = p
                    pred_text = tokenizer.decode(pred_tokens, skip_special_tokens=True)
                    true_text = tokenizer.decode(true_seq.cpu().numpy(), skip_special_tokens=True)
                    preds.append(pred_text)
                    refs.append(true_text)

        avg_wer = np.mean([wer(r, p) for r, p in zip(refs, preds)])
        print(f"Epoch {epoch+1} | Val WER: {avg_wer:.4f}")


In [None]:

model = CNNLSTM_CTC_OCR(cnn_out=256, hidden_dim=256, vocab_size=tokenizer.vocab_size)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_ctc(model, train_loader, val_loader, tokenizer, optimizer, device, epochs=10)
