In [61]:
!ls /content/drive/MyDrive/


'capstone test.ipynb'   seq2seq_project  'train (2).zip'   valid_data
'Colab Notebooks'      'test (2).zip'	  train_data
'recognition (2).zip'   test_data	 'valid (2).zip'


In [62]:
import zipfile

zip_path = '/content/drive/MyDrive/recognition (2).zip'
output_dir = '/content/dataset'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(output_dir)

# Check that files have been extracted correctly
!ls /content/dataset


recognition.zip


In [63]:
import zipfile

inner_zip_path = '/content/dataset/recognition.zip'
final_output_dir = '/content/dataset/final'

with zipfile.ZipFile(inner_zip_path, 'r') as zip_ref:
    zip_ref.extractall(final_output_dir)

# Confirm extraction
!ls /content/dataset/final


recognition


In [66]:
!ls /content/dataset/final


recognition


In [67]:
!ls /content/dataset/final/recognition


test  test.csv	train  train.csv


# load and prep data

In [72]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torch
from torchvision import transforms

# Your special characters for Hindi (use your own if you want)
vocab = {'<PAD>':0, '<SOS>':1, '<EOS>':2}
characters = "अआइईउऊऋएऐओऔकखगघचछजझटठडढतथदधनपफबभमयरलवशषसह"
for i, ch in enumerate(characters, 3):
    vocab[ch] = i

transform = transforms.Compose([
    transforms.Resize((128, 512)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5]*3, std=[0.5]*3)
])

class HindiTextDataset(Dataset):
    def __init__(self, csv_file, vocab, transform=None):
        self.data = pd.read_csv(csv_file)
        self.vocab = vocab
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def text_to_indices(self, text):
        return [self.vocab['<SOS>']] + [self.vocab.get(c, 0) for c in text] + [self.vocab['<EOS>']]

    def __getitem__(self, idx):
        # Use the "Filepath" column in your CSV for image filename
        img_path = f"/content/dataset/final/recognition/{self.data.iloc[idx]['Filepath']}"
        img = Image.open(img_path).convert('RGB')
        if self.transform:
            img = self.transform(img)
        # Use the "Text" column in your CSV for labels
        text = self.text_to_indices(self.data.iloc[idx]['Text'])
        return img, torch.tensor(text)

def collate_fn(batch):
    imgs, texts = zip(*batch)
    imgs = torch.stack(imgs)
    lengths = [len(t) for t in texts]
    max_len = max(lengths)
    padded_texts = torch.zeros(len(texts), max_len, dtype=torch.long)
    for i, t in enumerate(texts):
        padded_texts[i, :lengths[i]] = t
    return imgs, padded_texts, lengths

train_csv = '/content/dataset/final/recognition/train.csv'
train_dataset = HindiTextDataset(train_csv, vocab, transform)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)


# Model Definition (CNN Encoder + Attention LSTM Decoder)

In [73]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CNNEncoder(nn.Module):
    def __init__(self, output_dim=128):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(3, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(),
            nn.MaxPool2d(2,2),
            nn.Conv2d(64,128,3,padding=1), nn.BatchNorm2d(128), nn.ReLU(),
            nn.MaxPool2d(2,2)
        )
        self.output_dim = output_dim
    def forward(self, x):
        features = self.conv(x)
        b, c, h, w = features.size()
        features = features.permute(0,3,1,2).contiguous().view(b, w, -1)
        return features

class Attention(nn.Module):
    def __init__(self, enc_dim, dec_dim):
        super().__init__()
        self.attn = nn.Linear(enc_dim + dec_dim, dec_dim)
        self.v = nn.Parameter(torch.rand(dec_dim))
    def forward(self, encoder_outputs, hidden):
        batch_size = encoder_outputs.size(0)
        src_len = encoder_outputs.size(1)
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        energy = energy.permute(0, 2, 1)
        v = self.v.repeat(batch_size, 1).unsqueeze(1)
        attention = torch.bmm(v, energy).squeeze(1)
        return F.softmax(attention, dim=1)

class DecoderLSTM(nn.Module):
    def __init__(self, vocab_size, emb_dim, enc_dim, dec_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim + enc_dim, dec_dim, batch_first=True)
        self.attention = Attention(enc_dim, dec_dim)
        self.fc_out = nn.Linear(dec_dim, vocab_size)
    def forward(self, input_token, hidden, cell, encoder_output):
        embedded = self.embedding(input_token).unsqueeze(1)
        attn_weights = self.attention(encoder_output, hidden.squeeze(0)).unsqueeze(1)
        context = torch.bmm(attn_weights, encoder_output)
        lstm_input = torch.cat((embedded, context), dim=2)
        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden, cell, attn_weights


# Traning Loop

In [74]:
import pandas as pd

df = pd.read_csv('/content/dataset/final/recognition/train.csv')
print(df.columns)
print(df.head())


Index(['Filepath', 'Text', 'Language'], dtype='object')
                             Filepath       Text Language
0    train/english/A_image_1005_0.jpg   CULTURAL  english
1    train/english/A_image_1005_1.jpg  JAGANNATH  english
2   train/english/A_image_1005_10.jpg     Daily)  english
3  train/english/A_image_1005_100.jpg        the  english
4  train/english/A_image_1005_101.jpg     Temple  english


In [77]:
from torch.utils.data import Subset

# Use only the first 6 samples
small_dataset = Subset(train_dataset, range(6))
train_loader = DataLoader(small_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
encoder = CNNEncoder().to(device)
decoder = DecoderLSTM(len(vocab), 128, 128*32, 256).to(device)

optimizer = torch.optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=vocab['<PAD>'])

for epoch in range(1):  # Run for only 1 epoch (quick test)
    encoder.train()
    decoder.train()
    for imgs, texts, lengths in train_loader:
        imgs = imgs.to(device)
        texts = texts.to(device)
        optimizer.zero_grad()
        encoder_outputs = encoder(imgs)
        hidden = torch.zeros(1, imgs.size(0), 256).to(device)
        cell = torch.zeros(1, imgs.size(0), 256).to(device)
        input_token = texts[:, 0]
        loss = 0
        max_len = texts.size(1)
        for t in range(1, max_len):
            output, hidden, cell, _ = decoder(input_token, hidden, cell, encoder_outputs)
            loss += criterion(output, texts[:, t])
            input_token = texts[:, t]
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss {loss.item()/max_len}")


Epoch 1, Loss nan


# interference

In [78]:
def predict_image(image_path, encoder, decoder, vocab, max_len=30):
    # Prepare image
    img = Image.open(image_path).convert('RGB')
    img = transform(img).unsqueeze(0).to(device)

    # Inverse vocab for index to char
    inv_vocab = {v: k for k, v in vocab.items()}

    # Encoder forward
    encoder.eval()
    decoder.eval()
    with torch.no_grad():
        encoder_outputs = encoder(img)
        hidden = torch.zeros(1, 1, 256).to(device)
        cell = torch.zeros(1, 1, 256).to(device)

        input_token = torch.tensor([vocab['<SOS>']], device=device)
        predicted_indices = []
        for _ in range(max_len):
            output, hidden, cell, _ = decoder(input_token, hidden, cell, encoder_outputs)
            top1 = output.argmax(1)
            if top1.item() == vocab['<EOS>']:
                break
            predicted_indices.append(top1.item())
            input_token = top1

    predicted_text = ''.join([inv_vocab[idx] for idx in predicted_indices if idx in inv_vocab])
    return predicted_text


In [79]:
import pandas as pd

# Load test CSV (update path if necessary)
test_df = pd.read_csv('/content/dataset/final/recognition/test.csv')

for idx in range(5):  # Predict for first 5 test images
    image_path = '/content/dataset/final/recognition/' + test_df.iloc[idx]['Filepath']
    actual_text = test_df.iloc[idx]['Text']
    predicted_text = predict_image(image_path, encoder, decoder, vocab)
    print(f"Test Sample {idx+1}")
    print("Actual Text:", actual_text)
    print("Predicted Text:", predicted_text)
    print("-"*40)


Test Sample 1
Actual Text: JUBILEE
Predicted Text: 
----------------------------------------
Test Sample 2
Actual Text: FUNCTION
Predicted Text: 
----------------------------------------
Test Sample 3
Actual Text: OF
Predicted Text: 
----------------------------------------
Test Sample 4
Actual Text: জয়ন্তী
Predicted Text: 
----------------------------------------
Test Sample 5
Actual Text: 20
Predicted Text: 
----------------------------------------


In [81]:
idx = 1  # Choose row number
image_path = '/content/dataset/final/recognition/' + test_df.iloc[idx]['Filepath']
actual_text = test_df.iloc[idx]['Text']
predicted_text = predict_image(image_path, encoder, decoder, vocab)
print("Actual Text:", actual_text)
print("Predicted Text:", predicted_text)


Actual Text: FUNCTION
Predicted Text: 
