In [29]:
%pip install numpy==1.22.4

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [30]:
import os
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

plt.ion()   # interactive mode

<contextlib.ExitStack at 0x1d1cf66ca50>

## Data Preparation
- Create Dataloader class

Note: Working on Part (a) as of now.  
Guiding light: https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

In [112]:
START_TOKEN = "START"
END_TOKEN = "END"
UNK_TOKEN = "UNK"

class Vocabulary:
    def __init__(self, freq_dict, wd_to_id, id_to_wd):
        self.freq_dict = freq_dict
        self.wd_to_id = wd_to_id
        self.id_to_wd = id_to_wd
        self.N = len(freq_dict)
    
    def get_id(self, word):
        if word in self.wd_to_id:
            return self.wd_to_id[word]
        else:
            return self.wd_to_id[UNK_TOKEN]

class LatexFormulaDataset(Dataset):
    """Latex Formula Dataset: Image and Text"""
    
    def __init__(self, csv_file, root_dir, transform = None):
        """
        Arguments:
            csv_file (string): Path to the csv file with image name and text
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        #@TODO: May want to preload images
        self.df = pd.read_csv(csv_file)
        
        print('Debuggin:-----------------')
        print(self.df.head())
        self.root_dir = root_dir
        self.transform = transform

        '''Tokenize the formula by splitting on spaces'''
        self.df['formula'] = self.df['formula'].apply(lambda x: x.split())
        self.vocab= self.construct_vocab()  

        self.maxlen = 0
        for formula in self.df['formula']:
            if len(formula) > self.maxlen:
                self.maxlen = len(formula)

        self.df['formula'] = self.df['formula'].apply(lambda x: x + [UNK_TOKEN]*(self.maxlen - len(x)))

        #Embedding layer
        self.embed = nn.Embedding(self.vocab.N, 512)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        """
        Returns sample of type image, textformula
        """
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_name = os.path.join(self.root_dir,
                                self.df.iloc[idx, 0])
        image = io.imread(img_name)
        formula = self.df.iloc[idx, 1]
        formula = np.array([formula], dtype=str).reshape(-1, 1)
        formula = [[self.vocab.get_id(wd[0]) for wd in formula]] 
        sample = {'image': image, 'formula': torch.tensor(formula, dtype=torch.int64)}

        if self.transform:
            sample['image'] = self.transform(sample['image'])
            
        return sample 
    
    def construct_vocab(self):
        """
        Constructs vocabulary from the dataset formulas
        """
        freq_dict = {}
        for formula in self.df['formula']:
            for wd in formula:
                if wd not in freq_dict:
                    freq_dict[wd] = 1
                else:
                    freq_dict[wd] += 1
        freq_dict[START_TOKEN] = 1
        freq_dict[END_TOKEN] = 1
        freq_dict[UNK_TOKEN] = 1
        N = len(freq_dict)
        wd_to_id = {}
        for i, wd in enumerate(freq_dict):
            wd_to_id[wd] = i
        id_to_wd = {v: k for k, v in wd_to_id.items()}
    
        #pad the formulas with 
        return Vocabulary(freq_dict, wd_to_id, id_to_wd)      

def get_dataloader(csv_path, image_root, batch_size, transform = None):
    """
    Returns dataloader for the dataset
    """
    dataset = LatexFormulaDataset(csv_path, image_root, transform) #checked
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataloader, dataset
     

### Encoder Network
- A CNN to encode image to more meaningful vector

In [142]:
class EncoderCNN(nn.Module):
    def __init__(self):
        super().__init__()
    
        #@TODO:reduce number of layers: eliminate pools and acts
        self.conv1 = nn.Conv2d(3, 32, (5, 5))
        self.act1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d((2, 2))
        
        self.conv2 = nn.Conv2d(32, 64, (5, 5))
        self.act2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d((2, 2))
        
        self.conv3 = nn.Conv2d(64, 128, (5, 5))
        self.act3 = nn.ReLU()
        self.pool3 = nn.MaxPool2d((2, 2))
        
        self.conv4 = nn.Conv2d(128, 256, (5, 5))
        self.act4 = nn.ReLU()
        self.pool4 = nn.MaxPool2d((2, 2))
        
        self.conv5 = nn.Conv2d(256, 512, (5, 5))
        self.act5 = nn.ReLU()
        self.pool5 = nn.MaxPool2d((2, 2))
        
        self.avg_pool = nn.AvgPool2d((3, 3))
    
    def forward(self, x):
        x = self.act1(self.conv1(x))
        x = self.pool1(x)
        
        x = self.act2(self.conv2(x))
        x = self.pool2(x)
        
        x = self.act3(self.conv3(x))
        x = self.pool3(x)
        
        x = self.act4(self.conv4(x))
        x = self.pool4(x)
        
        x = self.act5(self.conv5(x))
        x = self.pool5(x)
        
        x = self.avg_pool(x)
        x = x.view(x.size(0), -1, 512) 
        return x

### Vocabulary
- https://github.com/harvardnlp/im2markup/blob/master

### Decoder Network

In [206]:
class DecoderRNN(nn.Module):
    """
    INPUTS
    context_size : size of the context vector
    hidden_size : size of the hidden latent vectors
    embed_size : literal
    vocab_size : literal
    output_size : one_hot?
    """
    def __init__(self, vocab, context_size, hidden_size, embed_size, output_size, max_length):
        super().__init__()

        #class variables
        self.embed_size = embed_size
        self.context_size = context_size
        self.max_length = max_length
        self.vocab = vocab
        vocab_size = vocab.N

        #compute input size, concatenating context and prev. output embedding
        input_size = context_size + embed_size

        self.embedding = nn.Embedding(vocab_size, embed_size)

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers = 1)

        self.out = nn.Linear(hidden_size, output_size) #output_size = vocab_size
    
    def forward(self, context, target_tensor = None):
        """
        target_tensor is of size MAX_LENGTH
        """
        #START Token handling
        batch_size = context.size(0)
        start_id = self.vocab.get_id(START_TOKEN)
        start_tensor = torch.empty(batch_size, 1, dtype = torch.int64).fill_(start_id)

        decoder_input = torch.concatenate((context, self.embedding(start_tensor)), dim = 0)

        print(f'Context shape: {context.shape}, decoder_input shape: {decoder_input.shape}')
        print(f'embedding shape: {self.embedding(start_tensor).shape}')
        print('====================================')

        decoder_hidden = context  #dimensions are same
        decoder_outputs = []

        for i in range(self.max_length):
            decoder_output, decoder_hidden = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                input_tensor = self.vocab.get_id(target_tensor[i])  #assuming target_tensor[i] is just a number
                ground_truth_embed = self.embedding(input_tensor)
                decoder_input = torch.concatenate((context, ground_truth_embed), dim = 0)
            else:
                #embed the last output, which was an index of vocab
                last_out_embed = self.embedding(decoder_outputs[-1])
                decoder_input = torch.concatenate((context, last_out_embed), dim = 0)

        return decoder_outputs, decoder_hidden, None
        
    def forward_step(self, input, hidden):
        print('+++++++++++++++++++++++++=')
        print(f'Input shape: {input.shape}, hidden shape: {hidden.shape}')
        output, hidden = self.lstm(input, hidden)
        print(f'New hidden shape: {hidden.shape}')
        output = self.out(hidden)

        #get the output as just an index tensor
        output = torch.argmax(output, dim = -1)

        return output, hidden

### Training Code.
- Dataloader automatically loads in batches. The data need not be modified by us.

In [207]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data['image'], data['formula']

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_output = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_output)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [208]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [209]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [210]:
def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001, print_every=100, plot_every=100):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss() #as stated in assignment

    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

## Training

In [211]:
batch_size = 32
vocab_size = 1000
CONTEXT_SIZE = 512
HIDDEN_SIZE = 512
OUTPUT_SIZE  = vocab_size
MAX_LENGTH = 10000

In [212]:
# image processing
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((224, 224)),
    transforms.Lambda(lambda x: x/255.0), #min-max normalisation
])

In [213]:
#part a
#train_csv_path = "/kaggle/input/converting-handwritten-equations-to-latex-code/col_774_A4_2023/SyntheticData/train.csv"
#image_root_path = "/kaggle/input/converting-handwritten-equations-to-latex-code/col_774_A4_2023/SyntheticData/images"
train_csv_path = "data/SyntheticData/train.csv"
image_root_path = "data/SyntheticData/images"
train_dataloader, train_dataset = get_dataloader(train_csv_path, image_root_path, batch_size, transform)
vocab = train_dataset.vocab
MAX_LENGTH = train_dataset.maxlen

print(train_dataset.df.shape)

Debuggin:-----------------
            image                                            formula
0  74d337e8a0.png  $ \gamma _ { \Omega R , 5 } ^ { T } = - \gamma...
1  2d0f18f71d.png  $ l ^ { ( -- ) \underline { { m } } } u _ { \u...
2  6d9b9de88d.png  $ \left[ H , \gamma _ { i } ^ { \left( 2 \righ...
3  38c6d510bb.png  $ < a _ { i } > \; \propto \; \int _ { \omega ...
4  24537a86e3.png  $ \Psi ( \mu _ { 1 } , \ldots , \mu _ { K } ) ...
(75000, 2)


In [214]:
#create a network instance
encoder = EncoderCNN()
decoder = DecoderRNN(vocab, CONTEXT_SIZE, HIDDEN_SIZE, 512, OUTPUT_SIZE, MAX_LENGTH)
train(train_dataloader, encoder, decoder, 10)

Context shape: torch.Size([32, 1, 512]), decoder_input shape: torch.Size([64, 1, 512])
embedding shape: torch.Size([32, 1, 512])
+++++++++++++++++++++++++=
Input shape: torch.Size([64, 1, 512]), hidden shape: torch.Size([32, 1, 512])


RuntimeError: For batched 3-D input, hx and cx should also be 3-D but got (2-D, 2-D) tensors