In [7]:
%pip install numpy==1.22.4

Note: you may need to restart the kernel to use updated packages.


In [8]:
import os
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import torch.nn as nn
import torch.nn.functional as F

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

plt.ion()   # interactive mode

<contextlib.ExitStack at 0x291797e80>

## Data Preparation
- Create Dataloader class

Note: Working on Part (a) as of now.

In [9]:
START_TOKEN = "START"
END_TOKEN = "END"
UNK_TOKEN = "UNK"

class Vocabulary:
    def __init__(self, freq_dict, wd_to_id, id_to_wd):
        self.freq_dict = freq_dict
        self.wd_to_id = wd_to_id
        self.id_to_wd = id_to_wd
        self.N = len(freq_dict)

class LatexFormulaDataset(Dataset):
    """Latex Formula Dataset: Image and Text"""
    
    def __init__(self, csv_file, root_dir, transform = None):
        """
        Arguments:
            csv_file (string): Path to the csv file with image name and text
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        #@TODO: May want to preload images
        self.df = pd.read_csv(csv_file)
        
        self.root_dir = root_dir
        self.transform = transform

        '''Tokenize the formula by splitting on spaces'''
        self.df['formula'] = self.df['formula'].apply(lambda x: x.split())
        self.vocab= self.construct_vocab()  

        maxlen = 0
        for formula in self.df['formula']:
            if len(formula) > maxlen:
                maxlen = len(formula)

        self.df['formula'] = self.df['formula'].apply(lambda x: x + [UNK_TOKEN]*(maxlen - len(x)))

    def __len__(self):
        return len(self.landmarks_frame)

    def __getitem__(self, idx):
        """
        Returns sample of type image, textformula
        """
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_name = os.path.join(self.root_dir,
                                self.df.iloc[idx, 0])
        image = io.imread(img_name)
        formula = self.df.iloc[idx, 1]
        formula = np.array([formula], dtype=str).reshape(-1, 1)
        sample = {'image': image, 'formula': formula}

        if self.transform:
            sample['image'] = self.transform(sample['image'])

        return sample 
    def construct_vocab(self):
        """
        Constructs vocabulary from the dataset formulas
        """
        freq_dict = {}
        for formula in self.df['formula']:
            for wd in formula:
                if wd not in freq_dict:
                    freq_dict[wd] = 1
                else:
                    freq_dict[wd] += 1
        freq_dict[START_TOKEN] = 1
        freq_dict[END_TOKEN] = 1
        freq_dict[UNK_TOKEN] = 1
        N = len(freq_dict)
        wd_to_id = {}
        for i, wd in enumerate(freq_dict):
            wd_to_id[wd] = i
        id_to_wd = {v: k for k, v in wd_to_id.items()}
    
        #pad the formulas with 
        return Vocabulary(freq_dict, wd_to_id, id_to_wd)       

In [10]:
# image processing
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((224, 224)),
    transforms.Lambda(lambda x: x/255.0), #min-max normalisation
])

In [11]:
# image = io.imread('/kaggle/input/converting-handwritten-equations-to-latex-code/col_774_A4_2023/SyntheticData/images/100009e256.png')
# image = np.asarray(image)
# image.shape

In [12]:
#part a

# train_csv_path = "/kaggle/input/converting-handwritten-equations-to-latex-code/col_774_A4_2023/SyntheticData/train.csv"
# image_root_path = "/kaggle/input/converting-handwritten-equations-to-latex-code/col_774_A4_2023/SyntheticData/images"
# train_set = LatexFormulaDataset(train_csv_path, image_root_path, transform)

# print(train_set[1]['image'])

### Encoder Network
- A CNN to encode image to more meaningful vector

In [13]:
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.conv1 = nn.Conv2d(3, 32, (5, 5))
        self.act1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d((2,2))
        
        self.conv2 = nn.Conv2d(32, 64, (5, 5))
        self.act2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d((2, 2))
        
        self.conv3 = nn.Conv2d(64, 128, (5, 5))
        self.act3 = nn.ReLU()
        self.pool3 = nn.MaxPool2d((2, 2))
        
        self.conv4 = nn.Conv2d(128, 256, (5, 5))
        self.act4 = nn.ReLU()
        self.pool4 = nn.MaxPool2d((2, 2))
        
        self.conv5 = nn.Conv2d(256, 512, (5, 5))
        self.act5 = nn.ReLU()
        self.pool5 = nn.MaxPool2d((2, 2))
        
        self.avg_pool = nn.AvgPool2d((3, 3))
    
    def forward(self, x):
        x = self.act1(self.conv1(x))
        x = self.pool1(x)
        
        x = self.act2(self.conv2(x))
        x = self.pool2(x)
        
        x = self.act3(self.conv3(x))
        x = self.pool3(x)
        
        x = self.act4(self.conv4(x))
        x = self.pool4(x)
        
        x = self.act5(self.conv5(x))
        x = self.pool5(x)
        
        x = self.avg_pool(x)
        x = x.reshape((1, 1, 512))
        return x

### Vocabulary
- https://github.com/harvardnlp/im2markup/blob/master

### Decoder Network

In [14]:


class Decoder(nn.Module):
    """
    Inputs:
    (here M is whatever the batch size is passed)

    context_size : size of the context vector [shape: (1,M,context_size)]
    n_layers: number of layers [for our purposes, defaults to 1]
    hidden_size : size of the hidden state vectors [shape: (n_layers,M,hidden_size)]
    embed_size : size of the embedding vectors [shape: (1,M,embed_size)]
    vocab_size : size of the vocabulary
    max_length : maximum length of the formula
    """
    def __init__(self, context_size, vocab, n_layers = 1, hidden_size = 512, embed_size = 512,  max_length = 100):
        super().__init__()
        self.context_size = context_size
        self.vocab = vocab
        self.vocab_size = vocab.N
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embed_size = embed_size
        self.max_length = max_length


        self.input_size = context_size + embed_size
        self.embed = nn.Embedding(self.vocab_size, embed_size)
        
        self.lstm = nn.LSTM(self.input_size, hidden_size, n_layers)
        self.linear = nn.Linear(hidden_size, self.vocab_size)
        self.softmax = nn.Softmax(dim = 2)

        pass
    
    def forward(self, context, target_tensor = None):
        """
        context is the context vector from the encoder [shape: (1,M,context_size)]
        target_tensor is the formula in tensor form [shape: (1,M,max_length)] (in the second dimension, it is sequence of indices of formula tokens)
            if target_tensor is not None, then we are in Teacher Forcing mode
            else normal jo bhi (last prediction is concatenated)
        """
        batch_size = context.shape[1]

        #initialize hidden state and cell state
            #@TODO: Some caveat in the size of the cell vector. Should it be same as hidden_size? (check nn.LSTM documentation)
        hidden = torch.zeros((self.n_layers, batch_size, self.hidden_size))
        cell = torch.zeros((self.n_layers, batch_size, self.hidden_size))

        #initialize the input with embedding of the start token
        init_embed = self.embed(torch.tensor([self.vocab.wd_to_id[START_TOKEN]])).reshape((1, batch_size, self.embed_size))
        input = torch.cat([context, init_embed], dim = 2)

        #initialize the output
        output = torch.zeros((1, batch_size, self.vocab_size))

        for i in range(self.max_length):
            output, (hidden, cell) = self.lstm(input, (hidden, cell))
            output = self.linear(output)
            output = self.softmax(output)

            
            if target_tensor is not None:
                input = torch.cat([context, self.embed(target_tensor[0, :, i]).reshape((1,batch_size, self.embed_size))], dim = 2)
            else:
                #add the embedding of the last prediction
                input = torch.cat([context, self.embed(torch.argmax(output, dim = 2))], dim = 2)
        
            







In [15]:
vocab_size = 1000
CONTEXT_SIZE = 512
HIDDEN_SIZE = 512
OUTPUT_SIZE  = vocab_size
MAX_LENGTH = 10000

In [16]:
print(torch.tensor(3))

tensor(3)
