# Sequential CRNN

This notebook aims to train the model for the STT task with a sequential CRNN model. We will first feed the processed audio features to a convolutional layer. The output features map from the convolutional layer will then be fed to the recurrent layer, before finally leading to the output. In contrast, a parallel CRNN will have both convolutional layer and recurrent layer run in parallel, before feature fusion into an output.

## Imports

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import h5py
import numpy as np
import json

## Vocab Building & Tokenization

### Vocab Functions

In [2]:
def build_vocab():
    """
    Builds a fixed vocabulary of lowercase English letters, space, and a special
    '<blank>' token for CTC.
    Returns:
        dict: A dictionary mapping characters to their integer encodings.
    """
    alphabet = "abcdefghijklmnopqrstuvwxyz "
    vocab = {"<blank>": 0}  # CTC blank token
    for idx, char in enumerate(alphabet, start=1):  # Starting from 1 to reserve 0 for blank
        vocab[char] = idx
    return vocab

def save_vocab(vocab, filepath):
    with open(filepath, 'w') as f:
        json.dump(vocab, f)

def load_vocab(filepath):
    with open(filepath, 'r') as f:
        vocab = json.load(f)
    return vocab

### Tokenization Functions

In [3]:
def encode_label(label, vocab):
    return [vocab[char] for char in label]

def decode_label(encoded_label, vocab):
    inv_vocab = {v: k for k, v in vocab.items()}
    return ''.join(inv_vocab[id] for id in encoded_label if id not in (0, 1))  # Skip pad and blank tokens

### Building the Vocab (use only if you don't already have the vocab built!)

In [None]:
vocab = build_vocab()
print(vocab)
save_vocab(vocab, 'vocab.json')

### Loading the vocab

In [4]:
vocab = load_vocab('vocab.json')
VOCAB_SIZE = len(vocab)

## Dataset Class Definition

In [5]:
class SpeechDataset(Dataset):
    def __init__(self, hdf5_path, vocab, max_length_frames=247):
        super(SpeechDataset, self).__init__()
        self.hdf5_path = hdf5_path
        self.vocab = vocab
        # Maximum sequence length for padding, 247 = 8s @ 16000 Hz, 512 hop length for MFCC
        self.max_length_frames = max_length_frames  
        with h5py.File(hdf5_path, 'r') as file:
            self.keys = list(file.keys())

    def __len__(self):
        return len(self.keys)

    def __getitem__(self, idx):
        with h5py.File(self.hdf5_path, 'r') as f:
            key = self.keys[idx]
            # Or use whatever features you need
            melspectrogram = np.array(f[key]['melspectrogram']).astype(np.float32)
            
            # Calculate the number of padding frames needed
            padding_length = self.max_length_frames - melspectrogram.shape[1]
            if padding_length > 0:
                # Pad the sequence to max_length_frames if it's shorter
                melspectrogram = np.pad(melspectrogram, ((0,0), (0, padding_length)), mode='constant', constant_values=0)
            elif padding_length < 0:
                # Truncate the sequence to max_length_frames if it's longer
                melspectrogram = melspectrogram[:, :self.max_length_frames]

            melspectrogram = np.expand_dims(melspectrogram, 0)  # Shape: [1, Freq, Time]
            label_str = f[key]['label'][()].decode('utf-8')
            label = encode_label(label_str, self.vocab)
            input_length = self.max_length_frames
            label_length = len(label)
            
        return torch.tensor(melspectrogram), torch.tensor(label, dtype=torch.int), input_length, label_length


## CRNN Class Definition

In [6]:
class CRNN(nn.Module):
    def __init__(self, num_mfcc_features, hidden_size, num_layers=2):
        super(CRNN, self).__init__()
        self.fc_out_size = VOCAB_SIZE  # Number of output classes, including the blank for CTC

        # Convolutional layers with Batch Normalization and Dropout
        self.conv = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),  # BatchNorm after convolution
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout(0.25),  # Dropout after pooling
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),  # BatchNorm after convolution
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout(0.25),  # Dropout after pooling
        )

        # Calculate the size of the RNN's input. Assuming the input MFCCs have shape [Batch, 1, Time, Features]
        # and after convolutions and pooling, the feature (height) dimension is reduced by a factor of 4,
        # and the time (width) dimension is also reduced. The factor reduction in the time dimension depends on
        # the length of your input sequences and the exact architecture of your convolutional layers.
        self.rnn_input_size = 64 * (num_mfcc_features // 4)  # Adjust based on your pooling and convolution operations

        # Recurrent layers
        self.rnn = nn.GRU(
            input_size=self.rnn_input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
        )

        # Fully connected layer
        self.fc = nn.Linear(hidden_size, self.fc_out_size)

    def forward(self, x):
        # Apply convolutional layers
        x = self.conv(x.unsqueeze(1))  # Add a channel dimension
        
        # Prepare the output of the CNN for the RNN
        batch, channels, height, width = x.size()
        x = x.permute(0, 3, 1, 2).contiguous()  # Change to [Batch, Width, Channels, Height]
        x = x.view(batch, width, -1)  # Flatten the feature maps
        
        # Apply RNN
        output, _ = self.rnn(x)
        
        # Apply fully connected layer
        output = self.fc(output)
        
        return output

## Training function

### Saver/Loader functions

In [7]:
def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    """
    Saves the model and training parameters at the specified checkpoint.
    """
    torch.save(state, filename)

def load_checkpoint(checkpoint, model, optimizer):
    """
    Loads the model and training parameters from a specified checkpoint.
    """
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    return checkpoint['epoch']

### Train Function

In [8]:
def train(model, device, train_loader, optimizer, epochs, start_epoch=0):
    model.train()
    criterion = nn.CTCLoss(blank=0, zero_infinity=True)
    
    # Start from the next epoch if resuming
    for epoch in range(start_epoch, epochs):
        for mels, labels, input_lengths, label_lengths in train_loader:
            mels = mels.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(mels)
            outputs = F.log_softmax(outputs, dim=2)
            input_lengths = torch.full(size=(mels.size(0),), fill_value=outputs.size(0), dtype=torch.long).to(device)
            loss = criterion(outputs.permute(1, 0, 2), labels, input_lengths, label_lengths)
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch+1}, Loss: {loss.item()}')
        
        # Save model at each epoch
        save_checkpoint({
            'epoch': epoch,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
        }, filename=f"checkpoint_epoch_{epoch}.pth.tar")

## Execution

### Params

In [11]:
# Paths declaration
hdf5_path = r"C:\Users\jonec\Documents\SUTD\T6\AI\STT\Recorded-Lecture-Transcription-STT\reduced_mfcc_dataset.h5"

# Model params declaration
learning_rate = 0.001
epochs = 10
batch_size = 128
num_mfcc_features = 13
hidden_size = 256
num_layers = 2

In [12]:
# Set device to GPU if available
# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Dataset and DataLoader instantiation
dataset = SpeechDataset(hdf5_path, vocab)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Load the checkpoint file. Use only if you are indeed loading from a checkpoint
# checkpoint = torch.load("checkpoint_epoch_X.pth.tar")  # Replace X with the checkpoint epoch

# Model initialisation
model = CRNN(num_mfcc_features=num_mfcc_features, hidden_size=hidden_size, num_layers=num_layers).to(device)

# Training execution
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Load the model and optimizer state from the checkpoint. Again, use only if you are indeed loading from a checkpoint
# start_epoch = load_checkpoint(checkpoint, model, optimizer)

train(model, device, loader, optimizer, epochs=epochs)

Using device: cuda


  from .autonotebook import tqdm as notebook_tqdm


KeyError: "Unable to synchronously open object (object 'melspectrogram' doesn't exist)"