In [1]:

"""speech_LSTM.ipynb

Speech recognition model with LSTM architecture
"""

'speech_LSTM.ipynb\n\nSpeech recognition model with LSTM architecture\n'

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from torch.utils.data import DataLoader, random_split
import soundfile as sf
import os
import numpy as np
import matplotlib.pyplot as plt
from itertools import dropwhile
import string
import warnings
warnings.filterwarnings("ignore")
import random
import pandas as pd

In [3]:
# Dataset configuration
root_path = '/content/librispeech'
os.makedirs(root_path, exist_ok=True)


In [4]:

# Text transformation module with explicit blank token
class TextTransform:
    def __init__(self):
        self.chars = ["'", " "] + list(string.ascii_lowercase)  # Space is actual space character
        self.char_map = {c: i for i, c in enumerate(self.chars)}
        self.char_map['<BLANK>'] = len(self.chars)  # Add blank token at the end
        self.index_map = {v: k for k, v in self.char_map.items()}
        self.blank_idx = self.char_map['<BLANK>']

    def text_to_int(self, text):
        return [self.char_map.get(c, self.blank_idx) for c in text.lower()]

    def int_to_text(self, labels):
        return ''.join(self.index_map[i] for i in labels if i != self.blank_idx)

    def int_to_text_remove_pad(self, int_sequence):
        cleaned = list(dropwhile(lambda x: x == self.blank_idx, reversed(int_sequence)))[::-1]
        return ''.join(self.index_map[i] for i in cleaned if i != self.blank_idx)

text_transform = TextTransform()

In [5]:

# Audio preprocessing
train_audio_transforms = nn.Sequential(
    torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128),
    torchaudio.transforms.FrequencyMasking(freq_mask_param=30),
    torchaudio.transforms.TimeMasking(time_mask_param=100)
)

valid_audio_transforms = torchaudio.transforms.MelSpectrogram()


In [6]:
def data_processing(data, data_type="train"):
    spectrograms = []
    labels = []
    input_lengths = []
    label_lengths = []

    audio_transforms = train_audio_transforms if data_type == 'train' else valid_audio_transforms

    for (waveform, _, utterance, _, _, _) in data:
        spec = audio_transforms(waveform).squeeze(0).transpose(0, 1)
        spectrograms.append(spec)
        label = torch.Tensor(text_transform.text_to_int(utterance.lower()))
        labels.append(label)
        input_lengths.append(spec.shape[0] // 2)  # Account for initial stride=2
        label_lengths.append(len(label))

    spectrograms = nn.utils.rnn.pad_sequence(spectrograms, batch_first=True).unsqueeze(1).transpose(2, 3)
    labels = nn.utils.rnn.pad_sequence(labels, batch_first=True)

    return spectrograms, labels, input_lengths, label_lengths


In [7]:
# Model architecture
class CNNLayerNorm(nn.Module):
    def __init__(self, n_feats):
        super(CNNLayerNorm, self).__init__()
        self.layer_norm = nn.LayerNorm(n_feats)

    def forward(self, x):
        x = x.transpose(2, 3).contiguous()
        x = self.layer_norm(x)
        return x.transpose(2, 3).contiguous()

class ResidualCNN(nn.Module):
    def __init__(self, in_channels, out_channels, kernel, stride, dropout, n_feats):
        super(ResidualCNN, self).__init__()
        self.cnn1 = nn.Conv2d(in_channels, out_channels, kernel, stride, padding=kernel//2)
        self.cnn2 = nn.Conv2d(out_channels, out_channels, kernel, stride, padding=kernel//2)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.layer_norm1 = CNNLayerNorm(n_feats)
        self.layer_norm2 = CNNLayerNorm(n_feats)

    def forward(self, x):
        residual = x
        x = self.layer_norm1(x)
        x = F.gelu(x)
        x = self.dropout1(x)
        x = self.cnn1(x)
        x = self.layer_norm2(x)
        x = F.gelu(x)
        x = self.dropout2(x)
        x = self.cnn2(x)
        return x + residual

class BidirectionalLSTM(nn.Module):
    def __init__(self, rnn_dim, hidden_size, dropout, batch_first):
        super(BidirectionalLSTM, self).__init__()
        self.BiLSTM = nn.LSTM(
            input_size=rnn_dim, hidden_size=hidden_size,
            num_layers=1, batch_first=batch_first, bidirectional=True)
        self.layer_norm = nn.LayerNorm(rnn_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.layer_norm(x)
        x = F.gelu(x)
        x, _ = self.BiLSTM(x)
        return self.dropout(x)

class SpeechRecognitionModel(nn.Module):
    def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride=2, dropout=0.1):
        super(SpeechRecognitionModel, self).__init__()
        n_feats = n_feats//2
        self.cnn = nn.Conv2d(1, 32, 3, stride=stride, padding=3//2)

        self.rescnn_layers = nn.Sequential(*[
            ResidualCNN(32, 32, kernel=3, stride=1, dropout=dropout, n_feats=n_feats)
            for _ in range(n_cnn_layers)
        ])
        self.fully_connected = nn.Linear(n_feats*32, rnn_dim)
        self.birnn_layers = nn.Sequential(*[
            BidirectionalLSTM(rnn_dim=rnn_dim if i==0 else rnn_dim*2,
                             hidden_size=rnn_dim, dropout=dropout, batch_first=i==0)
            for i in range(n_rnn_layers)
        ])
        self.classifier = nn.Sequential(
            nn.Linear(rnn_dim*2, rnn_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(rnn_dim, n_class)
        )

    def forward(self, x):
        x = self.cnn(x)
        x = self.rescnn_layers(x)
        sizes = x.size()
        x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3]).transpose(1, 2)
        x = self.fully_connected(x)
        x = self.birnn_layers(x)
        return self.classifier(x)


In [8]:
# Training configuration
pipeline_params = {
    "batch_size": 16,
    "epochs": 10,
    "learning_rate": 5e-4,
    "n_cnn_layers": 3,
    "n_rnn_layers": 5,
    "rnn_dim": 512,
    "n_class": len(text_transform.char_map),  # Now includes blank token
    "n_feats": 128,
    "stride": 2,
    "dropout": 0.1
}

In [9]:
# Metrics and utilities
def levenshtein_distance(ref, hyp):
    m, n = len(ref), len(hyp)
    dp = [[0] * (n+1) for _ in range(m+1)]
    for i in range(m+1): dp[i][0] = i
    for j in range(n+1): dp[0][j] = j
    for i in range(1, m+1):
        for j in range(1, n+1):
            cost = 0 if ref[i-1] == hyp[j-1] else 1
            dp[i][j] = min(dp[i-1][j]+1, dp[i][j-1]+1, dp[i-1][j-1]+cost)
    return dp[m][n]

def wer(ref, hyp):
    ref_words = ref.split()
    hyp_words = hyp.split()
    if len(ref_words) == 0: return 0.0
    return levenshtein_distance(ref_words, hyp_words) / len(ref_words)

def cer(ref, hyp):
    if len(ref) == 0: return 0.0
    return levenshtein_distance(ref, hyp) / len(ref)

def greedy_decoder(output_probs):
    _, indices = torch.max(output_probs, dim=2)
    decoded_batches = []
    for sequence in indices.transpose(0, 1):
        prev_char = -1
        decoded = []
        for idx in sequence:
            if idx != prev_char and idx != text_transform.blank_idx:
                decoded.append(idx.item())
            prev_char = idx
        decoded_batches.append(decoded)
    return decoded_batches


In [10]:
# Training and validation
def validate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    all_refs, all_hyps = [], []

    with torch.no_grad():
        for batch in loader:
            specs, labels, input_lens, label_lens = batch
            specs, labels = specs.to(device), labels.to(device)

            outputs = model(specs)
            outputs = F.log_softmax(outputs, dim=2).transpose(0, 1)
            loss = criterion(outputs, labels, input_lens, label_lens)
            total_loss += loss.item()

            decoded = greedy_decoder(torch.exp(outputs.transpose(0, 1)))
            hyps = [text_transform.int_to_text_remove_pad(seq) for seq in decoded]
            refs = [text_transform.int_to_text_remove_pad(label.tolist()) for label in labels]

            all_refs.extend(refs)
            all_hyps.extend(hyps)

    avg_loss = total_loss / len(loader)
    avg_wer = np.mean([wer(r, h) for r, h in zip(all_refs, all_hyps)])
    avg_cer = np.mean([cer(r, h) for r, h in zip(all_refs, all_hyps)])

    print(f'Validation Loss: {avg_loss:.4f} | WER: {avg_wer:.4f} | CER: {avg_cer:.4f}')
    return avg_loss

def train(model, train_loader, valid_loader, params, device):
    optimizer = torch.optim.AdamW(model.parameters(), params["learning_rate"])
    criterion = nn.CTCLoss(blank=text_transform.blank_idx).to(device)  # Use correct blank index
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=params["epochs"])

    for epoch in range(params["epochs"]):
        model.train()
        for batch_idx, (specs, labels, input_lens, label_lens) in enumerate(train_loader):
            specs, labels = specs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(specs)
            outputs = F.log_softmax(outputs, dim=2).transpose(0, 1)

            loss = criterion(outputs, labels, input_lens, label_lens)
            loss.backward()
            optimizer.step()

            if batch_idx % 100 == 0:
                print(f'Epoch {epoch+1} [{batch_idx}/{len(train_loader)}] Loss: {loss.item():.4f}')

        scheduler.step()
        validate(model, valid_loader, criterion, device)

    torch.save(model.state_dict(), "speech_lstm_model.pth")

In [12]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

CUDA available: False
Device: CPU
Using device: cpu


In [13]:

# Main execution
if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load dataset
    train_dataset = torchaudio.datasets.LIBRISPEECH(root_path, url="train-clean-100", download=True)
    test_dataset = torchaudio.datasets.LIBRISPEECH(root_path, url="test-clean", download=True)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=pipeline_params["batch_size"],
                             shuffle=True, collate_fn=lambda x: data_processing(x, "train"))
    test_loader = DataLoader(test_dataset, batch_size=pipeline_params["batch_size"],
                            collate_fn=lambda x: data_processing(x, "valid"))

    # Initialize model
    model = SpeechRecognitionModel(
        pipeline_params["n_cnn_layers"],
        pipeline_params["n_rnn_layers"],
        pipeline_params["rnn_dim"],
        pipeline_params["n_class"],
        pipeline_params["n_feats"],
        pipeline_params["stride"],
        pipeline_params["dropout"]
    ).to(device)

    # Start training
    train(model, train_loader, test_loader, pipeline_params, device)

  0%|          | 3.00M/5.95G [00:04<2:32:50, 696kB/s]


KeyboardInterrupt: 