In [None]:
import zipfile
import os
import librosa
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [None]:

# Zipe file TIMIT data attached 
zip_path = 'timit.zip'
extract_to_path = os.path.dirname(zip_path)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(path=extract_to_path)

print("Files extracted successfully!")

In [None]:
with open('./timit/timit/allsenlist.txt', 'r') as file:
    lines = file.readlines()

# Splitting the lines
data = [line.strip().split('\t') for line in lines]

# Creating a DataFrame( audio, text)
df = pd.DataFrame(data, columns=['filename', 'dummy', 'text'])
df

In [None]:
for index,row in df.iterrows(): # Na handling 
    if(row[2]==None):
        row[2] = row[1]
del df['dummy']
df

In [None]:
gender = [ ]
for i in range(0,160):
    s = df.iloc[i,0]
    if(s[4]=='m'):
        gender.append('male')
    else:
        gender.append('female')
gender = pd.Series(gender)
df['gender'] = gender
df.head(160)

In [None]:
for i in range(0,160):
    df.iloc[i,0]= './timit/timit/' + df.iloc[i,0] + '.wav'
df

In [None]:
# Preprocessing the data
def preprocess_labels(text):
    # Labels from 0 to num_classes-1 : 'a' maps to 0, 'b' to 1, ..., 'z' to 25, and space to 26
    label = [ord(char) - ord('a') if 'a' <= char <= 'z' else 26 for char in text.lower() if char == ' ' or 'a' <= char <= 'z']
    return label


# Updating the data to (spectrograms, labels)
def preprocess_data(df, max_sequence_length):
    spectrograms = []
    labels = []
    for i in range(len(df)):
        file_path = df.iloc[i, 0]
        text = df.iloc[i, 1]
        audio, _ = librosa.load(file_path, sr=16000)
        spectrogram = librosa.feature.melspectrogram(y=audio, sr=16000, n_mels=128)
        spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
        if spectrogram.shape[1] < max_sequence_length:
            pad_width = max_sequence_length - spectrogram.shape[1]
            spectrogram = np.pad(spectrogram, ((0, 0), (0, pad_width)), mode='constant')
        else:
            spectrogram = spectrogram[:, :max_sequence_length]
        spectrograms.append(spectrogram)
        processed_label = preprocess_labels(text)
        labels.append(processed_label)
    return spectrograms, labels


# CTC model with BLSTM
class CTCModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(CTCModel, self).__init__()
        self.conv = nn.Conv2d(1, 64, kernel_size=(3, 3), padding=(1, 1)) # Had to add conv to make the matrix right
        self.lstm = nn.LSTM(64 * input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        x = self.conv(x)
        x = x.permute(0, 3, 1, 2)  
        x = x.reshape(x.size(0), x.size(1), -1)  
        out, _ = self.lstm(x)
        out = self.fc(out)
        return out


if __name__ == "__main__":
    max_sequence_length = 500
    spectrograms, labels = preprocess_data(df, max_sequence_length)

    # Converting to PyTorch tensors
    sequences_tensor = torch.tensor(spectrograms, dtype=torch.float32).unsqueeze(1)
    labels_tensor = torch.nn.utils.rnn.pad_sequence([torch.tensor(label, dtype=torch.long) for label in labels], batch_first=True)
    label_lengths = torch.tensor([len(label) for label in labels], dtype=torch.long)

    dataset = TensorDataset(sequences_tensor, labels_tensor, label_lengths)
    batch_size = 32
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)

    # Instantiating the model
    input_size = 128  # # of MEL frequency bins
    hidden_size = 256
    num_layers = 3
    num_classes = 26  # # of characters (a-z)
    model = CTCModel(input_size, hidden_size, num_layers, num_classes + 1)  # +1 for blank label

    criterion = nn.CTCLoss(blank=num_classes, reduction='mean', zero_infinity=True)
    optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training ...
num_epochs = 2000
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    num_batches = 0
    for sequences, labels, label_lengths in dataloader:
        optimizer.zero_grad()
        logits = model(sequences)
        input_lengths = torch.full((sequences.size(0),), logits.size(1), dtype=torch.long)
        logits = logits.log_softmax(2).permute(1, 0, 2)
        
        ctc_loss = criterion(logits, labels, input_lengths, label_lengths)
        
        ctc_loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        total_loss += ctc_loss.item()
        num_batches += 1

    average_loss = total_loss / num_batches
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {average_loss:.4f}")

print("Training completed.")

In [None]:
# Evaluation 
model.eval()

predictions = []
ground_truths = []

# Mapping from numerical labels to characters
label_to_char = {i: chr(ord('a') + i) for i in range(num_classes)}
label_to_char[num_classes] = ' ' 
label_to_char[num_classes + 1] = '' 

with torch.no_grad():
    for sequences, labels, label_lengths in dataloader:
        logits = model(sequences)
        input_lengths = torch.full((sequences.size(0),), logits.size(1), dtype=torch.long)
        logits = logits.log_softmax(2).permute(1, 0, 2)
        
        # ctc_greedy_decoder
        decoded_preds = []
        for logit, length in zip(logits.permute(1, 0, 2), input_lengths):
            logit = logit[:length]
            decoded = torch.argmax(logit, dim=-1)
            decoded_preds.append(decoded.tolist())
        
        # Predictions and ground truth 
        for pred, label, length in zip(decoded_preds, labels, label_lengths):
            pred_chars = ''.join([label_to_char[p] for p in pred if p != num_classes + 1])
            label_chars = ''.join([label_to_char[l.item()] for l in label[:length]])
            predictions.append(pred_chars)
            ground_truths.append(label_chars)

for i in range(len(predictions)):
    print(f"Prediction: {predictions[i]}")
    print(f"Ground Truth: {ground_truths[i]}")
    print()