# Speech

pytorch backends:
https://github.com/facebookresearch/demucs/issues/570


In [7]:


%pip install torch torchaudio

Note: you may need to restart the kernel to use updated packages.


In [6]:
import torch
import torchaudio

import matplotlib.pyplot as plt
import os

_SAMPLE_DIR = "_data"
YESNO_DATASET_PATH = os.path.join(_SAMPLE_DIR, "yes_no")
os.makedirs(YESNO_DATASET_PATH, exist_ok=True)

dataset = torchaudio.datasets.YESNO(YESNO_DATASET_PATH, download=True)

def plot_specgram(waveform, sample_rate, title="Spectrogram"):
    waveform = waveform.numpy()

    figure, ax = plt.subplots()
    ax.specgram(waveform[0], Fs=sample_rate)
    figure.suptitle(title)
    figure.tight_layout()

i = 1
waveform, sample_rate, label = dataset[i]
plot_specgram(waveform, sample_rate, title=f"Sample {i}: {label}")
IPython.display.Audio(waveform, rate=sample_rate)

RuntimeError: Couldn't find appropriate backend to handle uri _data/yes_no/waves_yesno/0_0_0_1_0_0_0_1.wav and format None.

In [None]:
import os
# set before importing pytorch to avoid all non-deterministic operations on GPU
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

import torch
import torchaudio
from torch import nn
from torch.utils.data import DataLoader
from torchaudio.datasets import LIBRISPEECH
from torchaudio.transforms import MelSpectrogram
import matplotlib.pyplot as plt
import IPython.display as ipd
from torch.nn import Module, Conv2d, Linear, TransformerEncoder, TransformerEncoderLayer
import torch.nn.functional as F

# Definición del modelo
class SpeechRecognitionModel(Module):
    def __init__(self, num_classes, input_size=128, num_heads=4, num_layers=3, hidden_size=256):
        super(SpeechRecognitionModel, self).__init__()
        # Capas convolucionales para reducir la dimensionalidad
        self.conv1 = Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2))
        self.conv2 = Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2))

        # Transformer para el procesamiento secuencial
        transformer_layer = TransformerEncoderLayer(d_model=input_size, nhead=num_heads, dim_feedforward=hidden_size)
        self.transformer_encoder = TransformerEncoder(transformer_layer, num_layers=num_layers)

        # Capa lineal para el mapeo a la salida
        self.fc = Linear(input_size, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = x.permute(2, 0, 1)  # Cambiar a formato (seq_len, batch, features)
        x = self.transformer_encoder(x)
        x = self.fc(x)
        return x

# Cargar y procesar datos
def data_processing(data):
    spectrogram_transform = MelSpectrogram()
    waveform, _, utterances, _, _, _ = zip(*data)
    spectrograms = [spectrogram_transform(w).squeeze(0).transpose(0, 1) for w in waveform]
    input_lengths = [len(s) for s in spectrograms]
    target_lengths = [len(u) for u in utterances]
    spectrograms = nn.utils.rnn.pad_sequence(spectrograms, batch_first=True).unsqueeze(1).transpose(2, 3)
    targets = [torch.tensor([ord(c) for c in u]) for u in utterances]
    targets = nn.utils.rnn.pad_sequence(targets, batch_first=True)
    return spectrograms, targets, input_lengths, target_lengths

# Configuración
batch_size = 10
num_epochs = 10
num_classes = 29  # 26 letras + espacio, apóstrofe y caracter en blanco

os.makedirs('data', exist_ok=True)

# Conjuntos de datos y cargadores
#train_dataset = LIBRISPEECH("./data", url="train-clean-100", download=True)
#test_dataset = LIBRISPEECH("./data", url="test-clean", download=True)
train_dataset = torchaudio.datasets.YESNO("./data", download=True)
test_dataset = torchaudio.datasets.YESNO("./data", download=True)

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_processing)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, collate_fn=data_processing)

# Instanciar modelo y optimizador
model = SpeechRecognitionModel(num_classes=num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CTCLoss(blank=28, zero_infinity=True)

# Entrenamiento del modelo
def train(model, train_loader, criterion, optimizer):
    model.train()
    for epoch in range(num_epochs):
        for batch_idx, (spectrograms, targets, input_lengths, target_lengths) in enumerate(train_loader):
            optimizer.zero_grad()
            output = model(spectrograms)  # (batch, time, n_class)
            output = output.permute(1, 0, 2)  # Reordenar a (time, batch, n_class)
            loss = criterion(output, targets, input_lengths, target_lengths)
            loss.backward()
            optimizer.step()
            print(f'Epoch: {epoch}, Batch: {batch_idx}, Loss: {loss.item()}')

train(model, train_loader, criterion, optimizer)

# Función para visualizar espectrograma
def plot_spectrogram(spectrogram):
    plt.figure(figsize=(10, 4))
    plt.imshow(spectrogram.log2(), aspect='auto', origin='lower', 
               cmap='viridis')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Mel spectrogram')
    plt.tight_layout()
    plt.show()

# Reproducir un archivo de audio
def play_audio(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    ipd.Audio(waveform.numpy(), rate=sample_rate)

# Prueba de visualización y reproducción de un archivo
test_audio_path = "./data/LibriSpeech/test-clean/1089/134686/1089-134686-0000.flac"
waveform, sample_rate = torchaudio.load(test_audio_path)
spectrogram = MelSpectrogram()(waveform)
plot_spectrogram(spectrogram[0])
play_audio(test_audio_path)

# Evaluación del modelo con un solo archivo de audio
def evaluate(model, audio_path):
    model.eval()
    waveform, _ = torchaudio.load(audio_path)
    spectrogram = MelSpectrogram()(waveform).unsqueeze(0).transpose(2, 3)
    with torch.no_grad():
        output = model(spectrogram)
        output = output.permute(1, 0, 2)  # Reordenar a (time, batch, n_class)
        output = torch.argmax(output, dim=2)
        output = output.transpose(0, 1).squeeze(0)
        return ''.join([chr(o + 96) for o in output if o != 28])  # Convertir a texto

# Uso de evaluate para probar un archivo
print(evaluate(model, test_audio_path))


100%|██████████| 4.49M/4.49M [00:03<00:00, 1.54MB/s]
  from .autonotebook import tqdm as notebook_tqdm


RuntimeError: Couldn't find appropriate backend to handle uri data/waves_yesno/1_1_0_0_0_1_1_1.wav and format None.