In [1]:
%%capture
!pip install speechbrain

In [2]:
%%capture
import speechbrain
import librosa
import pandas as pd
import torchaudio
import torchaudio.transforms as T
import torch
import matplotlib.pyplot as plt
import IPython.display as ipd
from speechbrain.pretrained import SepformerSeparation as separator

In [3]:
%cd "/content/drive/Othercomputers/Mi portátil/projecte/StutterFormer/src"

/content/drive/Othercomputers/Mi portátil/projecte/StutterFormer/src


In [4]:
def load_audio(dataset, experiment, seed):
  data = pd.read_csv("../out/" + experiment + "/" + seed + "/save/" + dataset + ".csv")
  entry = data.sample()
  return entry["file_path_stutter"].item(), entry["file_path_speech"].item()


def plot_waveform(waveform, sr, title="Waveform", printBig=False, printTitle=False):
  waveform = waveform.numpy()

  num_channels, num_frames = waveform.shape
  time_axis = torch.arange(0, num_frames) / sr

  figure, axes = plt.subplots(num_channels, 1)
  axes.plot(time_axis, waveform[0], linewidth=1)
  axes.grid(True)
  axes.set_ylim([-1, 1])
  if printTitle:
    figure.suptitle(title, y=0.92)
  if printBig:
    figure.set_size_inches(8, 6)
    figure.set_dpi(300)
  plt.show(block=False)


def plot_spectrogram(specgram, title=None, ylabel="freq_bin", printBig=False, printTitle=False):
  fig, axs = plt.subplots(1, 1)
  if printTitle:
    axs.set_title(title)
  #axs.set_ylabel(ylabel)
  #axs.set_xlabel("frame")
  im = axs.imshow(librosa.power_to_db(specgram), origin="lower", aspect="auto")
  fig.colorbar(im, ax=axs)
  if printBig:
    fig.set_size_inches(8, 6)
    fig.set_dpi(300)
  plt.show(block=False)

In [30]:
EXPERIMENT = "tiny-stutterformer"
SEED = "2345"

stutter_path, speech_path = load_audio("test", EXPERIMENT, SEED)
stutter_path = "../data/LibriStutter/LibriStutter Audio/1502/122619/1502-122619-0055.flac"
speech_path = "../data/LibriSpeech/1502/122619/1502-122619-0055.flac"
stutter_waveform, stutter_sr = torchaudio.load(stutter_path)
speech_waveform, speech_sr = torchaudio.load(speech_path)

model = separator.from_hparams(
    source="../out/" + EXPERIMENT + "/" + SEED + "/save/best",
    savedir="/content/pretrained_models/" + EXPERIMENT
)

enhanced = model.separate_file(path=stutter_path)[:, :, 0]

In [None]:
print(speech_path)
ipd.Audio(speech_waveform, rate=16000)

../data/LibriSpeech/4018/107338/4018-107338-0027.flac


In [16]:
print(stutter_path)
ipd.Audio(stutter_waveform, rate=16000)

../data/LibriStutter/LibriStutter Audio/1502/122619/1502-122619-0055.flac


In [17]:
ipd.Audio(enhanced, rate=16000)

In [None]:
printBig = True
printTitle = False
plot_waveform(stutter_waveform, stutter_sr, title="Muestra de entrada (LibriStutter)", printBig=printBig, printTitle=printTitle)
plot_waveform(enhanced, stutter_sr, title="Salida del modelo", printBig=printBig, printTitle=printTitle)
plot_waveform(speech_waveform, speech_sr, title="Muestra de referencia (LibriSpeech)", printBig=printBig, printTitle=printTitle)

In [31]:
printBig = True
printTitle = False
n_fft = 1024
win_length = None
hop_length = 512

# Define transform
spectrogram = T.Spectrogram(
    n_fft=n_fft,
    win_length=win_length,
    hop_length=hop_length,
    center=True,
    pad_mode="reflect",
    power=2.0,
)

stutter_spec = spectrogram(stutter_waveform)
plot_spectrogram(stutter_spec[0], title="Muestra de entrada (LibriStutter)", printBig=printBig, printTitle=printTitle)

estimated = spectrogram(enhanced)
plot_spectrogram(estimated[0], title="Salida del modelo", printBig=printBig, printTitle=printTitle)

speech_spec = spectrogram(speech_waveform)
plot_spectrogram(speech_spec[0], title="Muestra de referencia (LibriSpeech)", printBig=printBig, printTitle=printTitle)

Output hidden; open in https://colab.research.google.com to view.