In [None]:
%pip install torch torchaudio onnxruntime silero-vad transformers datasets[audio] whisper-timestamped

In [1]:
from datasets import load_dataset

dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]

README.md:   0%|          | 0.00/480 [00:00<?, ?B/s]

(…)-00000-of-00001-913508124a40cb97.parquet:   0%|          | 0.00/1.98M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/1 [00:00<?, ? examples/s]

In [2]:
sample["sampling_rate"]

16000

In [4]:
min(sample["array"])

np.float64(-0.644012451171875)

In [3]:
from IPython.display import Audio

Audio(sample["array"], rate=sample["sampling_rate"])

In [4]:
import whisper_timestamped as whisper

Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.



In [5]:
import torch
import torchaudio
import numpy as np

if sample["sampling_rate"] != 16000:
    resampler = torchaudio.transforms.Resample(orig_freq=sample["sampling_rate"], new_freq=16000)
    audio_tensor = torch.tensor(sample["array"]).unsqueeze(0)
    audio_resampled = resampler(audio_tensor).squeeze(0).numpy().astype(np.float32)
else:
    audio_resampled = torch.tensor(sample["array"]).numpy().astype(np.float32)

In [6]:
audio_resampled.dtype

dtype('float32')

In [7]:
model = whisper.load_model("tiny", device="cpu")

In [8]:
result = whisper.transcribe(model, audio_resampled, language="en")

100%|██████████| 6245/6245 [00:04<00:00, 1418.59frames/s]


In [9]:
result

{'text': " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Latins' work is really Greek after all, and can discover in it but little of rocky Ithaca. Lennils, pictures, are a sort of upguards and atom paintings, and Mason's exquisite idles are as national as a jingo poem. Mr. Berkett Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap on the back before he says, like a shampoo or a turkish bath. Next man.",
 'segments': [{'id': 0,
   'seek': 0,
   'start': np.float64(0.54),
   'end': np.float64(5.36),
   'text': ' Mr. Quilter is the apostle of the middle classes, and we are

In [13]:
result["segments"][0]["words"]

[{'text': 'Mr.',
  'start': np.float64(0.54),
  'end': np.float64(0.78),
  'confidence': 0.776},
 {'text': 'Quilter',
  'start': np.float64(0.94),
  'end': np.float64(1.26),
  'confidence': 0.915},
 {'text': 'is',
  'start': np.float64(1.26),
  'end': np.float64(1.44),
  'confidence': 0.966},
 {'text': 'the',
  'start': np.float64(1.44),
  'end': np.float64(1.62),
  'confidence': 0.993},
 {'text': 'apostle',
  'start': np.float64(1.62),
  'end': np.float64(2.06),
  'confidence': 0.932},
 {'text': 'of',
  'start': np.float64(2.06),
  'end': np.float64(2.32),
  'confidence': 0.997},
 {'text': 'the',
  'start': np.float64(2.32),
  'end': np.float64(2.44),
  'confidence': 0.995},
 {'text': 'middle',
  'start': np.float64(2.44),
  'end': np.float64(2.68),
  'confidence': 0.827},
 {'text': 'classes,',
  'start': np.float64(2.68),
  'end': np.float64(3.2),
  'confidence': 0.9},
 {'text': 'and',
  'start': np.float64(3.48),
  'end': np.float64(3.54),
  'confidence': 0.922},
 {'text': 'we',
  '

In [19]:
from typing import List

class Word:
    def __init__(self, word, start, end):
        self.word = word
        self.start = start
        self.end = end

    def __repr__(self):
        return f"{self.word} | {self.start} | {self.end}"

def get_words(result):
    words: List[Word] = []

    for segment in result["segments"]:
        for word in segment["words"]:
            words.append(Word(word["text"], word["start"], word["end"]))

    return words

words = get_words(result)
words


[Mr. | 0.54 | 0.78,
 Quilter | 0.94 | 1.26,
 is | 1.26 | 1.44,
 the | 1.44 | 1.62,
 apostle | 1.62 | 2.06,
 of | 2.06 | 2.32,
 the | 2.32 | 2.44,
 middle | 2.44 | 2.68,
 classes, | 2.68 | 3.2,
 and | 3.48 | 3.54,
 we | 3.54 | 3.64,
 are | 3.64 | 3.8,
 glad | 3.8 | 4.08,
 to | 4.08 | 4.3,
 welcome | 4.3 | 4.58,
 his | 4.58 | 4.88,
 gospel. | 4.88 | 5.36,
 Nor | 6.46 | 6.7,
 is | 6.7 | 7.0,
 Mr. | 7.0 | 7.24,
 Quilter's | 7.38 | 7.8,
 manner | 7.8 | 8.12,
 less | 8.12 | 8.46,
 interesting | 8.46 | 9.12,
 than | 9.12 | 9.42,
 his | 9.42 | 9.68,
 matter. | 9.68 | 10.3,
 He | 11.14 | 11.42,
 tells | 11.42 | 11.66,
 us | 11.66 | 12.06,
 that | 12.06 | 12.38,
 at | 12.38 | 12.52,
 this | 12.52 | 12.8,
 festive | 12.8 | 13.18,
 season | 13.18 | 13.7,
 of | 13.7 | 13.92,
 the | 13.92 | 14.22,
 year, | 14.22 | 14.66,
 with | 14.94 | 15.14,
 Christmas | 15.14 | 15.62,
 and | 15.62 | 15.94,
 roast | 15.94 | 16.2,
 beef | 16.2 | 16.54,
 looming | 16.54 | 16.81,
 before | 16.81 | 17.32,
 us, | 17.32

In [20]:
len(audio_resampled)

999280

In [None]:
def chop_word(audio, word: Word, sample_rate: int):
    start = int(word.start * sample_rate)
    end = int(word.end * sample_rate)
    return np.concat((audio[:start], audio[end:]))

chopped_audio = chop_word(audio_resampled, words[2], sample["sampling_rate"])
len(chopped_audio), len(audio_resampled)

(997200, 999280)

In [40]:
Audio(chopped_audio, rate=sample["sampling_rate"])