<a href="https://colab.research.google.com/github/i-am-neo/whisper_test/blob/main/whisper_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing Whisper

The commands below will install the Python packages needed to use Whisper models and evaluate the transcription results.

In [None]:
! pip install git+https://github.com/openai/whisper.git
! pip install jiwer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-dp60g943
  Running command git clone -q https://github.com/openai/whisper.git /tmp/pip-req-build-dp60g943
Collecting transformers>=4.19.0
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 4.7 MB/s 
[?25hCollecting ffmpeg-python==0.2.0
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 55.6 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 49.4 MB/s 
Building wheels for collected packages: whisper
  Buil

# Loading the LibriSpeech dataset

The following will load the test-clean split of the LibriSpeech corpus using torchaudio.

In [None]:
import os
import numpy as np

try:
    import tensorflow  # required in Colab to avoid protobuf compatibility issues
except ImportError:
    pass

import torch
import pandas as pd
import whisper
import torchaudio

from tqdm.notebook import tqdm


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
#DEVICE = 'cpu'

In [None]:
class LibriSpeech(torch.utils.data.Dataset):
    """
    A simple class to wrap LibriSpeech and trim/pad the audio to 30 seconds.
    It will drop the last few seconds of a very small portion of the utterances.
    """
    def __init__(self, split="test-clean", device=DEVICE):
        self.dataset = torchaudio.datasets.LIBRISPEECH(
            root=os.path.expanduser("~/.cache"),
            url=split,
            download=True,
        )
        self.device = device

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, item):
        audio, sample_rate, text, _, _, _ = self.dataset[item]
        assert sample_rate == 16000
        audio = whisper.pad_or_trim(audio.flatten()).to(self.device)
        mel = whisper.log_mel_spectrogram(audio)
        
        return (mel, text)

In [None]:
dataset = LibriSpeech("test-clean")
loader = torch.utils.data.DataLoader(dataset, batch_size=16)

# Running inference on the dataset using a base Whisper model

The following will take a few minutes to transcribe all utterances in the dataset.

In [None]:
model = whisper.load_model("base.en")
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)

100%|███████████████████████████████████████| 139M/139M [00:03<00:00, 46.7MiB/s]


Model is English-only and has 71,825,408 parameters.


In [None]:
# predict without timestamps for short-form transcription
#options = whisper.DecodingOptions(language="en", without_timestamps=True)
options = whisper.DecodingOptions(language="en", prompt='Hello')
options = whisper.DecodingOptions(language="en")#

In [None]:
def download_data(data):
  user = data['user']
  fn_ext = data['fn_ext']
  fn = fn_ext.split('.')[0]
  wavfn_format = data['wavfn_format'].format(user=user, fn=fn)

  # download wav file
  if not os.path.exists('./' + wavfn_format):
    command = f"wget https://neo.vidd.ai/output/wav_staging/{wavfn_format}"
    res = os.system(command)
    print(f'downloaded {command}, res:{res}')

  # download reference transcript
  if not os.path.exists(f"./{user}.{fn_ext}.ytvtt_transcript_mod_display"):
    command = f"wget https://neo.vidd.ai/output/wav_staging/{user}.{fn_ext}.ytvtt_transcript_mod_display"
    res = os.system(command)
    print(f'downloaded {command}, res:{res}')

  return(f"{wavfn_format}", f"{user}.{fn_ext}.ytvtt_transcript_mod_display")

In [None]:
def line_breaks(result):
  text = result["text"]
  text = text.replace('. ',". \n")
  text = text.replace('? ',"? \n")
  return(text)


In [None]:
def read_text(fn):
  # read reference text
  file=open(fn,"r")
  return(file.read())

In [None]:
import whisper
import os
  
#model = whisper.load_model("base")
model = whisper.load_model("large")

RuntimeError: ignored

In [None]:
import jiwer
from whisper.normalizers import EnglishTextNormalizer

normalizer = EnglishTextNormalizer()

In [None]:
# our data
koeris_data1 = { 'user':'58fac5f8-24ed-4c08-a9db-db64a95089b3', 'fn_ext':'MVj78TB3NDk.mp4', 'wavfn_format':'{user}.{fn}.16000.wav' }
koeris_data2 = { 'user':'58fac5f8-24ed-4c08-a9db-db64a95089b3', 'fn_ext':'LsgKwy8KIE4.mp4', 'wavfn_format':'{user}.{fn}.16000.wav' }
koeris_data3 = { 'user':'58fac5f8-24ed-4c08-a9db-db64a95089b3', 'fn_ext':'LsgKwy8KIE4.mp4_1', 'wavfn_format':'{user}.LsgKwy8KIE4.mp4_1.16000.wav' }
koeris_data4 = { 'user':'58fac5f8-24ed-4c08-a9db-db64a95089b3', 'fn_ext':'LsgKwy8KIE4.mp4_2', 'wavfn_format':'{user}.LsgKwy8KIE4.mp4_2.16000.wav' }
                
adena_data1 = { 'user':'a28ae61a-8733-4e6c-98c7-6e5b05444c36', 'fn_ext':'tVS-fUwpaRs.mp4', 'wavfn_format':'{user}.{fn}.16000.wav' }
cactus_data1 = { 'user': '4d06531b-f237-447f-8993-a68aa65fb6e8', 'fn_ext':'GKc6i29eawI.mp4','wavfn_format':'{user}.{fn}.16000.wav' }
cactus_data2 = { 'user': '4d06531b-f237-447f-8993-a68aa65fb6e8', 'fn_ext':'7XDQ5N2CIv4.mp4','wavfn_format':'{user}.{fn}.16000.wav'}

dataset = { 'koeris_data1': koeris_data1,
             'koeris_data2': koeris_data2,
             'adena_data1': adena_data1,
             'cactus_data1': cactus_data1,
             'cactus_data2': cactus_data2
            }
                
dataset2 = { 
           'koeris_data3': koeris_data3,
           'koeris_data4': koeris_data4,
            }
                

In [None]:
def compute_wer(reference, hypothesis):
  hypothesis_clean = [normalizer(text) for text in hypothesis]
  reference_clean = [normalizer(text) for text in reference]

  wer = jiwer.wer(list(reference_clean), list(hypothesis_clean))
  return(wer)

In [None]:
def process(test_data):
  wav_fn, reference_fn = download_data(test_data)
  result = model.transcribe(wav_fn)
  reference_text = read_text(reference_fn)

  # insert line_breaks since our reference text has line breaks
  text = line_breaks(result)
  
  return({'hypothesis':text.split('\n'), 'reference':reference_text.split('\n'), 'segments':result['segments']})

In [None]:
text = line_breaks(result)
#print(text)

In [None]:
text = normalizer(text)
reference_text = normalizer(reference_text)

In [None]:
print(text[:100])

you hello michael how is it going mark fine sir you good early in the morning for me so i am just do


In [None]:
wer = jiwer.wer(text, reference_text)

print(f"WER: {wer * 100:.2f} %")

In [None]:
def save_text(fn, text):
  with open(fn,'w') as f:
    f.write('\n'.join(text))

In [None]:
def save_text2(fn, text):
  with open(fn,'w') as f:
    f.write(text)

In [None]:
def save_json(fn, data):
  with open(fn, "w") as f:
    f.write(json.dumps(data))

In [None]:
# process  dataset2
import json, time
for i in dataset2:
  wav_fn, reference_fn = download_data(dataset2[i])



downloaded wget https://neo.vidd.ai/output/wav_staging/58fac5f8-24ed-4c08-a9db-db64a95089b3.LsgKwy8KIE4.mp4_1.16000.wav, res:0
downloaded wget https://neo.vidd.ai/output/wav_staging/58fac5f8-24ed-4c08-a9db-db64a95089b3.LsgKwy8KIE4.mp4_1.ytvtt_transcript_mod_display, res:2048
downloaded wget https://neo.vidd.ai/output/wav_staging/58fac5f8-24ed-4c08-a9db-db64a95089b3.LsgKwy8KIE4.mp4_2.16000.wav, res:0
downloaded wget https://neo.vidd.ai/output/wav_staging/58fac5f8-24ed-4c08-a9db-db64a95089b3.LsgKwy8KIE4.mp4_2.ytvtt_transcript_mod_display, res:2048


In [None]:

for i in dataset2:
  wav_fn, reference_fn = download_data(dataset2[i])
  st = time.time()
  res = model.transcribe(wav_fn)
  elapsed_time = time.time() - st
  print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
  hypothesis = res['text']
  save_text2(i+'_transcript.txt', hypothesis)
  save_json(i+'_segments.json', res['segments'])

downloaded wget https://neo.vidd.ai/output/wav_staging/58fac5f8-24ed-4c08-a9db-db64a95089b3.LsgKwy8KIE4.mp4_1.ytvtt_transcript_mod_display, res:2048


RuntimeError: ignored

In [None]:
!ls *wav

58fac5f8-24ed-4c08-a9db-db64a95089b3.LsgKwy8KIE4.16000.wav
58fac5f8-24ed-4c08-a9db-db64a95089b3.LsgKwy8KIE4.mp4_1.16000.wav
58fac5f8-24ed-4c08-a9db-db64a95089b3.LsgKwy8KIE4.mp4_2.16000.wav
58fac5f8-24ed-4c08-a9db-db64a95089b3.MVj78TB3NDk.16000.wav


In [None]:
!nvidia-smi

Wed Oct 19 21:58:36 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   55C    P0    28W /  70W |  15094MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!ls *wav


58fac5f8-24ed-4c08-a9db-db64a95089b3.LsgKwy8KIE4.16000.wav
58fac5f8-24ed-4c08-a9db-db64a95089b3.MVj78TB3NDk.16000.wav


In [None]:
print(res['text'])

 ಠದಕ� invites move ಠದಂ introduces the game ចានាន្រគៅាន្ម្រាបាន្រានាន្រាប្រាន្រាន្រាន្រាន្រាន្រាន្រាន្រា្រាន្រាន្រាន្រាន្រាន្រាន្រាន្រាន្រាន្រា ត្លានះារាន្រាន្រាន្រា្រា់្រា់្រាន្ម្រាន្រាន្រាន្រាន្រាន្រ្រាន្រ្រាន្រាន្ម្រាន្រាន្រ្រា្រាន្ម្រា� अच्छारी जानते हैं जो भी उसे प्रश्ट्र देखा रहा हैं अच्छारी जानते हैं जो भी उसे प्रश्ट्र देखा रहा हैं अच्छारी जानते हैं जो भी उसे प्रश्ट्र देखा रहा हैं अच्छारी जानते हैं जो भी उसे प्रश्ट्र देखा रहा हैं अच्छारी जानते हैं जो भी उसे प्रश्ट्र देखा रहा हैं अच्छारी जानते हैं जो भी उसे प्रश्ट्र देखा रहा हैं अच्छारी जानते हैं जो भी उसे प्रश्ट्र देखा रहा हैं अच्छारी जानते हैं जो भी उसे प्रश्ट्र देखा रहा हैं अच्छारी जानते हैं जो भी उसे प्रश्ट्र देखा रहा हैं अच्छारी जानते हैं जो भी उसे प्रश्ट्र देखा रहा हैं अच्छारी जानते हैं जो भी उसे प्रश्ट्र देखा रहा हैं अच्छारी जानते हैं जो भी उसे प्रश्ट्र देखा रहा हैं अच्छारी जानते हैं जो भी उसे प्रश्ट्र देखा रहा हैं अच्छारी जानते हैं जो भी उसे प्रश्ट्र देखा रहा हैं अच्छारी जानते हैं जो भी उसे प्रश्ट्र देखा रहा हैं अच्छारी जा

In [None]:
import json
for i in dataset:
  res = process(dataset[i])
  hypothesis = res['hypothesis']
  save_text(i+'_transcript.txt', hypothesis)
  save_json(i+'_segments.json', res['segments'])
  reference = res['reference']
  print(f'hypothesis:{hypothesis[:100]}')
  print(f' reference:{reference[:100]}')
  dataset[i]['hypothesis'] = hypothesis
  dataset[i]['reference'] = reference



downloaded wget https://neo.vidd.ai/output/wav_staging/58fac5f8-24ed-4c08-a9db-db64a95089b3.MVj78TB3NDk.16000.wav, res:0
downloaded wget https://neo.vidd.ai/output/wav_staging/58fac5f8-24ed-4c08-a9db-db64a95089b3.MVj78TB3NDk.mp4.ytvtt_transcript_mod_display, res:0


RuntimeError: ignored

In [None]:
!ls *txt

koeris_data3_transcript.txt  koeris_data4_transcript.txt


In [None]:
!head koeris_data3_transcript.txt

 ໂຈ໅ຈຈຈໂແໄໄໍ໋ໍ໋ໆ ໒ຈໍ໎ໆໍ່໋໋ໂ໇ເ໋ໍໄເເເໍໍແໍໍເ໇ໍໍ໋໇ໍໂເໍ່ໍໍໆໍໍໍເໍໍໍໍໍໍ່ໍໍໍໍ໋ໍ່ໍໍໍ� You can do this!!! Whole chicken wait!!! We gonna make ងូមូុតាត្មាត្មាត្មាច្មាត្មាត្មាត្មាណ្មាត្មាត្មាត្មាត្មាត្មាត្មាត្មាត្មាត្មាត្មាត្មាត្មាត្មាន្ម្មាត្ I didn't want to keep the miserable lifework, but I had my second leg and head into the turkey山前vert to go for a rideā हम ज HARR Gottes conserva משिल ज़िसा हम ज़िसा ល១្ម្ម វ្ម ១្ម្ម ២្. ល្ម្ម ២្ម ៨។។័? ឡ្្ម វ្ម! ល្ម ល្ម វ្ម ល្ម ៨។។ ៨។។័? NFL there seems to be been shakes over for the technicalities and all the lingo we use, you know, upstream and downstream and so on and so forth it sounds more complicated than it is so it can walk you through there fairly simply and then I can probably also give you a demo of the system, meaning the actual... not the virtual reality experience ឡაំ ឡაំ, ឡაំ, ឡაំ, ឡაំ, ឡაំ, ឡაំ, ឡაំ, ឡე, ឡ ᕥំម្។antisix,yet. So, that's what I'm attempting to chat a little bit. This week សវ្លនៃត្តានាន្មាះច្មា្មាែោ្រ្ឞ្រាន្តានំ្មាន្លាន្លាន្មាន្ឞ

In [None]:
from google.colab import files
files.download('koeris_data3_transcript.txt')
files.download('koeris_data4_transcript.txt')
files.download('koeris_data3_segments.json')
files.download('koeris_data4_segments.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files
files.download('adena_data1_segments.json')
files.download('koeris_data1_segments.json')
files.download('koeris_data2_segments.json')
files.download('cactus_data1_segments.json')
files.download('cactus_data2_segments.json')
files.download('koeris_data3_segments.json')
files.download('koeris_data4_segments.json')

FileNotFoundError: ignored

In [None]:
!ls *wav

4d06531b-f237-447f-8993-a68aa65fb6e8.7XDQ5N2CIv4.16000.wav
4d06531b-f237-447f-8993-a68aa65fb6e8.GKc6i29eawI.16000.wav
58fac5f8-24ed-4c08-a9db-db64a95089b3.LsgKwy8KIE4.16000.wav
58fac5f8-24ed-4c08-a9db-db64a95089b3.MVj78TB3NDk.16000.wav
a28ae61a-8733-4e6c-98c7-6e5b05444c36.tVS-fUwpaRs.16000.wav


In [None]:
result = model.transcribe('58fac5f8-24ed-4c08-a9db-db64a95089b3.MVj78TB3NDk.16000.wav', verbose=True)
segments = result['segments']
print(segments)

Detecting language using up to the first 30 seconds. Use `--language` to specify the language
Detected language: english
[00:00.000 --> 00:00.500]  uh
[00:30.000 --> 00:32.740]  you
[01:00.000 --> 01:02.060]  you
[01:30.000 --> 01:55.560]  I
[03:00.000 --> 03:07.000]  Hello Michael.
[03:07.000 --> 03:22.640]  How's it going Mark?
[03:22.640 --> 03:23.640]  Fine sir, you?
[03:23.640 --> 03:24.640]  Good.
[03:24.640 --> 03:28.760]  Early in the morning for me so I'm just doing some family stuff here.
[03:28.760 --> 03:30.640]  So I'm curling on my computer but.
[03:30.640 --> 03:31.640]  No problem.
[03:31.640 --> 03:35.000]  Yeah, everybody's got to get fed, kid, cats, etc.
[03:35.000 --> 03:37.120]  I know the drill.
[03:37.120 --> 03:38.120]  Know the drill.
[03:38.120 --> 03:39.120]  Indeed, indeed, indeed, indeed.
[03:39.120 --> 03:44.480]  So talk to me briefly while we have like maybe three seconds here.
[03:44.480 --> 03:48.000]  How are we going to work with you after you retire

In [None]:
from google.colab import files
uploaded = files.upload()

Saving adena_data1_segments.json to adena_data1_segments.json
Saving cactus_data2_segments.json to cactus_data2_segments.json
Saving cactus_data1_segments.json to cactus_data1_segments.json
Saving koeris_data2_segments.json to koeris_data2_segments.json
Saving koeris_data1_segments.json to koeris_data1_segments.json
Saving cactus_data1_transcript.txt to cactus_data1_transcript.txt
Saving cactus_data2_transcript.txt to cactus_data2_transcript.txt
Saving koeris_data1_transcript.txt to koeris_data1_transcript.txt
Saving koeris_data2_transcript.txt to koeris_data2_transcript.txt
Saving adena_data1_transcript.txt to adena_data1_transcript.txt


In [None]:
SAMPLE_RATE = 16000
import librosa
def read_wav_line(fn, sec_start, sec_len):
  audio, _ = librosa.load(fn, sr=SAMPLE_RATE, offset=sec_start, duration=sec_len)
  return(audio)

In [None]:
wav_fn, reference_fn = download_data(koeris_data1)

downloaded wget https://neo.vidd.ai/output/wav_staging/58fac5f8-24ed-4c08-a9db-db64a95089b3.MVj78TB3NDk.16000.wav, res:0
downloaded wget https://neo.vidd.ai/output/wav_staging/58fac5f8-24ed-4c08-a9db-db64a95089b3.MVj78TB3NDk.mp4.ytvtt_transcript_mod_display, res:0


In [None]:
import json
def read_segments(fn):
  f = open(fn, "r")
  return(json.loads(f.read()))

In [None]:
segments = read_segments('koeris_data1_segments.json')

In [None]:
# check results
import IPython.display as ipd

seg_num = 87
segment = segments[seg_num]
print(segment)
start = segment['start'] 
seg_len = segment['end'] - start
user = koeris_data1['user']
fn_ext = koeris_data1['fn_ext']
fn = fn_ext.split('.')[0]
wav_fn = koeris_data1['wavfn_format'].format(user=user, fn=fn)
line_audio = read_wav_line(wav_fn, start, seg_len)
ipd.Audio(data=np.asarray(line_audio), autoplay=True, rate=16000)

{'id': 87, 'seek': 47548, 'start': 493.12, 'end': 500.8, 'text': " On the software side, we still haven't resolved the burning issue of not being able to use", 'tokens': [50364, 663, 307, 264, 3069, 3174, 300, 286, 669, 1103, 259, 28842, 322, 13, 50508, 50508, 286, 1866, 428, 3796, 5186, 11, 370, 286, 486, 483, 322, 300, 965, 13, 50640, 50640, 1033, 13, 50690, 50690, 2561, 13, 50740, 50740, 400, 45786, 11, 498, 291, 434, 322, 11, 1767, 4160, 385, 281, 767, 360, 300, 965, 498, 321, 393, 13, 50954, 50954, 663, 1116, 312, 1687, 665, 13, 51246, 51246, 1282, 264, 4722, 1252, 11, 321, 920, 2378, 380, 20772, 264, 9488, 2734, 295, 406, 885, 1075, 281, 764, 51630, 51630], 'temperature': 0.0, 'avg_logprob': -0.20046677796737009, 'compression_ratio': 1.4855769230769231, 'no_speech_prob': 2.627169669722207e-05}


In [None]:
print(wav_fn)
# from yt
start = 506.71
end = 516.8

# from whisper, seg_num = 89
start = 507.12
end = 516.2
sec_len = end - start
audio, _ = librosa.load(wav_fn, sr=SAMPLE_RATE, offset=start, duration=sec_len)
ipd.Audio(data=np.asarray(audio), autoplay=True, rate=16000)

58fac5f8-24ed-4c08-a9db-db64a95089b3.MVj78TB3NDk.16000.wav


In [None]:
# whisper base model
for i in dataset:
  wer = compute_wer(dataset[i]['reference'], dataset[i]['hypothesis'])
  print(f'wer:{wer}')

  dataset[i]['wer'] = wer
  print(f"{i}:{dataset[i]['wer'] * 100:.2f} %")

koeris_data1:14.34 %
koeris_data2:12.77 %
adena_data1:80.58 %
cactus_data1:29.12 %
cactus_data2:29.30 %


In [None]:
from google.colab import runtime
runtime.unassign()

In [None]:
from huggingface_hub import notebook_login
hg_token = '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'
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [None]:
"""
original    https://github.com/farisalasmary/wav2vec2-kenlm
            Email: farisalasmary@gmail.com
            Date:  Sep 15, 2021
"""
"""
This code uses some of the works in the following repos:
https://github.com/parlance/ctcdecode
https://github.com/SeanNaren/deepspeech.pytorch
https://github.com/Wikidepia/wav2vec2-indonesian/blob/master/notebooks/kenlm-wav2vec2.ipynb
"""

import argparse
from decoder import *
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor
import librosa
import noisereduce as nr
import re, json, sys
import numpy as np
import multiprocessing

# for noise reduction
#from pysndfx import AudioEffectsChain
#import math
#import python_speech_features
#import scipy as sp
#from scipy import signal

def find_word(word, decoded_arr):
  for i in range(len(decoded_arr)):
    if word in decoded_arr[i]:
      return(i)
  return(False)


def get_topn_logits(logits, threshold=1):
  # threshold is the difference between logit values

  # top 2 largest values
  topn = torch.topk(logits, 2, dim=-1)

  topn_indices = topn.indices[0]
  topn_values = topn.values[0]

  top1n_indices= topn_indices[:,0]
  top1n_indices_list = top1n_indices.tolist()
  top1n_values = topn_values[:,0]
  top1n_values_list = top1n_values.tolist()
  print('top1n_indices_list):',top1n_indices_list)
  top2n_values = topn_values[:,1]
  top2n_values_list = top2n_values.tolist()
  #print('len(top1n_values_list):',len(top1n_values_list))
  #print('len(top2n_values_list):',len(top2n_values_list))

  top2n_indices = topn_indices[:,1]
  top2n_indices_list = top2n_indices.tolist()
  print('len(top2n_indices_list):',len(top2n_indices_list))
  print('top2n_indices_list):',top2n_indices_list)

  #let's examine logits
  # at timestamp 11, the value could be vocab[29] or vocab[15]

  smallest_diffs = []
  for i in range(len(top1n_indices)):
    top1_value = top1n_values[i]
    top2_value = top2n_values[i]
    diff = abs(top1_value - top2_value)
    if diff <= threshold:
      print('i:{i}, top1:{top1}, top2:{top2} diff:{diff}'.format(i=i, diff=diff, top1=top1_value, top2=top2_value))
      # replace original value with lower value - no real effect on beam search
      #logits[0, i, 31] = top2_value
      #print('now logits[0, {i}, 32:', logits[0, i, 31])
      smallest_diffs.append(i)
    #token1 = processor.tokenizer.convert_ids_to_tokens(top1_indices[i])
    #token2 = processor.tokenizer.convert_ids_to_tokens(top2_indices[i])

  #return(smallest_diffs, top2n_indices_list, logits)
  return(smallest_diffs, top2n_indices_list, top2n_values_list, top1n_indices_list, top1n_values_list)


def insert_runner_ups_logits(logits, top2n_logits_list, top2n_indices_list, runner_ups, top1n_logits_list, top1n_indices_list):
  """
  inserts each value in top2nd_logits into logits at indexes listed in runner_ups
  returns:
    tensor of logits
  arguments:
    runner_ups is an array of indexes

  """
  """
  print('top2n_indices_list:',top2n_indices_list)
  print('top2n_logits_list:',top2n_logits_list)
  print('top1n_indices_list:',top1n_indices_list)
  print('top2n_indices_list:',top2n_indices_list)
  print('top2n_logits_list:',top2n_logits_list)
  print('top1n_indices_list:',top1n_indices_list)
  print('top1n_logits_list:',top1n_logits_list)
  """

  offset = 0
  supp_x = logits[0].detach().clone()

  for count, i in enumerate(runner_ups):
    offset = count + i

    # we want to have top1n logit to come before the top2n logit,
    # so slice to offset + 1 for first_half
    first_half = supp_x[0:offset+1, :]
    second_half = supp_x[offset+1:, :]
    print('first_half size:',first_half.size())
    #logits[0, i, 31] = top2_value

    # make a copy of logits at i
    supp_row = logits[0, i ].detach().clone()

    # at timestamp i, the value of top2n_indices_list[i] is value
    index_top2nd = top2n_indices_list[i]
    value_top2nd = top2n_logits_list[i]

    index_top1nd = top1n_indices_list[i]
    value_top1nd = top1n_logits_list[i]

    #print(f'index_top2nd:{index_top2nd}, value_top2nd:{value_top2nd}')
    #print(f'index_top1nd:{index_top1nd}, value_top1nd:{value_top1nd}')

    # overwrite value with top2nd
    #print(f'{i}:before correction, supp_row:',supp_row)
    supp_row[index_top2nd] = value_top1nd + 1
    #print(f'{i}:supp_value:',supp_row[index_top2nd])
    #print(f'{i}:after correction, supp_row:',supp_row)
    #print(f'{i}:size of supp_row:',supp_row.size())

    # make into 2-dimensional tensor
    supp_row = supp_row.unsqueeze(0)
    #print(f'{i}:after unsqueeze size of supp_row:',supp_row.size())

    supp_x = torch.cat((first_half, supp_row, second_half), dim=0)

  # make into 3-dimensional tensor
  supp_x = supp_x.unsqueeze(0)
  print('supp_x.size():',supp_x.size()) #torch.Size([47, 300])
  return(supp_x)

def insert_runner_ups(predicted_ids, top2n_predicted_ids, runner_ups):
  """
  inserts a runner_up prediction into the top predictions list
  returns:
    an array of indices
  """
  sep = processor.tokenizer.convert_ids_to_tokens(processor.tokenizer.word_delimiter_token_id)
  pad = processor.tokenizer.convert_ids_to_tokens(processor.tokenizer.pad_token_id)
  sep_and_pad = [sep, pad]
  res_predicted_ids = []
  print('original predicted_ids len:',len(predicted_ids))
  for i in range(len(predicted_ids)):
    token1 = processor.tokenizer.convert_ids_to_tokens(predicted_ids[i])
    token2 = processor.tokenizer.convert_ids_to_tokens(top2n_predicted_ids[i])
    if i in runner_ups:
      # interested only in runner_ups that convert to a letter in the vocabulary
      if (token1 in sep_and_pad) and (token2 in sep_and_pad):
        res_predicted_ids.append(predicted_ids[i])
        continue
      res_predicted_ids.append(predicted_ids[i])
      res_predicted_ids.append(top2n_predicted_ids[i])
      continue
    res_predicted_ids.append(predicted_ids[i])
  print('resulting predicted_ids len:',len(res_predicted_ids))
  return(res_predicted_ids)


def show_tokens_with_values(predicted_ids, top2n_predicted_ids, runner_ups):
  tokens_list = []
  sep = processor.tokenizer.convert_ids_to_tokens(processor.tokenizer.word_delimiter_token_id)
  pad = processor.tokenizer.convert_ids_to_tokens(processor.tokenizer.pad_token_id)
  sep_and_pad = [sep, pad]
  #print('sep_and_pad:',sep_and_pad)
  for i in range(len(predicted_ids)):
    token1 = processor.tokenizer.convert_ids_to_tokens(predicted_ids[i])
    token2 = processor.tokenizer.convert_ids_to_tokens(top2n_predicted_ids[i])
    if i in runner_ups:
      # interested only in runner_ups that convert to a letter in the vocabulary
      if (token1 in sep_and_pad) and (token2 in sep_and_pad):
        continue
      this_str = token1 + '*' + token2
    else:
      this_str = token1
    if this_str == '<pad>':
      continue
    tokens_list.append(this_str)
  return(''.join(tokens_list))

  def show_tokens(predicted_ids):
  #_ids = predicted_ids[0].tolist()
  tokens_list = []
  for _id in predicted_ids:
    if _id == processor.tokenizer.pad_token_id:
      continue
    token = processor.tokenizer.convert_ids_to_tokens(_id)
    tokens_list.append(token)
  return(''.join(tokens_list))

def show_logit_probs(logits):
  device = 'cpu'
  for logit in logits:
        pred_ids = torch.argmax(logit, dim=-1)
        raw_token = processor.tokenizer.convert_ids_to_tokens(pred_ids[0].tolist())
        if raw_token == processor.tokenizer.pad_token_id:
          continue
        mask = pred_ids.ge(1).unsqueeze(-1).expand(logit.size())
        vocab_size = logit.size()[-1]
        voice_prob = torch.nn.functional.softmax((torch.masked_select(logit, mask).view(-1,vocab_size)),dim=-1)
        #gpt_input = torch.cat((torch.tensor([tokenizer.cls_token_id]).to(device),pred_ids[pred_ids>0]), 0)
        #gpt_prob = torch.nn.functional.softmax(gpt_model(gpt_input).logits, dim=-1)[:voice_prob.size()[0],:]
        print('raw_tokens',raw_token)
        print('voice_prob:',voice_prob)
        #comb_pred_ids = torch.argmax(gpt_prob*voice_prob, dim=-1)
        #decoded_results.append(processor.decode(comb_pred_ids))

def remove_repeating_chars(text):
  if len(text) <= 0:
    return(text)
  return(re.sub(r'(.)\1+', r'\1', text))

def spoken_for(a, b):
  # are there elements in a in b?
  c = list(set(a).intersection(b))
  if len(c):
    return(True)
  return(False)

def make_transcript(tokenizer, model, audiofile_path):
    global kenlm
    global hubert
    global greedy_decoder

    transcript = ""
    raw_tokens_transcript = ""
    supp_raw_tokens_transcript = ""
    supp_greedy_transcript = ""
    # Ensure that the sample rate is 16k
    print(librosa.get_samplerate(audiofile_path))

    speech_array, sample_rate = librosa.load(audiofile_path, sr=16000)
    duration = librosa.get_duration(filename=audiofile_path)
    print('file duration in secs:',duration, ' in mins:',duration/60)

    # settings from https://colab.research.google.com/github/flashlight/flashlight/blob/master/flashlight/app/asr/tutorial/notebooks/InferenceAndAlignmentCTC.ipynb#scrollTo=2FNsgvIghzfr
    n_fft = 512
    """
    # was ist das
    hop_length = sample_rate // 100
    # win_length corresponds to 25ms window
    win_length = int(hop_length * 2.5)
    print('what is hop_length//100?',hop_length, ' win_length:',win_length)
    """

    # override
    hop_length = 16000
    block_length = 30
    # power of two
    # block_length of 8 creates about a "thought"-length
    # block_length of 4 creates about a "phrase"-length
    block_length = 4 * 1
    frame_length = 16000
    # hop_length = The number of samples to advance between frames
    hop_length = int(frame_length * .85)

    print('block_length:{block_length}, hop_length:{hop_length}'.format(block_length=block_length, hop_length=hop_length))

    """
    # sort of works, but for inaccurate timestamps
    hop_length = 16000
    block_length = 30
    frame_length = 16000
    """
      # stream in chunks
    stream = librosa.stream(
        audiofile_path,
        block_length=block_length,
        frame_length=frame_length,
        hop_length=hop_length
    )


    stream_id = 0
    words = {}
    timestamps = {}
    beam_offsets = {}
    decoded_ids = {}
    offsets = {}
    lens = {}
    durations = {}
    predictions = {}
    prev_duration_sec = 0

    # block_length = 2
    base_constant = 74
    # =(4/2*50)/2 + constant
    dim_length = int((block_length / 2 * 50)) + base_constant
    logits = torch.empty(1, 74, 32)
    concatenated_logits = torch.empty(1, dim_length, 32)
    total_blocks = 0
    concat_predicted_ids = []
    runner_ups = []
    top2n_predicted_ids_list = []
    greedy_concat_predicted_ids = []

    num_blocks = int(duration / block_length)
    if (duration % block_length):
      num_blocks += 1
    print('len(stream):', num_blocks)

    greedy_transcript = ''

    results = {}
    for speech in stream:
        if len(speech.shape) > 1:
            speech = speech[:, 0] + speech[:, 1]

        D = np.abs(librosa.stft(speech, n_fft=n_fft, hop_length=hop_length, win_length=block_length, center=False))
        DB = librosa.amplitude_to_db(D, ref=np.max)
        durations[stream_id] = DB.shape[1]

        # try to get foreground "vocals"
        #speech = reduce_noise_centroid_mb(speech, sample_rate)
        # reduce noise
        #speech = nr.reduce_noise(y = speech, sr=sample_rate, n_std_thresh_stationary=1.5,stationary=True)
        # up the volume
        # https://stackoverflow.com/questions/13329617/change-the-volume-of-a-wav-file-in-python
        #speech = np.fromstring(speech, np.int16) / 10 * 5
        """
        db_min = np.min(DB)
        db_max = np.max(DB)
        max_min_multiple = int(db_max / db_min)
        print('db_min:{db_min},db_max:{db_max},max_min_multiple:{max_min_multiple}'.format(db_min=db_min,db_max=db_max,max_min_multiple=max_min_multiple))
        speech = speech * max_min_multiple
        """
        # stretch audio
        #speech = stretch(speech)

        """
        # amplify
        avg_loudness = loudness(speech)
        print('avg_loudness:',avg_loudness)
        speech = amplify(speech)
        """

        inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True)
        #print('inputs.shape:',inputs.input_values.shape)
        input_values = inputs.input_values

        # 30 secs for block_length=30
        duration_sec = input_values.shape[1] / sample_rate
        print('duration_sec:',duration_sec)

        # offset timestamp for this stream from the beginning of file, in seconds
        stream_offset = stream_id * (prev_duration_sec + (hop_length / frame_length))
        offsets[stream_id] = stream_offset

        net_input = inputs.to(device).input_values
        net_input = net_input.to(device)

        #print("Model Prediction...")
        model = model.eval().to(device)
        with torch.no_grad():
          # for facebook model
          #logits = model(net_input, attention_mask=inputs.attention_mask).logits
          # for non-facebook model
          logits = model(net_input).logits
          logits_size = logits.size()
          #print('logits:',logits)
          print('logits_size:',logits_size)

          # predicted ids
          predicted_ids = torch.argmax(logits, dim=-1)

          # if no words detected, try upping the pitch
          #speech = up_pitch(speech, sample_rate, pitch_rate=2)


          # get runner_ups
          runner_ups, top2n_predicted_ids_list, top2n_logits_list, top1n_predicted_ids_list, top1n_logits_list  = get_topn_logits(logits, threshold=.52)
          print('stream_id:{stream_id}, block_id:{block_id}, runner_ups:{runner_ups}'.format(stream_id=stream_id, block_id=total_blocks, runner_ups=runner_ups))

          supp_logits = insert_runner_ups_logits(logits, top2n_logits_list, top2n_predicted_ids_list, runner_ups, top1n_logits_list, top1n_predicted_ids_list)

          if total_blocks == 0:
            concatenated_logits = logits
            supp_concatenated_logits = supp_logits
          else:
            concatenated_logits = torch.cat((concatenated_logits, logits), 1)
            supp_concatenated_logits = torch.cat((supp_concatenated_logits, supp_logits), 1)
          print('concatenated_logits_size:',concatenated_logits.size())
          total_blocks += 1
      
          print('total_blocks:',total_blocks)
        # reset text
        text = ''
        greedy_text = ''
        supp_greedy_text = ''
        raw_tokens_text = ''
        supp_raw_tokens_text = ''
        # kenlm
        if kenlm:
          # decode every 8th block
          if (total_blocks >= min(8/block_length, num_blocks)):
            #concat_raw_tokens = show_tokens(concat_predicted_ids)
            #print('concat_raw_tokens:',concat_raw_tokens)

            #greedy_concat_raw_tokens = " ".join(processor.tokenizer.convert_ids_to_tokens(greedy_concat_predicted_ids[0].tolist()))
            #print('greedy_concat_raw_tokens:',greedy_concat_raw_tokens)

            # supplemented logits
            supp_concat_predicted_ids = torch.argmax(supp_concatenated_logits, dim=-1)
            supp_raw_tokens_text = show_tokens(supp_concat_predicted_ids[0].tolist())
            print('supp_raw_tokens_text:',supp_raw_tokens_text)

            supp_greedy_text = processor.decode(supp_concat_predicted_ids[0])

            beam_decoded_output, beam_decoded_offsets, beam_scores = beam_decoder.decode(supp_concatenated_logits)
            supp_beam0 = (beam_decoded_output[0][0]).lower()
            print('supp_ score:{score}, text0:{text}'.format(text=supp_beam0, score=beam_scores[0][0]))
            print('supp_ score:{score}, text1:{text}'.format(text=(beam_decoded_output[0][1]).lower(), score=beam_scores[0][1]))

            # unsupplemented logits
            concat_predicted_ids = torch.argmax(concatenated_logits, dim=-1)
            raw_tokens_text = show_tokens(concat_predicted_ids[0].tolist())
            print('raw_tokens_text:',raw_tokens_text)

            greedy_text = processor.decode(concat_predicted_ids[0])
            print('greedy: text:{text}'.format(text=greedy_text))

            beam_decoded_output, beam_decoded_offsets, beam_scores = beam_decoder.decode(concatenated_logits)
            print('size of beam:', len(beam_decoded_output[0]))
            beam0 = (beam_decoded_output[0][0]).lower()

            print('score:{score}, text0:{text}'.format(text=beam0, score=beam_scores[0][0]))
            print('score:{score}, text1:{text}'.format(text=(beam_decoded_output[0][1]).lower(), score=beam_scores[0][1]))
            beam_offsets[stream_id] = (beam_decoded_offsets[0][0]).tolist()
            total_blocks = 0
            concatenated_logits = logits
            concat_predicted_ids = []

            this_result = { 'raw_tokens':'', 'supp_raw_tokens':'', 'greedy_text':'', 'supp_greedy_text':'', 'beam0':'', 'supp_beam0':'' }
            this_result['raw_tokens'] = raw_tokens_text
            this_result['supp_raw_tokens'] = supp_raw_tokens_text
            this_result['greedy_text'] = greedy_text
            this_result['supp_greedy_text'] = supp_greedy_text
            this_result['beam0'] = beam0
            this_result['supp_beam0'] = supp_beam0
            results[stream_id] = this_result

          if len(text.strip()) > 0:
            words[stream_id] = text
            transcript += ' '
            transcript += beam0
            print('beam0:',beam0)
            print('transcript:',transcript)

          if len(greedy_text.strip()) > 0:
            greedy_transcript += ' '
            greedy_transcript += greedy_text
            print('greedy_transcript:',greedy_transcript)

          if len(supp_greedy_text.strip()) > 0:
            supp_greedy_transcript += ' '
            supp_greedy_transcript += supp_greedy_text

          if len(raw_tokens_text.strip()) > 0:
            raw_tokens_transcript += ' '
            raw_tokens_transcript += raw_tokens_text

          if len(supp_raw_tokens_text.strip()) > 0:
            supp_raw_tokens_transcript += ' '
            supp_raw_tokens_transcript += supp_raw_tokens_text

          stream_id += 1
          prev_duration_sec = duration_sec
          continue
        # no kenlm
        if hubert:
          text = processor.batch_decode(predicted_ids)
          text = text[0].lower()
        else:
          text = tokenizer.decode(predicted_ids[0]).lower()
        words[stream_id], timestamps[stream_id], decoded_ids[stream_id] = timestamp(text, predicted_ids, duration_sec)
        transcript += text
        stream_id += 1
        prev_duration_sec = duration_sec

    return(results)

    
def write_output(data, fn):
  with open(fn, 'w') as outfile:
    json.dump(data, outfile)
  print(json.dumps(fn))



In [None]:
!pip install transformers torch datasets librosa

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.5.1-py3-none-any.whl (431 kB)
[K     |████████████████████████████████| 431 kB 28.7 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 73.8 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 40.5 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 70.4 MB/s 
Installing collected packages: urllib3, xxhash, responses, multiprocess, datasets
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urllib3-1.24

In [None]:
def our_transcript(model, tokenizer, processor, audiofile_path):
    transcript = ""
    raw_tokens_transcript = ""
    supp_raw_tokens_transcript = ""
    supp_greedy_transcript = ""

    # Ensure that the sample rate is 16k
    sampling_rate = 16000

    #speech_array, sample_rate = librosa.load(audiofile_path, sr=16000)
    duration = librosa.get_duration(filename=audiofile_path)
    print('file duration in secs:',duration, ' in mins:',duration/60)
    # override
    # settings from https://colab.research.google.com/github/flashlight/flashlight/blob/master/flashlight/app/asr/tutorial/notebooks/InferenceAndAlignmentCTC.ipynb#scrollTo=2FNsgvIghzfr
    n_fft = 512
    hop_length = 16000
    block_length = 30
    # power of two
    # block_length of 8 creates about a "thought"-length
    # block_length of 4 creates about a "phrase"-length
    block_length = 4 * 1
    frame_length = 16000
    # hop_length = The number of samples to advance between frames
    hop_length = int(frame_length * .85)

    print('block_length:{block_length}, hop_length:{hop_length}'.format(block_length=block_length, hop_length=hop_length))
    # stream in chunks
    stream = librosa.stream(
        audiofile_path,
        block_length=block_length,
        frame_length=frame_length,
        hop_length=hop_length
    )

    num_blocks = int(duration / block_length)
    if (duration % block_length):
      num_blocks += 1
    print('len(stream):', num_blocks)

    greedy_transcript = ''
    stream_id = 0
    words = {}
    timestamps = {}
    beam_offsets = {}
    decoded_ids = {}
    offsets = {}
    lens = {}
    durations = {}
    predictions = {}
    prev_duration_sec = 0

    results = {}
    for speech in stream:
        if len(speech.shape) > 1:
            speech = speech[:, 0] + speech[:, 1]

        D = np.abs(librosa.stft(speech, n_fft=n_fft, hop_length=hop_length, win_length=block_length, center=False))
        DB = librosa.amplitude_to_db(D, ref=np.max)
        durations[stream_id] = DB.shape[1]
        inputs = processor(speech, sampling_rate=sampling_rate, return_tensors="pt")

        with torch.no_grad():
          logits = model(**inputs).logits
        predicted_ids = torch.argmax(logits, dim=-1)

        # transcribe speech
        transcription = processor.batch_decode(predicted_ids)
        if transcription:
          print(f"{stream_id}:{transcription}")
          greedy_transcript += ' '
          greedy_transcript += transcription[0]
        stream_id += 1
    return(greedy_transcript.strip())


In [None]:
# our own acoustic model
from transformers import Wav2Vec2Processor, Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, HubertForCTC
from datasets import load_dataset

import torch
import librosa
import re, json, sys, os
import numpy as np

model_name = 'i-am-neo/trial9-yt-hubert-ll60k'

if 'trial9' in model_name:
  batch3_clean_vocab = {"3": 0, "7": 1, "w": 2, "1": 3, "e": 4, "x": 5, "r": 6, "6": 7, "/": 8, "i": 9, "u": 10, "n": 11, "0": 12, "q": 13, "$": 14, "t": 15, "z": 16, "a": 17, "y": 18, "5": 19, "p": 21, "f": 22, "j": 23, "c": 24, "2": 25, "b": 26, "+": 27, "8": 28, "'": 29, "g": 30, "_": 31, "9": 32, "4": 33, "&": 34, "%": 35, "m": 36, "s": 37, "v": 38, "d": 39, "@": 40, "o": 41, "k": 42, "l": 43, "h": 44, "\u00ed": 45, "|": 20, "[UNK]": 46, "[PAD]": 47}
  with open("batch3_clean_vocab.json", "w") as outfile:
    outfile.write(json.dumps(batch3_clean_vocab))
  tokenizer = Wav2Vec2CTCTokenizer("batch3_clean_vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
if 'trial20' in model_name:
  batch3_clean_nonumbers_vocab = {"_": 0, "o": 1, "b": 2, "&": 3, "g": 4, "$": 5, "m": 6, "e": 7, "d": 8, "p": 9, "x": 10, "a": 11, "l": 12, "/": 13, "n": 14, "h": 15, "'": 16, "t": 17, "y": 18, "s": 19, "f": 20, "%": 21, "u": 22, "w": 23, "@": 24, "+": 25, "j": 26, "q": 27, "\u00ed": 28, "v": 29, "c": 30, "3": 31, "r": 32, "k": 33, "z": 35, "i": 36, "|": 34, "[UNK]": 37, "[PAD]": 38}
  with open("batch3_clean_nonumbers_vocab.json", "w") as outfile:
    outfile.write(json.dumps(batch3_clean_vocab))
  tokenizer = Wav2Vec2CTCTokenizer("batch3_clean_nonumbers_vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
model = HubertForCTC.from_pretrained(model_name, use_auth_token=True)
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

Downloading:   0%|          | 0.00/1.69k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

In [None]:
wav_fn, reference_fn = download_data(koeris_data1)
print(koeris_data2)
print(wav_fn)

downloaded wget https://neo.vidd.ai/output/wav_staging/58fac5f8-24ed-4c08-a9db-db64a95089b3.LsgKwy8KIE4.mp4.output.wav, res:0
downloaded wget https://neo.vidd.ai/output/wav_staging/58fac5f8-24ed-4c08-a9db-db64a95089b3.LsgKwy8KIE4.mp4.ytvtt_transcript_mod_display, res:2048
{'user': '58fac5f8-24ed-4c08-a9db-db64a95089b3', 'fn_ext': 'LsgKwy8KIE4.mp4', 'wavfn_format': '{user}.{fn_ext}.output.wav'}
58fac5f8-24ed-4c08-a9db-db64a95089b3.LsgKwy8KIE4.mp4.output.wav


In [None]:
!rm 58fac5f8-24ed-4c08-a9db-db64a95089b3.MVj78TB3NDk.mp4.output.wav

In [None]:
wav_fn, reference_fn = download_data(cactus_data1)
print(wav_fn)
transcript = our_transcript(model, tokenizer, processor, wav_fn)

downloaded wget https://neo.vidd.ai/output/wav_staging/4d06531b-f237-447f-8993-a68aa65fb6e8.GKc6i29eawI.mp4.output.wav, res:2048
downloaded wget https://neo.vidd.ai/output/wav_staging/4d06531b-f237-447f-8993-a68aa65fb6e8.GKc6i29eawI.mp4.ytvtt_transcript_mod_display, res:2048
4d06531b-f237-447f-8993-a68aa65fb6e8.GKc6i29eawI.mp4.output.wav


FileNotFoundError: ignored

In [None]:
!wget https://neo.vidd.ai/output/wav_staging/58fac5f8-24ed-4c08-a9db-db64a95089b3.MVj78TB3NDk.mp4.5.wav

--2022-09-24 23:55:42--  https://neo.vidd.ai/output/wav_staging/58fac5f8-24ed-4c08-a9db-db64a95089b3.MVj78TB3NDk.mp4.5.wav
Resolving neo.vidd.ai (neo.vidd.ai)... 54.67.78.144
Connecting to neo.vidd.ai (neo.vidd.ai)|54.67.78.144|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 18923598 (18M) [audio/x-wav]
Saving to: ‘58fac5f8-24ed-4c08-a9db-db64a95089b3.MVj78TB3NDk.mp4.5.wav’


2022-09-24 23:55:43 (20.2 MB/s) - ‘58fac5f8-24ed-4c08-a9db-db64a95089b3.MVj78TB3NDk.mp4.5.wav’ saved [18923598/18923598]



In [None]:
!wget https://neo.vidd.ai/output/wav_staging/58fac5f8-24ed-4c08-a9db-db64a95089b3.MVj78TB3NDk.mp4.line_55.wav

--2022-09-24 23:55:48--  https://neo.vidd.ai/output/wav_staging/58fac5f8-24ed-4c08-a9db-db64a95089b3.MVj78TB3NDk.mp4.line_55.wav
Resolving neo.vidd.ai (neo.vidd.ai)... 54.67.78.144
Connecting to neo.vidd.ai (neo.vidd.ai)|54.67.78.144|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 467022 (456K) [audio/x-wav]
Saving to: ‘58fac5f8-24ed-4c08-a9db-db64a95089b3.MVj78TB3NDk.mp4.line_55.wav.1’


2022-09-24 23:55:49 (1.25 MB/s) - ‘58fac5f8-24ed-4c08-a9db-db64a95089b3.MVj78TB3NDk.mp4.line_55.wav.1’ saved [467022/467022]



In [None]:

wav_fn = '58fac5f8-24ed-4c08-a9db-db64a95089b3.MVj78TB3NDk.mp4.line_55.wav'
wav_fn = '58fac5f8-24ed-4c08-a9db-db64a95089b3.MVj78TB3NDk.mp4.5.wav'
transcript = our_transcript(model, tokenizer, processor, wav_fn)

file duration in secs: 591.36  in mins: 9.856
block_length:4, hop_length:13600
len(stream): 148
0:['on on kgi side on canmeras']
1:["we had then it's a mora who's ar it  an was gorou"]
2:['join us and deal with that so that was good']
3:['um _u']
4:['what aboulte wei']
5:['o mike m yeah']
6:['i think the the the action item']
7:['m from that meeting two was to send over to']
8:['sharing and allbri list of the']
9:['ce odgles yeah that intad is the action item']
10:['i am i saw your imy yesterday so']
11:["we'll get on that today okay thanks and ambea"]
12:["if you're on seaser reminding meto actually dou thap today we can"]
13:["that'd be super good"]
14:['_an on the']
15:["software side we still haven't resolved"]
16:['the borning issue of not being able']
17:['to use the sarch arious cs']
18:['one thousand you know']
19:['we should probably escalate that to nimbel as a mag']
20:['is but we should also']
21:["we're gong to talk to  do a loop yep"]
22:['steve this is something i though

KeyboardInterrupt: ignored

In [None]:
print(transcript)

un on the software side we still haven't resolved the burning issue of not being able to use the searchard us cee s one thousand _


In [None]:
# wer for our model
transcript = normalize(transcript)
reference_text = read_text(reference_fn)
reference_text = normalize(reference_text)
wer = jiwer.wer(transcript, reference_text)

print(f"WER: {wer * 100:.2f} %")

In [None]:
hypotheses = []
references = []

for mels, texts in tqdm(loader):
    results = model.decode(mels, options)
    hypotheses.extend([result.text for result in results])
    references.extend(texts)

  0%|          | 0/164 [00:00<?, ?it/s]

In [None]:
data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))
data

Unnamed: 0,hypothesis,reference
0,"He hoped there would be stew for dinner, turni...",HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...
1,"Stuffered into you, his belly counseled him.",STUFF IT INTO YOU HIS BELLY COUNSELLED HIM
2,After early nightfall the yellow lamps would l...,AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...
3,"Hello Bertie, any good in your mind?",HELLO BERTIE ANY GOOD IN YOUR MIND
4,Number 10. Fresh Nelly is waiting on you. Good...,NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...
...,...,...
2615,"Oh, to shoot my soul's full meaning into futur...",OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE...
2616,"Then I, long tried by natural ills, received t...",THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE...
2617,I love thee freely as men strive for right. I ...,I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L...
2618,"I love thee with the passion put to use, in my...",I LOVE THEE WITH THE PASSION PUT TO USE IN MY ...


# Calculating the word error rate

Now, we use our English normalizer implementation to standardize the transcription and calculate the WER.

In [None]:
import jiwer
from whisper.normalizers import EnglishTextNormalizer

normalizer = EnglishTextNormalizer()

In [None]:
data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
data["reference_clean"] = [normalizer(text) for text in data["reference"]]
data

Unnamed: 0,hypothesis,reference,hypothesis_clean,reference_clean
0,"He hoped there would be stew for dinner, turni...",HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...,he hoped there would be stew for dinner turnip...,he hoped there would be stew for dinner turnip...
1,"Stuffered into you, his belly counseled him.",STUFF IT INTO YOU HIS BELLY COUNSELLED HIM,stuffered into you his belly counseled him,stuff it into you his belly counseled him
2,After early nightfall the yellow lamps would l...,AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...,after early nightfall the yellow lamps would l...,after early nightfall the yellow lamps would l...
3,"Hello Bertie, any good in your mind?",HELLO BERTIE ANY GOOD IN YOUR MIND,hello bertie any good in your mind,hello bertie any good in your mind
4,Number 10. Fresh Nelly is waiting on you. Good...,NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...,number 10 fresh nelly is waiting on you good n...,number 10 fresh nelly is waiting on you good n...
...,...,...,...,...
2615,"Oh, to shoot my soul's full meaning into futur...",OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE...,0 to shoot my soul is full meaning into future...,0 to shoot my soul is full meaning into future...
2616,"Then I, long tried by natural ills, received t...",THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE...,then i long tried by natural ills received the...,then i long tried by natural ills received the...
2617,I love thee freely as men strive for right. I ...,I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L...,i love thee freely as men strive for right i l...,i love thee freely as men strive for right i l...
2618,"I love thee with the passion put to use, in my...",I LOVE THEE WITH THE PASSION PUT TO USE IN MY ...,i love thee with the passion put to use in my ...,i love thee with the passion put to use in my ...


In [None]:
wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))

print(f"WER: {wer * 100:.2f} %")

WER: 4.26 %
