In [1]:
import youtube_dl
import os
class Downloader:
    class MyLogger(object):
        def debug(self, msg):
            pass
        def warning(self, msg):
            pass
        def error(self, msg):
            print(msg)

    @classmethod
    def download(cls, url, filename):
        ydl_opts = {
            'format': 'bestaudio/best',
            'outtmpl': filename,
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'wav',
                'preferredquality': '192',
            }],
            'logger': cls.MyLogger(),
            'progress_hooks': [cls.my_hook],
        }
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])

    @staticmethod
    def my_hook(d):
        if d['status'] == 'finished':
            print('Done downloading, now converting ...')

In [2]:
filename = 'out.wav'
if not os.path.exists(filename):
    dlr = Downloader()
    dlr.download('https://www.youtube.com/watch?v=8rJu-eltak0', filename)

In [11]:
import librosa
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
from tqdm import tqdm

class Wave2Vec2:
    def __init__(self, pretrained_model_name_or_path="facebook/wav2vec2-base-960h"):
        self.tokenizer = Wav2Vec2Tokenizer.from_pretrained(pretrained_model_name_or_path)
        self.model = Wav2Vec2ForCTC.from_pretrained(pretrained_model_name_or_path)
        self.device = torch.device('cuda')
        self.model.to(self.device)
        print("[INFO] model initialized..")
        
    def load_wav_file(self, filename):
        speech, rate = librosa.load(filename,sr=16000)
        encoded_audio = self.tokenizer(speech, return_tensors = 'pt').input_values
        encoded_audio = encoded_audio.to(self.device)
        print("[INFO] wav file vectorized..")
        return encoded_audio
    
    def predict(self, encoded_audio, BATCH_SIZE=64,ignoreLast=False):
        try:
            SPILT_SIZE = encoded_audio.shape[1] // (BATCH_SIZE-1)
            batches = torch.split(encoded_audio,SPILT_SIZE, dim=1)
            print(f"[INFO] split data into batches with SPILT_SIZE:{SPILT_SIZE}")
            if ignoreLast:
                transcriptions = [self.__predict(batch, self.model, self.tokenizer) \
                    for batch in tqdm(batches[:-1])]
            else:
                transcriptions = [self.__predict(batch, self.model, self.tokenizer) \
                    for batch in tqdm(batches[:])]
            print("[INFO] prediction done")
            return transcriptions  
            
        except RuntimeError as e:
            if "CUDA out of memory" in str(e):
                print(f"CUDA out of memory...Try to increase batch size param or restart kernel")
            elif "Kernel size can't greater than actual input size" in str(e):
                print(f"Kernel size issue...Try to set ignoreLast to True")
        
            print(e)
            
            return -1

    @staticmethod
    def __predict(batch, model, tokenizer):
        with torch.no_grad():
            logits = model(batch).logits
            predicted_ids = torch.argmax(logits, dim=-1)
            transcription = tokenizer.decode(predicted_ids[0])
        return transcription

In [12]:
cls = Wave2Vec2("facebook/wav2vec2-large-robust-ft-libri-960h")
encoded_audio = cls.load_wav_file(filename)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.


[INFO] model initialized..


  return f(*args, **kwargs)


[INFO] wav file vectorized..


In [16]:
transcriptions = cls.predict(encoded_audio, 32,ignoreLast=True)

[INFO] split data into batches with SPILT_SIZE:744512


100%|██████████| 31/31 [00:27<00:00,  1.13it/s]

[INFO] prediction done





In [15]:
# stich back transcriptions
full_text = f"\n".join(f"{i}: {t}" for i,t in enumerate(transcriptions)) 
# print(full_text)

TypeError: 'int' object is not iterable

In [None]:
output_text_filename = "output.txt"
if not os.path.exists(output_text_filename):
    #make file
    ...

with open(output_text_filename,'w') as f:
    f.write(full_text)

In [None]:
import pkg_resources, os, time

for package in pkg_resources.working_set:
    print("%s: %s" % (package, time.ctime(os.path.getctime(package.location))))

regex 2022.6.2: Fri Jun 17 20:54:51 2022
certifi 2022.5.18.1: Fri Jun 17 20:54:51 2022
pytz 2022.1: Fri Jun 17 20:54:51 2022
youtube-dl 2021.12.17: Fri Jun 17 20:54:51 2022
pywin32 302: Fri Jun 17 20:54:51 2022
setuptools 61.2.0: Fri Jun 17 20:54:51 2022
cryptography 37.0.1: Fri Jun 17 20:54:51 2022
pyzmq 22.3.0: Fri Jun 17 20:54:51 2022
pyOpenSSL 22.0.0: Fri Jun 17 20:54:51 2022
attrs 21.4.0: Fri Jun 17 20:54:51 2022
argon2-cffi 21.3.0: Fri Jun 17 20:54:51 2022
packaging 21.3: Fri Jun 17 20:54:51 2022
pip 21.2.4: Fri Jun 17 20:54:51 2022
argon2-cffi-bindings 21.2.0: Fri Jun 17 20:54:51 2022
Pillow 9.0.1: Fri Jun 17 20:54:51 2022
ipython 7.31.1: Fri Jun 17 20:54:51 2022
ipywidgets 7.6.5: Fri Jun 17 20:54:51 2022
jupyter-client 7.2.2: Fri Jun 17 20:54:51 2022
ipykernel 6.9.1: Fri Jun 17 20:54:51 2022
notebook 6.4.11: Fri Jun 17 20:54:51 2022
nbconvert 6.4.4: Fri Jun 17 20:54:51 2022
jupyter-console 6.4.3: Fri Jun 17 20:54:51 2022
tornado 6.1: Fri Jun 17 20:54:51 2022
PyYAML 6.0: Fri Jun