In [1]:
import librosa
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

# tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
# model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-large-robust-ft-libri-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-robust-ft-libri-960h")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.


import youtube_dl  
youtube_dl.downloader()  
youtube-dl --extract-audio --audio-format wav -o 'ONEPIECE.wav' 'https://www.youtube.com/watch?v=KM8tNu1lBhU'

In [None]:
import youtube_dl

class MyLogger(object):
    def debug(self, msg):
        pass
    def warning(self, msg):
        pass
    def error(self, msg):
        print(msg)

def my_hook(d):
    if d['status'] == 'finished':
        print('Done downloading, now converting ...')

ydl_opts = {
    'format': 'bestaudio/best',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'wav',
        'preferredquality': '192',
    }],
    'logger': MyLogger(),
    'progress_hooks': [my_hook],
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
    ydl.download(['https://www.youtube.com/watch?v=dJAoK5zK36M'])

In [4]:
filename = "English Audio Speech-to-Text Transcript with Hugging Face _ Python NLP-dJAoK5zK36M.wav"
device = torch.device('cuda')
BATCH_SIZE = 64

In [5]:
# vectorise input audio data
speech, rate = librosa.load(filename,sr=16000)
input_values = tokenizer(speech, return_tensors = 'pt').input_values
input_values = input_values.to(device)

SPILT_SIZE = input_values.shape[1] // (BATCH_SIZE-1)

batches = torch.split(input_values,SPILT_SIZE, dim=1)
len(batches), batches[0].shape, batches[-1].shape

(64, torch.Size([1, 157353]), torch.Size([1, 10]))

In [6]:
# import os
# import torch.nn as nn
# os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
# os.environ['CUDA_VISIBLE_DEVICES']='0,1'
# os.environ['PYTORCH_CUDA_ALLOC_CONF']='max_split_size_mb:128'
# # os.environ['PYTORCH_NO_CUDA_MEMORY_CACHING']='1'
# print(os.environ['PYTORCH_CUDA_ALLOC_CONF'])
# # print(os.environ['PYTORCH_NO_CUDA_MEMORY_CACHING'])``

In [7]:
print(torch.cuda.device_count(), \
torch.cuda.device(0), \
torch.cuda.get_device_name(0))

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')
    
# print(torch.cuda.max_memory_cached(device=None))
# print(torch.cuda.memory_allocated(device=None))
# print(torch.cuda.memory_stats(device=None))
print(torch.cuda.memory_summary(device=0, abbreviated=False))

2 <torch.cuda.device object at 0x00000230524B2148> NVIDIA GeForce GTX 1070
NVIDIA GeForce GTX 1070
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   38912 KB |   38912 KB |   38912 KB |       0 B  |
|       from large pool |   38912 KB |   38912 KB |   38912 KB |       0 B  |
|       from small pool |       0 KB |       0 KB |       0 KB |       0 B  |
|---------------------------------------------------------------------------|
| Active memory         |   38912 KB |   38912 KB |   38912 KB |       0 B  |
|       from large pool |   38912 KB |   38912 KB |   38912 KB |       

In [8]:
# model = nn.DataParallel(model.cuda())
# model = nn.DataParallel(model.to(device))

In [9]:
def predict(batch, model):
    with torch.no_grad():
        logits = model(batch).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = tokenizer.decode(predicted_ids[0])
    return transcription

In [10]:
# transcribe audio data, ignoring last batch
torch.cuda.empty_cache()
model.to(device)
transcriptions = [predict(batch,model) for batch in batches[:-1]]
transcriptions

['HAY FRENCH WELCOME TO ON LITTLE CODEN WHAT IF I TELL YOU THAT YOU CAN ACTUALLY DO SPEECH TRANSCRIPTION THE SAME WAY YOU DO ANEL PY WITH HUGGING FACE TRANSFORMERS E',
 "S THAT IS A REALITY THAT HUGGING FACE TRANSFORMER'S LATEST UPTATIS GOD IN THE LATEST TRANSFORMERS SUBDAT HUGGING FACE HAS ADDED A THE VERY POPULAR FACE BOUQUES MODEL WAVE T",
 'WEK SO WAVTOO WEK LETS YOU OR AT LEAST I SHOULD SAY THAT THE HUGGING FACE AP OF WAVTOO WEK LETS YOU JUST GIVE HER AN AUDIO FILE AND THEN GET HER TRANSCRIPTERD',
 "ENGLISH TRANSCRIPTION OUT OFFERD SO CURRENTLY AS FUR AS I KNOW THESE MORAL WORKS FINE FOR ENGLISH BUT I DONT NOW WHETHER THERE IS ANY OTHER LANGUAGE THAT IS AVAILABLE FOR THIS THING SO LET'S QUICKLY GO AHEAD AND THEN SEE HOW",
 "WE CAN DO SPEECH TRANSCRIPTION WITH HUGGING FIST TRANSFORMER'S MODEL THE FIRST SHIM THAT WE HAVE TO DO IS I'M UNUNDER TO GIVE YOU A LITTLE BIT MORE CONTEXT",
 'IHAVE NOT EVENT SWHICH DON MY AM GEPW ENVIRNMENT A GEPW OR ENVIROMNMENT SOME GOOD WOUL DO THIS ON SEE

In [11]:
# stich back transcriptions
full_text = f"\n".join(f"{i}: {t}" for i,t in enumerate(transcriptions)) 
full_text

"0: HAY FRENCH WELCOME TO ON LITTLE CODEN WHAT IF I TELL YOU THAT YOU CAN ACTUALLY DO SPEECH TRANSCRIPTION THE SAME WAY YOU DO ANEL PY WITH HUGGING FACE TRANSFORMERS E\n1: S THAT IS A REALITY THAT HUGGING FACE TRANSFORMER'S LATEST UPTATIS GOD IN THE LATEST TRANSFORMERS SUBDAT HUGGING FACE HAS ADDED A THE VERY POPULAR FACE BOUQUES MODEL WAVE T\n2: WEK SO WAVTOO WEK LETS YOU OR AT LEAST I SHOULD SAY THAT THE HUGGING FACE AP OF WAVTOO WEK LETS YOU JUST GIVE HER AN AUDIO FILE AND THEN GET HER TRANSCRIPTERD\n3: ENGLISH TRANSCRIPTION OUT OFFERD SO CURRENTLY AS FUR AS I KNOW THESE MORAL WORKS FINE FOR ENGLISH BUT I DONT NOW WHETHER THERE IS ANY OTHER LANGUAGE THAT IS AVAILABLE FOR THIS THING SO LET'S QUICKLY GO AHEAD AND THEN SEE HOW\n4: WE CAN DO SPEECH TRANSCRIPTION WITH HUGGING FIST TRANSFORMER'S MODEL THE FIRST SHIM THAT WE HAVE TO DO IS I'M UNUNDER TO GIVE YOU A LITTLE BIT MORE CONTEXT\n5: IHAVE NOT EVENT SWHICH DON MY AM GEPW ENVIRNMENT A GEPW OR ENVIROMNMENT SOME GOOD WOUL DO THIS ON S