https://huggingface.co/openai/whisper-large-v3

In [1]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


device = "cuda"
torch_dtype = torch.bfloat16

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=False,
    use_safetensors=True,
    use_flash_attention_2=True,
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)


dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]
print(sample)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    # max_new_tokens=128,
    chunk_length_s=30,
    batch_size=1,
    # return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)
result = pipe(dataset[0]["audio"])
print(result["text"])

The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'path': '0d38672e0bbdbdc460af55b8bb84a15b2730db2819f2af64f9c777d4d586f2de', 'array': array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00024414, 0.00048828,
       0.0005188 ]), 'sampling_rate': 16000}
 Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell's pictures are a sort of Upguards and Adam paintings, and Mason's exquisite idylls are as national as a jingo poem. Mr. Burkett Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap on the back before he says, like a shampooer in a 

In [3]:
import soundfile as sf

data, samplerate = sf.read(
    "./experiments/temp_audio.wav",
    dtype="float32",
)

In [4]:
pipe({"sampling_rate": samplerate, "raw": data})

{'text': ' Шаблонный метод это поведенческий паттерн проектирования, который определяет скелет алгоритма, перекладывая ответственность за некоторые его шаги на подклассы. Паттерн позволяет подклассам переопределять шаги алгоритма, не меняя его общей структуры.'}

### voice input

https://www.gradio.app/guides/real-time-speech-recognition

In [2]:
import gradio as gr
import numpy as np
from IPython.display import clear_output
import numpy as np
import soundfile as sf


def transcribe(audio):
    sr, y = audio
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))
    sample = {"sampling_rate": sr, "raw": y}
    print(sample)
    sf.write("./experiments/temp_audio.wav", y, sr, subtype="PCM_24")
    result = pipe(sample)
    result = result["text"]
    return result


demo = gr.Interface(
    transcribe,
    gr.Audio(sources=["microphone"]),
    "text",
)
demo.launch(
    inbrowser=True,
    # server_port=7866,
)
clear_output()
# print("Launched on http://127.0.0.1:7866/")

{'sampling_rate': 48000, 'raw': array([ 0.        ,  0.        ,  0.        , ...,  0.00072714,
       -0.00054536, -0.00090893], dtype=float32)}
{'sampling_rate': 48000, 'raw': array([ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
       -9.1555528e-05, -1.5259255e-04, -1.5259255e-04], dtype=float32)}


### https://github.com/SYSTRAN/faster-whisper (НЕ РАБОТАЕТ)

In [4]:
import numpy as np
import soundfile as sf
import librosa

rate = 48000
samplerate = 16000
data, samplerate = sf.read(
    "./experiments/temp_audio.wav",
    dtype="float32",
)
# Write out audio as 24bit PCM WAV
# sf.write('stereo_file.wav', data, samplerate, subtype='PCM_24')
data_16k = librosa.resample(
    y=data,
    orig_sr=rate,
    target_sr=samplerate,
)

In [2]:
# pipe({"sampling_rate": samplerate, "raw": data_16k})

In [4]:
from faster_whisper import WhisperModel

model_size = "large-v3"

# Run on GPU with FP16
# model = WhisperModel(
#     model_size,
#     device="cuda",
#     compute_type="float16",
# )

# or run on GPU with INT8
model = WhisperModel(
    model_size,
    device="cuda",
    compute_type="int8_float16",
)
# or run on CPU with INT8
# model = WhisperModel(model_size, device="cpu", compute_type="int8")

In [5]:
segments, info = model.transcribe(
    audio=data,
    beam_size=5,
    # language='ru'
)

print(
    "Detected language '%s' with probability %f"
    % (info.language, info.language_probability)
)

for segment in segments:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))

Detected language 'nn' with probability 0.563965
[0.00s -> 8.24s]  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20.


In [4]:
info

TranscriptionInfo(language='nn', language_probability=0.517578125, duration=9.72, duration_after_vad=9.72, all_language_probs=[('nn', 0.517578125), ('en', 0.081298828125), ('jw', 0.07403564453125), ('ko', 0.0728759765625), ('la', 0.04852294921875), ('ja', 0.0289764404296875), ('ru', 0.0257720947265625), ('haw', 0.0130615234375), ('es', 0.0113525390625), ('tr', 0.01108551025390625), ('fi', 0.01108551025390625), ('pl', 0.01058197021484375), ('fr', 0.0087738037109375), ('zh', 0.00798797607421875), ('pt', 0.00749969482421875), ('sv', 0.007442474365234375), ('de', 0.006671905517578125), ('ar', 0.005401611328125), ('it', 0.00492095947265625), ('th', 0.0048065185546875), ('uk', 0.004730224609375), ('ro', 0.0038604736328125), ('id', 0.0034618377685546875), ('nl', 0.0021839141845703125), ('km', 0.001941680908203125), ('si', 0.0018243789672851562), ('vi', 0.001781463623046875), ('da', 0.0016870498657226562), ('el', 0.0016870498657226562), ('cy', 0.00160980224609375), ('te', 0.0012683868408203125

### https://github.com/m-bain/whisperX

In [1]:
import whisperx
import gc

device = "cuda"
batch_size = 1  # reduce if low on GPU mem
# compute_type = "float16"  # change to "int8" if low on GPU mem (may reduce accuracy)
compute_type = "int8"  # change to "int8" if low on GPU mem (may reduce accuracy)

# 1. Transcribe with original whisper (batched)
model = whisperx.load_model(
    "large-v3", device, compute_type=compute_type, vad_model=None
)

  torchaudio.set_audio_backend("soundfile")
  torchaudio.set_audio_backend("soundfile")


No language specified, language will be first be detected for each audio file (increases inference time).


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.1.3. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../home/user-name-goes-here/.cache/torch/whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.1.2+cu121. Bad things might happen unless you revert torch to 1.x.


In [2]:
audio_file = "sample_audio/temp_audio_ru.wav"
audio = whisperx.load_audio(audio_file)
audio

array([ 0.        ,  0.        ,  0.        , ..., -0.0428772 ,
       -0.04476929, -0.04360962], dtype=float32)

In [3]:
from datasets import load_dataset

dataset = load_dataset(
	"distil-whisper/librispeech_long", "clean", split="validation"
)
sample = ["audio"]
sampledataset[0]

{'path': '0d38672e0bbdbdc460af55b8bb84a15b2730db2819f2af64f9c777d4d586f2de',
 'array': array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00024414, 0.00048828,
        0.0005188 ]),
 'sampling_rate': 16000}

In [7]:
import numpy as np
np.array(sample['array'], dtype=np.float32)

array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00024414, 0.00048828,
       0.0005188 ], dtype=float32)

In [4]:
result = model.transcribe(audio, batch_size=2)
print(result["segments"][0]["text"])  # before alignment

Detected language: ru (1.00) in first 30s of audio...
 Шаблонный метод – это поведенческий паттерн проектирования, который определяет скелет алгоритма, перекладывая ответственность за некоторые его шаги на подклассы. Паттерн позволяет подклассам переопределять шаги алгоритма, не меняя его общей структуры.


In [8]:
result = model.transcribe(np.array(sample['array'], dtype=np.float32), batch_size=2)
print(result["segments"][0]["text"]) 

Detected language: en (1.00) in first 30s of audio...
 Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.


In [2]:
import gradio as gr
import numpy as np
from IPython.display import clear_output
import soundfile as sf
import librosa

samplerate = 16_000


def transcribe(audio):
    sr, y = audio
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))
    sample = {"sampling_rate": sr, "raw": y}
    print(sample)
    data_16k = librosa.resample(
        y=y,
        orig_sr=sr,
        target_sr=samplerate,
    )
    result = model.transcribe(data_16k, batch_size=batch_size)
    if len(result["segments"]) > 0:
        result = result["segments"][0]["text"]
    print(result)
    return result


demo = gr.Interface(
    transcribe,
    gr.Audio(sources=["microphone"]),
    "text",
)
demo.launch(
    inbrowser=True,
    # server_port=7866,
)
clear_output()
# print("Launched on http://127.0.0.1:7866/")

{'sampling_rate': 48000, 'raw': array([0.        , 0.        , 0.        , ..., 0.02306926, 0.03407959,
       0.04320243], dtype=float32)}
Detected language: ru (1.00) in first 30s of audio...
