In [None]:
!pip install torchaudio soundfile librosa
!pip install datasets -U



In [None]:
!pip uninstall -y datasets
!pip install datasets==2.21.0
!pip install transformers accelerate soundfile librosa jiwer torchaudio
!apt install git-lfs

Found existing installation: datasets 4.0.0
Uninstalling datasets-4.0.0:
  Successfully uninstalled datasets-4.0.0
Collecting datasets==2.21.0
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting fsspec<=2024.6.1,>=2023.1.0 (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets==2.21.0)
  Downloading fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m527.3/527.3 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.6.1-py3-none-any.whl (177 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m177.6/177.6 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(*args)
             ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/cli/req_command.py", line 67, in wrapper
^C


In [None]:
#wav2vec2
import torch
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
from datasets import load_dataset, Audio
import re
import json
import shutil
from google.colab import drive
from transformers import (
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor,
    Wav2Vec2Processor,
    Wav2Vec2ForCTC,
    TrainingArguments,
    Trainer
)


drive.mount('/content/drive')

MODEL_ID = "facebook/wav2vec2-large-xlsr-53"
print("Loading Russian dataset (Golos - 1000 samples)...")
dataset = load_dataset("bond005/sberdevices_golos_10h_crowd", split="train[:1000]")


chars_to_ignore_regex = r'[\,\?\.\!\-\;\:\"\‚Äú\%\‚Äò\‚Äù\]]'

def remove_special_characters(batch):
    batch["transcription"] = re.sub(chars_to_ignore_regex, '', batch["transcription"]).lower() + " "
    return batch

dataset = dataset.map(remove_special_characters)

def extract_all_chars(batch):
    all_text = " ".join(batch["transcription"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}

vocab_dict = dataset.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=dataset.column_names)
vocab_list = list(set(vocab_dict["vocab"][0]))
vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}

vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)

with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    with processor.as_target_processor():
        batch["labels"] = processor(batch["transcription"]).input_ids
    return batch

encoded_dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names, num_proc=1)

@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.feature_extractor.pad(input_features, padding=self.padding, return_tensors="pt")
        labels_batch = self.processor.tokenizer.pad(label_features, padding=self.padding, return_tensors="pt")

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels
        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)


model = Wav2Vec2ForCTC.from_pretrained(
    MODEL_ID,
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

model.freeze_feature_encoder()

training_args = TrainingArguments(
    output_dir="./wav2vec2-russian-master",
    group_by_length=True,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    eval_strategy="steps",
    num_train_epochs=15,
    fp16=True,
    save_steps=500,
    eval_steps=500,
    logging_steps=100,
    learning_rate=1e-4,
    warmup_steps=300,
    save_total_limit=1,
)

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    train_dataset=encoded_dataset,
    eval_dataset=encoded_dataset,
    tokenizer=processor.feature_extractor,
)

print("Starting Master Training...")
trainer.train()


print("Saving model to Google Drive...")
drive_path = "/content/drive/My Drive/wav2vec2-russian-master"
trainer.save_model(drive_path)
processor.save_pretrained(drive_path)
print(f"DONE! Model safe at: {drive_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading Russian dataset (Golos - 1000 samples)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/6.75k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.71k [00:00<?, ?B/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting Master Training...


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


  torch._C._get_cudnn_allow_tf32(),


Step,Training Loss,Validation Loss
500,3.1688,3.103037
1000,1.2135,0.664822
1500,0.7551,0.334109


Saving model to Google Drive...
DONE! Model safe at: /content/drive/My Drive/wav2vec2-russian-master


In [None]:
# continue wav2vec2

args_refine = TrainingArguments(
    output_dir="./wav2vec2-russian-master-refined",
    group_by_length=True,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    eval_strategy="steps",
    num_train_epochs=20,
    fp16=True,
    save_steps=500,
    eval_steps=500,
    logging_steps=100,
    learning_rate=5e-5,
    warmup_steps=100,
    save_total_limit=1,
)

trainer_refine = Trainer(
    model=model,
    data_collator=data_collator,
    args=args_refine,
    train_dataset=encoded_dataset,
    eval_dataset=encoded_dataset,
    tokenizer=processor.feature_extractor,
)

print("Starting Refinement Training (20 more epochs)...")
trainer_refine.train()

print("Saving refined model to Google Drive...")
drive_path_refined = "/content/drive/My Drive/wav2vec2-russian-refined"
trainer_refine.save_model(drive_path_refined)
processor.save_pretrained(drive_path_refined)
print(f"DONE! Refined model safe at: {drive_path_refined}")

  trainer_refine = Trainer(


Starting Refinement Training (20 more epochs)...


Step,Training Loss,Validation Loss
500,0.5269,0.17431
1000,0.3658,0.116004
1500,0.3034,0.07712
2000,0.3474,0.060028
2500,0.3447,0.05335


Saving refined model to Google Drive...
DONE! Refined model safe at: /content/drive/My Drive/wav2vec2-russian-refined


In [None]:
import torch
import librosa
from google.colab import files


print("Please upload your audio file now:")
uploaded = files.upload()
filename = next(iter(uploaded))

print(f"Processing {filename}...")
speech, rate = librosa.load(filename, sr=16000)

input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values.to("cuda")

print("Transcribing...")
with torch.no_grad():
    logits = model(input_values).logits

pred_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(pred_ids)[0]

print("\n" + "="*40)
print("TARGET: –°—ä–µ—à—å –∂–µ –µ—â—ë —ç—Ç–∏—Ö –º—è–≥–∫–∏—Ö —Ñ—Ä–∞–Ω—Ü—É–∑—Å–∫–∏—Ö –±—É–ª–æ–∫, –¥–∞ –≤—ã–ø–µ–π —á–∞—é.")
print(f"WAV2VEC2: {transcription}")
print("="*40)

Please upload your audio file now:


Saving pangramm6.mp4 to pangramm6 (3).mp4
Processing pangramm6 (3).mp4...


  speech, rate = librosa.load(filename, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Transcribing...

TARGET: –°—ä–µ—à—å –∂–µ –µ—â—ë —ç—Ç–∏—Ö –º—è–≥–∫–∏—Ö —Ñ—Ä–∞–Ω—Ü—É–∑—Å–∫–∏—Ö –±—É–ª–æ–∫, –¥–∞ –≤—ã–ø–µ–π —á–∞—é.
WAV2VEC2: —Å–µ—à—å –∂–∏–µ—â–µ —ç—Ç–∏—Ö –º—è–∫–∏—Ö —Ñ—Ä–∞–Ω—Ü—É—Å–∫–∏—Ö –±—É–ª–∞–∫ –¥–æ–≤—ã –ø–æ–π—á–∞—é


In [3]:
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset, Audio
from jiwer import wer, cer
from tqdm.auto import tqdm
import re
import io
import soundfile as sf
import torchaudio

MODEL_NAME = "openai/whisper-small"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print(f"üöÄ Loading Whisper ({MODEL_NAME}) on {DEVICE}...")

processor = WhisperProcessor.from_pretrained(MODEL_NAME)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME).to(DEVICE)
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="russian", task="transcribe")

print("Loading Golos Dataset (First 100 samples)...")
dataset = load_dataset("bond005/sberdevices_golos_10h_crowd", split="test[:100]")

dataset = dataset.cast_column("audio", Audio(decode=False))

chars_to_ignore_regex = r'[\,\?\.\!\-\;\:\"\‚Äú\%\‚Äò\‚Äù\]]'
def normalize_text(text):
    if not text: return ""
    text = re.sub(chars_to_ignore_regex, "", text.lower())
    return " ".join(text.split())

predictions = []
references = []

print(f"\nRunning Inference...")

resampler = None

for item in tqdm(dataset):
    audio_bytes = item["audio"]["bytes"]
    audio_array, sample_rate = sf.read(io.BytesIO(audio_bytes))

    audio_tensor = torch.tensor(audio_array, dtype=torch.float32)

    if sample_rate != 16000:
        if resampler is None or resampler.orig_freq != sample_rate:
            resampler = torchaudio.transforms.Resample(sample_rate, 16000)
        audio_tensor = resampler(audio_tensor)

    input_features = processor(audio_tensor, sampling_rate=16000, return_tensors="pt").input_features.to(DEVICE)

    with torch.no_grad():
        predicted_ids = model.generate(input_features)

    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

    ref_norm = normalize_text(item["transcription"])
    pred_norm = normalize_text(transcription)

    references.append(ref_norm)
    predictions.append(pred_norm)

final_wer = wer(references, predictions)
final_cer = cer(references, predictions)

print("\n" + "="*50)
print(f"WHISPER ({MODEL_NAME}) RESULTS")
print("="*50)
print(f"WER: {final_wer:.2%}")
print(f"CER: {final_cer:.2%}")
print("="*50)

for i in range(5):
    print(f"Ref:  {references[i]}")
    print(f"Pred: {predictions[i]}")
    print("-" * 30)

üöÄ Loading Whisper (openai/whisper-small) on cuda...
Loading Golos Dataset (First 100 samples)...

Running Inference...


  0%|          | 0/100 [00:00<?, ?it/s]

Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



WHISPER (openai/whisper-small) RESULTS
WER: 34.86%
CER: 18.86%
Ref:  —à–µ—Å—Ç—å–¥–µ—Å—è—Ç —Ç—ã—Å—è—á —Ç–µ–Ω–≥–µ —Å–∫–æ–ª—å–∫–æ –±—É–¥–µ—Ç —Å—Ç–æ–∏—Ç—å
Pred: 60 —Ç—ã—Å—è—á —Ç–µ–Ω–≥–µ —Å–∫–æ–ª—å–∫–æ –±—É–¥–µ—Ç —Å—Ç–æ–∏—Ç—å
------------------------------
Ref:  –ø–æ–∫–∞–∂–∏ –º–Ω–µ –Ω–∞ —Å–º–æ—Ç—Ä–µ—à–∫–µ —Ç–µ–ª–µ–∫–∞–Ω–∞–ª —Å–∏–Ω–µ—Ä–≥–∏—è —Ç–≤
Pred: –ø–æ–∫–∞–∂–∏ –º–Ω–µ –Ω–∞ —Å–º–æ—Ç—Ä—ë–∂–∫–µ —Ç–µ–ª–µ–∫–∞–Ω–∞–ª —Å–∏–Ω–µ—Ä–≥–∏–∏ —Ç–≤
------------------------------
Ref:  –∑–∞–∫–∞–∑–∞—Ç—å —è–±–ª–æ–∫–∏ –∑–µ–ª–µ–Ω—ã–µ
Pred: –∑–∞–∫–æ—Å–æ—Ç —Å —è–±–ª–æ–∫–∏ –∑–µ–ª–µ–Ω—É—é
------------------------------
Ref:  –∞–ª–∏—Å–∞ –∑–∞–∫–∞–∂–∏ –∫–∏–ª–æ–≥—Ä–∞–º–º–æ–≤—ã–π —Ç–æ—Ä—Ç –≥—Ä–∞—Ñ—Å–∫–∏–µ —Ä–∞–∑–≤–∞–ª–∏–Ω—ã
Pred: –∞–ª–∏—Å–∞ –∑–∞–∫–∞–∂–∏ –∫–∏–ª–æ–≥—Ä–∞–º–º–æ–≤—ã–π —Ç–æ—Ä—Ç –≥—Ä–∞—Ñ—Å–∫–∏–µ —Ä–∞–∑–≤–∞–ª–µ–Ω—ã
------------------------------
Ref:  –∏—â–∏ —Ç–µ–ª–µ–∫–∞–Ω–∞–ª –ø—Ä–æ –±–∏–∑–Ω–µ—Å –Ω–∞ —Ç–∏–≤–∏
Pred: –∏—â–∏ —Ç–µ–ª–µ–∫–∞–Ω–∞–ª –ø—Ä–æ –±–∏–∑–Ω–µ—Å –Ω–∞ —Ç–≤
------------------------------


In [4]:
import torch
import torchaudio
import librosa
import soundfile as sf
import io
import re
from datasets import load_dataset, Audio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from jiwer import wer, cer
from tqdm.auto import tqdm

MODEL_ID = "bond005/wav2vec2-large-ru-golos"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
NUM_SAMPLES = 100

print(f"üöÄ Loading Specialist Model ({MODEL_ID}) on {DEVICE}...")

processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID).to(DEVICE)

print("Loading Golos Dataset...")
dataset = load_dataset("bond005/sberdevices_golos_10h_crowd", split=f"test[:{NUM_SAMPLES}]")
dataset = dataset.cast_column("audio", Audio(decode=False))

chars_to_ignore_regex = r'[\,\?\.\!\-\;\:\"\‚Äú\%\‚Äò\‚Äù\]]'
def normalize_text(text):
    if not text: return ""
    text = re.sub(chars_to_ignore_regex, "", text.lower())
    return " ".join(text.split())

predictions = []
references = []

print(f"\nRunning Inference...")
resampler = torchaudio.transforms.Resample(48000, 16000).to(DEVICE)

for item in tqdm(dataset):
    audio_bytes = item["audio"]["bytes"]
    audio_array, sample_rate = sf.read(io.BytesIO(audio_bytes))
    audio_tensor = torch.tensor(audio_array, dtype=torch.float32).to(DEVICE)

    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(sample_rate, 16000).to(DEVICE)
        audio_tensor = resampler(audio_tensor)

    if audio_tensor.ndim == 1:
        audio_tensor = audio_tensor.unsqueeze(0)

    inputs = processor(audio_tensor.squeeze().cpu().numpy(), sampling_rate=16000, return_tensors="pt", padding=True)
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}

    with torch.no_grad():
        logits = model(**inputs).logits

    pred_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(pred_ids)[0]

    ref_norm = normalize_text(item["transcription"])
    pred_norm = normalize_text(transcription)

    references.append(ref_norm)
    predictions.append(pred_norm)

final_wer = wer(references, predictions)
final_cer = cer(references, predictions)

print("\n" + "="*50)
print(f"SPECIALIST ({MODEL_ID}) RESULTS")
print("="*50)
print(f"WER: {final_wer:.2%}")
print(f"CER: {final_cer:.2%}")
print("="*50)

for i in range(5):
    print(f"Ref:  {references[i]}")
    print(f"Pred: {predictions[i]}")
    print("-" * 30)

üöÄ Loading Specialist Model (bond005/wav2vec2-large-ru-golos) on cuda...


preprocessor_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json:   0%|          | 0.00/368 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Loading Golos Dataset...

Running Inference...


  0%|          | 0/100 [00:00<?, ?it/s]


SPECIALIST (bond005/wav2vec2-large-ru-golos) RESULTS
WER: 8.17%
CER: 1.51%
Ref:  —à–µ—Å—Ç—å–¥–µ—Å—è—Ç —Ç—ã—Å—è—á —Ç–µ–Ω–≥–µ —Å–∫–æ–ª—å–∫–æ –±—É–¥–µ—Ç —Å—Ç–æ–∏—Ç—å
Pred: —à–µ—Å—Ç—å–¥–µ—Å—è—Ç —Ç—ã—Å—è—á —Ç–µ–Ω–≥–µ —Å–∫–æ–ª—å–∫–æ –±—É–¥–µ—Ç —Å—Ç–æ–∏—Ç—å
------------------------------
Ref:  –ø–æ–∫–∞–∂–∏ –º–Ω–µ –Ω–∞ —Å–º–æ—Ç—Ä–µ—à–∫–µ —Ç–µ–ª–µ–∫–∞–Ω–∞–ª —Å–∏–Ω–µ—Ä–≥–∏—è —Ç–≤
Pred: –ø–æ–∫–∞–∂–∏ –º–Ω–µ –Ω–∞ —Å–º–æ—Ç—Ä–µ—à–∫–µ —Ç–µ–ª–µ–∫–∞–Ω–∞–ª —Å–∏–Ω–µ—Ä–≥–∏—è —Ç–≤
------------------------------
Ref:  –∑–∞–∫–∞–∑–∞—Ç—å —è–±–ª–æ–∫–∏ –∑–µ–ª–µ–Ω—ã–µ
Pred: –∑–∞–∫–∞–∑–∞—Ç—å —Å —è–±–ª–æ–∫–∏ –∑–µ–ª–µ–Ω—ã–µ
------------------------------
Ref:  –∞–ª–∏—Å–∞ –∑–∞–∫–∞–∂–∏ –∫–∏–ª–æ–≥—Ä–∞–º–º–æ–≤—ã–π —Ç–æ—Ä—Ç –≥—Ä–∞—Ñ—Å–∫–∏–µ —Ä–∞–∑–≤–∞–ª–∏–Ω—ã
Pred: –∞–ª–∏—Å–∞ –∑–∞–∫–∞–∂–∏ –∫–∏–ª–æ–≥—Ä–∞–º–º–æ–≤—ã–π —Ç–æ—Ä—Ç –≥—Ä–∞—Ñ—Å–∫–∏–µ —Ä–∞–∑–≤–∞–ª–∏–Ω–Ω—ã–µ
------------------------------
Ref:  –∏—â–∏ —Ç–µ–ª–µ–∫–∞–Ω–∞–ª –ø—Ä–æ –±–∏–∑–Ω–µ—Å –Ω–∞ —Ç–∏–≤–∏
Pred: –∏—â–∏ —Ç–µ–ª–µ–∫–∞–Ω–∞–ª –ø—Ä–æ –±–∏–∑–Ω–µ—Å –Ω–∞ —Ç–∏–≤

In [5]:
import torch
import torchaudio
import librosa
import numpy as np
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from google.colab import files

MODEL_ID = "bond005/wav2vec2-large-ru-golos"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
PANGRAM_TEXT = "—Å—ä–µ—à—å –∂–µ –µ—â—ë —ç—Ç–∏—Ö –º—è–≥–∫–∏—Ö —Ñ—Ä–∞–Ω—Ü—É–∑—Å–∫–∏—Ö –±—É–ª–æ–∫ –¥–∞ –≤—ã–ø–µ–π —á–∞—é"

print(f"üöÄ Loading Champion Model ({MODEL_ID}) on {DEVICE}...")

processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID).to(DEVICE)

print("\n Upload your Pangram Audio File (wav/mp3)...")
uploaded = files.upload()
filename = next(iter(uploaded))

print(f"Processing {filename}...")
speech, sr = librosa.load(filename, sr=16000)

input_values = processor(speech, sampling_rate=16000, return_tensors="pt", padding="longest").input_values.to(DEVICE)

model.eval()
with torch.no_grad():
    logits = model(input_values).logits

pred_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(pred_ids)[0]

print("\n" + "="*50)
print("PANGRAM TEST RESULTS")
print("="*50)
print(f"TARGET:     {PANGRAM_TEXT}")
print(f"PREDICTION: {transcription}")
print("="*50)

üöÄ Loading Champion Model (bond005/wav2vec2-large-ru-golos) on cuda...

 Upload your Pangram Audio File (wav/mp3)...


Saving pangramm6.mp4 to pangramm6.mp4
Processing pangramm6.mp4...


  speech, sr = librosa.load(filename, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)



PANGRAM TEST RESULTS
TARGET:     —Å—ä–µ—à—å –∂–µ –µ—â—ë —ç—Ç–∏—Ö –º—è–≥–∫–∏—Ö —Ñ—Ä–∞–Ω—Ü—É–∑—Å–∫–∏—Ö –±—É–ª–æ–∫ –¥–∞ –≤—ã–ø–µ–π —á–∞—é
PREDICTION: —Å–µ—à–∂–µ –µ—â–µ —ç—Ç–∏—Ö –º—è–≥–∫–∏—Ö —Ñ—Ä–∞–Ω—Ü—É–∑—Å–∫–∏—Ö –±—É–ª–æ–∫ –¥–∞ –≤—ã –ø–æ–π—á–∞—é
