In [None]:
#libraries
!pip install torchaudio soundfile librosa
!pip install datasets -U

Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading datasets-4.4.1-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow, datasets
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 18.1.0
    Uninstalling pyarrow-18.1.0:
      Successfully uninstalled pyarrow-18.1.0
  Attempting uninstall: datasets
    Found existing installation: datasets 4.0.0
    Uninstalling datasets-4.0.0:
      Successfully uninstalled datasets-4.0.0
Successfully installed datasets-4.4.1 pya

In [None]:
#datasets
!pip uninstall datasets -y
!pip install datasets==2.19.0
!pip install transformers accelerate evaluate jiwer soundfile librosa -U

Found existing installation: datasets 4.0.0
Uninstalling datasets-4.0.0:
  Successfully uninstalled datasets-4.0.0
Collecting datasets==2.19.0
  Downloading datasets-2.19.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow-hotfix (from datasets==2.19.0)
  Downloading pyarrow_hotfix-0.7-py3-none-any.whl.metadata (3.6 kB)
Collecting fsspec<=2024.3.1,>=2023.1.0 (from fsspec[http]<=2024.3.1,>=2023.1.0->datasets==2.19.0)
  Downloading fsspec-2024.3.1-py3-none-any.whl.metadata (6.8 kB)
Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.3.1-py3-none-any.whl (171 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.0/172.0 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow_hotfix-0.7-py3-none-any.whl (7.9 kB)
Installing collected packages: pyarrow-hotfix, fsspec, datasets
  Attempting uninstall: fsspec

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(*args)
             ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/cli/req_command.py", line 67, in wrapper
^C


In [None]:
# open ai whisper
import os
import torch
import evaluate
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from datasets import load_dataset, Audio
from transformers import (
    WhisperFeatureExtractor,
    WhisperTokenizer,
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

print("🚀 Libraries Loaded. Setting up Whisper...")

MODEL_ID = "openai/whisper-small"
LANGUAGE = "Russian"
TASK = "transcribe"

feature_extractor = WhisperFeatureExtractor.from_pretrained(MODEL_ID)
tokenizer = WhisperTokenizer.from_pretrained(MODEL_ID, language=LANGUAGE, task=TASK)
processor = WhisperProcessor.from_pretrained(MODEL_ID, language=LANGUAGE, task=TASK)

print("📥 Loading Dataset (PolyAI/Minds14)...")

dataset = load_dataset("PolyAI/minds14", "ru-RU", split="train")

dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=16000).input_features[0]
    batch["labels"] = tokenizer(batch["transcription"]).input_ids
    return batch

print("⚙️  Processing Data...")
cols_to_remove = ["path", "audio", "transcription", "english_transcription", "intent_class", "lang_id"]
encoded_dataset = dataset.map(prepare_dataset, remove_columns=cols_to_remove, num_proc=1)

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

print("🧠 Loading Whisper Model...")
model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID)
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language=LANGUAGE, task=TASK)
model.config.suppress_tokens = []
model.freeze_encoder()

cer_metric = evaluate.load("cer")
wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    print(f"\n[DEBUG] Ref:  {label_str[0][:50]}...")
    print(f"[DEBUG] Pred: {pred_str[0][:50]}...")

    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"cer": cer, "wer": wer}

print("🔥 Starting Whisper Training...")
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper_russian_finetuned",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=1e-5,
    warmup_steps=50,
    max_steps=500,
    gradient_checkpointing=True,
    fp16=True,
    eval_strategy="steps",
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=100,
    eval_steps=100,
    logging_steps=25,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="cer",
    greater_is_better=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset,
    eval_dataset=encoded_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

trainer.train()

🚀 Libraries Loaded. Setting up Whisper...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

📥 Loading Dataset (PolyAI/Minds14)...


Downloading readme: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/34.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/539 [00:00<?, ? examples/s]

⚙️  Processing Data...


Map:   0%|          | 0/539 [00:00<?, ? examples/s]

🧠 Loading Whisper Model...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

🔥 Starting Whisper Training...


  trainer = Seq2SeqTrainer(
You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss,Cer,Wer
100,0.2714,0.178835,0.597528,0.652304
200,0.0413,0.029739,0.437901,0.464723
300,0.0084,0.006232,0.367238,0.386748
400,0.0033,0.003024,0.352342,0.35785
500,0.0025,0.002526,0.328946,0.338063


Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



[DEBUG] Ref:  Здравствуйте я бы хотела пересмотреть свои предыду...
[DEBUG] Pred: Здравствуйте я бы хотела пересмотреть свои предыду...





[DEBUG] Ref:  Здравствуйте я бы хотела пересмотреть свои предыду...
[DEBUG] Pred: अज्वाँच्विट्...





[DEBUG] Ref:  Здравствуйте я бы хотела пересмотреть свои предыду...
[DEBUG] Pred: Здравствуйте я бы хотела пересмотреть свои предыду...





[DEBUG] Ref:  Здравствуйте я бы хотела пересмотреть свои предыду...
[DEBUG] Pred: Здравствуйте я бы хотела пересмотреть свои предыду...





[DEBUG] Ref:  Здравствуйте я бы хотела пересмотреть свои предыду...
[DEBUG] Pred: Здравствуйте я бы хотела пересмотреть свои предыду...


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


TrainOutput(global_step=500, training_loss=0.23136868069320918, metrics={'train_runtime': 3631.6456, 'train_samples_per_second': 2.203, 'train_steps_per_second': 0.138, 'total_flos': 2.2884822245376e+18, 'train_loss': 0.23136868069320918, 'epoch': 14.71111111111111})

In [None]:
import torch
import librosa
import numpy as np
from google.colab import files

def test_whisper_upload(model, processor):
    print("Uploading...")

    uploaded = files.upload()
    file_name = list(uploaded.keys())[0]

    print(f"Received: {file_name}")
    print("Processing...")

    # librosa handles mp3/m4a conversion automatically
    audio, sr = librosa.load(file_name, sr=16000)

    input_features = processor.feature_extractor(audio, sampling_rate=16000, return_tensors="pt").input_features.to("cuda")

    model.eval()
    with torch.no_grad():
        generated_ids = model.generate(input_features, max_length=225)

    transcription = processor.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    print("-" * 40)
    print("TARGET:   Съешь же ещё этих мягких французских булок, да выпей чаю.")
    print(f"WHISPER:  {transcription}")
    print("-" * 40)

test_whisper_upload(model, processor)

Uploading...


Saving pangramm6.mp4 to pangramm6.mp4
Received: pangramm6.mp4
Processing...


  audio, sr = librosa.load(file_name, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


----------------------------------------
TARGET:   Съешь же ещё этих мягких французских булок, да выпей чаю.
WHISPER:  Сьешь же ещё этих мягких французских булок Да выпей чаю
----------------------------------------


In [None]:
# messy dataset lead to bad punctutation, the model still performs better than expected (the dataset) as we can see it capitalized
# the Д in "Да выпей чаю" trying insert a sentence break where the grammar demanded it
print("📚 TEXTBOOK EXAMPLES:")
for i in range(3):
    print(f"Label {i}: {dataset[i]['transcription']}")

📚 TEXTBOOK EXAMPLES:
Label 0: Здравствуйте я бы хотела пересмотреть свои предыдущие последние операции которые проходили по моей карте прямым помимо ему счёту Покажите пожалуйста операции последних трёх месяцев
Label 1: Покажи мне мои последние транзакции если платёж который я не знаю Да я признаю платёж
Label 2: покажите мне мои последние последние транзакции


In [None]:
# crnn
import os
import subprocess
import sys

print("Fixing Environment...")
subprocess.run("pip install torch torchaudio datasets==2.19.0 evaluate jiwer soundfile librosa -U -q", shell=True)
print("Libraries ready.")

import torch
import torch.nn as nn
import torchaudio
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset, Audio
import numpy as np
import librosa
from google.colab import files

BATCH_SIZE = 8
LEARNING_RATE = 1e-3
EPOCHS = 50       # 50 Epochs on diverse data is usually enough to see change
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Training CRNN on {DEVICE}...")

#dataset
print("Streaming Google Fleurs (Russian)...")
ds_stream = load_dataset("google/fleurs", "ru_ru", split="train", streaming=True)

print("Downloading & Caching 500 Wiki-sentences...")
data_cache = []
for i, item in enumerate(ds_stream):
    if i >= 500: break
    # We keep it simple. We assume the audio is usable.
    data_cache.append({
        "audio": item["audio"]["array"],
        "transcription": item["transcription"] # Fleurs uses 'transcription' or 'raw_transcription'
    })

print(f"Loaded {len(data_cache)} sentences.")
print(f"Sample: {data_cache[0]['transcription']}")

# Build Vocabulary
all_text = "".join([x["transcription"] for x in data_cache]).lower()
vocab_list = sorted(list(set(all_text)))
vocab_dict = {char: i + 1 for i, char in enumerate(vocab_list)}
vocab_dict["[BLANK]"] = 0
id_to_char = {v: k for k, v in vocab_dict.items()}
print(f"Vocab Size: {len(vocab_dict)}")

spec_transform = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=64).to(DEVICE)

def collate_fn(batch):
    waveforms, labels, label_lengths = [], [], []
    for item in batch:
        wav = torch.tensor(item["audio"]).float()
        waveforms.append(wav)
        txt = item["transcription"].lower()
        label = torch.tensor([vocab_dict.get(c, 0) for c in txt])
        labels.append(label)
        label_lengths.append(len(label))

    waveforms = pad_sequence(waveforms, batch_first=True)
    labels = pad_sequence(labels, batch_first=True, padding_value=0)
    return waveforms.unsqueeze(1), labels, torch.tensor(label_lengths)

loader = DataLoader(data_cache, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

#the model
class CRNN(nn.Module):
    def __init__(self, num_classes):
        super(CRNN, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32), nn.ReLU(), nn.MaxPool2d(2, 2),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(2, 2)
        )
        self.lstm = nn.LSTM(input_size=1024, hidden_size=64, num_layers=2, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(128, num_classes)

    def forward(self, x):
        x = x.transpose(2, 3)
        x = self.cnn(x)
        b, c, f, t = x.size()
        x = x.permute(0, 3, 1, 2).reshape(b, t, c*f)
        x, _ = self.lstm(x)
        x = self.fc(x)
        return F.log_softmax(x, dim=2)

model = CRNN(num_classes=len(vocab_dict)).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
ctc_loss = nn.CTCLoss(blank=0)

def decode(pred_logits):
    pred_ids = torch.argmax(pred_logits, dim=2).squeeze()
    if pred_ids.dim() > 1: pred_ids = pred_ids[0]
    chars = []
    prev_char = -1
    for i in pred_ids.tolist():
        if i != 0 and i != prev_char:
            chars.append(id_to_char.get(i, ""))
        prev_char = i
    return "".join(chars)

print(" CRNN Training (Wikipedia Data)...")
model.train()

for epoch in range(EPOCHS):
    total_loss = 0
    for batch in loader:
        waveform = batch[0].to(DEVICE)
        labels = batch[1].to(DEVICE)
        label_len = batch[2]

        spec = spec_transform(waveform).transpose(2, 3)
        preds = model(spec).permute(1, 0, 2)
        input_lengths = torch.full(size=(preds.size(1),), fill_value=preds.size(0), dtype=torch.long)

        loss = ctc_loss(preds, labels, input_lengths, label_len)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1:03d} | Loss: {total_loss / len(loader):.4f}")

#pangram test
print("\n FINAL PANGRAM TEST (CRNN + WIKI DATA)")
print("Uploading...")

uploaded = files.upload()
if uploaded:
    file_name = list(uploaded.keys())[0]
    audio, sr = librosa.load(file_name, sr=16000)
    wav_tensor = torch.tensor(audio).float().unsqueeze(0).to(DEVICE)
    spec = spec_transform(wav_tensor).transpose(1, 2).unsqueeze(1)

    model.eval()
    with torch.no_grad():
        logits = model(spec)
        prediction = decode(logits)

    print("-" * 40)
    print(f"CRNN (Fleurs): {prediction}")
    print("-" * 40)

Fixing Environment...
Libraries ready.
Training CRNN on cuda...
Streaming Google Fleurs (Russian)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading readme: 0.00B [00:00, ?B/s]

Downloading & Caching 500 Wiki-sentences...
Loaded 500 sentences.
Sample: эта идея пришла из китая где излюбленным цветком был цвет сливы
Vocab Size: 80
 CRNN Training (Wikipedia Data)...
Epoch 010 | Loss: 3.2810
Epoch 020 | Loss: 3.2364
Epoch 030 | Loss: 3.1061
Epoch 040 | Loss: 2.8588
Epoch 050 | Loss: 2.6738

 FINAL PANGRAM TEST (CRNN + WIKI DATA)
Uploading...


Saving pangramm6.mp4 to pangramm6.mp4
----------------------------------------
CRNN (Fleurs): ее еше ае се те е та с с аеаеазе ие те чеае
----------------------------------------


  audio, sr = librosa.load(file_name, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


In [None]:
#crnn +100 epochs
MORE_EPOCHS = 100

print(f"Pushing for {MORE_EPOCHS} more epochs to find Consonants...")

model.train()

for epoch in range(MORE_EPOCHS):
    total_loss = 0
    for batch in loader:
        waveform = batch[0].to(DEVICE)
        labels = batch[1].to(DEVICE)
        label_len = batch[2]

        spec = spec_transform(waveform).transpose(2, 3)
        preds = model(spec).permute(1, 0, 2)
        input_lengths = torch.full(size=(preds.size(1),), fill_value=preds.size(0), dtype=torch.long)

        loss = ctc_loss(preds, labels, input_lengths, label_len)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    if (epoch + 1) % 10 == 0:
        print(f"Extra Epoch {epoch+1:03d} | Loss: {total_loss / len(loader):.4f}")

print("\nRE-TESTING PANGRAM")
print(f"Using file: {file_name}")

audio, sr = librosa.load(file_name, sr=16000)
wav_tensor = torch.tensor(audio).float().unsqueeze(0).to(DEVICE)
spec = spec_transform(wav_tensor).transpose(1, 2).unsqueeze(1)

model.eval()
with torch.no_grad():
    logits = model(spec)
    prediction = decode(logits)

print("-" * 40)
print(f"CRNN (Total 150 Epochs): {prediction}")
print("-" * 40)

Pushing for 100 more epochs to find Consonants...
Extra Epoch 010 | Loss: 2.5505
Extra Epoch 020 | Loss: 2.4178
Extra Epoch 030 | Loss: 2.3008
Extra Epoch 040 | Loss: 2.2117
Extra Epoch 050 | Loss: 2.0973
Extra Epoch 060 | Loss: 1.9936
Extra Epoch 070 | Loss: 1.8887
Extra Epoch 080 | Loss: 1.8102
Extra Epoch 090 | Loss: 1.7309
Extra Epoch 100 | Loss: 1.6038

RE-TESTING PANGRAM
Using file: pangramm6.mp4
----------------------------------------
CRNN (Total 150 Epochs): щеящи яиясиях иеаих ашщяеи аиья я з веля елшея
----------------------------------------


  audio, sr = librosa.load(file_name, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


In [None]:
#crnn +100 epochs
import jiwer

MORE_EPOCHS = 100

print(f"Pushing for {MORE_EPOCHS} more epochs (Total will be 250)...")
print("Tracking CER (Character Error) and WER (Word Error)...")

model.train()

def calculate_metrics(preds, labels):

    pred_ids = torch.argmax(preds, dim=2).transpose(0, 1)

    decoded_preds = []
    decoded_targets = []

    for i in range(len(pred_ids)):
        p_chars = []
        prev = -1
        for pid in pred_ids[i].tolist():
            if pid != 0 and pid != prev:
                p_chars.append(id_to_char.get(pid, ""))
            prev = pid
        decoded_preds.append("".join(p_chars))

        t_chars = []
        for lid in labels[i].tolist():
            if lid != 0:
                t_chars.append(id_to_char.get(lid, ""))
        decoded_targets.append("".join(t_chars))

    try:
        wer = jiwer.wer(decoded_targets, decoded_preds)
        cer = jiwer.cer(decoded_targets, decoded_preds)
    except:
        wer, cer = 1.0, 1.0

    return wer, cer

for epoch in range(MORE_EPOCHS):
    total_loss = 0
    total_cer = 0
    total_wer = 0
    batches = 0

    for batch in loader:
        waveform = batch[0].to(DEVICE)
        labels = batch[1].to(DEVICE)
        label_len = batch[2]

        spec = spec_transform(waveform).transpose(2, 3)
        preds = model(spec).permute(1, 0, 2)
        input_lengths = torch.full(size=(preds.size(1),), fill_value=preds.size(0), dtype=torch.long)

        loss = ctc_loss(preds, labels, input_lengths, label_len)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        with torch.no_grad():
            b_wer, b_cer = calculate_metrics(preds, labels)
            total_cer += b_cer
            total_wer += b_wer
            batches += 1

    if (epoch + 1) % 10 == 0:
        avg_loss = total_loss / batches
        avg_cer = total_cer / batches
        avg_wer = total_wer / batches

        print(f"Epoch {epoch+1:03d} | Loss: {avg_loss:.4f} | CER: {avg_cer:.2f} | WER: {avg_wer:.2f}")

print("\nFINAL TEST: 250 EPOCHS")
print(f"Using file: {file_name}")

try:
    audio, sr = librosa.load(file_name, sr=16000)
    wav_tensor = torch.tensor(audio).float().unsqueeze(0).to(DEVICE)
    spec = spec_transform(wav_tensor).transpose(1, 2).unsqueeze(1)

    model.eval()
    with torch.no_grad():
        logits = model(spec)
        prediction = decode(logits)

    print("-" * 40)
    print(f"CRNN (250 Epochs): {prediction}")
    print("-" * 40)
except Exception as e:
    print(f"Could not load audio for test: {e}")

Pushing for 100 more epochs (Total will be 250)...
Tracking CER (Character Error) and WER (Word Error)...
Epoch 010 | Loss: 1.5849 | CER: 0.45 | WER: 0.96
Epoch 020 | Loss: 1.4936 | CER: 0.43 | WER: 0.95
Epoch 030 | Loss: 1.4248 | CER: 0.42 | WER: 0.93
Epoch 040 | Loss: 1.3732 | CER: 0.41 | WER: 0.92
Epoch 050 | Loss: 1.2842 | CER: 0.38 | WER: 0.89
Epoch 060 | Loss: 1.2939 | CER: 0.38 | WER: 0.90
Epoch 070 | Loss: 1.1581 | CER: 0.35 | WER: 0.85
Epoch 080 | Loss: 1.1450 | CER: 0.35 | WER: 0.85
Epoch 090 | Loss: 1.1896 | CER: 0.36 | WER: 0.86
Epoch 100 | Loss: 1.0381 | CER: 0.32 | WER: 0.82

FINAL TEST: 250 EPOCHS
Using file: pangramm6.mp4
----------------------------------------
CRNN (250 Epochs): -щи1ивевщющи аяи сях еясти ха ышяри аи жиа в в визявеюшеaеяеe
----------------------------------------


  audio, sr = librosa.load(file_name, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


In [2]:
# deepspeech TAKE 1
class DeepSpeech2(nn.Module):
    def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride=2, dropout=0.1):
        super(DeepSpeech2, self).__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
            nn.BatchNorm2d(32),
            nn.Hardtanh(0, 20, inplace=True),
            nn.Conv2d(32, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
            nn.BatchNorm2d(32),
            nn.Hardtanh(0, 20, inplace=True)
        )

        rnn_input_dim = 32 * (n_feats // 4)

        self.rnn = nn.GRU(input_size=rnn_input_dim, hidden_size=rnn_dim,
                          num_layers=n_rnn_layers, batch_first=True, bidirectional=True)

        self.classifier = nn.Sequential(
            nn.Linear(rnn_dim * 2, rnn_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(rnn_dim, n_class)
        )

    def forward(self, x):
        x = self.cnn(x)

        b, c, t, f = x.size()
        x = x.permute(0, 2, 1, 3)

        x = x.reshape(b, t, c*f)

        x, _ = self.rnn(x)
        x = self.classifier(x)
        return x

model = DeepSpeech2(n_cnn_layers=2, n_rnn_layers=3, rnn_dim=512,
                    n_class=VOCAB_SIZE, n_feats=128).to(DEVICE)

criterion = nn.CTCLoss(blank=0).to(DEVICE)
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

print("\nStarting DeepSpeech Training (Fixed)...")
for epoch in range(EPOCHS):
    model.train()
    train_loss = 0

    for batch in tqdm(train_loader):
        try:
            specs, targets, input_lengths, target_lengths = batch
            specs, targets = specs.to(DEVICE), targets.to(DEVICE)

            optimizer.zero_grad()
            output = model(specs)
            output = output.transpose(0, 1).log_softmax(2)

            input_lengths_cnn = input_lengths // 4

            loss = criterion(output, targets, input_lengths_cnn, target_lengths)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        except RuntimeError as e:
            if "size mismatch" in str(e):
                print("Skipping bad batch (size mismatch)...")
                continue
            else:
                raise e

    print(f"Epoch {epoch+1} | Loss: {train_loss/len(train_loader):.4f}")


Starting DeepSpeech Training (Fixed)...


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 1 | Loss: 3.9138


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 2 | Loss: 3.3990


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 3 | Loss: 3.3583


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 4 | Loss: 3.2929


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 5 | Loss: 3.2276


In [3]:
# 15 more epochs
EXTRA_EPOCHS = 15

print(f"Continuing training for {EXTRA_EPOCHS} more epochs...")
for epoch in range(EXTRA_EPOCHS):
    model.train()
    train_loss = 0

    for batch in tqdm(train_loader):
        try:
            specs, targets, input_lengths, target_lengths = batch
            specs, targets = specs.to(DEVICE), targets.to(DEVICE)

            optimizer.zero_grad()
            output = model(specs)
            output = output.transpose(0, 1).log_softmax(2)

            input_lengths_cnn = input_lengths // 4

            loss = criterion(output, targets, input_lengths_cnn, target_lengths)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        except RuntimeError:
            continue

    print(f"Epoch {epoch + 6} | Loss: {train_loss/len(train_loader):.4f}")

print("\nQuick Test on Training Data (Memorization Check):")
with torch.no_grad():
    model.eval()
    specs, targets, _, _ = next(iter(train_loader))
    specs = specs.to(DEVICE)
    output = model(specs)
    pred_text = decode_prediction(output.transpose(0, 1))[0]
    ref_text = "".join([id2char[x] for x in targets[0].tolist()])

    print(f"Target: {ref_text}")
    print(f"Pred:   {pred_text}")

Continuing training for 15 more epochs...


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 6 | Loss: 3.1399


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 7 | Loss: 3.0063


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 8 | Loss: 2.8480


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 9 | Loss: 2.6931


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 10 | Loss: 2.5494


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 11 | Loss: 2.4120


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 12 | Loss: 2.3043


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 13 | Loss: 2.1873


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 14 | Loss: 2.0948


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 15 | Loss: 2.0082


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 16 | Loss: 1.9213


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 17 | Loss: 1.8397


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 18 | Loss: 1.7645


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 19 | Loss: 1.6899


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 20 | Loss: 1.6098

Quick Test on Training Data (Memorization Check):


NameError: name 'decode_prediction' is not defined

In [5]:
# 30 more epochs
MORE_EPOCHS = 30
print(f"Pushing for {MORE_EPOCHS} more epochs to reach convergence...")

for epoch in range(MORE_EPOCHS):
    model.train()
    train_loss = 0

    for batch in tqdm(train_loader):
        try:
            specs, targets, input_lengths, target_lengths = batch
            specs, targets = specs.to(DEVICE), targets.to(DEVICE)

            optimizer.zero_grad()
            output = model(specs)
            output = output.transpose(0, 1).log_softmax(2)

            input_lengths_cnn = input_lengths // 4

            loss = criterion(output, targets, input_lengths_cnn, target_lengths)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        except RuntimeError:
            continue

    current_epoch = epoch + 21
    avg_loss = train_loss/len(train_loader)

    print(f"Epoch {current_epoch} | Loss: {avg_loss:.4f}")

    if avg_loss < 0.5:
        print("Loss is below 0.5! We have likely converged.")
        break


print("\nFinal Check on Training Data:")
with torch.no_grad():
    model.eval()
    specs, targets, _, _ = next(iter(train_loader))
    specs = specs.to(DEVICE)
    output = model(specs)

    pred_text = decode_prediction(output)[0]
    ref_text = "".join([id2char[x] for x in targets[0].tolist() if x != 0])

    print(f"Target: {ref_text}")
    print(f"Pred:   {pred_text}")

Pushing for 30 more epochs to reach convergence...


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 21 | Loss: 1.5626


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 22 | Loss: 1.4748


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 23 | Loss: 1.4089


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 24 | Loss: 1.3486


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 25 | Loss: 1.2853


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 26 | Loss: 1.2099


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 27 | Loss: 1.1588


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 28 | Loss: 1.0927


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 29 | Loss: 1.0362


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 30 | Loss: 0.9814


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 31 | Loss: 0.9349


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 32 | Loss: 0.8731


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 33 | Loss: 0.8236


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 34 | Loss: 0.7729


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 35 | Loss: 0.7344


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 36 | Loss: 0.6829


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 37 | Loss: 0.6426


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 38 | Loss: 0.6043


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 39 | Loss: 0.5623


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 40 | Loss: 0.5327


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 41 | Loss: 0.5256


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 42 | Loss: 0.5105


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 43 | Loss: 0.4331
Loss is below 0.5! We have likely converged.

Final Check on Training Data:
Target: запускай картину жанра спорт
Pred:   затпувтай кар тину жанро с пот 


In [7]:
import torch
import torchaudio
import librosa
import numpy as np
from google.colab import files

PANGRAM_TEXT = "съешь же ещё этих мягких французских булок, да выпей чаю"

print(f"Target (Pangram): {PANGRAM_TEXT}")
print("-" * 50)

print("Upload your Pangram Audio File (wav/mp3)...")
uploaded = files.upload()
filename = next(iter(uploaded))

print("\nProcessing audio...")

speech, sr = librosa.load(filename, sr=16000)

audio_tensor = torch.tensor(speech, dtype=torch.float32).unsqueeze(0).to(DEVICE) # (1, Time)


mel_transform = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128).to(DEVICE)

with torch.no_grad():

    spec = mel_transform(audio_tensor)

    spec = spec.transpose(1, 2).unsqueeze(1)

model.eval()
with torch.no_grad():
    output = model(spec)


    pred_text = decode_prediction(output)[0]

print("\n" + "="*50)
print(f"TARGET:     {PANGRAM_TEXT}")
print(f"PREDICTION: {pred_text}")
print("="*50)

Target (Pangram): съешь же ещё этих мягких французских булок, да выпей чаю
--------------------------------------------------
Upload your Pangram Audio File (wav/mp3)...


Saving pangramm6.mp4 to pangramm6 (1).mp4

Processing audio...


  speech, sr = librosa.load(filename, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)



TARGET:     съешь же ещё этих мягких французских булок, да выпей чаю
PREDICTION: смесь мейде со рарисцить няднаыйд хоносойьсте дезод да выь дыячая




In [8]:
import torch
import random
from jiwer import wer, cer

NUM_SAMPLES = 10

print(f"\nRunning DeepSpeech on {NUM_SAMPLES} samples from Golos Test Set...")
print("="*60)
print(f"{'TARGET (Reference)':<40} | {'PREDICTION (DeepSpeech)':<40}")
print("-" * 85)

model.eval()
predictions = []
references = []

test_iter = iter(test_loader)

with torch.no_grad():
    for i in range(NUM_SAMPLES):
        try:
            specs, targets, _, _ = next(test_iter)
        except StopIteration:
            break

        specs = specs.to(DEVICE)
        output = model(specs)

        pred_text = decode_prediction(output)[0]


        ref_text = "".join([id2char[x] for x in targets[0].tolist() if x != 0])

        predictions.append(pred_text)
        references.append(ref_text)


        print(f"{ref_text[:38]:<40} | {pred_text[:38]:<40}")

batch_wer = wer(references, predictions)
batch_cer = cer(references, predictions)

print("="*60)
print(f"Mini-Batch WER: {batch_wer:.2%}")
print(f"Mini-Batch CER: {batch_cer:.2%}")
print("="*60)


Running DeepSpeech on 10 samples from Golos Test Set...
TARGET (Reference)                       | PREDICTION (DeepSpeech)                 
-------------------------------------------------------------------------------------
шестьдесят тысяч тенге сколько будет с   | шестьи сатьристве пэнрит вролько водли  
покажи мне на смотрешке телеканал сине   | покажи не ва смотрешке телекана мерги   
заказать яблоки зеленые                  | скосуть вмязвыйки длеляоне              
алиса закажи килограммовый торт графск   | алиси виткодиькилов надореть от двт ви  
ищи телеканал про бизнес на тиви         | ичсиль тилыканал ро  жеместьна тили     
михаила мурадяна                         | нифа лила мородяало                     
любовницы две тысячи тринадцать пятнад   | слют борня встдвеки шеьквн надцс рески  
найди боевики                            | надиь дери цадри                        
гетто сезон три                          | дерхар свезонь три                      
хочу посмотреть р

In [1]:
#deepspeech TAKE 2 20000 samples
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
import io
import soundfile as sf
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset, Audio
import re
from tqdm.auto import tqdm
import os
from google.colab import drive
import random
from jiwer import wer, cer

print("Mounting Google Drive...")
drive.mount('/content/drive')

DRIVE_SAVE_PATH = "/content/drive/MyDrive/Colab Notebooks/my_deepspeech_models"
os.makedirs(DRIVE_SAVE_PATH, exist_ok=True)
print(f"Checkpoints will be saved to: {DRIVE_SAVE_PATH}")

BATCH_SIZE = 16
EPOCHS = 20
LEARNING_RATE = 1e-4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
NUM_VALIDATION_SAMPLES = 100

print("Loading 20,000 samples...")
dataset_stream = load_dataset("bond005/sberdevices_golos_10h_crowd", split="train[:20000]")
test_stream = load_dataset("bond005/sberdevices_golos_10h_crowd", split="test[:500]")

dataset_stream = dataset_stream.cast_column("audio", Audio(decode=False))
test_stream = test_stream.cast_column("audio", Audio(decode=False))

print("Building Vocabulary...")
chars_to_ignore_regex = r'[\,\?\.\!\-\;\:\"\“\%\‘\”\]]'
def clean_text(text):
    if text is None: return ""
    return re.sub(chars_to_ignore_regex, '', text).lower()

all_text = " ".join([clean_text(x['transcription']) for x in dataset_stream])
vocab = sorted(list(set(all_text)))
char2id = {c: i+1 for i, c in enumerate(vocab)}
char2id["<blank>"] = 0
id2char = {v: k for k, v in char2id.items()}
VOCAB_SIZE = len(char2id)

train_audio_transforms = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128).to(DEVICE)

class AudioDataset(Dataset):
    def __init__(self, hf_dataset):
        self.data = hf_dataset

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        audio_bytes = item["audio"]["bytes"]
        audio_array, sample_rate = sf.read(io.BytesIO(audio_bytes))
        audio = torch.tensor(audio_array, dtype=torch.float32).to(DEVICE)

        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(sample_rate, 16000).to(DEVICE)
            audio = resampler(audio)

        text = clean_text(item["transcription"])
        targets = torch.tensor([char2id[c] for c in text if c in char2id], dtype=torch.long)
        spec = train_audio_transforms(audio).transpose(0, 1)
        return spec, targets

def collate_fn(batch):
    specs, targets = zip(*batch)
    specs_padded = torch.nn.utils.rnn.pad_sequence(specs, batch_first=True).unsqueeze(1)
    targets_padded = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True)
    input_lengths = torch.tensor([s.shape[0] for s in specs])
    target_lengths = torch.tensor([len(t) for t in targets])
    return specs_padded, targets_padded, input_lengths, target_lengths

train_loader = DataLoader(AudioDataset(dataset_stream), batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=True)
test_loader = DataLoader(AudioDataset(test_stream), batch_size=1, collate_fn=collate_fn)

class DeepSpeech2(nn.Module):
    def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride=2, dropout=0.1):
        super(DeepSpeech2, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
            nn.BatchNorm2d(32),
            nn.Hardtanh(0, 20, inplace=True),
            nn.Conv2d(32, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
            nn.BatchNorm2d(32),
            nn.Hardtanh(0, 20, inplace=True)
        )
        rnn_input_dim = 32 * (n_feats // 4)
        self.rnn = nn.GRU(input_size=rnn_input_dim, hidden_size=rnn_dim,
                          num_layers=n_rnn_layers, batch_first=True, bidirectional=True)
        self.classifier = nn.Sequential(
            nn.Linear(rnn_dim * 2, rnn_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(rnn_dim, n_class)
        )

    def forward(self, x):
        x = self.cnn(x)
        b, c, t, f = x.size()
        x = x.permute(0, 2, 1, 3)
        x = x.reshape(b, t, c*f)
        x, _ = self.rnn(x)
        x = self.classifier(x)
        return x

def decode_prediction(output_tensor):
    arg_maxes = torch.argmax(output_tensor, dim=2)
    decodes = []
    for i in range(arg_maxes.shape[0]):
        decode = []
        for j in range(arg_maxes.shape[1]):
            if arg_maxes[i][j] != 0:
                if j == 0 or arg_maxes[i][j] != arg_maxes[i][j-1]:
                    decode.append(arg_maxes[i][j].item())
        decodes.append("".join([id2char[x] for x in decode]))
    return decodes

model = DeepSpeech2(n_cnn_layers=2, n_rnn_layers=3, rnn_dim=512,
                    n_class=VOCAB_SIZE, n_feats=128).to(DEVICE)
criterion = nn.CTCLoss(blank=0).to(DEVICE)
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

print("\nStarting Training...")
for epoch in range(EPOCHS):
    model.train()
    train_loss = 0

    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS} [Train]")
    for batch in loop:
        try:
            specs, targets, input_lengths, target_lengths = batch
            specs, targets = specs.to(DEVICE), targets.to(DEVICE)

            optimizer.zero_grad()
            output = model(specs)
            output = output.transpose(0, 1).log_softmax(2)

            input_lengths_cnn = input_lengths // 4

            loss = criterion(output, targets, input_lengths_cnn, target_lengths)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            loop.set_postfix(loss=loss.item())
        except RuntimeError:
            continue

    avg_loss = train_loss/len(train_loader)

    model.eval()
    val_preds = []
    val_refs = []

    test_indices = list(range(len(test_stream)))
    random.shuffle(test_indices)
    subset_indices = test_indices[:NUM_VALIDATION_SAMPLES]

    with torch.no_grad():
        for i in subset_indices:
            try:
                item = test_stream[i]
                audio_bytes = item["audio"]["bytes"]
                audio_array, sample_rate = sf.read(io.BytesIO(audio_bytes))
                audio = torch.tensor(audio_array, dtype=torch.float32).to(DEVICE)
                if sample_rate != 16000:
                    resampler = torchaudio.transforms.Resample(sample_rate, 16000).to(DEVICE)
                    audio = resampler(audio)

                spec = train_audio_transforms(audio).transpose(0, 1).unsqueeze(0) # Add batch dim

                output = model(spec)
                pred_text = decode_prediction(output)[0]
                ref_text = clean_text(item["transcription"])

                val_preds.append(pred_text)
                val_refs.append(ref_text)
            except:
                continue

    epoch_wer = wer(val_refs, val_preds)
    epoch_cer = cer(val_refs, val_preds)

    print(f"\nEPOCH {epoch+1} REPORT:")
    print(f"   Avg Loss: {avg_loss:.4f}")
    print(f"   Test WER: {epoch_wer:.2%}")
    print(f"   Test CER: {epoch_cer:.2%}")

    save_path = f"{DRIVE_SAVE_PATH}/model_epoch_{epoch+1}_wer_{int(epoch_wer*100)}.pt"
    torch.save(model.state_dict(), save_path)
    print(f"Saved to Drive: {save_path}\n")

print("Training Complete!")

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Checkpoints will be saved to: /content/drive/MyDrive/Colab Notebooks/my_deepspeech_models
Loading 20,000 samples...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Building Vocabulary...





Starting Training...


Epoch 1/20 [Train]:   0%|          | 0/500 [00:00<?, ?it/s]


EPOCH 1 REPORT:
   Avg Loss: 3.4692
   Test WER: 0.00%
   Test CER: 0.00%
Saved to Drive: /content/drive/MyDrive/Colab Notebooks/my_deepspeech_models/model_epoch_1_wer_0.pt



Epoch 2/20 [Train]:   0%|          | 0/500 [00:00<?, ?it/s]


EPOCH 2 REPORT:
   Avg Loss: 2.9800
   Test WER: 0.00%
   Test CER: 0.00%
Saved to Drive: /content/drive/MyDrive/Colab Notebooks/my_deepspeech_models/model_epoch_2_wer_0.pt



Epoch 3/20 [Train]:   0%|          | 0/500 [00:00<?, ?it/s]


EPOCH 3 REPORT:
   Avg Loss: 2.4770
   Test WER: 0.00%
   Test CER: 0.00%
Saved to Drive: /content/drive/MyDrive/Colab Notebooks/my_deepspeech_models/model_epoch_3_wer_0.pt



Epoch 4/20 [Train]:   0%|          | 0/500 [00:00<?, ?it/s]


EPOCH 4 REPORT:
   Avg Loss: 2.1830
   Test WER: 0.00%
   Test CER: 0.00%
Saved to Drive: /content/drive/MyDrive/Colab Notebooks/my_deepspeech_models/model_epoch_4_wer_0.pt



Epoch 5/20 [Train]:   0%|          | 0/500 [00:00<?, ?it/s]


EPOCH 5 REPORT:
   Avg Loss: 1.9912
   Test WER: 0.00%
   Test CER: 0.00%
Saved to Drive: /content/drive/MyDrive/Colab Notebooks/my_deepspeech_models/model_epoch_5_wer_0.pt



Epoch 6/20 [Train]:   0%|          | 0/500 [00:00<?, ?it/s]


EPOCH 6 REPORT:
   Avg Loss: 1.8371
   Test WER: 0.00%
   Test CER: 0.00%
Saved to Drive: /content/drive/MyDrive/Colab Notebooks/my_deepspeech_models/model_epoch_6_wer_0.pt



Epoch 7/20 [Train]:   0%|          | 0/500 [00:00<?, ?it/s]


EPOCH 7 REPORT:
   Avg Loss: 1.7087
   Test WER: 0.00%
   Test CER: 0.00%
Saved to Drive: /content/drive/MyDrive/Colab Notebooks/my_deepspeech_models/model_epoch_7_wer_0.pt



Epoch 8/20 [Train]:   0%|          | 0/500 [00:00<?, ?it/s]


EPOCH 8 REPORT:
   Avg Loss: 1.5974
   Test WER: 0.00%
   Test CER: 0.00%
Saved to Drive: /content/drive/MyDrive/Colab Notebooks/my_deepspeech_models/model_epoch_8_wer_0.pt



Epoch 9/20 [Train]:   0%|          | 0/500 [00:00<?, ?it/s]


EPOCH 9 REPORT:
   Avg Loss: 1.4914
   Test WER: 0.00%
   Test CER: 0.00%
Saved to Drive: /content/drive/MyDrive/Colab Notebooks/my_deepspeech_models/model_epoch_9_wer_0.pt



Epoch 10/20 [Train]:   0%|          | 0/500 [00:00<?, ?it/s]


EPOCH 10 REPORT:
   Avg Loss: 1.3896
   Test WER: 0.00%
   Test CER: 0.00%
Saved to Drive: /content/drive/MyDrive/Colab Notebooks/my_deepspeech_models/model_epoch_10_wer_0.pt



Epoch 11/20 [Train]:   0%|          | 0/500 [00:00<?, ?it/s]


EPOCH 11 REPORT:
   Avg Loss: 1.2983
   Test WER: 0.00%
   Test CER: 0.00%
Saved to Drive: /content/drive/MyDrive/Colab Notebooks/my_deepspeech_models/model_epoch_11_wer_0.pt



Epoch 12/20 [Train]:   0%|          | 0/500 [00:00<?, ?it/s]


EPOCH 12 REPORT:
   Avg Loss: 1.2103
   Test WER: 0.00%
   Test CER: 0.00%
Saved to Drive: /content/drive/MyDrive/Colab Notebooks/my_deepspeech_models/model_epoch_12_wer_0.pt



Epoch 13/20 [Train]:   0%|          | 0/500 [00:00<?, ?it/s]


EPOCH 13 REPORT:
   Avg Loss: 1.1212
   Test WER: 0.00%
   Test CER: 0.00%
Saved to Drive: /content/drive/MyDrive/Colab Notebooks/my_deepspeech_models/model_epoch_13_wer_0.pt



Epoch 14/20 [Train]:   0%|          | 0/500 [00:00<?, ?it/s]


EPOCH 14 REPORT:
   Avg Loss: 1.0437
   Test WER: 0.00%
   Test CER: 0.00%
Saved to Drive: /content/drive/MyDrive/Colab Notebooks/my_deepspeech_models/model_epoch_14_wer_0.pt



Epoch 15/20 [Train]:   0%|          | 0/500 [00:00<?, ?it/s]


EPOCH 15 REPORT:
   Avg Loss: 0.9603
   Test WER: 0.00%
   Test CER: 0.00%
Saved to Drive: /content/drive/MyDrive/Colab Notebooks/my_deepspeech_models/model_epoch_15_wer_0.pt



Epoch 16/20 [Train]:   0%|          | 0/500 [00:00<?, ?it/s]


EPOCH 16 REPORT:
   Avg Loss: 0.8886
   Test WER: 0.00%
   Test CER: 0.00%
Saved to Drive: /content/drive/MyDrive/Colab Notebooks/my_deepspeech_models/model_epoch_16_wer_0.pt



Epoch 17/20 [Train]:   0%|          | 0/500 [00:00<?, ?it/s]


EPOCH 17 REPORT:
   Avg Loss: 0.8166
   Test WER: 0.00%
   Test CER: 0.00%
Saved to Drive: /content/drive/MyDrive/Colab Notebooks/my_deepspeech_models/model_epoch_17_wer_0.pt



Epoch 18/20 [Train]:   0%|          | 0/500 [00:00<?, ?it/s]


EPOCH 18 REPORT:
   Avg Loss: 0.7516
   Test WER: 0.00%
   Test CER: 0.00%
Saved to Drive: /content/drive/MyDrive/Colab Notebooks/my_deepspeech_models/model_epoch_18_wer_0.pt



Epoch 19/20 [Train]:   0%|          | 0/500 [00:00<?, ?it/s]


EPOCH 19 REPORT:
   Avg Loss: 0.6902
   Test WER: 0.00%
   Test CER: 0.00%
Saved to Drive: /content/drive/MyDrive/Colab Notebooks/my_deepspeech_models/model_epoch_19_wer_0.pt



Epoch 20/20 [Train]:   0%|          | 0/500 [00:00<?, ?it/s]


EPOCH 20 REPORT:
   Avg Loss: 0.6263
   Test WER: 0.00%
   Test CER: 0.00%
Saved to Drive: /content/drive/MyDrive/Colab Notebooks/my_deepspeech_models/model_epoch_20_wer_0.pt

Training Complete!


In [2]:
#small test
import torch
import torch.nn as nn
import torchaudio
import io
import soundfile as sf
from datasets import load_dataset, Audio
from google.colab import drive
import os
import re
from tqdm.auto import tqdm
from jiwer import wer, cer

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
LOAD_PATH = "/content/drive/MyDrive/Colab Notebooks/my_deepspeech_models/model_epoch_20_wer_0.pt"

if 'model' not in locals():
    print("Please run the setup/model definition cell first!")
else:
    print("Running Full Evaluation (WER & CER)...")
    model.eval()
    predictions = []
    references = []

    test_stream = load_dataset("bond005/sberdevices_golos_10h_crowd", split="test[:100]")
    test_stream = test_stream.cast_column("audio", Audio(decode=False))
    spec_transform = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128).to(DEVICE)

    def decode_prediction(output_tensor):
        arg_maxes = torch.argmax(output_tensor, dim=2)
        decodes = []
        for i in range(arg_maxes.shape[0]):
            decode = []
            for j in range(arg_maxes.shape[1]):
                if arg_maxes[i][j] != 0:
                    if j == 0 or arg_maxes[i][j] != arg_maxes[i][j-1]:
                        decode.append(arg_maxes[i][j].item())
            decodes.append("".join([id2char[x] for x in decode]))
        return decodes

    with torch.no_grad():
        for item in tqdm(test_stream):
            audio_bytes = item["audio"]["bytes"]
            audio_array, sample_rate = sf.read(io.BytesIO(audio_bytes))
            audio = torch.tensor(audio_array, dtype=torch.float32).to(DEVICE)

            if sample_rate != 16000:
                resampler = torchaudio.transforms.Resample(sample_rate, 16000).to(DEVICE)
                audio = resampler(audio)

            spec = spec_transform(audio).transpose(0, 1).unsqueeze(0).unsqueeze(0)

            output = model(spec)
            pred_text = decode_prediction(output)[0]
            ref_text = clean_text(item["transcription"])

            predictions.append(pred_text)
            references.append(ref_text)

    real_wer = wer(references, predictions)
    real_cer = cer(references, predictions)

    print("\n" + "="*40)
    print(f"REAL Final WER: {real_wer:.2%}")
    print(f"REAL Final CER: {real_cer:.2%}")
    print("="*40)

    for i in range(5):
        print(f"Ref:  {references[i]}")
        print(f"Pred: {predictions[i]}")
        sample_cer = cer(references[i], predictions[i])
        print(f"Sample CER: {sample_cer:.2%}")
        print("-" * 20)

Running Full Evaluation (WER & CER)...


  0%|          | 0/100 [00:00<?, ?it/s]


REAL Final WER: 95.62%
REAL Final CER: 53.42%
Ref:  шестьдесят тысяч тенге сколько будет стоить
Pred: шестьйест тистч тенре сколько фудетстой
Sample CER: 23.26%
--------------------
Ref:  покажи мне на смотрешке телеканал синергия тв
Pred: покажи мне на смотрешке теле канал снергию тув
Sample CER: 8.89%
--------------------
Ref:  заказать яблоки зеленые
Pred: сковотьсвядлыке зереное
Sample CER: 52.17%
--------------------
Ref:  алиса закажи килограммовый торт графские развалины
Pred: алисо закажикино дрображный фон рак на слан
Sample CER: 50.00%
--------------------
Ref:  ищи телеканал про бизнес на тиви
Pred: кичщи телеканалт о дзнс на дии
Sample CER: 31.25%
--------------------


In [3]:
#pangram
import torch
import torchaudio
import librosa
import numpy as np
from google.colab import files

PANGRAM_TEXT = "съешь ещё этих мягких французских булок, да выпей чаю."

print(f"Target (Pangram): {PANGRAM_TEXT}")
print("-" * 50)

print("Uploading (wav/mp3)...")
uploaded = files.upload()
filename = next(iter(uploaded))

print(f"\nProcessing {filename}...")
speech, sr = librosa.load(filename, sr=16000)

audio_tensor = torch.tensor(speech, dtype=torch.float32).to(DEVICE)
spec_transform = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128).to(DEVICE)

spec = spec_transform(audio_tensor).transpose(0, 1).unsqueeze(0).unsqueeze(0)

model.eval()
with torch.no_grad():
    output = model(spec)
    pred_text = decode_prediction(output)[0]

print("\n" + "="*50)
print(f"TARGET:     {PANGRAM_TEXT}")
print(f"PREDICTION: {pred_text}")
print("="*50)

Target (Pangram): съешь ещё этих мягких французских булок, да выпей чаю.
--------------------------------------------------
Uploading (wav/mp3)...


Saving pangramm6.mp4 to pangramm6.mp4

Processing pangramm6.mp4...


  speech, sr = librosa.load(filename, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)



TARGET:     съешь ещё этих мягких французских булок, да выпей чаю.
PREDICTION: сесщи я яячо редца нефки фонслулских фулотк довыть долчаю
