In [2]:
!pip uninstall -y datasets
!pip install datasets==2.21.0
!pip install transformers accelerate soundfile librosa jiwer torchaudio
!apt install git-lfs

Found existing installation: datasets 4.0.0
Uninstalling datasets-4.0.0:
  Successfully uninstalled datasets-4.0.0
Collecting datasets==2.21.0
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting fsspec<=2024.6.1,>=2023.1.0 (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets==2.21.0)
  Downloading fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.6.1-py3-none-any.whl (177 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.6/177.6 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.0
    Uninstalling fsspec-2025.3.0:
      Successfully uninstalled fsspec-2025.3.0
[31mERROR: pip's dependency resolver does not currently tak

Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-4.0.0 rapidfuzz-3.14.3
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.


In [1]:
#hubert
import torch
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
from datasets import load_dataset, Audio
import re
import json
import shutil
import os
from google.colab import drive
from transformers import (
    HubertForCTC,
    Wav2Vec2Processor,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)

#this training was done overnight and saved to google drive
drive.mount('/content/drive')
drive_save_path = "/content/drive/My Drive/hubert-russian-overnight"

MODEL_ID = "facebook/hubert-base-ls960"

print("Loading Russian dataset (Golos - 2000 samples)...")
dataset = load_dataset("bond005/sberdevices_golos_10h_crowd", split="train[:2000]")

chars_to_ignore_regex = r'[\,\?\.\!\-\;\:\"\“\%\‘\”\]]'

def remove_special_characters(batch):
    batch["transcription"] = re.sub(chars_to_ignore_regex, '', batch["transcription"]).lower() + " "
    return batch

dataset = dataset.map(remove_special_characters)

def extract_all_chars(batch):
    all_text = " ".join(batch["transcription"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}

vocab_dict = dataset.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=dataset.column_names)
vocab_list = list(set(vocab_dict["vocab"][0]))
vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}

vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)

with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    with processor.as_target_processor():
        batch["labels"] = processor(batch["transcription"]).input_ids
    return batch

encoded_dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names, num_proc=1)

# validation split (for early stopping and avoiding overfitting)
train_testvalid = encoded_dataset.train_test_split(test_size=0.1)
train_dataset = train_testvalid["train"]
eval_dataset = train_testvalid["test"]

@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.feature_extractor.pad(input_features, padding=self.padding, return_tensors="pt")
        labels_batch = self.processor.tokenizer.pad(label_features, padding=self.padding, return_tensors="pt")

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels
        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

model = HubertForCTC.from_pretrained(
    MODEL_ID,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
    mask_time_prob=0.05,
    layerdrop=0.1,
)
model.freeze_feature_encoder()

training_args = TrainingArguments(
    output_dir="./hubert-russian-checkpoints",
    group_by_length=True,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    eval_strategy="steps",
    num_train_epochs=30,
    fp16=True,
    save_steps=400,
    eval_steps=400,
    logging_steps=100,
    learning_rate=1e-4,
    warmup_steps=300,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor.feature_extractor,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Starting HuBERT Training...")
trainer.train()

print(f"Training finished. Saving Best Model to {drive_save_path}...")
trainer.save_model(drive_save_path)
processor.save_pretrained(drive_save_path)
print("SUCCESS!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading Russian dataset (Golos - 2000 samples)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/6.75k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.71k [00:00<?, ?B/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]



config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of HubertForCTC were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Starting HuBERT Training...


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


  torch._C._get_cudnn_allow_tf32(),


Step,Training Loss,Validation Loss
400,3.2351,3.240108
800,2.4774,1.711775
1200,1.1648,0.915641
1600,0.9131,0.746301
2000,0.8604,0.756039
2400,0.6487,0.740955
2800,0.6439,0.722328
3200,0.5266,0.714211
3600,0.4736,0.740517
4000,0.4481,0.71061


Training finished. Saving Best Model to /content/drive/My Drive/hubert-russian-overnight...
SUCCESS!


In [2]:
!pip install jiwer transformers librosa

Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-4.0.0 rapidfuzz-3.14.3


In [6]:
#pangram
import torch
import librosa
from transformers import HubertForCTC, Wav2Vec2Processor
from google.colab import files

drive_path = "/content/drive/My Drive/hubert-russian-overnight"
print(f"Loading HuBERT from: {drive_path}...")

model = HubertForCTC.from_pretrained(drive_path).to("cuda")
processor = Wav2Vec2Processor.from_pretrained(drive_path)

print("Upload your pangram (or re-upload if needed):")
uploaded = files.upload()
filename = next(iter(uploaded))

print(f"Processing {filename}...")
speech, rate = librosa.load(filename, sr=16000)
input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values.to("cuda")

with torch.no_grad():
    logits = model(input_values).logits

pred_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(pred_ids)[0]

print("\n" + "="*40)
print("PANGRAM: Съешь же еще этих мягких француских булок, да выпей чаю.")
print(f"HuBERT: {transcription}")
print("="*40)

Loading HuBERT from: /content/drive/My Drive/hubert-russian-overnight...
Upload your pangram (or re-upload if needed):


Saving pangramm6.mp4 to pangramm6 (2).mp4
Processing pangramm6 (2).mp4...


  speech, rate = librosa.load(filename, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)



PANGRAM: Съешь же еще этих мягких француских булок, да выпей чаю.
HuBERT: сьяр  жей ще этих мерки фансуских булок далвы прыйчавю


In [9]:
!pip uninstall -y datasets
!pip install datasets==2.21.0

Found existing installation: datasets 4.0.0
Uninstalling datasets-4.0.0:
  Successfully uninstalled datasets-4.0.0
Collecting datasets==2.21.0
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting fsspec<=2024.6.1,>=2023.1.0 (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets==2.21.0)
  Downloading fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.6.1-py3-none-any.whl (177 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.6/177.6 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.0
    Uninstalling fsspec-2025.3.0:
      Successfully uninstalled fsspec-2025.3.0
[31mERROR: pip's dependency resolver does not currently tak

In [1]:
import torch
from datasets import load_dataset, Audio
from transformers import HubertForCTC, Wav2Vec2Processor
from jiwer import wer, cer
import re
from tqdm.auto import tqdm

drive_path = "/content/drive/My Drive/hubert-russian-overnight"
print(f"Loading HuBERT from: {drive_path}...")
model = HubertForCTC.from_pretrained(drive_path).to("cuda")
processor = Wav2Vec2Processor.from_pretrained(drive_path)

print("Loading Test Dataset (200 samples)...")
test_dataset = load_dataset("bond005/sberdevices_golos_10h_crowd", split="test[:200]")

chars_to_ignore_regex = r'[\,\?\.\!\-\;\:\"\“\%\‘\”\]]'

def remove_special_characters(batch):
    text = batch["transcription"] if batch["transcription"] is not None else ""
    batch["transcription"] = re.sub(chars_to_ignore_regex, '', text).lower() + " "
    return batch

print("Cleaning data...")
test_dataset = test_dataset.map(remove_special_characters)
test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16000))

print("Running Evaluation...")
predictions = []
references = []

model.eval()

for i in tqdm(range(len(test_dataset))):
    if not test_dataset[i]["transcription"].strip():
        continue

    inputs = processor(test_dataset[i]["audio"]["array"],
                       sampling_rate=16000,
                       return_tensors="pt")

    with torch.no_grad():
        input_values = inputs.input_values.to("cuda")
        logits = model(input_values).logits

    pred_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(pred_ids)[0]

    predictions.append(transcription)
    references.append(test_dataset[i]["transcription"])

wer_score = wer(references, predictions)
cer_score = cer(references, predictions)

print("\n" + "="*40)
print(f"OFFICIAL HUBERT REPORT CARD:")
print(f"Word Error Rate (WER):      {wer_score:.2%}")
print(f"Character Error Rate (CER): {cer_score:.2%}")
print("="*40)

print("\n--- Error Analysis Examples ---")
for i in range(0, 5):
    if i < len(references):
        print(f"Ref:  {references[i]}")
        print(f"Pred: {predictions[i]}")
        print("-" * 20)

Loading HuBERT from: /content/drive/My Drive/hubert-russian-overnight...
Loading Test Dataset (200 samples)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/6.75k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.71k [00:00<?, ?B/s]

Cleaning data...


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Running Evaluation...


  0%|          | 0/200 [00:00<?, ?it/s]


OFFICIAL HUBERT REPORT CARD:
Word Error Rate (WER):      57.11%
Character Error Rate (CER): 15.02%

--- Error Analysis Examples ---
Ref:  шестьдесят тысяч тенге сколько будет стоить 
Pred: шестьдесят тысяча тнге сколько будет стоить
--------------------
Ref:  покажи мне на смотрешке телеканал синергия тв 
Pred: покажи мне на смотрешке телеканал синергия тв
--------------------
Ref:  заказать яблоки зеленые 
Pred: заказод с яблоки зиленые
--------------------
Ref:  алиса закажи килограммовый торт графские развалины 
Pred: алиса закажи килограмовый торт гравские разавалены
--------------------
Ref:  ищи телеканал про бизнес на тиви 
Pred: ищи телеканал про бизныс на тиви
--------------------


In [10]:
#manual rule based ctc
import torch
import torch.nn.functional as F

def manual_ctc_decode(logits, vocab_map):
    """
    Manually implements the CTC Greedy Decoding rules.
    """
    # Rule 1: ARGMAX (Greedy Choice)
    probs = F.softmax(logits, dim=-1)
    best_paths = torch.argmax(probs, dim=-1)

    decoded_raw = []
    decoded_final = []

    prev_char_idx = -1

    for i, char_idx in enumerate(best_paths):
        char_idx = char_idx.item()

        # Store raw path for visualization
        decoded_raw.append(vocab_map.get(char_idx, ""))

        # Rule 2: COLLAPSE REPEATS
        if char_idx != prev_char_idx:
            # Rule 3: REMOVE BLANKS
            if char_idx != 0:
                decoded_final.append(vocab_map.get(char_idx, ""))

        prev_char_idx = char_idx

    return "".join(decoded_final), decoded_raw

# New Vocab: 0=Blank, plus the letters for P-R-I-V-E-T
vocab_map = {
    0: "<blank>",
    1: "П",
    2: "Р",
    3: "И",
    4: "В",
    5: "Е",
    6: "Т"
}

# demo sentence: "П П - Р Р И И - В Е Е Т Т"
# This simulates stuttering (repeats) and pauses (blanks/zeros)
# Indices: 1, 1, 0, 2, 2, 3, 3, 0, 4, 5, 5, 6, 6
fake_indices = [1, 1, 0, 2, 2, 3, 3, 0, 4, 5, 5, 6, 6]

# Turn these indices into "Fake Logits"
T = len(fake_indices)
C = len(vocab_map)
fake_logits = torch.zeros(T, C)

for t, class_idx in enumerate(fake_indices):
    fake_logits[t, class_idx] = 10.0

#run the algorithm
prediction_text, raw_path = manual_ctc_decode(fake_logits, vocab_map)

print("="*40)
print("MANUAL CTC DECODING DEMO")
print("="*40)
print(f"1. Raw Input Sequence (Argmax Path):")
print(f"   {raw_path}")
print("-" * 40)
print(f"2. Apply 'Collapse & Remove Blank' Rules:")
print(f"   '{prediction_text}'")
print("-" * 40)
print(f"3. Success?")
print(f"   {'YES' if prediction_text == 'ПРИВЕТ' else 'NO'}")
print("="*40)

MANUAL CTC DECODING DEMO
1. Raw Input Sequence (Argmax Path):
   ['П', 'П', '<blank>', 'Р', 'Р', 'И', 'И', '<blank>', 'В', 'Е', 'Е', 'Т', 'Т']
----------------------------------------
2. Apply 'Collapse & Remove Blank' Rules:
   'ПРИВЕТ'
----------------------------------------
3. Success?
   YES


In [13]:
#pangram
import torch
import torch.nn.functional as F
import librosa
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from google.colab import files
# We use the pro model because it's already loaded and good
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-russian"
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID).to("cuda")
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)

print("pangram test:")
uploaded = files.upload()
filename = next(iter(uploaded))
speech, rate = librosa.load(filename, sr=16000)

input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values.to("cuda")
with torch.no_grad():
    logits = model(input_values).logits[0] # Take first item in batch

# First, we need the vocabulary map from the model
vocab_map = {v: k for k, v in processor.tokenizer.get_vocab().items()}

def real_manual_decode(logits, tokenizer):
    probs = F.softmax(logits, dim=-1)
    best_paths = torch.argmax(probs, dim=-1)

    decoded_str = ""
    prev_idx = -1

    # Iterate over time steps
    for idx in best_paths:
        idx = idx.item()

        # In Wav2Vec2, the blank token is usually ID 0 (<pad>)
        if idx != prev_idx and idx != processor.tokenizer.pad_token_id:
            # Convert Int -> Char
            char = processor.tokenizer.decode([idx])
            decoded_str += char

        prev_idx = idx

    return decoded_str

print("\n" + "="*40)
print("MANUAL DECODE")
print("="*40)
prediction = real_manual_decode(logits, processor)
print("PANGRAM: съешь же ещё этих мягких французских булок, да выпей чаю")
print(f"Prediction: {prediction}")
print("="*40)

pangram test:


Saving pangramm6.mp4 to pangramm6 (7).mp4

MANUAL DECODE


  speech, rate = librosa.load(filename, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


PANGRAM: съешь же ещё этих мягких французских булок, да выпей чаю
Prediction: съечьжеещеэтихмягкихфранцузскихбулокдавыпойчаю
