In [1]:
!pip install transformers datasets librosa jiwer==2.3.0
!pip freeze > requirements.txt
# Add requirements, just in case
#!cat requirements.txt



In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
# Replace with your actual path
BASE = '/content/drive/MyDrive/LR_ASR/preprocessing/processed_data/14.0-delta-2023-06-23'
CSV = f'{BASE}/manifest_sw_14_0_delta.csv'
AUDIO_DIR = f'{BASE}/cleaned_sw_audio_14_0_delta'
# If you're using a pre-configured HuggingFace processor, you can comment out or remove VOCAB
VOCAB = '/content/drive/MyDrive/LR_ASR/baseline/vocab.json'

In [4]:
from transformers import (
    Wav2Vec2FeatureExtractor,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2Processor,
    Wav2Vec2ForCTC
)
#
# 1)
  # 1-1) Feature Extractor (basic XLS-R settings)
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1,
    sampling_rate=16000,
    padding_value=0.0,
    do_normalize=True,
    return_attention_mask=True
)

  # 1-2) Tokenizer (using custom vocab.json)
tokenizer = Wav2Vec2CTCTokenizer(
    VOCAB,
    unk_token="[UNK]",
    pad_token="[PAD]",
    word_delimiter_token="|",
)

In [5]:
!pip install -U evaluate jiwer==2.3.0



In [6]:
import os
import pandas as pd
import librosa
import torch
import json
from datasets import load_metric

df = pd.read_csv(CSV)
# Replace original local paths with the correct mounted paths
df["wav_path"] = df["wav_path"].apply(lambda p: os.path.join(AUDIO_DIR, os.path.basename(p)))

# 2. If you want to use the pre configured processor instead,
# comment out the custom processor and
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-xlsr-53")
processor = Wav2Vec2Processor(
    feature_extractor=feature_extractor,
    tokenizer=tokenizer
)
# then uncomment the line below.
# processor = Wav2Vec2Processor.from_pretrained("alokmatta/wav2vec2-large-xlsr-53-sw")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device).eval()

# 3. Inference function, This function is documented in the README.
#    Please review it there and let me know if you spot any issues!
def transcribe(path):
    speech, _ = librosa.load(path, sr=16000)
    inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        logits = model(**inputs).logits
    pred_ids = torch.argmax(logits, dim=-1)
    return processor.batch_decode(pred_ids)[0]

# 4. Run inference on the dataset
df["prediction"] = df["wav_path"].map(transcribe)

# 5. Compute WER and CER
wer = load_metric("wer")
cer = load_metric("cer")

print("Baseline WER:", wer.compute(predictions=df["prediction"], references=df["transcript"]))
print("Baseline CER:", cer.compute(predictions=df["prediction"], references=df["transcript"]))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  wer = load_metric("wer")


Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

Baseline WER: 1.0
Baseline CER: 0.970987927438215


In [7]:
print(df['prediction'][3])
print(df['transcript'][3])

tdod'd'dwdfdc
Hata hivyo maelfu ya watu waliojitokeza kumsikiliza rais wa chama ,Chamisa , akizungumza inaonyesha hakuna shaka yoyote


In [8]:
df

Unnamed: 0,wav_path,duration,transcript,prediction
0,/content/drive/MyDrive/LR_ASR/preprocessing/pr...,3.252,Ugongwa wa kupinda shingo,3dfdcdfd
1,/content/drive/MyDrive/LR_ASR/preprocessing/pr...,5.979,Dalili ya ugonjwa wa miguu na midomo ni mnyama...,tdfdfdfdcd
2,/content/drive/MyDrive/LR_ASR/preprocessing/pr...,4.046,Wanyama walioathirika na ugonjwa wa ukurutu wa...,3dcdcdfdfdf
3,/content/drive/MyDrive/LR_ASR/preprocessing/pr...,7.815,Hata hivyo maelfu ya watu waliojitokeza kumsik...,tdod'd'dwdfdc
4,/content/drive/MyDrive/LR_ASR/preprocessing/pr...,7.174,Marais Uhuru Kenyata wa Kenya Yoweri Museveni ...,3fdfdfd3fdfdfdfdcdfdfdfdfjdfdfd3dfdfdfdfdfdfd
...,...,...,...,...
265,/content/drive/MyDrive/LR_ASR/preprocessing/pr...,6.413,"Lakini, kwa mujibu wa ripoti ya Benki ya Dunia...",tofdfdfdfdcdfdcdodfdfdfdzdtdfdfdfdfjdf
266,/content/drive/MyDrive/LR_ASR/preprocessing/pr...,2.774,Binti ya Dos Santos apinga kuzikwa kwake Angola,sd3dfdcdfdf
267,/content/drive/MyDrive/LR_ASR/preprocessing/pr...,8.852,Ripoti ya Baraza la Jumuiya ya Afrika Masharik...,ufdfdfdfdfdfotzdfdcdfdfdfdfdcdfdfdfdf
268,/content/drive/MyDrive/LR_ASR/preprocessing/pr...,5.095,Toa taarifa ukishuku kuwepo kwa taarifa yoyote...,dfdfdcdfdfdfdfdfdfdfdw


Note: If WER and CER scores remain unchanged after training, we will revisit and adjust the processor configuration.

#### Todo:
> 1. Train the model on 1 hour of preprocessed Swahili data.
> 2. Update the transcribe() function to use beam search instead of greedy decoding  
(This may be resource intensive in our current development environment.)
> 3. Refine and validate vocab.json

In [9]:
!pip install accelerate -U



In [10]:
import os
import pandas as pd
import librosa
import torch
from datasets import Dataset, load_metric
from transformers import (
    Wav2Vec2FeatureExtractor,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2Processor,
    Wav2Vec2ForCTC,
    TrainingArguments,
    Trainer
)
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
import numpy as np

In [104]:
print(f"Total samples in dataset: {len(df)}")

Total samples in dataset: 270


In [105]:
train_size = int(0.8 * len(df))
test_size = len(df) - train_size

print(f"Train samples: {train_size} (~{train_size*6/3600:.2f} hours)")
print(f"Test samples: {test_size} (~{test_size*6/3600:.2f} hours)")

Train samples: 216 (~0.36 hours)
Test samples: 54 (~0.09 hours)


In [16]:
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)

print(f"Train samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")

Train samples: 216
Test samples: 54


In [103]:
from datasets import Dataset, Audio

dataset = Dataset.from_pandas(df[["wav_path", "transcript"]])
dataset = dataset.cast_column("wav_path", Audio(sampling_rate=16000))

dataset = dataset.train_test_split(test_size=0.2)

In [106]:
def normalize(batch):
    batch["transcript"] = batch["transcript"].upper()
    return batch

dataset = dataset.map(normalize)

Map:   0%|          | 0/216 [00:00<?, ? examples/s]

Map:   0%|          | 0/54 [00:00<?, ? examples/s]

In [129]:
def prepare_dataset(batch):
    audio = batch["wav_path"]

    audio_inputs = processor(audio["array"], sampling_rate=audio["sampling_rate"])
    batch["input_values"] = audio_inputs["input_values"][0]

    label_inputs = tokenizer(batch["transcript"])
    batch["labels"] = label_inputs["input_ids"]

    return batch


encoded_dataset = dataset.map(prepare_dataset, remove_columns=dataset["train"].column_names, num_proc=4)

Map (num_proc=4):   0%|          | 0/216 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Map (num_proc=4):   0%|          | 0/54 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [130]:
print(type(encoded_dataset["train"][0]["input_values"]))
print(np.shape(encoded_dataset["train"][0]["input_values"]))

<class 'list'>
(35888,)


In [136]:
from dataclasses import dataclass
from typing import Union, List, Dict
import torch

@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = "longest"

    def __call__(self, features: List[Dict]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": f["input_values"]} for f in features]
        label_features = [{"input_ids": f["labels"]} for f in features]

        batch = self.processor.feature_extractor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt"
        )

        labels_batch = self.processor.tokenizer.pad(
            label_features,
            padding=self.padding,
            return_tensors="pt"
        )

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels

        return batch

In [141]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest")

In [142]:
import evaluate
import numpy as np

wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

In [148]:
from transformers import TrainingArguments, Trainer

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53",
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(tokenizer),
)

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/LR_ASR/models/wav2vec2-sw-finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    eval_strategy="steps",
    num_train_epochs=10,
    save_steps=100,
    eval_steps=100,
    logging_steps=10,
    learning_rate=1e-4,
    warmup_steps=100,
    save_total_limit=2,
    fp16=True,
    push_to_hub=False,
    logging_dir="./logs",
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=processor,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [149]:
encoded_dataset["train"][0]["input_values"]

[-0.00046059838496148586,
 -0.00046059838496148586,
 -0.00046059838496148586,
 -0.00046059838496148586,
 -0.00046059838496148586,
 -0.0006367168971337378,
 -0.00046059838496148586,
 -0.0006367168971337378,
 -0.0006367168971337378,
 -0.0006367168971337378,
 -0.0006367168971337378,
 -0.0006367168971337378,
 -0.0006367168971337378,
 -0.00046059838496148586,
 -0.00046059838496148586,
 -0.0006367168971337378,
 -0.0006367168971337378,
 -0.00046059838496148586,
 -0.00046059838496148586,
 -0.00046059838496148586,
 -0.00046059838496148586,
 -0.00046059838496148586,
 -0.00046059838496148586,
 -0.00046059838496148586,
 -0.00046059838496148586,
 -0.00046059838496148586,
 -0.00046059838496148586,
 -0.00046059838496148586,
 -0.00046059838496148586,
 -0.00046059838496148586,
 -0.00046059838496148586,
 -0.00046059838496148586,
 -0.00046059838496148586,
 -0.00046059838496148586,
 -0.00046059838496148586,
 -0.00046059838496148586,
 -0.00046059838496148586,
 -0.00046059838496148586,
 -0.00046059838496148

In [150]:
from torch.utils.data import DataLoader

dl = DataLoader(encoded_dataset["train"], batch_size=8, collate_fn=data_collator)
batch = next(iter(dl))
print(batch.keys())
print(batch["input_values"].shape)
print(batch["labels"].shape)

dict_keys(['input_values', 'attention_mask', 'labels'])
torch.Size([8, 72272])
torch.Size([8, 66])


In [151]:
print("Tokenizer vocab size:", len(tokenizer))
print("Model vocab size:", model.config.vocab_size)

Tokenizer vocab size: 41
Model vocab size: 41


In [None]:
trainer.train()