In [1]:
!pip install transformers datasets librosa jiwer
!pip freeze > requirements.txt
# Add requirements, just in case
!cat requirements.txt

Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-4.0.0 rapidfuzz-3.13.0
absl-py==1.4.0
accelerate==1.8.1
aiofiles==24.1.0
aiohappyeyeballs==2.6.1
aiohttp==3.11.15
aiosignal==1.3.2
alabaster==1.0.0
albucore==0.0.24
albumentations==2.0.8
ale-py==0.11.1
altair==5.5.0
annotated-types==0.7.0
antlr4-python3-runtime==4.9.3
anyio==4.9.0
argon2-cffi==25.1.0
argon2-cffi-bindings==21.2.0
array_record==0.7.2
arviz==0.21.0
astropy==7.1.0
astropy-iers-data==0.2025.6.23.0.39.50
astunpars

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
# Replace with your actual path
BASE = '/content/drive/MyDrive/ASR_wav2vec_project/preprocessing/processed_data/14.0-delta-2023-06-23'
CSV = f'{BASE}/manifest_sw_14_0_delta.csv'
AUDIO_DIR = f'{BASE}/cleaned_sw_audio_14_0_delta'
# If you're using a pre-configured HuggingFace processor, you can comment out or remove VOCAB
VOCAB = f'{BASE}/vocab.json'

In [4]:
from transformers import (
    Wav2Vec2FeatureExtractor,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2Processor,
    Wav2Vec2ForCTC
)
#
# 1)
  # 1-1) Feature Extractor (basic XLS-R settings)
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1,
    sampling_rate=16000,
    padding_value=0.0,
    do_normalize=True,
    return_attention_mask=True
)

  # 1-2) Tokenizer (using custom vocab.json)
tokenizer = Wav2Vec2CTCTokenizer(
    VOCAB,
    unk_token="[UNK]",
    pad_token="[PAD]",
    word_delimiter_token="|",
)

In [8]:
import os
import pandas as pd
import librosa
import torch
import json
from evaluate import load

df = pd.read_csv(CSV)
# Replace original local paths with the correct mounted paths
df["wav_path"] = df["wav_path"].apply(lambda p: os.path.join(AUDIO_DIR, os.path.basename(p)))

# 2. If you want to use the pre configured processor instead,
# comment out the custom processor and
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-xlsr-53")
processor = Wav2Vec2Processor(
    feature_extractor=feature_extractor,
    tokenizer=tokenizer
)
# then uncomment the line below.
# processor = Wav2Vec2Processor.from_pretrained("alokmatta/wav2vec2-large-xlsr-53-sw")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device).eval()

# 3. Inference function, This function is documented in the README.
#    Please review it there and let me know if you spot any issues!
def transcribe(path):
    speech, _ = librosa.load(path, sr=16000)
    inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        logits = model(**inputs).logits
    pred_ids = torch.argmax(logits, dim=-1)
    return processor.batch_decode(pred_ids)[0]

# 4. Run inference on the dataset
df["prediction"] = df["wav_path"].map(transcribe)

# 5. Compute WER and CER
wer = load("wer")
cer = load("cer")

print("Baseline WER:", wer.compute(predictions=df["prediction"], references=df["transcript"]))
print("Baseline CER:", cer.compute(predictions=df["prediction"], references=df["transcript"]))

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Baseline WER: 1.0
Baseline CER: 0.9747803552240693


In [None]:
print(df['prediction'][3])
print(df['transcript'][3])

zjkjkjkjzjz
Hata hivyo maelfu ya watu waliojitokeza kumsikiliza rais wa chama ,Chamisa , akizungumza inaonyesha hakuna shaka yoyote


In [None]:
df

Unnamed: 0,wav_path,duration,transcript,prediction
0,/content/drive/MyDrive/ASR_wav2vec_project/pre...,3.252,Ugongwa wa kupinda shingo,kjkj
1,/content/drive/MyDrive/ASR_wav2vec_project/pre...,5.979,Dalili ya ugonjwa wa miguu na midomo ni mnyama...,zjkj
2,/content/drive/MyDrive/ASR_wav2vec_project/pre...,4.046,Wanyama walioathirika na ugonjwa wa ukurutu wa...,kjkjkj
3,/content/drive/MyDrive/ASR_wav2vec_project/pre...,7.815,Hata hivyo maelfu ya watu waliojitokeza kumsik...,zjkjkjkjzjz
4,/content/drive/MyDrive/ASR_wav2vec_project/pre...,7.174,Marais Uhuru Kenyata wa Kenya Yoweri Museveni ...,yjyjkj
...,...,...,...,...
265,/content/drive/MyDrive/ASR_wav2vec_project/pre...,6.413,"Lakini, kwa mujibu wa ripoti ya Benki ya Dunia...",yjkjkjkjyjyjyjyj
266,/content/drive/MyDrive/ASR_wav2vec_project/pre...,2.774,Binti ya Dos Santos apinga kuzikwa kwake Angola,yjyjkjyjy
267,/content/drive/MyDrive/ASR_wav2vec_project/pre...,8.852,Ripoti ya Baraza la Jumuiya ya Afrika Masharik...,kjyjyjyjkjyjyjkjyj
268,/content/drive/MyDrive/ASR_wav2vec_project/pre...,5.095,Toa taarifa ukishuku kuwepo kwa taarifa yoyote...,jkjyjz


Note: If WER and CER scores remain unchanged after training, we will revisit and adjust the processor configuration.

#### Todo:
> 1. Train the model on 1 hour of preprocessed Swahili data.
> 2. Update the transcribe() function to use beam search instead of greedy decoding  
(This may be resource intensive in our current development environment.)
> 3. Refine and validate vocab.json