In [18]:
# Whisper Fine-Tuning sur Common Voice Bassa

## 1. Configuration

from datasets import load_dataset, Audio
from transformers import WhisperProcessor, WhisperForConditionalGeneration, TrainingArguments, Trainer
import torchaudio
import torch
import pandas as pd
import os


In [16]:


## 2. Chargement du corpus Common Voice Bassa (après extraction)


# Dossiers
data_dir = "Bassa"
clips_wav_dir = os.path.join(data_dir, "clips_wav")

# Chargement des TSV
train_df = pd.read_csv(os.path.join(data_dir, "train.tsv"), sep="\t")
dev_df = pd.read_csv(os.path.join(data_dir, "dev.tsv"), sep="\t")
test_df = pd.read_csv(os.path.join(data_dir, "test.tsv"), sep="\t")

# Modifier les chemins pour utiliser les .wav au lieu des .mp3
for df in [train_df, dev_df, test_df]:
    df['audio_path'] = df['path'].apply(
        lambda p: os.path.join(clips_wav_dir, p.replace('.mp3', '.wav'))
    )

# Vérification
print(train_df[['audio_path', 'sentence']].head())

                                      audio_path                    sentence
0  Bassa\clips_wav\common_voice_bas_41203802.wav                     Tôs nu.
1  Bassa\clips_wav\common_voice_bas_41203803.wav              Mbas i nhamba.
2  Bassa\clips_wav\common_voice_bas_41203804.wav  Ba ntip babaa hiloga hini.
3  Bassa\clips_wav\common_voice_bas_41203806.wav      A nlôm e; a nlôm hyéé.
4  Bassa\clips_wav\common_voice_bas_41203812.wav    Ba njul bôt i kédé hisi.


In [12]:
# Vérifier si les fichiers .wav sont bien là
for name, df in zip(['train', 'dev', 'test'], [train_df, dev_df, test_df]):
    missing = df[~df['audio_path'].apply(os.path.exists)]
    print(f"{name} — Fichiers audio manquants : {len(missing)}")


train — Fichiers audio manquants : 0
dev — Fichiers audio manquants : 0
test — Fichiers audio manquants : 0


In [19]:
import torchaudio

# Exemple : charger un fichier wav
waveform, sample_rate = torchaudio.load(train_df.loc[0, 'audio_path'])
print(waveform.shape, sample_rate)


torch.Size([1, 53568]) 16000


# Whisper Train

In [1]:
## 1. Configuration


import os
import pandas as pd
import torchaudio
from datasets import Dataset
from transformers import WhisperProcessor, WhisperForConditionalGeneration, TrainingArguments, Trainer
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [2]:


## 2. Chemins d'accès aux données


data_dir = "Bassa"
clips_wav_dir = os.path.join(data_dir, "clips_wav")

# Charger les fichiers TSV
def load_split(split_name):
    df = pd.read_csv(os.path.join(data_dir, f"{split_name}.tsv"), sep="\t")
    df = df[df['sentence'].notnull()]  # enlever les phrases manquantes
    df['audio_path'] = df['path'].apply(lambda p: os.path.join(clips_wav_dir, p.replace('.mp3', '.wav')))
    return df

train_df = load_split("train")
dev_df = load_split("dev")


In [3]:


## 3. Création du Dataset compatible Hugging Face


def df_to_dataset(df):
    return Dataset.from_pandas(df[['audio_path', 'sentence']])

train_dataset = df_to_dataset(train_df)
dev_dataset = df_to_dataset(dev_df)

In [4]:
train_dataset.shape, dev_dataset.shape

((2109, 2), (1328, 2))

In [5]:
## 4. Traitement audio et texte pour Whisper


processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")


def preprocess(example):
    # Charger et resampler l'audio
    speech_array, sampling_rate = torchaudio.load(example['audio_path'])
    if sampling_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
        speech_array = resampler(speech_array)

    # Extraire les features audio
    inputs = processor(
        speech_array.squeeze().numpy(),
        sampling_rate=16000,
        return_tensors="pt"
    )

    # Extraire les labels texte
    labels = processor.tokenizer(
        example["sentence"],
        return_tensors="pt",
        padding=False,
        truncation=True
    )

    return {
        "input_features": inputs.input_features[0],  # torch.Tensor
        "labels": labels.input_ids[0]               # torch.Tensor
    }


'''
# Appliquer le prétraitement
train_dataset = train_dataset.map(preprocess, remove_columns=train_dataset.column_names)
dev_dataset = dev_dataset.map(preprocess, remove_columns=dev_dataset.column_names)
'''

'\n# Appliquer le prétraitement\ntrain_dataset = train_dataset.map(preprocess, remove_columns=train_dataset.column_names)\ndev_dataset = dev_dataset.map(preprocess, remove_columns=dev_dataset.column_names)\n'

In [6]:
## 5. Chargement du modèle
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []


In [7]:
import transformers
print(transformers.__version__)


4.53.0


In [8]:
def whisper_data_collator(features):
    audio_paths = [f["audio_path"] for f in features]
    texts = [f["sentence"] for f in features]

    input_features = []
    for path in audio_paths:
        speech_array, sr = torchaudio.load(path)
        if sr != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
            speech_array = resampler(speech_array)
        processed = processor(
            speech_array.squeeze().numpy(),
            sampling_rate=16000,
            return_tensors="pt"
        )
        input_features.append(processed.input_features[0])

    batch = processor.feature_extractor.pad(
        {"input_features": input_features},
        return_tensors="pt"
    )

    label_batch = processor.tokenizer(
        texts,
        padding=True,
        return_tensors="pt",
        truncation=True
    )
    labels = label_batch["input_ids"].masked_fill(label_batch["attention_mask"].ne(1), -100)

    batch["labels"] = labels

    #print("🚨 batch keys:", batch.keys())
    #print("✅ input_features shape:", batch["input_features"].shape)
    #print("✅ labels shape:", labels.shape)

    return batch


In [11]:
for row in train_dataset:
    if row["sentence"] is None or row["sentence"] == "":
        print("❌ Texte vide :", row)


In [10]:
train_dataset = train_dataset.filter(lambda x: x["sentence"] is not None and x["sentence"] != "")
dev_dataset = dev_dataset.filter(lambda x: x["sentence"] is not None and x["sentence"] != "")


Filter:   0%|          | 0/2109 [00:00<?, ? examples/s]

Filter: 100%|██████████| 2109/2109 [00:00<00:00, 30724.35 examples/s]
Filter: 100%|██████████| 1328/1328 [00:00<00:00, 142905.71 examples/s]


In [9]:


## 6. Entraînement avec HuggingFace Trainer


training_args = TrainingArguments(
    output_dir="./whisper-bassa-model",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    learning_rate=1e-4,
    logging_dir="./logs",
    fp16=torch.cuda.is_available(),
    report_to="none",
    remove_unused_columns=False 
)




from jiwer import wer, cer

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)
    return {"wer": wer(label_str, pred_str), "cer": cer(label_str, pred_str)}


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=None,
    data_collator=whisper_data_collator,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [12]:
## 7. Lancer l'entraînement
trainer.train(resume_from_checkpoint=True)


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
3000,0.0605
3500,0.0336
4000,0.0148
4500,0.0041
5000,0.0008




TrainOutput(global_step=5280, training_loss=0.010810695371280115, metrics={'train_runtime': 13946.9808, 'train_samples_per_second': 1.512, 'train_steps_per_second': 0.379, 'total_flos': 5.192122871808e+17, 'train_loss': 0.010810695371280115, 'epoch': 10.0})

In [None]:

## 8. Évaluer le modèle
metrics = trainer.evaluate()
#trainer.evaluate(eval_dataset=dev_dataset)
print(metrics)


In [None]:
from jiwer import wer
from tqdm import tqdm

def evaluate_in_batches(dataset, batch_size=4):
    model.eval()
    all_preds = []
    all_labels = []

    for i in tqdm(range(0, len(dataset), batch_size)):
        batch = dataset[i:i+batch_size]
        input_features = []
        labels_text = []

        for j in range(len(batch["audio_path"])):
            path = batch["audio_path"][j]
            text = batch["sentence"][j]

            speech_array, sr = torchaudio.load(path)
            if sr != 16000:
                resampler = torchaudio.transforms.Resample(sr, 16000)
                speech_array = resampler(speech_array)

            inputs = processor(speech_array.squeeze().numpy(), sampling_rate=16000, return_tensors="pt")
            input_features.append(inputs.input_features[0])
            labels_text.append(text)

        batch_input = processor.feature_extractor.pad({"input_features": input_features}, return_tensors="pt")
        input_tensor = batch_input.input_features.to(model.device)

        with torch.no_grad():
            predicted_ids = model.generate(input_tensor)

        pred_str = processor.batch_decode(predicted_ids, skip_special_tokens=True)
        all_preds.extend(pred_str)
        all_labels.extend(labels_text)

    return {"wer": wer(all_labels, all_preds)}


# ➤ Évaluation
metrics = evaluate_in_batches(dev_dataset, batch_size=2)
print("✅ WER sur dev_dataset :", metrics)


  0%|          | 0/664 [00:00<?, ?it/s]Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 664/664 [22:19<00:00,  2.02s/it]


✅ WER sur dev_dataset : {'wer': 0.5071082879612825}


: 

In [13]:
## 9. Sauvegarde du modèle
trainer.save_model("./API Model/model/model_final")

In [None]:
def transcribe_audio(audio_path):
    speech_array, sr = torchaudio.load(audio_path)
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
        speech_array = resampler(speech_array)
    
    inputs = processor(speech_array.squeeze().numpy(), sampling_rate=16000, return_tensors="pt")
    input_features = inputs.input_features.to(model.device)
    with torch.no_grad():
        predicted_ids = model.generate(input_features)
    return processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

# Exemple d'utilisation :
transcribe_audio("C:/Users/GENIUS ELECTRONICS/STT-Bassa/Bassa/test/test1.wav")


  from .autonotebook import tqdm as notebook_tqdm


[]