In [None]:
!pip install datasets transformers noisereduce librosa evaluate
!pip install jiwer
!pip install transformers[torch]
!pip install accelerate -U
!pip install torch h5py transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Extract the dataset
!tar -zxvf /content/drive/MyDrive/cv-corpus-12.0-delta-2022-12-07-en.tar.gz -C /content/

In [None]:
import h5py
import torch
from transformers import WhisperForConditionalGeneration, WhisperProcessor

def load_model_from_h5(model, h5_path):
    with h5py.File(h5_path, 'r') as h5file:
        for name, param in model.named_parameters():
            param_data = torch.tensor(h5file[name][...])
            param.data.copy_(param_data)


model_name = "whisper-small-dv"
h5_file_path = "/content/drive/MyDrive/whisper_model_weights_judy.h5"

processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

load_model_from_h5(model, h5_file_path)



In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration

# Path to your fine-tuned model
model_directory = "/content/drive/MyDrive/whisper-small-dv"

# Load the processor and model
processor = WhisperProcessor.from_pretrained(model_directory)
model = WhisperForConditionalGeneration.from_pretrained(model_directory)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Import necessary libraries
import pandas as pd
import os
from datasets import Dataset, DatasetDict, Audio, load_metric
import noisereduce as nr
import librosa
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
import evaluate
from functools import partial

In [None]:
# Define paths
metadata_path = '/content/cv-corpus-12.0-delta-2022-12-07/en/validated.tsv'
clips_path = '/content/cv-corpus-12.0-delta-2022-12-07/en/clips/'

# Load the metadata
metadata = pd.read_csv(metadata_path, sep='\t')

# Filter the dataset to include only audio files listed in validated.tsv
metadata = metadata[['path', 'sentence']]
metadata['path'] = metadata['path'].apply(lambda x: os.path.join(clips_path, x))

# Convert metadata to Hugging Face Dataset
dataset = Dataset.from_pandas(metadata)

# Reduce the dataset size to avoid crashing Colab RAM (optional)
dataset = dataset.shuffle(seed=42).select(range(3500))
# Split the dataset: 2000 samples for train, 500 samples for test
train_test_split = dataset.train_test_split(test_size=600)
common_voice = DatasetDict({
    "train": train_test_split["train"],
    "test": train_test_split["test"]
})


In [None]:
def load_audio(batch):
    path = batch["path"]
    audio_array, sampling_rate = librosa.load(path, sr=processor.feature_extractor.sampling_rate)
    batch["audio"] = {
        "array": audio_array,
        "sampling_rate": sampling_rate
    }
    return batch

In [None]:
# Load the audio files
common_voice = common_voice.map(load_audio)

Map:   0%|          | 0/2900 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [None]:
# Set sampling rate
sampling_rate = processor.feature_extractor.sampling_rate
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=sampling_rate))
# Define noise reduction function
def reduce_noise(audio_array, sample_rate):
    reduced_noise = nr.reduce_noise(y=audio_array, sr=sample_rate)
    return reduced_noise

# Define normalization function
def normalize_audio(audio_array):
    norm_audio = librosa.util.normalize(audio_array)
    return norm_audio

# Preprocess audio data
def preprocess_audio(example):
    audio = example["audio"]["array"]
    sample_rate = example["audio"]["sampling_rate"]

    # Apply noise reduction
    audio = reduce_noise(audio, sample_rate)

    # Normalize audio
    audio = normalize_audio(audio)

    example["audio"]["array"] = audio
    return example
# Apply preprocessing to the training set
common_voice = common_voice.map(preprocess_audio)
# Set sampling rate
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=sampling_rate))

Map:   0%|          | 0/2900 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [None]:
def prepare_dataset(example):
    audio = example["audio"]
    example = processor(
        audio=audio["array"],
        sampling_rate=audio["sampling_rate"],
        text=example["sentence"],
    )
    example["input_length"] = len(audio["array"]) / audio["sampling_rate"]
    return example
common_voice = common_voice.map(
    prepare_dataset, remove_columns=["audio", "sentence"], num_proc=1
)


Map:   0%|          | 0/2900 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2900 [00:00<?, ? examples/s]

In [None]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [
            {"input_features": feature["input_features"][0]} for feature in features
        ]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100
        )
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        return batch


data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [None]:
metric_wer = evaluate.load("wer")
metric_cer = evaluate.load("cer")
normalizer = BasicTextNormalizer()

In [None]:
# Compute metrics function
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # Decode predictions and labels
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    # Compute orthographic WER and CER
    wer_ortho = 100 * metric_wer.compute(predictions=pred_str, references=label_str)
    cer_ortho = 100 * metric_cer.compute(predictions=pred_str, references=label_str)

    # Normalize predictions and labels
    pred_str_norm = [normalizer(pred) for pred in pred_str]
    label_str_norm = [normalizer(label) for label in label_str]

    # Filter out empty references
    pred_str_norm = [pred_str_norm[i] for i in range(len(pred_str_norm)) if len(label_str_norm[i]) > 0]
    label_str_norm = [label_str_norm[i] for i in range(len(label_str_norm)) if len(label_str_norm[i]) > 0]

    # Compute normalized WER and CER
    wer = 100 * metric_wer.compute(predictions=pred_str_norm, references=label_str_norm)
    cer = 100 * metric_cer.compute(predictions=pred_str_norm, references=label_str_norm)

    return {"wer_ortho": wer_ortho, "cer_ortho": cer_ortho, "wer": wer, "cer": cer}


In [None]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-dv-safouh",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=1e-4,
    lr_scheduler_type="cosine",
    warmup_steps=500,
    max_steps=1000,
    fp16=True,
    fp16_full_eval=True,
    eval_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=16384,
    save_steps=500,
    eval_steps=500,
    logging_steps=50,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
    weight_decay=0.1,
)


In [None]:
from transformers import Seq2SeqTrainer, TrainerCallback
import shutil

class SaveModelCallback(TrainerCallback):
    def on_train_end(self, args, state, control, **kwargs):

        output_dir = "/content/drive/MyDrive/whisper-small-safouh"
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        shutil.copytree(args.output_dir, output_dir, dirs_exist_ok=True)
        print(f"Model saved to {output_dir}")


In [None]:
# Initialize trainer with the callback
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor,
    callbacks=[SaveModelCallback]
)


max_steps is given, it will override any value given in num_train_epochs


In [None]:
# Continue training
trainer.train()

# Save model to H5
def save_model_to_h5(model, h5_path):
    with h5py.File(h5_path, 'w') as h5file:
        for name, param in model.named_parameters():
            h5file.create_dataset(name, data=param.detach().cpu().numpy())

# Example usage after training
h5_file_path = "/content/drive/MyDrive/whisper_model_weights_judy.h5"
save_model_to_h5(model, h5_file_path)



Step,Training Loss,Validation Loss,Wer Ortho,Cer Ortho,Wer,Cer
500,0.0639,0.202763,11.69241,6.071019,10.340314,5.517974


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


Step,Training Loss,Validation Loss,Wer Ortho,Cer Ortho,Wer,Cer
500,0.0639,0.202763,11.69241,6.071019,10.340314,5.517974
1000,0.0052,0.196555,10.181033,4.985545,8.835079,4.53961


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


Model saved to /content/drive/MyDrive/whisper-small-dv


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
import librosa
from transformers import WhisperProcessor, WhisperForConditionalGeneration

# Define the path to the fine-tuned model checkpoint
fine_tuned_model_checkpoint = "/content/drive/MyDrive/whisper-small-dv/checkpoint-1000"  # Update with your actual path

# Load the fine-tuned model and processor
processor = WhisperProcessor.from_pretrained(fine_tuned_model_checkpoint)
model = WhisperForConditionalGeneration.from_pretrained(fine_tuned_model_checkpoint)



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
!pip install pydub
!pip install SpeechRecognition pydub
from demucs import separate
!pip install ipython

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
import math
import librosa
import torch
from pydub import AudioSegment
import noisereduce as nr


def split_audio(audio, segment_length_ms):
    # Calculate the number of segments needed
    segment_count = math.ceil(len(audio) / segment_length_ms)

    # Create a list to hold all the segments
    segments = []

    for i in range(segment_count):
        # Calculate the start and end of this segment
        start_ms = i * segment_length_ms
        end_ms = start_ms + segment_length_ms

        # Make sure not to go past the audio length
        if end_ms > len(audio):
            end_ms = len(audio)

        # Extract and store the segment
        segment = audio[start_ms:end_ms]
        segments.append(segment)

    return segments

def remove_background_music(audio_segment):
    # Convert the audio segment to a numpy array
    samples = np.array(audio_segment.get_array_of_samples())

    # Perform noise reduction
    reduced_noise = nr.reduce_noise(y=samples, sr=audio_segment.frame_rate)

    # Convert back to AudioSegment
    reduced_audio = AudioSegment(
        reduced_noise.tobytes(),
        frame_rate=audio_segment.frame_rate,
        sample_width=audio_segment.sample_width,
        channels=audio_segment.channels
    )

    return reduced_audio

def transcribe_audio(file_path, segment_length_ms):
    # Load the audio file
    audio = AudioSegment.from_file(file_path)

    # Split the audio file into segments
    audio_segments = split_audio(audio, segment_length_ms)

    transcriptions = []

    # Process each segment
    for i, segment in enumerate(audio_segments):
        # Remove background music from the segment
        clean_segment = remove_background_music(segment)

        # Export the segment to a temporary WAV file
        segment_file_path = f"segment_{i}.wav"
        clean_segment.export(segment_file_path, format="wav")

        # Load the audio segment using librosa
        audio_array, sampling_rate = librosa.load(segment_file_path, sr=processor.feature_extractor.sampling_rate)

        # Preprocess the audio file
        inputs = processor(audio=audio_array, sampling_rate=sampling_rate, return_tensors="pt")

        # Generate transcription
        with torch.no_grad():
            generated_ids = model.generate(inputs["input_features"])

        # Decode the generated ids to text
        transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        transcriptions.append(transcription)

    return transcriptions

# Example usage
file_path = '/content/youtube_6669ad4d1fcae.wav'  # Change this to the path of your audio file
segment_length_ms = 10 * 1000  # Split into 10-second segments
transcriptions = transcribe_audio(file_path, segment_length_ms)

# Print all transcriptions
for i, transcription in enumerate(transcriptions):
    print(f"Segment {i} transcription: {transcription}")


Segment 0 transcription: Technology has had a profound interest on our daily lives, performing the label community, worked and access information.
Segment 1 transcription: One area of appearance influence is particularly every as it is social media.
Segment 2 transcription: Constagraph have revolutionized the wavey connected and shares with others.
Segment 3 transcription: Volruvius and Conbino.
Segment 4 transcription: It has also opened up new Avenue's four interact, allowed us to connected with light-listed.
Segment 5 transcription: Actually communities.Moreover, social media has an label as the state format about current events and engaged.
Segment 6 transcription: Gushing's on various topics.
Segment 7 transcription: Other personnel relationships on the positive sign it allow us to state connected with France and France, especially those who need railway.
Segment 8 transcription: Rickenshire life upgrades celebrate male stocks and offer support through online platforms.
Segment 9 