# Preprocess Dataset: Segmentation and Train, Test, Val Split

In [18]:
import json, random
from pathlib import Path
from pydub import AudioSegment
import kagglehub
from preprocess_utils import segment_words
import os

# --- Disable parallelism for tokenizers to avoid warnings ---
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# --- Download Kaggle dataset ---
path = kagglehub.dataset_download("etaifour/trump-speeches-audio-and-word-transcription")

# --- Settings ---
output_dir = Path("processed_dataset")
split_ratios = (0.8, 0.1, 0.1) # train, val, test
min_len, max_len = 5.0, 30.0  # max and min length of audio segements in seconds
random.seed(42)

output_dir.mkdir(parents=True, exist_ok=True)
meta = []
dataset_root = Path(path)

# --- Process each audio + transcript pair ---
for file in ["Trump_WEF_2018", "Trumps_speech_at_75th_d_day_anniversary_in_normandy_full_remarks_UhOMVlQxapY", "state of the union 2018", "state-of-the-union-trump_2019-02-05-225820-8225-0-0-0.64kmono"]:
    audio_file = dataset_root / f"{file}.mp3"
    json_file = audio_file.with_name(audio_file.name + ".json")

    if not json_file or not json_file.exists():
        print(f"⚠️ Skipping {audio_file.name}, no matching transcript.")
        continue
    audio = AudioSegment.from_file(audio_file)
    with open(json_file, "r", encoding="utf-8-sig") as f:
        transcript = json.load(f)

    segments = segment_words(transcript["words"], min_len=5.0, max_len=30.0)

    for i, (start, end, text) in enumerate(segments):
        clip_name = f"{audio_file.stem}_{i:04d}.wav"
        clip_path = output_dir / "audio" / clip_name
        clip_path.parent.mkdir(exist_ok=True)

        clip = audio[start * 1000 : end * 1000]
        clip.export(clip_path, format="wav", parameters=["-ar", "16000", "-ac", "1"])

        meta.append({
            "audio": f"audio/{clip_name}",  # relative path
            "text": text.strip(),
            "duration": round(end - start, 3)
        })

# --- Split into train/val/test ---
random.shuffle(meta)
n = len(meta)
n_train = int(split_ratios[0] * n)
n_val = int(split_ratios[1] * n)

splits = {
    "train": meta[:n_train],
    "validation": meta[n_train:n_train + n_val],
    "test": meta[n_train + n_val:]
}

# --- Save to JSONL files ---
for split, items in splits.items():
    with open(output_dir / f"{split}.json", "w", encoding="utf-8") as f:
        for item in items:
            json.dump(item, f)
            f.write("\n")  # JSONL format

print(f"\n✅ Created {len(meta)} total segments.")
print(f"Train: {len(splits['train'])}, Val: {len(splits['validation'])}, Test: {len(splits['test'])}")
print(f"Processed dataset saved at: {output_dir.resolve()}")



✅ Created 756 total segments.
Train: 604, Val: 75, Test: 77
Processed dataset saved at: /Users/johnm/ISE/AdvancedAI/AI/fine-tune-whisper/processed_dataset


# Setup Weights and Biases for Metrics, Whisper Processor, Import Datasets

In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, GenerationConfig
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
import wandb, torch
from jiwer import wer
from data_collator import DataCollatorSpeechSeq2SeqWithPadding
from prepare_dataset import AudioTextDataset

# Load the processor for feature extraction and tokenization
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")

# Initialize the data collator to pad variable-length audio/text inputs within a batch
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

# Initialize Weights & Biases for experiment tracking
wandb.init(
    project="whisper-fine-tune",  # Name of the project on wandb
)

# --- Decoder settings ---
forced_decoder_ids = processor.get_decoder_prompt_ids(language="en", task="transcribe")
decoder_start_token_id = processor.tokenizer.convert_tokens_to_ids("<|startoftranscript|>")

# --- Metrics computation (Word Error Rate) ---
def compute_metrics(pred):
    labels = pred.label_ids
    labels[labels == -100] = processor.tokenizer.pad_token_id

    preds = processor.batch_decode(pred.predictions, skip_special_tokens=True)
    refs = processor.batch_decode(labels, skip_special_tokens=True)

    return {"wer": wer(refs, preds)}

# --- Datasets ---
train_dataset = AudioTextDataset(json_path="processed_dataset/train.json", processor=processor)
val_dataset   = AudioTextDataset(json_path="processed_dataset/validation.json", processor=processor)
test_dataset  = AudioTextDataset(json_path="processed_dataset/test.json", processor=processor)

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


# Full Fine Tuning

In [None]:
# Full fine-tuning

learning_rate = 2e-5
warmup_steps = 50

training_args_full = Seq2SeqTrainingArguments(
    output_dir="checkpoints/full",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    eval_strategy="steps",
    eval_steps=50,
    logging_steps=25,
    save_strategy="steps",
    save_steps=100,
    num_train_epochs=8,   # reduce epochs for small dataset
    learning_rate=learning_rate,
    warmup_steps=warmup_steps,
    save_total_limit=2,
    report_to=["wandb"],  # Log metrics to Weights & Biases
    fp16=False,
    bf16=True,
    predict_with_generate=True,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="eval_wer",
    greater_is_better=False,  # lower WER is better
    generation_max_length=128,
    max_grad_norm=1.0,
 )

device = "mps" if torch.backends.mps.is_available() else "cpu"  # MPS (macOS) or CPU
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")

# Unfreeze all model parameters for full fine-tuning
for param in model.parameters():
    param.requires_grad = True

# Configure generation and caching
model.config.use_cache = False
model.config.forced_decoder_ids = forced_decoder_ids
model.config.decoder_start_token_id = decoder_start_token_id

trainer_full = Seq2SeqTrainer(
    args=training_args_full,
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
 )

# --- Evaluate before training ---
test_pre_eval_full = trainer_full.evaluate(eval_dataset=test_dataset)
print(f"[FULL] Before fine-tuning → Test Loss: {test_pre_eval_full['eval_loss']:.4f}, WER: {test_pre_eval_full['eval_wer']:.3f}")
val_pre_eval_full = trainer_full.evaluate(eval_dataset=val_dataset)
print(f"[FULL] Before fine-tuning → Val Loss: {val_pre_eval_full['eval_loss']:.4f}, WER: {val_pre_eval_full['eval_wer']:.3f}")

# --- Training ---
trainer_full.train()

# === Save the model after training ===
from datetime import datetime
from pathlib import Path

models_root = Path("models"); models_root.mkdir(exist_ok=True)
full_model_dir = models_root / f"fine_tuned_whisper_full_{datetime.now().strftime('%Y%m%d-%H%M%S')}"
full_model_dir.mkdir(parents=True, exist_ok=True)
model.save_pretrained(full_model_dir)
processor.save_pretrained(full_model_dir)
print(f"[FULL] Model and processor saved to {full_model_dir}")

# Keep in memory for the final comparison
full_model = model

You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


[FULL] Before fine-tuning → Loss: 2.6783, WER: 0.226


Step,Training Loss,Validation Loss,Model Preparation Time,Wer
50,2.1844,2.031796,0.0007,0.254913
100,1.0455,1.253105,0.0007,0.230208
150,0.9094,1.111961,0.0007,0.267827
200,0.762,1.027179,0.0007,0.262774
250,0.6622,0.936654,0.0007,0.262212
300,0.3555,0.698043,0.0007,0.236384
350,0.316,0.687892,0.0007,0.229085
400,0.2858,0.685406,0.0007,0.232454
450,0.2023,0.68595,0.0007,0.233015
500,0.2749,0.684588,0.0007,0.273442


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].
There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


[FULL] Model and processor saved to models/fine_tuned_whisper_full_20251022-140211


# Low Rank Adaptation Fine Tuning

In [None]:
# LoRA fine-tuning

learning_rate = 4e-5
warmup_steps = 50

training_args_lora = Seq2SeqTrainingArguments(
    output_dir="checkpoints/lora",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    eval_strategy="steps",
    eval_steps=50,
    logging_steps=25,
    save_strategy="steps",
    save_steps=100,
    num_train_epochs=8,   # reduce epochs for small dataset
    learning_rate=learning_rate,
    warmup_steps=warmup_steps,
    save_total_limit=2,
    report_to=["wandb"],  # Log metrics to Weights & Biases
    fp16=False,
    bf16=True,
    predict_with_generate=True,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="eval_wer",
    greater_is_better=False,  # lower WER is better
    generation_max_length=128,
    max_grad_norm=1.0,
 )

# Start from a fresh base model
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")

# Prepare the model for LoRA-compatible k-bit training
model = prepare_model_for_kbit_training(model)

# Configure LoRA (Low-Rank Adaptation) for efficient fine-tuning
config = LoraConfig(
    r=32,  # Rank of LoRA decomposition
    lora_alpha=64,  # Scaling factor
    target_modules=["q_proj", "v_proj"],  # Apply LoRA to attention projections
    lora_dropout=0.05,  # Dropout applied to LoRA layers
    bias="none"  # Don't adapt bias terms
)

# Wrap the base model with LoRA
model = get_peft_model(model, config)
model.config.use_cache = False  # Disable caching during training
model.config.forced_decoder_ids = forced_decoder_ids
model.config.decoder_start_token_id = decoder_start_token_id

trainer_lora = Seq2SeqTrainer(
    args=training_args_lora,
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
 )

# --- Evaluate before training ---
test_pre_eval_lora = trainer_lora.evaluate(eval_dataset=test_dataset)
print(f"[LoRA] Before fine-tuning → Test Loss: {test_pre_eval_lora['eval_loss']:.4f}, WER: {test_pre_eval_lora['eval_wer']:.3f}")
val_pre_eval_lora = trainer_lora.evaluate(eval_dataset=val_dataset)
print(f"[LoRA] Before fine-tuning → Val Loss: {val_pre_eval_lora['eval_loss']:.4f}, WER: {val_pre_eval_lora['eval_wer']:.3f}")

# --- Training ---
trainer_lora.train()

# === Save the model after training ===
from datetime import datetime
from pathlib import Path

models_root = Path("models"); models_root.mkdir(exist_ok=True)
lora_model_dir = models_root / f"fine_tuned_whisper_lora_{datetime.now().strftime('%Y%m%d-%H%M%S')}"
lora_model_dir.mkdir(parents=True, exist_ok=True)
model.save_pretrained(lora_model_dir)
processor.save_pretrained(lora_model_dir)
print(f"[LoRA] Model and processor saved to {lora_model_dir}")

# Keep in memory for the final comparison
lora_model = model



[LoRA] Before fine-tuning → Loss: 2.6783, WER: 0.226


Step,Training Loss,Validation Loss,Model Preparation Time,Wer
50,2.2172,2.044517,0.0014,0.271196
100,1.1243,1.324261,0.0014,0.272319
150,1.043,1.237145,0.0014,0.244806
200,0.984,1.198608,0.0014,0.237507
250,0.9705,1.167904,0.0014,0.236384
300,0.8449,1.136174,0.0014,0.236384
350,0.8664,1.094966,0.0014,0.23863
400,0.779,0.996046,0.0014,0.238069
450,0.4039,0.736219,0.0014,0.239191
500,0.4611,0.717368,0.0014,0.24256




[LoRA] Model and processor saved to models/fine_tuned_whisper_lora_20251022-141355


# Test Resultant Models

In [13]:
# Evaluate both models on the test set

# Evaluation-only arguments
eval_args = Seq2SeqTrainingArguments(
    output_dir="checkpoints/eval",
    per_device_eval_batch_size=4,
    predict_with_generate=True,
    report_to=[],  # avoid logging evals
    fp16=False,
    bf16=True,
 )

# Full model evaluation
trainer_full_eval = Seq2SeqTrainer(
    args=eval_args,
    model=full_model,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
 )
test_res_full = trainer_full_eval.evaluate(eval_dataset=test_dataset)
print(f"[FULL] After fine-tuning → Test Loss: {test_res_full['eval_loss']:.4f}, WER: {test_res_full['eval_wer']:.3f}")
val_res_full = trainer_full_eval.evaluate(eval_dataset=val_dataset)
print(f"[FULL] After fine-tuning → Val Loss: {val_res_full['eval_loss']:.4f}, WER: {val_res_full['eval_wer']:.3f}")

# LoRA model evaluation
trainer_lora_eval = Seq2SeqTrainer(
    args=eval_args,
    model=lora_model,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
 )
test_res_lora = trainer_lora_eval.evaluate(eval_dataset=test_dataset)
print(f"[LoRA] After fine-tuning → Test Loss: {test_res_lora['eval_loss']:.4f}, WER: {test_res_lora['eval_wer']:.3f}")
val_res_lora = trainer_lora_eval.evaluate(eval_dataset=val_dataset)
print(f"[LoRA] After fine-tuning → Val Loss: {val_res_lora['eval_loss']:.4f}, WER: {val_res_lora['eval_wer']:.3f}")



[FULL] After fine-tuning → Test Loss: 0.9711, WER: 0.174
[FULL] After fine-tuning → Val Loss: 1.2531, WER: 0.230


[LoRA] After fine-tuning → Test Loss: 0.9189, WER: 0.177
[LoRA] After fine-tuning → Val Loss: 1.1986, WER: 0.238
