In [None]:
# !pip install --upgrade pip

In [None]:
# !pip install --upgrade datasets[audio] transformers accelerate evaluate jiwer tensorboard gradio

In [None]:
# !pip install ipywidgets

In [None]:
# !huggingface-cli login


In [None]:
!huggingface-cli whoami

In [None]:
!huggingface-cli login --token "hf_xxx"  

In [None]:
%cd /ocean/projects/cis250085p/shared/B_track

In [None]:
!pwd

In [None]:
import torch
torch.cuda.is_available()


In [None]:
# import pandas as pd

# train_json_path = "/shared/B_track/train.json"
# train_df = pd.read_json(train_json_path).T
# train_df.head()

In [None]:

# dev_json_path = "/shared/B_track/dev_test.json"
# dev_df = pd.read_json(dev_json_path).T
# dev_df.head()

In [None]:
import os
import pandas as pd
from datasets import Dataset, DatasetDict, Audio

# Base path
base_path = "/shared/B_track"

# Load and convert split
def load_split(split_json, audio_folder):
    json_path = os.path.join(base_path, f"{split_json}.json")
    
    # Load JSON as DataFrame
    df = pd.read_json(json_path).T  # transpose to fix orientation
    
    # Remove "audio/" prefix from path
    df["audio"] = df["audio_path"].str.replace("audio/", "", regex=False).apply(
        lambda x: os.path.join(base_path, audio_folder, x + ".wav")
    )
    
    # Convert to list of dicts
    data = df.to_dict(orient="records")
    
    return Dataset.from_list(data).cast_column("audio", Audio())

# Build dataset
dataset = DatasetDict({
    "train": load_split("train", "train_audios_b"),
    "test": load_split("test", "test_audios_b"),
    "validation": load_split("dev_test", "val_audios_b"),
})

#Inspect
# print(dataset)
# print(dataset["train"])


In [None]:
dataset = dataset.remove_columns(["voice_creator_id", "image_path", "image_category", "image_sub_category", "age_group", "gender", "project_name", "locale", "year", "duration", "location"])

In [None]:
dataset

## A feature extractor which pre-processes the raw audio-inputs

### Load WhisperFeatureExtractor


In [None]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-large-v2")

### Load WhisperTokenizer


In [None]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large-v2", task="translate")

In [None]:
input_str = dataset["train"][0]["transcription"]
labels = tokenizer(input_str).input_ids #The output is a list of numbers
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")

### Combine To Create A WhisperProcessor


In [None]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2", task="translate")


### Prepare Data


In [None]:
display(dataset["train"][0].keys())

display(dataset["train"][0]["audio"])


In [None]:
from datasets import Audio

dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array 
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids 
    batch["labels"] = tokenizer(batch["transcription"]).input_ids
    return batch

In [None]:
sample_batch = dataset["train"][3]

print("--- Original Sample Batch (before processing) ---")
print(f"Keys: {sample_batch.keys()}")
print(f"Audio details: {sample_batch['audio']}")
print(f"Transcription: {sample_batch['transcription']}")
print("-" * 50)


In [None]:
processed_batch = prepare_dataset(sample_batch)
print("\n--- Processed Sample Batch (after prepare_dataset) ---")
print(f"Keys: {processed_batch.keys()}")

# Check the 'input_features'

print(f"Input Features (log-Mel spectrogram):")
print(f"  Type: {type(processed_batch['input_features'])}")
print(f"  Shape: {processed_batch['input_features'].shape}") # This will typically be (80, N_frames) for Whisper, where 80 is the number of Mel bins.
print(f"  Example values (first few): {processed_batch['input_features'].flatten()[:5]}") # Flatten to show linear values


In [None]:
processed_batch

In [None]:
# Check the 'labels'
print(f"Labels (tokenized text):")
print(f"  Type: {type(processed_batch['labels'])}")
print(f"  Length: {len(processed_batch['labels'])}")
print(f"  Example IDs (first few): {processed_batch['labels'][:10]}")
print(f"  Decoded labels: {tokenizer.decode(processed_batch['labels'])}")
print("-" * 50)

In [None]:
import matplotlib.pyplot as plt
import numpy as np # For numerical operations, though your features are already numpy arrays

# Assuming you have 'processed_batch' from your previous execution
# If you just ran the prepare_dataset function, processed_batch should be available.

# 1. Get the input_features (the Mel spectrogram)
mel_spectrogram = processed_batch["input_features"]

print(f"Shape of Mel Spectrogram: {mel_spectrogram.shape}")

# 2. Plot the Mel spectrogram
plt.figure(figsize=(12, 6)) # Adjust figure size for better viewing

# imshow displays the array as an image.
# We use 'origin='lower'' to have lower frequencies at the bottom of the plot.
# 'aspect='auto'' adjusts the aspect ratio to fit the figure.
# 'cmap='viridis'' or 'magma' or 'inferno' are good colormaps for spectrograms.
plt.imshow(mel_spectrogram, origin='lower', aspect='auto', cmap='viridis')

plt.title('Log-Mel Spectrogram')
plt.ylabel('Mel Bins')
plt.xlabel('Time Frames')
plt.colorbar(label='Log-Mel Energy') # Add a color bar to show value scale
plt.tight_layout() # Adjust layout to prevent labels from overlapping
plt.show()

# You can also confirm the data type
print(f"Data type of Mel Spectrogram: {mel_spectrogram.dtype}")

In [None]:
dataset["train"]

In [None]:
!pwd

In [None]:
# Assuming 'dataset' is your DatasetDict (e.g., containing 'train', 'test', 'validation' splits)

# Process only the 'train' split for the first 100 examples
# You apply .select() to the specific split (dataset["train"])
# And then you apply .map() to that selected subset.

# It's good practice to create a new DatasetDict to store the processed splits
processed_dataset = DatasetDict()

processed_dataset["train"] = dataset["train"].select(range(100)).map(
    prepare_dataset,
    remove_columns=dataset["train"].column_names, # Use column_names from the specific split
    num_proc=1,
    desc="Debugging small subset"

)

# If you want to process other splits similarly, you'd do it for each one:
# processed_dataset["test"] = dataset["test"].select(range(50)).map( # Or just the full test set
#     prepare_dataset,
#     remove_columns=dataset["test"].column_names,
#     num_proc=4
# )
# And so on for 'validation' if you have it.

print("Processed DatasetDict structure:")
print(processed_dataset)

# To access the processed training data, you would now use processed_dataset["train"]
# It means "apply this specific function (prepare_dataset) to every single example (or 'row') in this dataset.

In [None]:
# After successful processing:
processed_dataset.save_to_disk("/shared/B_track/processed_100_samples")

# To load it later:
from datasets import load_from_disk
loaded_processed_dataset = load_from_disk("/shared/B_track/processed_100_samples")
print(loaded_processed_dataset)

### Training and Evaluation


In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")


In [None]:
loaded_processed_dataset

In [None]:
import os
import torch
import torchaudio # Explicitly import torchaudio for audio loading
from datasets import load_dataset, Audio, DatasetDict
from transformers import WhisperForConditionalGeneration, WhisperProcessor, Seq2SeqTrainingArguments, Seq2SeqTrainer
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import evaluate
import numpy as np
import logging

# Set logging level to info to see more details from Hugging Face
logging.basicConfig(level=logging.INFO)
transformers.logging.set_verbosity_info()

# --- Configuration ---
# Paths for your TRAINING pipeline test data
FULL_TRAIN_JSON_FILE = "./kinyarwanda_train_test_data.json" # JSON file for your training subset
TRAIN_AUDIO_FOLDER = "./kinyarwanda_train_audio_data"     # Folder containing audio files referenced in FULL_TRAIN_JSON_FILE

# Paths for your VALIDATION pipeline test data
FULL_VALIDATION_JSON_FILE = "./validation.json"           # Your existing validation JSON file
VALIDATION_AUDIO_FOLDER = "./kinyarwanda_validation_audio_data" # Folder containing audio files referenced in FULL_VALIDATION_JSON_FILE

# Column names in your JSON files (ensure these match exactly!)
AUDIO_FILE_COLUMN_NAME = "audio_path"
SOURCE_TEXT_COLUMN_NAME = "kinyarwanda_text"     # Column in train JSON for Kinyarwanda text
TRANSLATION_TEXT_COLUMN_NAME = "english_translation" # Column in both JSONs for English translation

# Model and language settings
WHISPER_MODEL_NAME = "openai/whisper-large-v3"
TARGET_LANGUAGE_FOR_TRANSLATION = "English" # This is used for the processor's language setting
TARGET_LANGUAGE_CODE = "en"               # ISO 639-1 code for the target language (English)

# Expected number of samples for this pipeline test
EXPECTED_TRAIN_SAMPLES = 80
EXPECTED_VALIDATION_SAMPLES = 20

# Output directory for this pipeline test run
OUTPUT_DIR_PIPELINE_TEST = "./whisper-kinyarwanda-pipeline-test"

# --- 1. Load the Raw Datasets (for Pipeline Test) ---
# We assume these JSON files already contain the desired small number of samples.
print(f"Loading raw training dataset from {FULL_TRAIN_JSON_FILE}...")
raw_train_dataset = load_dataset(
    "json",
    data_files=FULL_TRAIN_JSON_FILE,
    split="train" # Load the entire (small) training JSON file
)

print(f"Loading raw validation dataset from {FULL_VALIDATION_JSON_FILE}...")
raw_validation_dataset = load_dataset(
    "json",
    data_files=FULL_VALIDATION_JSON_FILE,
    split="train" # Load the entire (small) validation JSON file
)

# Verify loaded counts and warn if they don't match expectations
if len(raw_train_dataset) != EXPECTED_TRAIN_SAMPLES:
    print(f"WARNING: Expected {EXPECTED_TRAIN_SAMPLES} training samples, but loaded {len(raw_train_dataset)} from {FULL_TRAIN_JSON_FILE}.")
if len(raw_validation_dataset) != EXPECTED_VALIDATION_SAMPLES:
    print(f"WARNING: Expected {EXPECTED_VALIDATION_SAMPLES} validation samples, but loaded {len(raw_validation_dataset)} from {FULL_VALIDATION_JSON_FILE}.")


print(f"Raw training dataset loaded: {len(raw_train_dataset)} samples")
print(f"Raw validation dataset loaded: {len(raw_validation_dataset)} samples")

# --- 2. Initialize Whisper Processor ---
print(f"Loading Whisper processor for {WHISPER_MODEL_NAME} with target language '{TARGET_LANGUAGE_FOR_TRANSLATION}' and task 'translate'...")
processor = WhisperProcessor.from_pretrained(
    WHISPER_MODEL_NAME,
    language=TARGET_LANGUAGE_FOR_TRANSLATION,
    task="translate"
)

# --- 3. Define Preprocessing Function ---
# This function will be applied to both training and validation datasets.
# It now takes an `audio_folder_base` argument to correctly locate audio files.
def prepare_dataset(batch, audio_folder_base):
    # Construct full audio path using the provided base folder
    audio_file_name = batch[AUDIO_FILE_COLUMN_NAME]
    audio_path = os.path.join(audio_folder_base, audio_file_name)

    # Load and resample audio data using torchaudio
    try:
        speech_array, sampling_rate = torchaudio.load(audio_path)
        # Ensure 16kHz sampling rate
        if sampling_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
            speech_array = resampler(speech_array)
        speech_array = speech_array.squeeze(0).numpy() # Convert to mono numpy array
    except Exception as e:
        print(f"Error loading audio file {audio_path}: {e}")
        # Return empty features/labels for this sample so it gets skipped by the data collator's masking
        return {"input_features": [], "labels": []}

    # Compute log-Mel spectrogram input features
    batch["input_features"] = processor.feature_extractor(
        speech_array,
        sampling_rate=16000 # Always pass 16000 here after resampling
    ).input_features[0]

    # Tokenize the target language transcription (English) for the labels
    batch["labels"] = processor.tokenizer(
        batch[TRANSLATION_TEXT_COLUMN_NAME],
        language=TARGET_LANGUAGE_FOR_TRANSLATION, # Target language for labels
        task="translate"
    ).input_ids

    return batch

# --- 4. Apply Preprocessing to Training and Validation Subsets ---
print("Applying preprocessing to training subset...")
# Use a lambda to pass the specific audio folder to prepare_dataset
processed_train_dataset = raw_train_dataset.map(
    lambda batch: prepare_dataset(batch, TRAIN_AUDIO_FOLDER),
    remove_columns=raw_train_dataset.column_names,
    num_proc=os.cpu_count() if os.cpu_count() > 1 else 1, # Use multiple cores if available
    desc="Preprocessing Kinyarwanda training subset for translation"
)

print("Applying preprocessing to validation subset...")
# Use a lambda to pass the specific audio folder to prepare_dataset
processed_validation_dataset = raw_validation_dataset.map(
    lambda batch: prepare_dataset(batch, VALIDATION_AUDIO_FOLDER),
    remove_columns=raw_validation_dataset.column_names,
    num_proc=os.cpu_count() if os.cpu_count() > 1 else 1, # Use multiple cores if available
    desc="Preprocessing Kinyarwanda validation subset for translation"
)

print(f"Processed training dataset: {len(processed_train_dataset)} samples")
print(f"Processed validation dataset: {len(processed_validation_dataset)} samples")

# --- (Optional) Save Processed Subsets ---
# It's good practice to save them even for a pipeline test, helps debugging
PROCESSED_TRAIN_PATH = os.path.join(OUTPUT_DIR_PIPELINE_TEST, "processed_train_subset")
PROCESSED_VALIDATION_PATH = os.path.join(OUTPUT_DIR_PIPELINE_TEST, "processed_validation_subset")

os.makedirs(OUTPUT_DIR_PIPELINE_TEST, exist_ok=True)
processed_train_dataset.save_to_disk(PROCESSED_TRAIN_PATH)
processed_validation_dataset.save_to_disk(PROCESSED_VALIDATION_PATH)
print(f"Processed subsets saved to {PROCESSED_TRAIN_PATH} and {PROCESSED_VALIDATION_PATH}")

# --- 5. Define Data Collator ---
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Filter out examples where audio loading failed (empty input_features or labels)
        features = [f for f in features if f.get("input_features") and f.get("labels")]
        if not features: # If no valid features left in batch
            return {}

        # pad input features (audio)
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # pad labels (transcriptions)
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

# --- 6. Define Metrics (for BLEU score in translation) ---
# For translation, BLEU is a more common metric than WER.
# You'll need to install sacrebleu: pip install sacrebleu
metric_bleu = evaluate.load("sacrebleu")
metric_wer = evaluate.load("wer") # Keep WER for reference if needed, though BLEU is primary for translation

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # Clean references for BLEU (sacrebleu expects a list of lists of references)
    cleaned_label_str = [[label] for label in label_str]

    # Compute BLEU score
    bleu_score = metric_bleu.compute(predictions=pred_str, references=cleaned_label_str)

    # Compute WER for comparison
    wer_score = metric_wer.compute(predictions=pred_str, references=label_str)

    return {"bleu": bleu_score["score"], "wer": 100 * wer_score}

# --- 7. Load Whisper Model ---
print(f"Loading Whisper model {WHISPER_MODEL_NAME} for fine-tuning...")
model = WhisperForConditionalGeneration.from_pretrained(WHISPER_MODEL_NAME)
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

# --- 8. Set Up Training Arguments ---
print("Setting up training arguments for pipeline test...")
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR_PIPELINE_TEST,
    per_device_train_batch_size=4, # Smaller batch size for small dataset
    gradient_accumulation_steps=1,
    learning_rate=1e-5,
    warmup_steps=50, # Fewer warmup steps for fewer total steps
    max_steps=200,   # Very few steps for a quick pipeline test (e.g., 200-500)
                     # This means each of your 80 training samples will be seen a few times.
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=4,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=100, # Save checkpoint more frequently for a short test
    eval_steps=100, # Evaluate more frequently for a short test
    logging_steps=10, # Log training progress very frequently
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="bleu", # Monitor BLEU score for translation
    greater_is_better=True,       # Higher BLEU is better
    push_to_hub=False, # Set to True if you want to push this test model
)

# --- 9. Initialize the Trainer ---
print("Initializing Seq2SeqTrainer...")
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=processed_train_dataset,     # Use your pre-processed training subset
    eval_dataset=processed_validation_dataset, # Use your pre-processed validation subset
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
)

# --- 10. Start the Training! ---
print("\nStarting pipeline test training...")
trainer.train()
print("\nPipeline test training complete!")

# --- 11. (Optional) Final Evaluation and Inference Test ---
print("\nPerforming final evaluation on the processed validation dataset:")
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

print("\n--- Example Inference after Pipeline Test ---")
# Pick a sample from the raw validation dataset for a quick check
if len(raw_validation_dataset) > 0:
    sample_index_for_inference = 0 # You can change this index to test different samples
    raw_sample = raw_validation_dataset[sample_index_for_inference]

    # Load the raw audio array for the pipeline
    audio_to_load_path = os.path.join(VALIDATION_AUDIO_FOLDER, raw_sample[AUDIO_FILE_COLUMN_NAME])
    try:
        input_audio_array, _ = torchaudio.load(audio_to_load_path)
        input_audio_array = input_audio_array.squeeze(0).numpy() # Ensure mono numpy array
    except Exception as e:
        print(f"Could not load audio for inference test from {audio_to_load_path}: {e}")
        input_audio_array = None

    if input_audio_array is not None:
        from transformers import pipeline
        asr_pipeline = pipeline(
            "automatic-speech-recognition",
            model=trainer.model, # Use the model from the trainer (which is the best one loaded at the end)
            tokenizer=processor.tokenizer,
            feature_extractor=processor.feature_extractor,
            device=0 if torch.cuda.is_available() else -1 # Use GPU if available (0 is first GPU, -1 is CPU)
        )

        # Set generation configuration for translation
        gen_kwargs = {"language": TARGET_LANGUAGE_FOR_TRANSLATION, "task": "translate"}

        result = asr_pipeline(input_audio_array, generate_kwargs=gen_kwargs)

        print(f"--- Sample {sample_index_for_inference} ---")
        print(f"Original Kinyarwanda Text: {raw_sample.get(SOURCE_TEXT_COLUMN_NAME, 'N/A')}") # Use .get() for safety
        print(f"True English Translation: {raw_sample[TRANSLATION_TEXT_COLUMN_NAME]}")
        print(f"Predicted English Translation: {result['text']}")
    else:
        print("Skipping inference test due to audio loading error.")
else:
    print("No validation samples to perform inference test.")



In [None]:
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import torch

# Define your data collator (copy-paste from previous steps if you still have it)
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # pad input features (audio)
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # pad labels (transcriptions)
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

# Load the pre-trained Whisper model and processor
model_name = "openai/whisper-large-v3" # Or the size you chose
model = WhisperForConditionalGeneration.from_pretrained(model_name)
processor = WhisperProcessor.from_pretrained(model_name, language="Kinyarwanda", task="translate")

# Instantiate the data collator
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

# Configure the model for generation (important for Seq2SeqTrainer)
model.config.forced_decoder_ids = None # This is typical for ASR, as output length varies
model.config.suppress_tokens = []