In [None]:
import json
import os
import random
import torch
import espnetez as EZ
from espnetez.config import from_yaml

# --- Configuration Section ---

# 1. Project and Data Paths
# This should be the root folder of your project.
PROJECT_ROOT = "." 
# The name of your JSON file with audio paths and transcripts.
TRAIN_JSON_PATH = os.path.join(PROJECT_ROOT, "train.json") 
# The folder where all your .wav files are stored.
WAV_DIR = os.path.join(PROJECT_ROOT, "wavs") 

# 2. Model and Training Hyperparameters
# The name for your experiment. ESPnet will save logs and models in `exp/{EXP_NAME}`.
EXP_NAME = "whisper_finetune_experiment"
# Choose the Whisper model you want to fine-tune.
# Options: "openai/whisper-tiny", "openai/whisper-base", "openai/whisper-small",
# "openai/whisper-medium", "openai/whisper-large-v2", etc.
# Smaller models are faster to train but less powerful.
BASE_MODEL = "openai/whisper-small" 
# Training settings
BATCH_SIZE = 4      # Adjust based on your GPU memory.
MAX_EPOCHS = 10     # Number of times to iterate over the entire dataset.
LEARNING_RATE = 1e-5 # Learning rate for the optimizer.
VALIDATION_SPLIT = 0.1 # Use 10% of the data for validation.

# --- End of Configuration Section ---



Failed to import Flash Attention, using ESPnet default: No module named 'flash_attn'


  @torch.cuda.amp.autocast(enabled=False)
  @torch.cuda.amp.autocast(enabled=False)
  @torch.cuda.amp.autocast(enabled=False)
  @torch.cuda.amp.autocast(enabled=False)
  @torch.cuda.amp.autocast(enabled=False)
  @torch.cuda.amp.autocast(enabled=False)
  @torch.cuda.amp.autocast(enabled=False)
  @torch.cuda.amp.autocast(enabled=False)
  @torch.cuda.amp.autocast(enabled=False)
  @torch.cuda.amp.autocast(enabled=False)


In [2]:
import json
import os
import random


def prepare_espnet_data(json_path, wav_dir, val_split=0.1):
    """
    Loads data from a JSON file and formats it for ESPnet-EZ.
    It also splits the data into training and validation sets.

    Args:
        json_path (str): Path to the train.json file.
        wav_dir (str): Path to the directory containing wav files.
        val_split (float): The fraction of data to use for validation.

    Returns:
        tuple: A tuple containing two dictionaries: (train_data, valid_data)
               Each dictionary is in the format expected by ESPnet-EZ.
    """
    print("Preparing data for ESPnet-EZ...")
    
    # 1. Load the JSON file (as dict, then to list)
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            all_data_dict = json.load(f)
            all_data = list(all_data_dict.values())
    except FileNotFoundError:
        print(f"Error: The file {json_path} was not found.")
        exit()
    except json.JSONDecodeError:
        print(f"Error: The file {json_path} is not a valid JSON file.")
        exit()

    # 2. Shuffle data for a random split
    random.shuffle(all_data)

    # 3. Split data into training and validation sets
    split_index = int(len(all_data) * (1 - val_split))
    train_list = all_data[:split_index]
    valid_list = all_data[split_index:]
    
    print(f"Total samples: {len(all_data)}")
    print(f"Training samples: {len(train_list)}")
    print(f"Validation samples: {len(valid_list)}")

    # 4. Format the data into the required dictionary structure
    def format_to_dict(data_list):
        data_dict = {}
        for i, item in enumerate(data_list):
            # Use 'audio_path' and 'transcription' keys
            audio_path = os.path.join(wav_dir, os.path.basename(item['audio_path'] + ".wav"))
            text = item['transcription']
            
            if not os.path.exists(audio_path):
                print(f"Warning: Audio file not found, skipping: {audio_path}")
                continue

            # Create a unique utterance ID
            utt_id = f"utt_{i:05d}"
            data_dict[utt_id] = {
                "wav": audio_path,
                "text": text
            }
        return data_dict

    train_data = format_to_dict(train_list)
    valid_data = format_to_dict(valid_list)

    return train_data, valid_data


train_data, valid_data = prepare_espnet_data(
    "/ocean/projects/cis250085p/shared/A_track/train.json",
    "/ocean/projects/cis250085p/shared/track_a_audio_files",
    val_split=0.01
)

Preparing data for ESPnet-EZ...


Total samples: 90163
Training samples: 89261
Validation samples: 902


In [3]:
import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"
os.environ["NUMBA_NUM_THREADS"] = "1"
import torch
torch.set_num_threads(1)
torch.set_num_interop_threads(1)

In [None]:
import soundfile as sf
# import torch
from espnet2.bin.s2t_inference_ctc import Speech2TextGreedySearch

context_len_in_secs = 4   # left and right context when doing buffered inference
batch_size = 2   # depends on the GPU memory
s2t = Speech2TextGreedySearch.from_pretrained(
    "espnet/owsm_ctc_v4_1B",
    device='cuda' if torch.cuda.is_available() else 'cpu',
    generate_interctc_outputs=False,
    # lang_sym='<rwa>',
    task_sym='<asr>',
)


# Process each audio file in the training data
# for utt_id, data in valid_data.items():
#     audio_path = data['wav']
#     speech, rate = sf.read(audio_path)
    
#     text = s2t.decode_long_batched_buffered(
#         speech,
#         batch_size=batch_size,
#         context_len_in_secs=context_len_in_secs,
#     )
#     print(f"Utterance {utt_id}:")
#     print(f"Original text: {data['text']}")
#     print(f"Predicted text: {text}")
#     print("-" * 80)

Fetching 38 files:   0%|          | 0/38 [00:00<?, ?it/s]

  with autocast(False):


Utterance utt_00000:
Original text: Icyapa kigaragaza akarere ka muhanga umuhanda urimo isiganwa ry'amagare, abantu bitegereza isiganwa ry'amagare ibiti biriho amashami amapoto, n'insinga z' amashanyarazi ikirere kiza.
Predicted text: Icyapa kigaragaza Akarerere ka Muhanga umuhanda urimo isiganwa ry’amagare abantu bitegereza isiganwa ry’amagare ibiti iriho amashami, amafoto n’insinga z’amashanyarazi ikirere cyiza
--------------------------------------------------------------------------------
Utterance utt_00001:
Original text: Abana bato, bagomba kwitabwaho, kugira ngo ubuzima bwabo, burusheho kugenda neza, kandi nugaragaje ko afite ikibazo, yitabweho hakiri kare.
Predicted text: Abana bato bagomba kwitabwaho kugira ngo ubuzima bwabo burushaho kugenda neza kandi ntugaragaje ko afite ikibazo yitabweho hakiri kare
--------------------------------------------------------------------------------
Utterance utt_00002:
Original text: Inzu ikorerwamo umurimo w'ubwogoshi aho dukundaga kuja mu 

In [2]:
import pandas as pd
from jiwer import wer

# Create empty lists to store results
utterance_ids = []
original_texts = []
predicted_texts = []

# Process each audio file in the training data
for utt_id, data in valid_data.items():
    audio_path = data['wav']
    speech, rate = sf.read(audio_path)
    
    text = s2t.decode_long_batched_buffered(
        speech,
        batch_size=batch_size,
        context_len_in_secs=context_len_in_secs,
    )
    
    # Store results
    utterance_ids.append(utt_id)
    original_texts.append(data['text'])
    predicted_texts.append(text)

# Create DataFrame
results_df = pd.DataFrame({
    'utterance_id': utterance_ids,
    'original_text': original_texts,
    'predicted_text': predicted_texts
})

# Calculate WER for each utterance
results_df['wer'] = [wer(orig, pred) for orig, pred in zip(results_df['original_text'], results_df['predicted_text'])]

# Calculate average WER
avg_wer = results_df['wer'].mean()

print(f"Average WER: {avg_wer:.4f}")
print("\nDetailed Results:")
print(results_df)

NameError: name 'valid_data' is not defined

In [1]:
text

NameError: name 'text' is not defined

In [None]:
!pip install loralib

In [7]:
import torch
import os



# Configuration
method = "lora"  # or "full"
model_tag = "espnet/owsm_ctc_v4_1B"  # Using same model as inference
epochs = 3
batch_size = 4
learning_rate = 1e-5

# --- 1. Load the Pre-trained Model ---
print(f"Loading pre-trained model: {model_tag}")
s2t = Speech2TextGreedySearch.from_pretrained(
    model_tag,
    device='cuda' if torch.cuda.is_available() else 'cpu',
    generate_interctc_outputs=False,
    task_sym='<asr>',
)

# --- 2. Apply Fine-Tuning Method ---
if method == "lora":
    print("Applying LoRA adapters to the model for parameter-efficient fine-tuning...")
    s2t.s2t_model = apply_lora_to_model(s2t.s2t_model, rank=8, alpha=16)
    print("LoRA applied. Only adapter weights will be trained.")
else:
    print("Proceeding with FULL fine-tuning. All model weights will be updated.")

# Print trainable parameters info
trainable_params = sum(p.numel() for p in s2t.s2t_model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in s2t.s2t_model.parameters())
print(f"Fine-tuning Method: {method.upper()}")
print(f"Trainable parameters: {trainable_params:,} (~{trainable_params/total_params:.4%})")
print(f"Total parameters:     {total_params:,}")

# --- 3. Prepare Training Data ---
# Using valid_data from previous cell
train_data = valid_data  # Assuming valid_data contains the training examples

# --- 4. Training Loop ---
output_dir = f"./exp/{method}_finetune_{model_tag.replace('/', '_')}"
os.makedirs(output_dir, exist_ok=True)

optimizer = torch.optim.AdamW(
    [p for p in s2t.s2t_model.parameters() if p.requires_grad],
    lr=learning_rate
)

s2t.s2t_model.train()
print("\nStarting training...")

for epoch in range(epochs):
    total_loss = 0
    for utt_id, data in train_data.items():
        optimizer.zero_grad()
        
        # Load audio
        speech, rate = sf.read(data['wav'])
        speech = torch.tensor(speech).to(s2t.device)
        
        # Get target text
        target_text = data['text']
        
        # Forward pass
        output = s2t.s2t_model(speech)
        loss = output['loss']
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
    avg_loss = total_loss / len(train_data)
    print(f"Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}")

# Save the fine-tuned model
torch.save({
    'model_state_dict': s2t.s2t_model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
}, f"{output_dir}/model.pt")

print("\nFine-tuning complete!")
print(f"Model saved to: {output_dir}/model.pt")


ImportError: cannot import name 'lora_state_dict' from 'espnet2.torch_utils' (/ocean/projects/cis250085p/shared/espnet_env/lib/python3.10/site-packages/espnet2/torch_utils/__init__.py)

In [2]:
import torch
import os
import soundfile as sf
from espnet2.bin.asr_inference import Speech2Text

# Configuration
method = "lora"  # or "full"
model_tag = "espnet/owsm_ctc_v4_1B"
epochs = 3
batch_size = 4
learning_rate = 1e-5

# 1. Load the Pre-trained Model
print(f"Loading pre-trained model: {model_tag}")
s2t = Speech2Text.from_pretrained(
    model_tag,
    device='cuda' if torch.cuda.is_available() else 'cpu',
    # Add other necessary arguments as needed
)


Loading pre-trained model: espnet/owsm_ctc_v4_1B


Fetching 38 files:   0%|          | 0/38 [00:00<?, ?it/s]

TypeError: Speech2Text.__init__() got an unexpected keyword argument 's2t_train_config'

In [4]:
import torch
from espnet2.bin.s2t_inference_ctc import Speech2TextGreedySearch

model_tag = "espnet/owsm_ctc_v4_1B"

s2t = Speech2TextGreedySearch.from_pretrained(
    model_tag,
    device='cuda' if torch.cuda.is_available() else 'cpu',
    generate_interctc_outputs=False,
    task_sym='<asr>',
)


Fetching 38 files:   0%|          | 0/38 [00:00<?, ?it/s]

In [25]:
type(s2t.s2t_model)

espnet2.s2t.espnet_ctc_model.ESPnetS2TCTCModel

In [26]:

def find_lora_target_modules(model: torch.nn.Module):
    """
    Scans the model and prints the names of all linear layers, which are
    the typical targets for LoRA. This helps users identify which modules
    the `lora_state_dict` function will adapt.

    Args:
        model: The PyTorch model to inspect.
    """
    print("\n--- Finding Potential LoRA Target Modules ---")
    print("These are the names of the `torch.nn.Linear` layers in the model.")
    print("The `lora_state_dict` utility will adapt these by default.\n")
    
    found_targets = []
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            found_targets.append(name)
            
    if found_targets:
        for name in found_targets:
            print(name)
    else:
        print("No `torch.nn.Linear` layers found in the model.")
    
    print("\n---------------------------------------------\n")
    return found_targets

In [27]:
find_lora_target_modules(s2t.s2t_model)


--- Finding Potential LoRA Target Modules ---
These are the names of the `torch.nn.Linear` layers in the model.
The `lora_state_dict` utility will adapt these by default.

encoder.embed.out
encoder.encoders.0.attn.linear_q
encoder.encoders.0.attn.linear_k
encoder.encoders.0.attn.linear_v
encoder.encoders.0.attn.linear_out
encoder.encoders.0.cgmlp.channel_proj1.0
encoder.encoders.0.cgmlp.channel_proj2
encoder.encoders.0.feed_forward.w_1
encoder.encoders.0.feed_forward.w_2
encoder.encoders.0.feed_forward_macaron.w_1
encoder.encoders.0.feed_forward_macaron.w_2
encoder.encoders.0.merge_proj
encoder.encoders.1.attn.linear_q
encoder.encoders.1.attn.linear_k
encoder.encoders.1.attn.linear_v
encoder.encoders.1.attn.linear_out
encoder.encoders.1.cgmlp.channel_proj1.0
encoder.encoders.1.cgmlp.channel_proj2
encoder.encoders.1.feed_forward.w_1
encoder.encoders.1.feed_forward.w_2
encoder.encoders.1.feed_forward_macaron.w_1
encoder.encoders.1.feed_forward_macaron.w_2
encoder.encoders.1.merge_proj
e

['encoder.embed.out',
 'encoder.encoders.0.attn.linear_q',
 'encoder.encoders.0.attn.linear_k',
 'encoder.encoders.0.attn.linear_v',
 'encoder.encoders.0.attn.linear_out',
 'encoder.encoders.0.cgmlp.channel_proj1.0',
 'encoder.encoders.0.cgmlp.channel_proj2',
 'encoder.encoders.0.feed_forward.w_1',
 'encoder.encoders.0.feed_forward.w_2',
 'encoder.encoders.0.feed_forward_macaron.w_1',
 'encoder.encoders.0.feed_forward_macaron.w_2',
 'encoder.encoders.0.merge_proj',
 'encoder.encoders.1.attn.linear_q',
 'encoder.encoders.1.attn.linear_k',
 'encoder.encoders.1.attn.linear_v',
 'encoder.encoders.1.attn.linear_out',
 'encoder.encoders.1.cgmlp.channel_proj1.0',
 'encoder.encoders.1.cgmlp.channel_proj2',
 'encoder.encoders.1.feed_forward.w_1',
 'encoder.encoders.1.feed_forward.w_2',
 'encoder.encoders.1.feed_forward_macaron.w_1',
 'encoder.encoders.1.feed_forward_macaron.w_2',
 'encoder.encoders.1.merge_proj',
 'encoder.encoders.2.attn.linear_q',
 'encoder.encoders.2.attn.linear_k',
 'encode

In [None]:
lora_target_modules = [
            "*.attn.linear_q",
            "*.attn.linear_v",
        ]

In [9]:
s2t.s2t_model.enable_lora(rank=8, alpha=16)


create_lora_adapter(
    model: torch.nn.Module,
    rank: int = 8,
    alpha: int = 8,
    dropout_rate: float = 0.0,
    target_modules: List[str] = ["query"],
    bias_type: Optional[str] = "none",
)

AttributeError: 'ESPnetS2TCTCModel' object has no attribute 'enable_lora'

In [5]:

# 2. Apply Fine-Tuning Method
if method == "lora":
    print("Enabling LoRA adapters for parameter-efficient fine-tuning...")
    # LoRA is now enabled via config or CLI, not by manual injection.
    # If using CLI: add --use_lora true --lora_rank 8 --lora_alpha 16
    # If using code, check if your ESPnet2 exposes a method like enable_lora()
    if hasattr(s2t.model, "enable_lora"):
        s2t.model.enable_lora(rank=8, alpha=16)
    else:
        raise NotImplementedError("LoRA must be enabled via config or CLI in this ESPnet2 version.")
    print("LoRA enabled. Only adapter weights will be trained.")
else:
    print("Proceeding with FULL fine-tuning. All model weights will be updated.")


Enabling LoRA adapters for parameter-efficient fine-tuning...


AttributeError: 'Speech2TextGreedySearch' object has no attribute 'model'

In [11]:

# Print trainable parameters info
trainable_params = sum(p.numel() for p in s2t.s2t_model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in s2t.s2t_model.parameters())
print(f"Fine-tuning Method: {method.upper()}")
print(f"Trainable parameters: {trainable_params:,} (~{trainable_params/total_params:.4%})")
print(f"Total parameters:     {total_params:,}")


Fine-tuning Method: LORA
Trainable parameters: 1,011,342,162 (~100.0000%)
Total parameters:     1,011,342,162


In [None]:

# 3. Prepare Training Data
# You must provide 'train_data' as a dict: {utt_id: {'wav': path, 'text': str}}
# Example: train_data = {'utt1': {'wav': 'audio1.wav', 'text': 'hello world'}, ...}

# 4. Training Loop
output_dir = f"./exp/{method}_finetune_{model_tag.replace('/', '_')}"
os.makedirs(output_dir, exist_ok=True)

optimizer = torch.optim.AdamW(
    [p for p in s2t.model.parameters() if p.requires_grad],
    lr=learning_rate
)

s2t.model.train()
print("\nStarting training...")

for epoch in range(epochs):
    total_loss = 0
    for utt_id, data in train_data.items():
        optimizer.zero_grad()
        speech, rate = sf.read(data['wav'])
        speech = torch.tensor(speech).to(s2t.device)
        target_text = data['text']
        # Forward pass (update as per your model's API)
        output = s2t.model(speech)
        loss = output['loss']
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_data)
    print(f"Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}")

# Save the fine-tuned model
torch.save({
    'model_state_dict': s2t.model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
}, f"{output_dir}/model.pt")

print("\nFine-tuning complete!")
print(f"Model saved to: {output_dir}/model.pt")
