In [1]:
import jsonlines
import torchaudio
from datasets import Dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Trainer, TrainingArguments
from pathlib import Path
import torch
import librosa
import IPython.display as ipd

In [23]:
model_name = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

current_directory = Path.cwd() # creates a relative path to the current directory gpt5
file_path = current_directory / '..' / '..' / 'novice' / 'audio' / 'audio_0.wav'
audio_file = file_path.resolve() # convert to absolute path

audio_input, sample_rate = librosa.load(audio_file, sr=16000)

input_values = processor(audio_input, sampling_rate=sample_rate, return_tensors="pt").input_values

with torch.no_grad():
    logits = model(input_values).logits

predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]

print("Transcription:", transcription)

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

Transcription: THAT HE AS ONE FOINEFE OLD DOG IN THE PINGDOM OR TE LEPS OFF TOWARD OF THE BOYS OLECCOMECNETIC OF


In [3]:
audio_f = str(audio_file)

audio_data, sampling_rate = librosa.load(audio_f, sr=None)
waveform, sample_rate = torchaudio.load(audio_f)
ipd.Audio(waveform, rate=sampling_rate)

In [4]:
# Define the path to the directory
current_directory = Path.cwd()
file_path = current_directory / '..' / '..' / 'novice'
data_dir = file_path.resolve()

# Read data from a jsonl file and reformat it
data = {'key': [], 'audio': [], 'transcript': []}
with jsonlines.open(data_dir / "asr.jsonl") as reader:
    for obj in reader:
        if len(data['key']) < 100:  # Only keep the first 10 entries
            for key, value in obj.items():
                data[key].append(value)

# Convert to a Hugging Face dataset
dataset = Dataset.from_dict(data)

# Shuffle the dataset
dataset = dataset.shuffle(seed=42)

# Split the dataset into training, validation, and test sets
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset = dataset.select(range(train_size))
val_dataset = dataset.select(range(train_size, train_size + val_size))
test_dataset = dataset.select(range(train_size + val_size, train_size + val_size + test_size))

In [5]:
# downloading the model (same code as 3rd box)

model_name = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

Note:
    The Wav2Vec2 processor is responsible for converting raw audio signals into input features that the Wav2Vec2 model can understand. It consists of a feature extractor and a tokenizer. The feature extractor processes the raw audio waveform, typically by performing preprocessing steps such as resampling and normalization. The tokenizer tokenizes the processed audio into input tokens suitable for the model. In our code, we use the Wav2Vec2Processor to instantiate the processor from the pretrained model.

In [6]:
'''
import whisper
model = whisper.load_model("base")
result = model.transcribe('audio.wav', fp16 = False)

result['text']
'''

'\nimport whisper\nmodel = whisper.load_model("base")\nresult = model.transcribe(\'audio.wav\', fp16 = False)\n\nresult[\'text\']\n'

In [7]:
'''
import noisereduce as nr
import librosa
import soundfile as sf

# Load audio file
audio_path = 'audio.wav'
audio, rate = librosa.load(audio_path, sr=None)

# Perform noise reduction
noisy_part = audio[0:int(rate*0.5)]  # Identify the noisy part
reduced_noise_audio = nr.reduce_noise(y=audio, sr=rate, y_noise=noisy_part)

# Save the cleaned audio
clean_audio_path = 'cleaned_audio.wav'
sf.write(clean_audio_path, reduced_noise_audio, rate)
result = model.transcribe('cleaned_audio.wav', fp16 = False)

result['text']
'''

"\nimport noisereduce as nr\nimport librosa\nimport soundfile as sf\n\n# Load audio file\naudio_path = 'audio.wav'\naudio, rate = librosa.load(audio_path, sr=None)\n\n# Perform noise reduction\nnoisy_part = audio[0:int(rate*0.5)]  # Identify the noisy part\nreduced_noise_audio = nr.reduce_noise(y=audio, sr=rate, y_noise=noisy_part)\n\n# Save the cleaned audio\nclean_audio_path = 'cleaned_audio.wav'\nsf.write(clean_audio_path, reduced_noise_audio, rate)\nresult = model.transcribe('cleaned_audio.wav', fp16 = False)\n\nresult['text']\n"

In [8]:
# Function to load and preprocess audio
audio_dir = data_dir / 'audio' # data_dir = novice folder

# Freezing all layers except the classifier layer
for param in model.parameters():
    param.requires_grad = False
for param in model.lm_head.parameters():
    param.requires_grad = True

def preprocess_data(examples):
    input_values = []
    attention_masks = []
    labels = []

    for audio_path, transcript in zip(examples['audio'], examples['transcript']): # zip() pairs each audio with its corresponding transcript
        speech_array, sampling_rate = torchaudio.load(audio_dir / audio_path) # Note: audio_dir used here
        processed = processor(speech_array.squeeze(0), sampling_rate=sampling_rate, return_tensors="pt", padding=True)
        # speech_array.squeeze(0) removes any extra empty dimensions in speech_array that might result from torchaudio.load
        # return_tensors specifies the tensor type, in this case it is 'pt' for PyTorch
        # padding=True activates padding to match all the lengths of the audio sequences

        # Process labels with the same processor settings
        with processor.as_target_processor(): # this line ensures that 'processor' switches to a mode for handling target labels (i.e. the transcript). Once this block of code is done, 'processor' returns to its default mode for handling audio input.
            label = processor(transcript, return_tensors="pt", padding=True) # this line processes the transcript (actual words), and stores the text data as a PyTorch tensor in 'label'

        input_values.append(processed.input_values.squeeze(0))
        # Create attention masks based on the input values
        attention_mask = torch.ones_like(processed.input_values) # creates a mask of all ones with the same shape as the processed audio features
        attention_mask[processed.input_values == processor.tokenizer.pad_token_id] = 0  # Set padding tokens to 0
        # the above code sets elements in attention_mask to 0 wherever the corresponding element in the processed audio is the padding token ID processor.tokenizer.pad_token_id
        # Note: the padding token ID is a unique pre-defined value in the tokenizer's vocabulary (the tokenizer converts raw audio / transcript data into numerical tensors), which allows for the identification of padding tokens / elements in the processed audio sequence.
        # Elements which are 0 effectively mask out features such as padded elements in the audio sequence, allowing the model to focus its attention on more informative parts
        attention_masks.append(attention_mask.squeeze(0))

        # Ensure labels are padded to the same length as inputs if needed
        padded_label = torch.full(processed.input_values.shape[1:], -100, dtype=torch.long) # fills padded_label with -100, length is padded sequence length
        actual_length = label.input_ids.shape[1] # gets actual sequence length without padding (label.input_ids has shape (sequence_length,) representing word units in the transcript)
        padded_label[:actual_length] = label.input_ids.squeeze(0) # fills in the first part of padded_label up until the actual length with true labels, the rest of padded_label is left as -100 padding
        labels.append(padded_label)

    # Concatenate all batches
    # torch.stack stacks input_values (which is a list of input values for each audio sequence) into a single tensor by adding a dimension
    examples['input_values'] = torch.stack(input_values)
    examples['attention_mask'] = torch.stack(attention_masks)
    examples['labels'] = torch.stack(labels)

    return examples

In [9]:
# Apply preprocessing
train_dataset = train_dataset.map(preprocess_data, batched=True, batch_size=1, remove_columns=train_dataset.column_names)
val_dataset = val_dataset.map(preprocess_data, batched=True, batch_size=1, remove_columns=val_dataset.column_names)
test_dataset = test_dataset.map(preprocess_data, batched=True, batch_size=1, remove_columns=test_dataset.column_names)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    learning_rate=1e-4,
    per_device_train_batch_size=1,  # Reduce to one for simplicity
    num_train_epochs=10,
    weight_decay=0.005,
    save_steps=1000, #original 500
    eval_steps=1000, #original 500
    logging_steps=10,
    load_best_model_at_end=True
)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]



Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [10]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # Use the validation dataset for evaluation
    tokenizer=processor.feature_extractor
)

# Train the model
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=800, training_loss=1725.5667199707032, metrics={'train_runtime': 182.7524, 'train_samples_per_second': 4.378, 'train_steps_per_second': 4.378, 'total_flos': 5.44488489510048e+16, 'train_loss': 1725.5667199707032, 'epoch': 10.0})

In [11]:
# SAME AS CODE BLOCK ABOVE BUT WITH EXTRA RANDOM STUFF
'''
from transformers import DefaultDataCollator

# Initialize the dynamic padding data collator
data_collator = DefaultDataCollator(return_tensors="pt")

# Setup the trainer with the dynamic padding data collator for evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=processor.feature_extractor
)

# Train the model
try:
    trainer.train()
except Exception as e:
    print(f"Error during training: {e}")
    '''

'\nfrom transformers import DefaultDataCollator\n\n# Initialize the dynamic padding data collator\ndata_collator = DefaultDataCollator(return_tensors="pt")\n\n# Setup the trainer with the dynamic padding data collator for evaluation\ntrainer = Trainer(\n    model=model,\n    args=training_args,\n    train_dataset=train_dataset,\n    eval_dataset=val_dataset,\n    data_collator=data_collator,\n    tokenizer=processor.feature_extractor\n)\n\n# Train the model\ntry:\n    trainer.train()\nexcept Exception as e:\n    print(f"Error during training: {e}")\n    '

In [14]:
'''
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # determines device runs on gpu
model.to(device)

audio_file = 'cleaned_audio.wav'
audio_input, sample_rate = librosa.load(audio_file, sr=16000) # load the audio file and returns an audio signal of sample rate 16000 (which is required by the model)

# converts the audio data to be transformed and returned as tensors 
input_values = processor(audio_input, sampling_rate=sample_rate, return_tensors="pt").input_values

input_values = input_values.to(device) # ensures that input and model are on the same gpu / cpu

with torch.no_grad(): # ensure that no gradients is computed as this is the inference phase and not training phase
    logits = model(input_values).logits

predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]

print("Transcription:", transcription)
'''

Transcription: AR EER OA OR IL OFOY O


In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the device
model.to(device)

# Preprocess function to ensure padding and truncation
def preprocess_function(batch):
    audio = batch["audio"]
    inputs = processor(
        audio["array"], 
        sampling_rate=audio["sampling_rate"], 
        padding="max_length", 
        truncation=True, 
        max_length=16000,  # Adjust max_length as needed
        return_tensors="pt"
    )
    batch["input_values"] = inputs.input_values[0]
    return batch

# Apply preprocessing to the entire dataset
val_dataset = val_dataset.map(preprocess_function)

# Define a collate function for DataLoader
def collate_fn(batch):
    input_values = torch.stack([item['input_values'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    return {
        'input_values': input_values,
        'attention_mask': attention_mask
    }

# Create DataLoader for batch processing
val_dataloader = DataLoader(val_dataset, batch_size=8, collate_fn=collate_fn)

# Function to decode and compute metrics
def evaluate(model, dataloader, processor, device):
    model.eval()
    predictions = []
    for batch in dataloader:
        input_values = batch['input_values'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        with torch.no_grad():
            logits = model(input_values, attention_mask=attention_mask).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        predicted_transcriptions = processor.batch_decode(predicted_ids)
        predictions.extend(predicted_transcriptions)
    return predictions

# Run evaluation
predictions = evaluate(model, val_dataloader, processor, device)

# If you have ground truth transcriptions, you can compute metrics
# ground_truths = val_dataset['transcription']  # List of ground truth transcriptions
# metric = load_metric("wer")
# wer = metric.compute(predictions=predictions, references=ground_truths)
# print("WER:", wer)

# Print the transcriptions for inspection
for transcription in predictions:
    print("Transcription:", transcription)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

KeyError: 'audio'

In [16]:
val_dataset

Dataset({
    features: ['input_values', 'attention_mask', 'labels'],
    num_rows: 10
})