In [None]:
import os
from dotenv import load_dotenv
import huggingface_hub

# load env var huggingface token
load_dotenv(os.path.join("../", ".env"))
# login to the hub
huggingface_hub.login(token=os.getenv("HUGGINGFACE_TOKEN"))

In [29]:
import datasets

voxpopuli = datasets.load_dataset("facebook/voxpopuli", "en", streaming=True, trust_remote_code=True)

In [30]:
voxpopuli_head = list(voxpopuli["train"].take(5))
SAMPLING_RATE = voxpopuli_head[0]["audio"]["sampling_rate"]
print(voxpopuli_head)

[{'audio_id': '20180418-0900-PLENARY-3-en_20180418-08:50:36_17', 'language': 0, 'audio': {'path': 'train_part_0/20180418-0900-PLENARY-3-en_20180418-08:50:36_17.wav', 'array': array([-0.00030518,  0.00119019,  0.00506592, ..., -0.00036621,
       -0.00027466, -0.00018311]), 'sampling_rate': 16000}, 'raw_text': 'If you do not address this problem, the ground is there for populist nationalist forces to go on growing all over Europe.', 'normalized_text': 'if you do not address this problem the ground is there for populist nationalist forces to go on growing all over europe.', 'gender': 'female', 'speaker_id': '124737', 'is_gold_transcript': True, 'accent': 'None'}, {'audio_id': '20170614-0900-PLENARY-5-en_20170614-10:03:08_5', 'language': 0, 'audio': {'path': 'train_part_0/20170614-0900-PLENARY-5-en_20170614-10:03:08_5.wav', 'array': array([-0.00036621, -0.00030518, -0.00042725, ...,  0.00012207,
        0.00119019,  0.00027466]), 'sampling_rate': 16000}, 'raw_text': 'they attacked and rem

In [None]:
from transformers import (
    AutoTokenizer,
    AutoFeatureExtractor,
    SpeechEncoderDecoderModel,
    SpeechEncoderDecoderConfig,
    AutoConfig,
)

import torch

encoder_id = "facebook/wav2vec2-base-960h"  # acoustic model encoder
decoder_id = "facebook/bart-base"  # text decoder

# feature_extractor = AutoFeatureExtractor.from_pretrained(encoder_id)
# tokenizer = AutoTokenizer.from_pretrained(decoder_id)

model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained(
    encoder_id, decoder_id, encoder_add_adapter=True
)
model.config.encoder.feat_proj_dropout = 0.0
model.config.encoder.mask_time_prob = 0.0
model.config.decoder_start_token_id = model.decoder.config.bos_token_id
model.config.pad_token_id = model.decoder.config.pad_token_id
model.config.eos_token_id = model.decoder.config.eos_token_id
model.config.max_length = 128
model.config.encoder.layerdrop = 0.0
model.config.use_cache = False
model.config.processor_class = "Wav2Vec2Processor"

# Load model without pretrained weights
config_encoder = AutoConfig.from_pretrained(encoder_id)
config_encoder.add_adapter = True
config_decoder = AutoConfig.from_pretrained(decoder_id)
config_decoder.is_decoder = True
config_decoder.add_cross_attention = True

config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(
    config_encoder, config_decoder
)
model_no_weights = SpeechEncoderDecoderModel(config=config)

model_no_weights.config.encoder.feat_proj_dropout = 0.0
model_no_weights.config.encoder.mask_time_prob = 0.0
model_no_weights.config.decoder_start_token_id = model_no_weights.decoder.config.bos_token_id
model_no_weights.config.pad_token_id = model_no_weights.decoder.config.pad_token_id
model_no_weights.config.eos_token_id = model_no_weights.decoder.config.eos_token_id
model_no_weights.config.max_length = 128
model_no_weights.config.encoder.layerdrop = 0.0
model_no_weights.config.use_cache = False
model_no_weights.config.processor_class = "Wav2Vec2Processor"

# # Accessing the model configuration
# config_encoder = model_no_weights.config.encoder
# config_encoder.add_adapter = True
# config_decoder = model_no_weights.config.decoder
# # set decoder config to causal lm
# config_decoder.is_decoder = True
# config_decoder.add_cross_attention = True


# input_values = feature_extractor(
#     voxpopuli_head[0]["audio"]["array"], return_tensors="pt", sampling_rate=SAMPLING_RATE
# ).input_values

# generated_ids = model.generate(input_values, decoder_start_token_id=tokenizer.cls_token_id, )
# generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
# print(generated_text)

# # load its corresponding transcription and tokenize to generate labels
# labels = tokenizer(voxpopuli_head[0]["text"], return_tensors="pt").input_ids

In [2]:
# Load trained model
def display_weight_stats(model):
    for name, param in model.named_parameters():
        if param.requires_grad:
            mean = param.data.mean().item()
            std = param.data.std().item()
            print(f"Layer: {name} | Mean: {mean:.4f} | Std: {std:.4f}")

In [None]:
from transformers import AutoModelForSpeechSeq2Seq

model = AutoModelForSpeechSeq2Seq.from_pretrained("../seq2seq_wav2vec2_bart-base_scratch")

In [None]:
display_weight_stats(model)

In [None]:
# FOR DEBUGGING TO INSPECT THE AUDIO
import matplotlib.pyplot as plt
import librosa.display

# Load and display the original audio
audio_orig, sr_orig = librosa.load("../tmp/debug/2277-149896-0000_orig.mp3")
plt.figure(figsize=(14, 5))
librosa.display.waveshow(audio_orig, sr=sr_orig)
plt.title("Original Audio")
plt.show()

# Load and display the post-feature extracted audio
audio_post, sr_post = librosa.load("../tmp/debug/2277-149896-0000.mp3")
plt.figure(figsize=(14, 5))
librosa.display.waveshow(audio_post, sr=sr_post)
plt.title("Post-Feature Extracted Audio")
plt.show()