In [36]:
# Imports 
from datasets import load_dataset
import os
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm
from transformers import WhisperProcessor, WhisperForConditionalGeneration, MarianMTModel, MarianTokenizer


In [37]:
# Load the europarl dataset training split
dataset = load_dataset("tj-solergibert/Europarl-ST", split="train")


In [38]:
# Filter the dataset where the speech is english, there is a french transcription, and the audio file is present 
dataset = dataset.filter(lambda example: example['original_language'] == 'en' and 'fr' in example['transcriptions'].keys() and example["transcriptions"]['fr'] is not None and os.path.exists(example["audio_path"]))
print(dataset)

# Select a subset of the filtered dataset for training
dataset = dataset.select(range(20))
print(dataset)

# Display some examples from the filtered dataset
for example in dataset.select(range(5)):
    print(example)

Dataset({
    features: ['original_speech', 'original_language', 'audio_path', 'segment_start', 'segment_end', 'transcriptions'],
    num_rows: 31777
})
Dataset({
    features: ['original_speech', 'original_language', 'audio_path', 'segment_start', 'segment_end', 'transcriptions'],
    num_rows: 20
})
{'original_speech': 'Mr President, I know that I will not be popular for making a long speech at this time, but my two fellow-rapporteurs, with whom I have worked very closely as a team, have made short statements so I want to keep the team spirit together.', 'original_language': 'en', 'audio_path': 'en/audios/en.20080924.23.3-123.m4a', 'segment_start': 0.0, 'segment_end': 14.470000267028809, 'transcriptions': {'de': 'Herr Präsident! Ich weiß, dass ich mir keine Freunde mache, wenn ich um diese Uhrzeit eine lange Rede halte, doch meine beiden Mitberichterstatter, mit denen ich sehr eng im Team zusammengearbeitet habe, haben kurze Stellungnahmen abgegeben, sodass ich den Teamgeist zusammen

In [39]:
# Load models and processors/tokenizers for whisper and marian
whisper_model_name = "openai/whisper-tiny"
translation_model_name = "Helsinki-NLP/opus-mt-en-fr"

whisper_processor = WhisperProcessor.from_pretrained(whisper_model_name)
print("Loaded whisper processor")
whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name)
print("Loaded whisper conditional generator")
mt_tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
print("Loaded mt tokenizer")
mt_model = MarianMTModel.from_pretrained(translation_model_name)
print("Loaded mt model")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded whisper processor
Loaded whisper conditional generator
Loaded mt tokenizer
Loaded mt model


In [40]:
# device config 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
whisper_model.to(device)
mt_model.to(device)

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(59514, 512, padding_idx=59513)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(59514, 512, padding_idx=59513)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [41]:
import librosa 

audio_directory = "."

# preprocesses the dataset for training 
def preprocess(batch):
    # gets audio meta data
    audio_path = batch["audio_path"]
    segment_start = batch["segment_start"]
    segment_end = batch["segment_end"]
    
    # load the audio
    audio, sr = librosa.load(audio_path, sr=16000, offset=segment_start, duration=segment_end-segment_start)
    
    # process the audio using whisper
    inputs = whisper_processor(audio, sampling_rate=sr, return_tensors="pt")
    
    # pad the input to be length 3000 for whisper 
    input_features = inputs.input_features.squeeze(0)
    # if input_features.shape[-1] < 3000:
    #     padding = torch.zeros((input_features.shape[0], 3000 - input_features.shape[-1]))
    #     input_features = torch.cat([input_features, padding], dim=-1)
    
    batch["input_features"] = input_features
    
    # tokenize the transcription of the original speech for whisper labels 
    original_transcription = batch["original_speech"]
    tokenized_originals = whisper_processor.tokenizer(
        original_transcription,
        return_tensors="pt",
        padding="longest"
    ).input_ids.squeeze(0)
    
    batch["english_text"] = tokenized_originals
    
    french_transcription = batch["transcriptions"]["fr"]
    
    # Tokenize the target French text translation for marian
    tokenized_labels = mt_tokenizer(
        text_target=french_transcription,
        return_tensors="pt",
        padding="longest"
    ).input_ids.squeeze(0)
    
    batch["labels"] = tokenized_labels
    
    return batch

# Apply the preprocessing function to the dataset
dataset = dataset.map(preprocess, remove_columns=["audio_path", "original_speech", "original_language", "segment_start", "segment_end", "transcriptions"])


In [67]:
# collate function for tensor and padding before dataloader
def collate_fn(batch):
    input_features = [item['input_features'] for item in batch]
    labels = [item['labels'] for item in batch]
    english_text = [item['english_text'] for item in batch]
    
    # Convert all inputs to tensors if they are not already
    input_features = [torch.tensor(f) if not isinstance(f, torch.Tensor) else f for f in input_features]
    labels = [torch.tensor(l) if not isinstance(l, torch.Tensor) else l for l in labels]
    english_text = [torch.tensor(l) if not isinstance(l, torch.Tensor) else l for l in english_text]
    
    
    # pad sequences
    input_features = pad_sequence(input_features, batch_first=True)
    labels = pad_sequence(labels, batch_first=True, padding_value=mt_tokenizer.pad_token_id)
    english_text = pad_sequence(english_text, batch_first=True)
    
    
    return {
        'input_features': input_features,
        'labels': labels, 
        'english_text': english_text
    }

# create data loader using collate function
dataloader = DataLoader(dataset, batch_size=1, shuffle=True, collate_fn=collate_fn)
print(dataloader)
# optimizers
whisper_optimizer = AdamW(whisper_model.parameters(), lr=5e-5)
mt_optimizer = AdamW(mt_model.parameters(), lr=5e-5)

# training loop
num_epochs = 3
output_dir = "./fine_tuned_models"

        
for epoch in range(3):
    whisper_model.train()
    mt_model.train()
    
    epoch_loss = 0.0
    for batch in tqdm(dataloader):
        
    
        input_features = (batch["input_features"])
        transcription = batch["english_text"]
        target_ids = batch["labels"]
    
        # forward pass through Whisper model with english transcription labels
        whisper_outputs = whisper_model(input_features, labels=transcription)
        
        predicted_ids = whisper_outputs.logits.argmax(dim=-1)
        
        # decode the predicted ids to text
        predicted_texts = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)
        print(predicted_texts)

        # tokenize the decoded text for the translation model
        translation_inputs = mt_tokenizer(predicted_texts, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)

        # forward pass through Translation model with Whisper's transcription as input
        translation_outputs = mt_model(input_ids=translation_inputs, labels=target_ids)

        # combine the losses for end-to-end training
        # combined_loss = whisper_outputs.loss + translation_outputs.loss
        # ombined_loss.backward()
       
        whisper_outputs.loss.backward()
        translation_outputs.loss.backward()
        
        whisper_optimizer.step()
        mt_optimizer.step()
        
        whisper_optimizer.zero_grad()
        mt_optimizer.zero_grad()

        epoch_loss += whisper_outputs.loss.item() + translation_outputs.loss.item()

    avg_loss = epoch_loss / len(dataloader)
    print(f"Epoch {epoch + 1}, Loss: {avg_loss}")

    # save the models
    whisper_model.save_pretrained(f"./fine_tuned_models/whisper_epoch_{epoch + 1}")
    whisper_processor.save_pretrained(f"./fine_tuned_models/whisper_epoch_{epoch + 1}")
    mt_model.save_pretrained(f"./fine_tuned_models/mt_epoch_{epoch + 1}")
    mt_tokenizer.save_pretrained(f"./fine_tuned_models/mt_epoch_{epoch + 1}")
    

<torch.utils.data.dataloader.DataLoader object at 0x13c3e5fa0>


  0%|          | 0/20 [00:00<?, ?it/s]

[' Presidentut,, you mentioned that word several times.']


  5%|▌         | 1/20 [00:02<00:56,  2.96s/it]

[' President, let us hope that the American proposals for purchases of toxic assets do work, because, if they do not, the contagion will almost certainly spread over here.']


 10%|█         | 2/20 [00:05<00:48,  2.72s/it]

[' would like a from Mr.iuya and Mr Almunia that we already do have other defensesences in place.']


 15%|█▌        | 3/20 [00:07<00:39,  2.33s/it]

[' sure sure that, in due course, we will have to review our regulatory defensesences, but this can and shouldn not be done reciproitateately in in the heat of the crisis.']


 20%|██        | 4/20 [00:09<00:38,  2.43s/it]

[' Monday, on Monday we were told by a McCreevy that hedge funds and private equity were not the cause of the turmoil turmoil.']


 25%|██▌       | 5/20 [00:12<00:37,  2.48s/it]

[' want like to the thank Alexander Alvaro and the Civil Liberties Committee with with the we worked very closely, for their part on the E-Privacy Directive.']


 30%|███       | 6/20 [00:14<00:33,  2.41s/it]

[' when the flames and embers are extinguished can we turn to post-mortems as to how this happened and what is needed to avoid it happening again.']


 35%|███▌      | 7/20 [00:17<00:30,  2.37s/it]

[' most vital thing which which we must bring about immediately is is the restoration of confidence.']


 40%|████      | 8/20 [00:19<00:26,  2.25s/it]

[' are moving from liquidity liquidity problem to sol solvency problem.']


 45%|████▌     | 9/20 [00:20<00:22,  2.08s/it]

[' just like want to say this you is more amendments on my reports, my committee has been more ambitious in the improvements it wanted to make to the Commission proposal.']


 50%|█████     | 10/20 [00:22<00:19,  1.99s/it]

[' would urge that the leaders of the world,�s most important economies of America, Europe, the Middle and Far East meet together within days and assure the world unequivocally that whatever is necessary to douse the flames will be provided unequivocally wherever those flames may erupt.']


 55%|█████▌    | 11/20 [00:24<00:17,  1.96s/it]

[" would like all of you to give us a huge majority for this, when when when we come to negotiate with Commissioner Commission and Council, we will do our very best for Europe's� Cons Cons."]


 60%|██████    | 12/20 [00:26<00:15,  1.96s/it]

[' us see how we can set a it.']


 65%|██████▌   | 13/20 [00:28<00:13,  1.90s/it]

[' is now up to the political leaders of the highest level, a summit indeed – of the free-- economies – come together – no excuses – no holding back – no arguments – – take full responsibility for restoring confidence.']


 70%|███████   | 14/20 [00:30<00:11,  1.89s/it]

[' we have our lenders of last resort ready to meet the very worst, might occur in such a circumstance.']


 75%|███████▌  | 15/20 [00:31<00:09,  1.85s/it]

[' would risk overreacting, imposing our, wrongly-, over-draconian conditions, would only harm the prospects for investment in our economies, our for the future.']


 80%|████████  | 16/20 [00:33<00:07,  1.84s/it]

[' President, well know that I will not be popular for making a long speech at this time, but my two other-rapporteur, with whom I worked worked, closely, a team, have made short statements, I want to keep the team spirit together.']


 85%|████████▌ | 17/20 [00:35<00:05,  1.84s/it]

[' would like thanked thank all my colleagues on the committee who worked with me to put together some really big compromise amendments which we will pass today.']


 90%|█████████ | 18/20 [00:37<00:03,  1.82s/it]

[' ask for your support to ensure that we have competent well well-in- consumers, electronic communications and and also also secure and knowing that their personal data is protected.']


 95%|█████████▌| 19/20 [00:39<00:01,  1.89s/it]

['idence is the vital base on which a vibrant financial system and a vibrant global economy rests.']


100%|██████████| 20/20 [00:41<00:00,  2.06s/it]
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


Epoch 1, Loss: 2.895303186774254


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
  0%|          | 0/20 [00:00<?, ?it/s]

[' would like all thank all my colleagues on the committee who worked with me to put together some really big compromise amendments which we will pass today.']


  5%|▌         | 1/20 [00:02<00:52,  2.77s/it]

[' us see how we can set about it.']


 10%|█         | 2/20 [00:04<00:39,  2.19s/it]

[' most vital thing, which we must bring about immediately, is the restoration of confidence.']


 15%|█▌        | 3/20 [00:06<00:33,  1.96s/it]

[' we have our lenders of last resort ready to meet the very worst which might occur in such a circumstance?']


 20%|██        | 4/20 [00:08<00:30,  1.94s/it]

[' a is now up to the political leaders of the highest level – a summit indeed of of the free-market economies to come together – no excuses, no holding back, no arguments to to take full responsibility for restoring confidence.']


 25%|██▌       | 5/20 [00:09<00:28,  1.89s/it]

[' would like to to thank Alexander Alvaro and the Civil Liberties Committee for whom whom we worked very closely, for their part on the E-Privacy Directive.']


 30%|███       | 6/20 [00:11<00:25,  1.86s/it]

[' Almuniaia you mentioned the word several times.']


 35%|███▌      | 7/20 [00:13<00:24,  1.87s/it]

[' a Alm, on the we were told by Commissioner Creevy that hedge funds and private equity were not the cause of the current term.']


 40%|████      | 8/20 [00:16<00:24,  2.05s/it]

[' a would like assurance from Mr Jouyet and Mr Almunia that we really do have our defences in place.']


 45%|████▌     | 9/20 [00:18<00:22,  2.05s/it]

[' a would like all of you to give us a huge majority for this so that, when we come to negotiate with the Commission and Council, we will do our very best for Europe consumers� s consumers.']


 50%|█████     | 10/20 [00:19<00:19,  2.00s/it]

[' a President, I know that I will not be popular for making a long speech at this time, but my two fellow-rapporteurs, with whom I have worked very closely as a team, have made short statements so I want to keep the team spirit together.']


 55%|█████▌    | 11/20 [00:22<00:18,  2.01s/it]

[' aidence is the vital base on which a vibrant financial system and a vibrant global economy rests.']


 60%|██████    | 12/20 [00:23<00:15,  1.97s/it]

[' a would like like to say that there are more amendments for my report because my committee has been more ambitious in the improvements it wanted to make to the Commission proposal.']


 65%|██████▌   | 13/20 [00:25<00:13,  1.94s/it]

[' a is true that, in due course, we will have to review our regulatory defences, but this cannot and should not be done precipitately. in the heat of the crisis.']


 70%|███████   | 14/20 [00:27<00:11,  1.90s/it]

[' a would risk overreacting, imposing unnecessary, wrongly directed over over-recraconian conditions which would only harm the prospects for investment in our economies and jobs for the future.']


 75%|███████▌  | 15/20 [00:29<00:09,  1.94s/it]

[' a President, let us hope that the American proposals for purchases of toxic assets do work, because, if they do not, the contagion will almost certainly spread over here.']


 80%|████████  | 16/20 [00:31<00:07,  1.92s/it]

[' a would urge that the leaders of the world s� s most important economies of America, Europe, the Middle and Far East meet together within days and assure the world unequivocally that whatever is necessary to douse the flames will be provided unequivocally wherever those flames may erupt.']


 85%|████████▌ | 17/20 [00:34<00:06,  2.16s/it]

[' a when the flames and embers are extinguished can we turn to post-mortems as to how this happened and what is needed to avoid it happening again.']


 90%|█████████ | 18/20 [00:36<00:04,  2.25s/it]

[' a ask for your support to ensure that we have confident, well-informed consumers for electronic communications who who are also secure and know that their personal data is protected.']


 95%|█████████▌| 19/20 [00:38<00:02,  2.27s/it]

[' are moving from a a problem to a solvency problem.']


100%|██████████| 20/20 [00:41<00:00,  2.07s/it]
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


Epoch 2, Loss: 1.3270757138729095


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
  0%|          | 0/20 [00:00<?, ?it/s]

[' a President, let us hope that the American proposals for purchases of toxic assets do work, because, if they do not, the contagion will almost certainly spread over here.']


  5%|▌         | 1/20 [00:02<00:46,  2.44s/it]

[" a would like all of you to give us a huge majority for this so that, when we come to negotiate with the Commission and Council, we will do our very best for Europe's� s consumers."]


 10%|█         | 2/20 [00:04<00:39,  2.18s/it]

[' a would like assurance from Mr Jouyet and Mr Almunia that we really do have our defences in place.']


 15%|█▌        | 3/20 [00:06<00:37,  2.20s/it]

[' most vital thing, which we must bring about immediately, is the restoration of confidence.']


 20%|██        | 4/20 [00:09<00:37,  2.34s/it]

[' a would just like to say that there are more amendments in my report because my committee has been more ambitious in the improvements it wanted to make to the Commission proposal.']


 25%|██▌       | 5/20 [00:11<00:33,  2.23s/it]

[' Almunia, you mentioned the word several times.']


 30%|███       | 6/20 [00:13<00:29,  2.12s/it]

[' a when the flames and embers are extinguished can we turn to post-mortems as to how this happened and what is needed to avoid it happening again.']


 35%|███▌      | 7/20 [00:15<00:28,  2.20s/it]

[' a is true that, in due course, we will have to review our regulatory defences, but this cannot and should not be done precipitately, in the heat of the crisis.']


 40%|████      | 8/20 [00:18<00:28,  2.36s/it]

[' a would like to thank all my colleagues on the committee who worked with me to put together some really big compromise amendments which we will pass today.']


 45%|████▌     | 9/20 [00:20<00:24,  2.25s/it]

[' a are moving from a liquidity problem to a solvency problem to To']


 50%|█████     | 10/20 [00:22<00:22,  2.22s/it]

['idence is the vital base on which a vibrant financial system and a vibrant global economy rests.']


 55%|█████▌    | 11/20 [00:24<00:20,  2.31s/it]

[' a President, I know that I will not be popular for making a long speech at this time, but my two fellow-rapporteurs, with whom I have worked very closely as a team, have made short statements so I want to keep the team spirit together.']


 60%|██████    | 12/20 [00:28<00:20,  2.56s/it]

[' a President, on Monday we were told by Commissioner McCreevy that hedge funds and private equity were not the cause of the current turmoil.']


 65%|██████▌   | 13/20 [00:30<00:17,  2.48s/it]

[' a would urge that the leaders of the world ’ s most important economies of America, Europe, the Middle and Far East meet together within days and assure the world unequivocally that whatever is necessary to douse the flames will be provided unequivocally wherever those flames may erupt.']


 70%|███████   | 14/20 [00:32<00:14,  2.34s/it]

[' would like particularly to thank Alexander Alvaro and the Civil Liberties Committee, with whom we worked very closely, for their part on the E-Privacy Directive.']


 75%|███████▌  | 15/20 [00:34<00:11,  2.20s/it]

[' we have our lenders of last resort ready to meet the very worst which might occur in such a circumstance?']


 80%|████████  | 16/20 [00:36<00:08,  2.11s/it]

[' is now up to the political leaders of the highest level – a summit indeed – of the free-market economies to come together, no excuses, no holding back, no arguments – to take full responsibility for restoring confidence.']


 85%|████████▌ | 17/20 [00:38<00:06,  2.28s/it]

[' would risk overreacting, imposing unnecessary, wrongly directed, over-draconian conditions which would only harm the prospects for investment in our economies and jobs for the future.']


 90%|█████████ | 18/20 [00:41<00:04,  2.38s/it]

[' us see how we can set about it.']


 95%|█████████▌| 19/20 [00:43<00:02,  2.43s/it]

[' ask for your support to ensure that we have confident, well-informed consumers for electronic communications, who are also secure and know that their personal data is protected.']


100%|██████████| 20/20 [00:46<00:00,  2.31s/it]
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


Epoch 3, Loss: 0.706246136687696


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}


In [68]:
test_dataset = load_dataset("tj-solergibert/Europarl-ST", split="test")
test_dataset = test_dataset.filter(lambda example: example['original_language'] == 'en' and 'fr' in example['transcriptions'].keys() and example["transcriptions"]['fr'] is not None)
print(len(test_dataset))
print(test_dataset)
test_dataset = test_dataset.select(range(10))
test_dataset = test_dataset.map(preprocess, remove_columns=["audio_path", "original_speech", "original_language", "segment_start", "segment_end", "transcriptions"])

# Create DataLoader with custom collate_fn
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True, collate_fn=collate_fn)


# Paths to the saved models
whisper_model_path = "./fine_tuned_models/whisper_epoch_3"  # Update with your final epoch
mt_model_path = "./fine_tuned_models/mt_epoch_3"  # Update with your final epoch

# Load the models
# whisper_processor = WhisperProcessor.from_pretrained(whisper_model_path)
# whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_path)
# mt_tokenizer = MarianTokenizer.from_pretrained(mt_model_path)
# mt_model = MarianMTModel.from_pretrained(mt_model_path)

whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
mt_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")
mt_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-fr")


# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
whisper_model.to(device)
mt_model.to(device)

whisper_model.eval()
mt_model.eval()
    
epoch_loss = 0.0
for batch in tqdm(test_dataloader):
    input_features = batch["input_features"].to(device)

    original_speech = whisper_processor.batch_decode(batch["english_text"].to(device), skip_special_tokens=True)[0]
    print("Original Speech", original_speech)
    # Generate transcription using Whisper model
    with torch.no_grad():
        whisper_outputs = whisper_model.generate(
            input_features,
            num_beams=5, 
            repetition_penalty=1.2, 
            no_repeat_ngram_size=2, 
            temperature=0.7,  
            top_k=50,  
            top_p=0.95  
        )
        
        transcription = whisper_processor.batch_decode(whisper_outputs, skip_special_tokens=True)[0]

    print("Generated Transcription:", transcription)

    # Translate the transcription using the translation model
    tokenized_transcription = mt_tokenizer(transcription, return_tensors="pt", padding="longest", truncation=True)
    tokenized_transcription = tokenized_transcription.input_ids.to(device)

    with torch.no_grad():
        translated_tokens = mt_model.generate(tokenized_transcription)
        translation = mt_tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

    print("Translation to French:", translation)

avg_loss = epoch_loss / len(dataloader)
print(f"Loss: {avg_loss}")
    


Using the latest cached version of the dataset since tj-solergibert/Europarl-ST couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /Users/jacksonbrouwer/.cache/huggingface/datasets/tj-solergibert___europarl-st/default/0.0.0/9635c1dc32c9a45e70e4373cf82e0c2e8fbe12c3 (last modified on Wed Aug 14 20:26:41 2024).


1214
Dataset({
    features: ['original_speech', 'original_language', 'audio_path', 'segment_start', 'segment_end', 'transcriptions'],
    num_rows: 1214
})


  audio, sr = librosa.load(audio_path, sr=16000, offset=segment_start, duration=segment_end-segment_start)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
Map: 100%|██████████| 10/10 [00:00<00:00, 14.74 examples/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


ConnectionError: (MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /Helsinki-NLP/opus-mt-en-fr/resolve/main/model.safetensors (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x144106fc0>: Failed to resolve \'huggingface.co\' ([Errno 8] nodename nor servname provided, or not known)"))'), '(Request ID: 9aa9d87c-ba65-48f3-9884-ac863c24f870)')

In [None]:
# Calculate BLEU score of the original model and the trained model 
import sacrebleu
import jiwer

bleu_scores = []
wer_scores = []

for batch in tqdm(test_dataloader):
    input_features = batch["input_features"].to(device)
    target_ids = batch["labels"].to(device)

    original_speech = whisper_processor.batch_decode(batch["english_text"].to(device), skip_special_tokens=True)[0]
    
    print("Original Speech", original_speech)
    # Generate transcription using Whisper model
    with torch.no_grad():
        whisper_outputs = whisper_model.generate(
            input_features,
            num_beams=5, 
            repetition_penalty=1.2, 
            no_repeat_ngram_size=2, 
            temperature=0.7,  
            top_k=50,  
            top_p=0.95  
        )
        
        transcription = whisper_processor.batch_decode(whisper_outputs, skip_special_tokens=True)[0]

    # print("Generated Transcription:", transcription)

    # Translate the transcription using the translation model
    tokenized_transcription = mt_tokenizer(transcription, return_tensors="pt", padding="longest", truncation=True)
    tokenized_transcription = tokenized_transcription.input_ids.to(device)

    with torch.no_grad():
        translated_tokens = mt_model.generate(tokenized_transcription)
        translation = mt_tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

    # print("Translation to French:", translation)

    # Calculate BLEU score
    reference = mt_tokenizer.decode(target_ids[0], skip_special_tokens=True)
    hypothesis = translation 
    bleu = sacrebleu.corpus_bleu([hypothesis], [[reference]])
    bleu_score = bleu.score
    bleu_scores.append(bleu_score)
    wer = jiwer.wer(reference, hypothesis)
    wer_scores.append(wer)

    print(f"Transcription: {transcription}")
    print(f"Generated Translation: {translation}")
    print(f"Reference Translation: {reference}")
    print(f"BLEU Score: {bleu_score}\n")
    print(f"WER: {wer}\n")

avg_bleu_score = sum(bleu_scores) / len(bleu_scores)
avg_wer_score = sum(wer_scores) / len(wer_scores)

print(f"Average BLEU Score: {avg_bleu_score}")
print(f"Average WER: {avg_wer_score}")

  0%|          | 0/10 [00:00<?, ?it/s]



 10%|█         | 1/10 [00:03<00:28,  3.12s/it]

Generated Translation: Pourquoi l'élaboration des politiques n'a-t-elle pas appris de la crise précédente?
Reference Translation: Pourquoi les décideurs politiques n'ont-ils pas tiré les leçons des précédentes crises, en dépit des avertissements qui leur étaient adressés?
BLEU Score: 2.2008038097333027

Original Speech What about the flaws of the originate-and-distribute model, which has enhanced systemic risk?


 20%|██        | 2/10 [00:05<00:22,  2.76s/it]

Transcription:  What about the flaws of the region at industry be model, which has enhanced systemic risk?
Generated Translation: Qu'en est-il des défauts de la région dans l'industrie, qui ont accru le risque systémique?
Reference Translation: Qu'en est-il des défaillances du modèle d '« octroi puis cession », qui a augmenté le risque systémique?
BLEU Score: 19.343786993461148

Original Speech Madam President, are only greed, euphoria and cheap money to be blamed for the whole mess?


 30%|███       | 3/10 [00:08<00:18,  2.69s/it]

Transcription:  He's only greed, euphoria, and cheap money to be blamed for the whole mess.
Generated Translation: Il n'est que cupidité, euphorie, et argent bon marché à blâmer pour tout le désordre.
Reference Translation: Madame la Présidente, l'avidité, l'euphorie et l'argent bon marché sont-ils les seuls à blâmer pour tout ce désordre?
BLEU Score: 15.378249972287636

Original Speech Just remember what Lamfalussy, Gramlich, Volcker and Buffett said years ago.


 40%|████      | 4/10 [00:10<00:16,  2.70s/it]

Transcription:  just remember what Lumpfaloo's e-gramming forker. Buff had said years ago.
Generated Translation: Souviens-toi de ce que Lumpfaloo a dit il y a des années.
Reference Translation: Souvenez-vous de ce que disaient Lamfalussy, Gramlich, Volcker et Buffett il y a quelques années.
BLEU Score: 14.957166830762677

Original Speech What about conflicts of interest?


 50%|█████     | 5/10 [00:12<00:10,  2.19s/it]

Transcription:  about conflicts of interest.
Generated Translation: sur les conflits d'intérêts.
Reference Translation: Et les conflits d'intérêts?
BLEU Score: 39.76353643835254

Original Speech What about skewed pay schemes with a lack of ethics, which have stimulated reckless risk-taking?


 60%|██████    | 6/10 [00:15<00:09,  2.45s/it]

Transcription:  What about skewed paste schemes with a lack of ethics, which has stimulated recklessly staking?
Generated Translation: Qu'en est-il des stratagèmes de pâte biaisés avec un manque d'éthique, qui a stimulé le jalonnement imprudent?
Reference Translation: Qu'en est-il du système de rémunération, biaisé et totalement dépourvu d'éthique, qui a encouragé la prise de risques inconsidérés?
BLEU Score: 13.830039740141478

Original Speech What about investment-grade values assigned to trash?


 70%|███████   | 7/10 [00:17<00:06,  2.26s/it]

Transcription:  What about investment grade values assigned to trash?
Generated Translation: Qu'en est-il des valeurs d'investissement assignées aux déchets?
Reference Translation: Qu'en est-il des valeurs d'investissement qui ont jetées à la poubelle?
BLEU Score: 33.47189874003769

Original Speech The argument that regulation stifles financial innovation I find ludicrous.


 80%|████████  | 8/10 [00:19<00:04,  2.27s/it]

Transcription:  The argument that regulations type of financial innovation I find ludicrous.
Generated Translation: L'argument selon lequel le type de réglementation d'innovation financière me paraît ridicule.
Reference Translation: L'argument selon lequel la réglementation étoufferait l'innovation financière est tout simplement grotesque.
BLEU Score: 13.67440667823257

Original Speech What about banks engaging in casino-type transactions?


 90%|█████████ | 9/10 [00:21<00:02,  2.12s/it]

Transcription:  What would banks engage in casino type transactions?
Generated Translation: Qu'est-ce que les banques effectueraient dans des transactions de type casino?
Reference Translation: Et les banques qui s'engagent dans des transactions de type casino?
BLEU Score: 57.067457770559976

Original Speech What about the ‘ shadow ’ banking sector, with its extreme leveraging and speculation?


100%|██████████| 10/10 [00:23<00:00,  2.32s/it]

Transcription:  What about the shadow banking sector with extreme leveraging and speculation?
Generated Translation: Qu'en est-il du secteur bancaire parallèle avec un effet de levier et des spéculations extrêmes?
Reference Translation: Et le secteur bancaire « caché », qui mise à l'extrême sur l'effet de levier et la spéculation?
BLEU Score: 9.330745616758765

Average BLEU Score: 21.901809259032778



