## Générer les données

In [2]:
from datasets.load import load_dataset
import numpy as np
from tqdm import tqdm
import pickle

librispeech_clean = load_dataset("librispeech_asr", "clean", split="test")
librispeech_other = load_dataset("librispeech_asr", "other", split="test")
sample_size = 50

X_clean = []
Y_clean = []
for ind in tqdm(np.random.randint(0, len(librispeech_clean['audio']), sample_size)):
    X_clean.append(librispeech_clean['audio'][ind]['array'])
    Y_clean.append(librispeech_clean['text'][ind])
    
X_other = []
Y_other = []
for ind in tqdm(np.random.randint(0, len(librispeech_other['audio']), sample_size)):
    X_other.append(librispeech_other['audio'][ind]['array'])
    Y_other.append(librispeech_other['text'][ind])
    
data = {
    "X_clean": X_clean,
    "Y_clean": Y_clean,
    "X_other": X_other,
    "Y_other": Y_other
}
with open('Data/data_librispeech.pkl', 'wb') as f:
    pickle.dump(data, f)


Reusing dataset librispeech_asr (C:\Users\Jayma\.cache\huggingface\datasets\librispeech_asr\clean\2.1.0\14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb)
Reusing dataset librispeech_asr (C:\Users\Jayma\.cache\huggingface\datasets\librispeech_asr\other\2.1.0\14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb)
100%|██████████| 50/50 [05:17<00:00,  6.36s/it]
100%|██████████| 50/50 [05:22<00:00,  6.44s/it]


## Calculer les prédictions

In [2]:
from transformers import Wav2Vec2Processor, Data2VecAudioForCTC, Wav2Vec2ForCTC
import jiwer
import torch
import pickle

# load model and processor
processor_data2vec = Wav2Vec2Processor.from_pretrained("facebook/data2vec-audio-large-960h")
processor_wav2vec = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
mdl_data2vec = Data2VecAudioForCTC.from_pretrained("facebook/data2vec-audio-large-960h")
mdl_wav2vec = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

# load data
with open('Data/data_librispeech.pkl', 'rb') as f:
    data = pickle.load(f)


input_data2vec_clean = processor_data2vec(data['X_clean'], return_tensors="pt", sampling_rate=16000, padding="longest").input_values
input_wav2vec_clean = processor_wav2vec(data['X_clean'], return_tensors="pt", sampling_rate=16000, padding="longest").input_values
input_data2vec_other = processor_data2vec(data['X_other'], return_tensors="pt", sampling_rate=16000, padding="longest").input_values
input_wav2vec_other = processor_wav2vec(data['X_other'], return_tensors="pt", sampling_rate=16000, padding="longest").input_values

with torch.no_grad():
    logits_data2vec_clean = mdl_data2vec(input_data2vec_clean.float()).logits
    logits_wav2vec_clean = mdl_wav2vec(input_wav2vec_clean.float()).logits
    logits_data2vec_other = mdl_data2vec(input_data2vec_other.float()).logits
    logits_wav2vec_other = mdl_wav2vec(input_wav2vec_other.float()).logits
    
data['wav2vec_clean'] = processor_wav2vec.batch_decode(torch.argmax(logits_wav2vec_clean, dim=-1))
data['data2vec_clean'] = processor_data2vec.batch_decode(torch.argmax(logits_data2vec_clean, dim=-1))
data['wav2vec_other'] = processor_wav2vec.batch_decode(torch.argmax(logits_wav2vec_other, dim=-1))
data['data2vec_other'] = processor_data2vec.batch_decode(torch.argmax(logits_data2vec_other, dim=-1))

with open('Data/data_librispeech.pkl', 'wb') as f:
    pickle.dump(data, f)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Visualiser les résultats

In [1]:
# load data
import pickle
import jiwer
with open('Data/data_librispeech.pkl', 'rb') as f:
    data = pickle.load(f)

In [3]:
print("WER wav2vec2 clean: {:.1f}% | CER wav2vec2 clean: {:.1f}%".format(jiwer.wer(data['Y_clean'], data['wav2vec_clean'])*100, jiwer.cer(data['Y_clean'], data['wav2vec_clean'])*100))
print("WER data2vec clean: {:.1f}% | CER data2vec clean: {:.1f}%".format(jiwer.wer(data['Y_clean'], data['data2vec_clean'])*100, jiwer.cer(data['Y_clean'], data['data2vec_clean'])*100))
print("WER wav2vec2 other: {:.1f}% | CER wav2vec2 other: {:.1f}%".format(jiwer.wer(data['Y_other'], data['wav2vec_other'])*100, jiwer.cer(data['Y_other'], data['wav2vec_other'])*100))
print("WER data2vec other: {:.1f}% | CER data2vec other: {:.1f}%".format(jiwer.wer(data['Y_other'], data['data2vec_other'])*100, jiwer.cer(data['Y_other'], data['data2vec_other'])*100))

WER wav2vec2 clean: 7.1% | CER wav2vec2 clean: 2.7%
WER data2vec clean: 1.7% | CER data2vec clean: 0.4%
WER wav2vec2 other: 7.4% | CER wav2vec2 other: 3.3%
WER data2vec other: 3.0% | CER data2vec other: 1.1%


In [15]:
score = []
for i in range(len(data['Y_clean'])):
    score.append(jiwer.wer(data['Y_other'][i], data['data2vec_other'][i]))
score = np.array(score)
data['Y_other'] = np.array(data['Y_other'])
data['Y_clean'] = np.array(data['Y_clean'])
data['wav2vec_other'] = np.array(data['wav2vec_other'])
data['wav2vec_clean'] = np.array(data['wav2vec_clean'])
data['data2vec_other'] = np.array(data['data2vec_other'])
data['data2vec_clean'] = np.array(data['data2vec_clean'])

In [30]:
i = 7
print(data['Y_other'][score > 0.1][i])
print(data['wav2vec_other'][score > 0.1][i])
print(data['data2vec_other'][score > 0.1][i])

BUT THIS SHALL BANISH IT UTTERLY
BUT THEY SOUBANISD IT UTTERLY
BUT THEY SHALL BANISH IT UTTERLY
