In [1]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import Audio, load_dataset
import pandas as pd
import librosa

In [2]:
# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
forced_decoder_ids = processor.get_decoder_prompt_ids(language="french", task="transcribe")

In [3]:
import librosa

path = 'audio_samples/train_0023.wav'
input_speech, _ = librosa.load(path, sr=None)
input_features = processor(input_speech, sampling_rate=16000, return_tensors="pt").input_features
predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
print(transcription)



[' Bonsoir.']


In [None]:
transcription_df = pd.DataFrame(columns=['wav', 'transcription'])
for wav in ['audio_samples/train_0001.wav', 'audio_samples/train_0002.wav', 'audio_samples/train_0003.wav']:
    input_speech, _ = librosa.load(wav, sr=None)
    input_features = processor(input_speech, sampling_rate=16000, return_tensors="pt").input_features
    # generate token ids
    predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    dict_metadata = {
        'wav': wav,
        'transcription': transcription
    }
    transcription_df = transcription_df.append(dict_metadata, ignore_index=True)

In [45]:
transcription_df.to_csv('recipes/requests/data/transcripts/train_transcriptions.csv', index=False)
print(transcription_df['transcription'][3])

KeyError: 3

In [38]:
df = pd.read_csv('recipes/requests/data/transcripts/train_transcriptions.csv')

In [None]:
!pip install https://github.com/kpu/kenlm/archive/master.zip pyctcdecode

In [47]:
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import torch
import torchaudio

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Load model
model = AutoModelForSpeechSeq2Seq.from_pretrained("bofenghuang/whisper-large-v2-cv11-french").to(device)
processor = AutoProcessor.from_pretrained("bofenghuang/whisper-large-v2-cv11-french", language="french", task="transcribe")

# NB: set forced_decoder_ids for generation utils
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="fr", task="transcribe")

# 16_000
model_sample_rate = processor.feature_extractor.sampling_rate

# Load data
path = 'audio_samples/train_0003.wav'
input_speech, sr = librosa.load(path, sr=None)
sample_rate = sr

# Resample
if sample_rate != model_sample_rate:
    resampler = torchaudio.transforms.Resample(sample_rate, model_sample_rate)
    input_speech = resampler(input_speech)

# Get feat
inputs = processor(input_speech, sampling_rate=model_sample_rate, return_tensors="pt")
input_features = inputs.input_features
input_features = input_features.to(device)

# Generate
generated_ids = model.generate(inputs=input_features, max_new_tokens=225)  # greedy
# generated_ids = model.generate(inputs=input_features, max_new_tokens=225, num_beams=5)  # beam search

# Detokenize
generated_sentences = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_sentences)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/832 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.11k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

Sur les semelles orthopédiques, on prend en charge parce que la Sécu reconnaît ce soin.
