In [1]:
# %load_ext autoreload
# %autoreload 2

In [19]:
import os
from collections import defaultdict

import pandas as pd
import torch
import torchaudio as ta
from data_utils import SpeechDataset, yt_data_to_df
from eval_utils import _normalize_text, _wer
from IPython.display import Audio
from tqdm import tqdm

In [3]:
target_sr = 16000

In [84]:
video_dir = r'F:/BIG_FILES/AI_DATA/2024_STT'
if not os.path.exists(video_dir):
    print('Video directory not found')
    raise FileNotFoundError(video_dir)
    
cache_dir = './cache'
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

In [85]:
video_df, segment_df = yt_data_to_df(video_dir)
display(video_df.head(3), segment_df.head(3))
print(f'Number of videos: {len(video_df)}')
print(f'Number of segments: {len(segment_df)}')

Unnamed: 0,video_id,video_path,video_url,language,is_generated,num_segments,segment_durations
0,-9s7VmUX-10,F:/BIG_FILES/AI_DATA/2024_STT\-9s7VmUX-10,https://www.youtube.com/watch?v=-9s7VmUX-10,de,True,10,"[32.030000000000086, 32.19, 29.129999999999995..."
1,0jh_-wVKeEE,F:/BIG_FILES/AI_DATA/2024_STT\0jh_-wVKeEE,https://www.youtube.com/watch?v=0jh_-wVKeEE,de,True,10,"[30.460000000000036, 30.99000000000001, 31.098..."
2,1GsU56ryNPU,F:/BIG_FILES/AI_DATA/2024_STT\1GsU56ryNPU,https://www.youtube.com/watch?v=1GsU56ryNPU,de,True,10,"[30.69999999999999, 29.429999999999993, 30.710..."


Unnamed: 0,video_id,segment_id,segment_path,transcript_path,segment_duration
0,-9s7VmUX-10,0,F:/BIG_FILES/AI_DATA/2024_STT\-9s7VmUX-10\segm...,F:/BIG_FILES/AI_DATA/2024_STT\-9s7VmUX-10\tran...,32.03
1,-9s7VmUX-10,1,F:/BIG_FILES/AI_DATA/2024_STT\-9s7VmUX-10\segm...,F:/BIG_FILES/AI_DATA/2024_STT\-9s7VmUX-10\tran...,32.19
2,-9s7VmUX-10,2,F:/BIG_FILES/AI_DATA/2024_STT\-9s7VmUX-10\segm...,F:/BIG_FILES/AI_DATA/2024_STT\-9s7VmUX-10\tran...,29.13


Number of videos: 64
Number of segments: 627


In [86]:
# load a random audio file
sample_segment = segment_df.sample(1).iloc[0]
audio_file_path = sample_segment['segment_path']
transcript_path = sample_segment['transcript_path']
is_generated = video_df[video_df['video_id'] == sample_segment['video_id']]['is_generated'].iloc[0]
with open(transcript_path, 'r', encoding="utf-8") as f:
    transcript = f.read().strip()
wave, sr = ta.load(audio_file_path)
print(f'Is generated: {is_generated}')
print(f'Loaded audio file: {audio_file_path}')
print(f'Wave shape: {wave.shape}')
print(f'Sample rate: {sr}')
display(Audio(wave.numpy(), rate=sr))

res_wave = ta.transforms.Resample(sr, target_sr)(wave)
print(f'Wave shape after resampling: {res_wave.shape}')
display(Audio(res_wave.numpy(), rate=target_sr))

# show the transcript but only x words per line
words = transcript.split()
words_per_line = 15
for i in range(0, len(words), words_per_line):
    print(' '.join(words[i:i+words_per_line]))

Is generated: False
Loaded audio file: F:/BIG_FILES/AI_DATA/2024_STT\m7Sfmb-MPl0\segments\segment_02.mp3
Wave shape: torch.Size([1, 1421784])
Sample rate: 44100


Wave shape after resampling: torch.Size([1, 515840])


Ähnlich, wie Fridays for Future, wo wir die Klassenzimmer verlassen haben. Und dann losgelaufen sind
zu anderen Schulen, um andere Schüler abzuholen, um Demonstrationen zu veranstalten. Ich fühle mich sehr
erinnert bei Fridays for Future an meine Jugend und diesen Geist, das Entstehen eines Bewusstseins
für Zusammenhänge, politische, wie gesellschaftliche. Auch diese Dringlichkeit kann ich sehr nachempfinden, mit der man
diese politischen Diskussionen führt. Mich befriedigt, zu sehen, dass - anders als oft behauptet wird
-


In [87]:
def show_output(wave, sr, gt, decoded_output):
    display(Audio(wave.numpy(), rate=sr))
    print(f"Ground Truth: {gt}")
    print(f"Predicted:    {decoded_output.strip()}")
    print(f"---")
    print(f"Normalized Ground Truth: {_normalize_text(gt)}")
    print(f"Normalized Predicted:    {_normalize_text(decoded_output)}")
    print(f'WER: {_wer(gt, decoded_output, _normalize_text)}')

In [88]:
# convert the is_generated column to a boolean
video_df['is_generated'] = video_df['is_generated'].map({'True': True, 'False': False})

# only keep not generated videos
print(f'Original video count: {len(video_df)}')
manuel_trans_video_df = video_df[~video_df['is_generated']]
manuel_trans_video_df = manuel_trans_video_df.reset_index(drop=True)
print(f'Filtered video count: {len(manuel_trans_video_df)}')

print(f'Original segment count: {len(segment_df)}')
manuel_trans_segment_df = segment_df[segment_df['video_id'].isin(manuel_trans_video_df['video_id'])]
manuel_trans_segment_df = manuel_trans_segment_df.reset_index(drop=True)
print(f'Filtered segment count: {len(manuel_trans_segment_df)}')

Original video count: 64
Filtered video count: 42
Original segment count: 627
Filtered segment count: 408


## Whisper

In [89]:
from transformers import (AutoProcessor, WhisperConfig,
                          WhisperForConditionalGeneration)

MODEL_ID = "openai/whisper-small"

In [90]:
whisper_config = WhisperConfig.from_pretrained(MODEL_ID, cache_dir=cache_dir)
whisper_model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID, config=whisper_config, cache_dir=cache_dir)
whisper_model.eval().to('cuda')
whisper_processor = AutoProcessor.from_pretrained(MODEL_ID, cache_dir=cache_dir)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [91]:
processor_args = {
    "return_tensors": "pt",
    "sampling_rate": target_sr,
}

whisper_dataset = SpeechDataset(segment_df, whisper_processor, processor_args, target_sr)
whisper_sample = whisper_dataset[0]
whisper_loader = torch.utils.data.DataLoader(whisper_dataset, batch_size=24, shuffle=False, collate_fn=lambda x: x)

In [92]:
def whisper_inferece(model, batch, processor, target_sr):
    input_features = [b["input_features"] for b in batch]
    input_features = torch.stack(input_features).squeeze(1).to('cuda')
    with torch.no_grad():
        output = model.generate(input_features, language="de")
        decoded_outputs = processor.batch_decode(output, skip_special_tokens=True)
        return decoded_outputs

In [105]:
whisper_results = defaultdict(list)
for batch in tqdm(whisper_loader):
    decoded_outputs = whisper_inferece(whisper_model, batch, whisper_processor, target_sr)
    for i, decoded_output in enumerate(decoded_outputs):
        whisper_results['decoded_output'].append(decoded_output.strip())
        whisper_results['normalized_decoded_output'].append(_normalize_text(decoded_output))
        whisper_results['gt'].append(batch[i]['transcript'].strip())
        whisper_results['normalized_gt'].append(_normalize_text(batch[i]['transcript']))
        whisper_results['audio_path'].append(batch[i]['audio_path'])
whisper_results_df = pd.DataFrame(whisper_results)
# filter all rows where the gt is empty
whisper_results_df['valid'] = whisper_results_df['normalized_gt'].apply(lambda x: len(x) > 0)
whisper_results_df = whisper_results_df[whisper_results_df['valid']].reset_index(drop=True)
whisper_results_df['wer'] = whisper_results_df.apply(lambda x: _wer(x['gt'], x['decoded_output'], _normalize_text), axis=1)
whisper_results_df.head()

100%|██████████| 27/27 [02:16<00:00,  5.04s/it]


In [113]:
whisper_wer = whisper_results_df['wer'].mean()*100
print(f'WHISPER WER: {whisper_wer:.2f}%')

WHISPER WER: 71.39%


In [114]:
whisper_results_df['wer'].describe()

count    622.000000
mean       0.713923
std        3.518447
min        0.020000
25%        0.178571
50%        0.278915
75%        0.453154
max       49.000000
Name: wer, dtype: float64

In [115]:
# show example with the highest WER
worst_wer_idx = whisper_results_df['wer'].idxmax()
worst_wer_row = whisper_results_df.loc[worst_wer_idx]
worst_wer_wave, sr = ta.load(worst_wer_row['audio_path'])
show_output(worst_wer_wave, sr, worst_wer_row['gt'], worst_wer_row['decoded_output'])

Ground Truth: Він був «своєрідним» Берліном і недалеко від техно-клубу «Berghain». І звичайно смачний хумус. Я отримав повідомлення в Instagram від кастинг-директорки з Росії. «Вітаю, Даніелю, я бачила один проєкт з Вами та Ваше інтерв'ю для російської газети, у мене для вас пропозиція». Було дуже смішно, бо ця роль не могла б бути ще більш «ультраросійською». І вони вважали, що це дуже круто, що я розмовляю російською, але не граю і не рухаюся як російський актор.
Predicted:    Es ist irgendwie Berlin und ich wollte und es ist nicht so weit ins Berg heim und dennoch kriege ich Homos. Ich habe wirklich über Instagram eine Message bekommen von einer Casterin aus Russland. Haliba Daniel, habt ein Projekt mit dir gesehen und ein Interview in der russischen Zeit und würde mich gerne anfragen. Es war so lustig, weil es für eine Rolle, die so ultra russischer nicht sein könnte und sie fand es aber so cool, dass ich sozusagen zwar russisch spreche, aber überhaupt nicht mich bewege und gebe wi

## Wav2Vec2ForCTC

In [None]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-german"

wav2vec_processor = Wav2Vec2Processor.from_pretrained(MODEL_ID, cache_dir=cache_dir)
wav2vec_model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID, cache_dir=cache_dir)
_= wav2vec_model.eval().to('cuda')

In [None]:
input_values = wav2vec_processor(res_wave[0], return_tensors="pt", sampling_rate=target_sr).input_values
with torch.no_grad():
    logits = wav2vec_model(input_values.to('cuda')).logits
predicted_ids = torch.argmax(logits, dim=-1)
predicted_sentences = wav2vec_processor.batch_decode(predicted_ids)[0]
show_output(res_wave, target_sr, transcript, predicted_sentences)

## SeamlessM4Tv2

In [None]:
from transformers import AutoProcessor, SeamlessM4Tv2Model

MODEL_ID = "facebook/seamless-m4t-v2-large"

seamless_processor = AutoProcessor.from_pretrained(MODEL_ID, cache_dir=cache_dir)
seamless_model = SeamlessM4Tv2Model.from_pretrained(MODEL_ID, cache_dir=cache_dir)
_ = seamless_model.eval().to('cuda')

In [None]:
# use the model for ASR
inputs = seamless_processor(audios=res_wave[0], return_tensors="pt", sampling_rate=target_sr)
inputs = inputs.to("cuda")
model_output = seamless_model.generate(**inputs, tgt_lang="deu", generate_speech=False)
decoded_output = seamless_processor.batch_decode(model_output[0], skip_special_tokens=True)[0]
show_output(res_wave, target_sr, transcript, decoded_output)