Notebook for generating plots after benching french STT models

In [10]:
import os
import json
import pandas as pd
import numpy as np
import pickle

from asr_benchmark.visualization.plots import prepare_data, plot_violin_df, plot_wer_df, plot_bar_df
from asr_benchmark.utils.visualize import load_data

pd.set_option("future.no_silent_downcasting", True)

In [11]:
INPUT_FOLDER = "output"
OUTPUT_FOLDER = "plots"

os.makedirs(OUTPUT_FOLDER, exist_ok=True)
if not os.path.exists("data.pkl"):
    df = pd.DataFrame(load_data(INPUT_FOLDER))
    df.to_pickle("data.pkl")
df = pd.read_pickle("data.pkl")

In [12]:
df.head()

Unnamed: 0,backend,model,vad,precision,device,accurate,previous_text,input_manifest,compute_rtf,language,...,dataset,audio_duration,process_duration,wer,wer_details,audio_file,RAM usage,VRAM usage,GPU usage,decoder
0,faster-whisper,bofenghuang/whisper-large-v3-french-distil-dec8,False,float16,cuda,False,False,examples/manifest_example.jsonl,True,fr,...,Example,[1.202],[0.24683],[0.0],"[{'wer': 0.0, 'del': 0.0, 'ins': 0.0, 'sub': 0...",[bonjour],7.18,3.66,0.0,
1,faster-whisper,whisper-large-v3-turbo,False,float16,cuda,False,False,examples/manifest_example.jsonl,True,fr,...,Example,[1.202],[0.2239],[0.0],"[{'wer': 0.0, 'del': 0.0, 'ins': 0.0, 'sub': 0...",[bonjour],7.17,3.44,0.0,
2,nemo,stt-fr-conformer-ctc-large,False,,cuda,,,examples/manifest_example.jsonl,True,fr,...,Example,[1.202],[0.02727],[0.0],"[{'wer': 0.0, 'del': 0.0, 'ins': 0.0, 'sub': 0...",[bonjour],5.54,1.87,0.0,ctc


In [13]:
selected_rows = df[
    ((df["device"] == "cuda"))
    & (~df['process_duration'].isnull())
]
selected_rows = prepare_data(selected_rows, return_format="df")
selected_rows

Unnamed: 0,backend,model,vad,precision,device,accurate,previous_text,input_manifest,compute_rtf,language,...,dataset,audio_duration,process_duration,wer,wer_details,audio_file,RAM usage,VRAM usage,GPU usage,decoder
0,faster-whisper,bofenghuang/whisper-large-v3-french-distil-dec8,False,float16,cuda,False,False,examples/manifest_example.jsonl,True,fr,...,Example,[1.202],[0.24683],[0.0],"[{'wer': 0.0, 'del': 0.0, 'ins': 0.0, 'sub': 0...",[bonjour],7.18,3.66,0.0,
1,faster-whisper,whisper-large-v3-turbo,False,float16,cuda,False,False,examples/manifest_example.jsonl,True,fr,...,Example,[1.202],[0.2239],[0.0],"[{'wer': 0.0, 'del': 0.0, 'ins': 0.0, 'sub': 0...",[bonjour],7.17,3.44,0.0,
2,nemo,stt-fr-conformer-ctc-large,False,,cuda,,,examples/manifest_example.jsonl,True,fr,...,Example,[1.202],[0.02727],[0.0],"[{'wer': 0.0, 'del': 0.0, 'ins': 0.0, 'sub': 0...",[bonjour],5.54,1.87,0.0,ctc


In [14]:
local_output_folder = os.path.join(OUTPUT_FOLDER, "model")
os.makedirs(local_output_folder, exist_ok=True)
plot_violin_df(selected_rows, local_output_folder, x_column="rtf", y_column="model", title=None, limit=0.1, ylabel=None, save_fig=True)

In [15]:
plot_bar_df(selected_rows, local_output_folder, x_column="VRAM usage", y_column="model", title=None, save_fig=True)
plot_bar_df(selected_rows, local_output_folder, x_column="GPU usage", y_column="model", title=None, save_fig=True)

In [16]:
rows_dict = prepare_data(selected_rows, return_format="dict")
plot_wer_df(rows_dict, local_output_folder, x_column="model", title="", save_fig=f"model_wer")