# Generating Whisper Transcriptions

## Set Up Environment 

In [1]:
from datasets import load_dataset, Dataset, Audio
import torch
from transformers import pipeline
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import os
import pandas as pd

## Gather Data for Preprocessing

In [2]:
# This is where Audio Files are stored in my ACCRE directory, but this path may need to be changed accordingly 

folder_path = "Audio_Files/"

In [3]:
import torch
torch.cuda.empty_cache()

In [4]:
def gather_file_paths(folder_path):
    data = []
    
    # Loop through the files in the folder
    for filename in os.listdir(folder_path):
        # Ensure it's a file and has an mp3 extension
        if filename.endswith(".mp3") | filename.endswith(".wav") :
            clip_name = os.path.splitext(filename)[0]  # Get the file name without extension
            file_path = os.path.join(folder_path, filename)
            data.append({"clip": clip_name, "path": file_path})
    
    # Create a DataFrame from the gathered data
    df = pd.DataFrame(data, columns=["clip", "path"])
    
    return df

In [5]:
df = gather_file_paths(folder_path)

audio_dataset = Dataset.from_dict({"audio": df['path']}).cast_column("audio", Audio())

In [6]:
df.shape

(131, 2)

## Define stable-ts timestamps

In [7]:
#!pip install -U stable-ts

In [3]:
import os
import gc
import torch
import pandas as pd
from tqdm import tqdm
import stable_whisper
import json

In [9]:
def process_audio_batch(model, audio_files, batch_size=4):
    for i in tqdm(range(0, len(audio_files), batch_size)):
        batch = audio_files[i:i+batch_size]
        
        for j, audio_file in enumerate(batch):
            result = model.transcribe(audio_file)
            
            # Save the result as JSON
            result.save_as_json(f'stable_ts_json/stable_ts_audio_{i + j}.json')
            
            # Extract word-level data
            words_data = []
            for segment in result.segments:
                for word in segment.words:
                    words_data.append({
                        'word': word.word,
                        'start': word.start,
                        'end': word.end,
                        'probability': word.probability
                    })
            # Create DataFrame for words
            df_words = pd.DataFrame(words_data)
            # Save the DataFrame as CSV
            df_words.to_csv(f'stable_ts_csv/stable_ts_{i + j}.csv', index=False)
        
        # Clear CUDA cache
        gc.collect()
        torch.cuda.empty_cache()

In [10]:
# Load the model
model = stable_whisper.load_model('large-v3')

# Ensure the output directories exist
os.makedirs('stable_ts_json', exist_ok=True)
os.makedirs('stable_ts_csv', exist_ok=True)

audio_folder = "Audio_Files/"
audio_files = [os.path.join(audio_folder, f) for f in os.listdir(audio_folder) if f.endswith(('.mp3', '.wav'))]

In [6]:
audio_folder = "Audio_Files/"
audio_files = [os.path.join(audio_folder, f) for f in os.listdir(audio_folder) if f.endswith(('.mp3', '.wav'))]

In [17]:
import os

# Paths
AUDIO_FILES_FOLDER = 'Audio_Files'
CSV_FILES_FOLDER = 'stable_ts_csv'

# Get the list of audio file paths and extract their corresponding keys
audio_files = os.listdir(AUDIO_FILES_FOLDER)  # Ensure consistent ordering
audio_keys = [os.path.splitext(filename)[0] for filename in audio_files if (filename.endswith(".mp3") | filename.endswith(".wav")) ]

# Get the list of CSV files in the stable_ts_csv folder
csv_files = sorted( [file for file in os.listdir(CSV_FILES_FOLDER) if file.endswith("csv")] )  # Ensure consistent ordering

# Check that the number of audio files matches the number of CSV files
if len(audio_keys) != len(csv_files):
    raise ValueError("The number of audio files does not match the number of CSV files!")


In [21]:
csv_files = sorted(
    [file for file in os.listdir(CSV_FILES_FOLDER) if file.endswith("csv")],
    key=lambda x: int(x.split('_')[2].split('.')[0])  # Extract the numeric part for sorting
)

In [23]:
# Rename each CSV file to match the corresponding audio file key
for csv_file, audio_key in zip(csv_files, audio_keys):
    # Define the old and new file paths
    old_csv_path = os.path.join(CSV_FILES_FOLDER, csv_file)
    new_csv_path = os.path.join(CSV_FILES_FOLDER, f"{audio_key}.csv")
    # Rename the CSV file
    os.rename(old_csv_path, new_csv_path)

print("CSV files renamed successfully.")

CSV files renamed successfully.


In [None]:
# Process the audio files in batches
BATCH_SIZE = 4
process_audio_batch(model, audio_files, batch_size=BATCH_SIZE)

  0%|          | 0/33 [00:00<?, ?it/s]
                                      [00:00<?, ?sec/s][A
  0%|          | 0/33 [00:01<?, ?it/s]                 
Transcribe:   0%|          | 0/420.49 [00:01<?, ?sec/s][A

Detected language: english



Transcribe:   7%|▋         | 28.0/420.49 [00:07<01:41,  3.88sec/s][A
Transcribe:  14%|█▍        | 58.0/420.49 [00:11<01:08,  5.31sec/s][A
Transcribe:  19%|█▊        | 78.0/420.49 [00:14<00:59,  5.76sec/s][A
Transcribe:  26%|██▌       | 107.48/420.49 [00:17<00:44,  7.00sec/s][A
Transcribe:  33%|███▎      | 137.48/420.49 [00:21<00:38,  7.27sec/s][A
Transcribe:  39%|███▉      | 164.08/420.49 [00:23<00:29,  8.75sec/s][A
Transcribe:  46%|████▌     | 194.08/420.49 [00:25<00:23,  9.47sec/s][A
Transcribe:  53%|█████▎    | 224.08/420.49 [00:27<00:18, 10.86sec/s][A
Transcribe:  60%|██████    | 254.08/420.49 [00:31<00:16,  9.95sec/s][A
Transcribe:  68%|██████▊   | 284.08/420.49 [00:34<00:14,  9.67sec/s][A
Transcribe:  75%|███████▍  | 314.08/420.49 [00:37<00:10,  9.87sec/s][A
Transcribe:  82%|████████▏ | 344.08/420.49 [00:40<00:08,  9.54sec/s][A
Transcribe:  89%|████████▉ | 374.08/420.49 [00:43<00:04,  9.81sec/s][A
Transcribe:  96%|█████████▌| 404.08/420.49 [00:45<00:01, 11.48sec/s]

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_0.json



                                      [00:00<?, ?sec/s][A
  0%|          | 0/33 [00:48<?, ?it/s]                 
Transcribe:   0%|          | 0/282.91 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:  10%|█         | 28.74/282.91 [00:04<00:43,  5.86sec/s][A
Transcribe:  20%|█▉        | 55.34/282.91 [00:08<00:33,  6.70sec/s][A
Transcribe:  30%|██▉       | 83.88/282.91 [00:12<00:28,  6.90sec/s][A
Transcribe:  40%|████      | 113.88/282.91 [00:15<00:21,  7.90sec/s][A
Transcribe:  51%|█████     | 143.88/282.91 [00:18<00:16,  8.28sec/s][A
Transcribe:  61%|██████▏   | 173.88/282.91 [00:21<00:11,  9.46sec/s][A
Transcribe:  72%|███████▏  | 203.88/282.91 [00:24<00:08,  8.86sec/s][A
Transcribe:  83%|████████▎ | 233.88/282.91 [00:27<00:05,  9.23sec/s][A
Transcribe:  93%|█████████▎| 263.88/282.91 [00:31<00:02,  8.54sec/s][A
Transcribe: 100%|█████████▉| 282.75/282.91 [00:34<00:00,  8.09sec/s][A


Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_1.json



                                      [00:00<?, ?sec/s][A
  0%|          | 0/33 [01:23<?, ?it/s]                 
Transcribe:   0%|          | 0/424.33 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   6%|▌         | 24.36/424.33 [00:04<01:06,  5.99sec/s][A
Transcribe:  13%|█▎        | 54.36/424.33 [00:07<00:48,  7.67sec/s][A
Transcribe:  19%|█▉        | 81.02/424.33 [00:11<00:48,  7.14sec/s][A
Transcribe:  26%|██▌       | 111.02/424.33 [00:15<00:43,  7.13sec/s][A
Transcribe:  33%|███▎      | 141.02/424.33 [00:19<00:40,  7.04sec/s][A
Transcribe:  40%|████      | 171.02/424.33 [00:23<00:33,  7.67sec/s][A
Transcribe:  47%|████▋     | 201.02/424.33 [00:27<00:29,  7.60sec/s][A
Transcribe:  54%|█████▍    | 230.02/424.33 [00:32<00:28,  6.94sec/s][A
Transcribe:  61%|██████▏   | 260.02/424.33 [00:35<00:21,  7.50sec/s][A
Transcribe:  68%|██████▊   | 290.02/424.33 [00:39<00:17,  7.59sec/s][A
Transcribe:  75%|███████▌  | 320.02/424.33 [00:43<00:14,  7.24sec/s][A
Transcribe:  82%|████████▏ | 350.02/424.33 [00:48<00:10,  7.02sec/s][A
Transcribe:  90%|████████▉ | 380.02/424.33 [00:52<00:06,  7.02sec/s][A
Transcribe:  97%|█████████▋| 410.02/424.33 [00:55<00:01,  7.96sec/

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_2.json



                                      [00:00<?, ?sec/s][A
  0%|          | 0/33 [02:20<?, ?it/s]                 
Transcribe:   0%|          | 0/398.81 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   7%|▋         | 29.0/398.81 [00:03<00:47,  7.84sec/s][A
Transcribe:  13%|█▎        | 52.88/398.81 [00:06<00:40,  8.50sec/s][A
Transcribe:  21%|██        | 82.88/398.81 [00:09<00:36,  8.74sec/s][A
Transcribe:  28%|██▊       | 112.88/398.81 [00:13<00:35,  8.00sec/s][A
Transcribe:  36%|███▌      | 142.88/398.81 [00:17<00:30,  8.30sec/s][A
Transcribe:  43%|████▎     | 172.88/398.81 [00:19<00:24,  9.10sec/s][A
Transcribe:  51%|█████     | 202.88/398.81 [00:24<00:23,  8.22sec/s][A
Transcribe:  58%|█████▊    | 232.88/398.81 [00:28<00:20,  8.12sec/s][A
Transcribe:  66%|██████▌   | 262.88/398.81 [00:31<00:16,  8.26sec/s][A
Transcribe:  73%|███████▎  | 292.88/398.81 [00:33<00:11,  9.51sec/s][A
Transcribe:  81%|████████  | 322.88/398.81 [00:36<00:07,  9.54sec/s][A
Transcribe:  88%|████████▊ | 352.88/398.81 [00:40<00:05,  9.08sec/s][A
Transcribe:  96%|█████████▌| 382.88/398.81 [00:43<00:01,  8.97sec/s][A
Transcribe: 100%|█████████▉| 398.77/398.81 [00:45<00:00,  8.74sec/s

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_3.json


  3%|▎         | 1/33 [03:06<1:39:29, 186.56s/it]
                                                 ?sec/s][A
  3%|▎         | 1/33 [03:06<1:39:29, 186.56s/it]       
Transcribe:   0%|          | 0/1039.42 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   3%|▎         | 30.0/1039.42 [00:02<01:29, 11.23sec/s][A
Transcribe:   6%|▌         | 60.0/1039.42 [00:06<01:51,  8.78sec/s][A
Transcribe:   9%|▊         | 90.0/1039.42 [00:12<02:28,  6.38sec/s][A
Transcribe:  12%|█▏        | 120.0/1039.42 [00:17<02:24,  6.38sec/s][A
Transcribe:  14%|█▍        | 150.0/1039.42 [00:40<05:33,  2.67sec/s][A
Transcribe:  17%|█▋        | 180.0/1039.42 [00:55<05:55,  2.41sec/s][A
Transcribe:  20%|██        | 209.6/1039.42 [01:00<04:37,  2.99sec/s][A
Transcribe:  23%|██▎       | 239.6/1039.42 [01:06<03:58,  3.35sec/s][A
Transcribe:  26%|██▌       | 268.6/1039.42 [01:09<03:04,  4.17sec/s][A
Transcribe:  29%|██▊       | 298.6/1039.42 [01:27<04:16,  2.89sec/s][A
Transcribe:  32%|███▏      | 328.6/1039.42 [01:50<05:43,  2.07sec/s][A
Transcribe:  34%|███▍      | 357.9/1039.42 [01:55<04:22,  2.60sec/s][A
Transcribe:  37%|███▋      | 387.9/1039.42 [01:59<03:21,  3.23sec/s][A
Transcribe:  40%|████      | 417.9/1039.42 [02:11<03:26,  3.02sec/

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_4.json



                                                 sec/s][A
  3%|▎         | 1/33 [09:33<1:39:29, 186.56s/it]      
Transcribe:   0%|          | 0/301.77 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   8%|▊         | 24.0/301.77 [00:04<00:51,  5.37sec/s][A
Transcribe:  16%|█▋        | 49.48/301.77 [00:22<02:08,  1.96sec/s][A
Transcribe:  26%|██▋       | 79.48/301.77 [00:26<01:10,  3.15sec/s][A
Transcribe:  36%|███▋      | 109.48/301.77 [00:29<00:42,  4.53sec/s][A
Transcribe:  46%|████▌     | 139.48/301.77 [00:33<00:30,  5.36sec/s][A
Transcribe:  56%|█████▌    | 169.48/301.77 [00:36<00:20,  6.38sec/s][A
Transcribe:  66%|██████▌   | 199.48/301.77 [00:39<00:14,  6.88sec/s][A
Transcribe:  76%|███████▌  | 229.48/301.77 [00:42<00:09,  7.85sec/s][A
Transcribe:  86%|████████▌ | 259.48/301.77 [00:45<00:04,  8.56sec/s][A
Transcribe:  96%|█████████▌| 289.48/301.77 [00:47<00:01,  9.62sec/s][A
Transcribe: 100%|█████████▉| 301.74/301.77 [00:48<00:00,  6.26sec/s][A


Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_5.json



                                                 ?sec/s][A
  3%|▎         | 1/33 [10:22<1:39:29, 186.56s/it]       
Transcribe:   0%|          | 0/1282.91 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   2%|▏         | 24.16/1282.91 [00:02<02:14,  9.38sec/s][A
Transcribe:   4%|▍         | 54.16/1282.91 [00:05<02:13,  9.19sec/s][A
Transcribe:   7%|▋         | 84.16/1282.91 [00:09<02:17,  8.69sec/s][A
Transcribe:   8%|▊         | 102.44/1282.91 [00:12<02:30,  7.82sec/s][A
Transcribe:  10%|█         | 132.44/1282.91 [00:15<02:22,  8.07sec/s][A
Transcribe:  13%|█▎        | 162.44/1282.91 [00:18<02:00,  9.28sec/s][A
Transcribe:  15%|█▌        | 192.44/1282.91 [00:21<01:55,  9.44sec/s][A
Transcribe:  17%|█▋        | 222.44/1282.91 [00:24<01:53,  9.36sec/s][A
Transcribe:  20%|█▉        | 252.44/1282.91 [00:28<01:59,  8.65sec/s][A
Transcribe:  22%|██▏       | 282.44/1282.91 [00:31<01:50,  9.05sec/s][A
Transcribe:  24%|██▍       | 312.44/1282.91 [00:35<01:51,  8.68sec/s][A
Transcribe:  27%|██▋       | 342.44/1282.91 [00:53<04:12,  3.73sec/s][A
Transcribe:  29%|██▉       | 372.44/1282.91 [00:57<03:20,  4.55sec/s][A
Transcribe:  31%|███▏      | 402.44/1282.91 [00:59<02

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_6.json



                                                 sec/s][A
  3%|▎         | 1/33 [14:09<1:39:29, 186.56s/it]      
Transcribe:   0%|          | 0/520.78 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   6%|▌         | 30.0/520.78 [00:05<01:22,  5.92sec/s][A
Transcribe:  12%|█▏        | 60.0/520.78 [00:09<01:09,  6.61sec/s][A
Transcribe:  17%|█▋        | 90.0/520.78 [00:13<01:02,  6.89sec/s][A
Transcribe:  23%|██▎       | 120.0/520.78 [00:17<00:54,  7.36sec/s][A
Transcribe:  29%|██▉       | 150.0/520.78 [00:20<00:49,  7.57sec/s][A
Transcribe:  35%|███▍      | 180.0/520.78 [00:21<00:34,  9.93sec/s][A
Transcribe:  40%|████      | 210.0/520.78 [00:23<00:27, 11.46sec/s][A
Transcribe:  46%|████▌     | 240.0/520.78 [00:26<00:25, 10.93sec/s][A
Transcribe:  52%|█████▏    | 270.0/520.78 [00:30<00:25,  9.92sec/s][A
Transcribe:  58%|█████▊    | 300.0/520.78 [00:34<00:24,  9.05sec/s][A
Transcribe:  63%|██████▎   | 330.0/520.78 [00:38<00:22,  8.61sec/s][A
Transcribe:  69%|██████▉   | 360.0/520.78 [00:41<00:18,  8.73sec/s][A
Transcribe:  75%|███████▍  | 390.0/520.78 [00:44<00:14,  9.22sec/s][A
Transcribe:  81%|████████  | 420.0/520.78 [00:47<00:11,  9.15sec/s][A
Transcri

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_7.json


  6%|▌         | 2/33 [15:09<4:19:23, 502.04s/it]
                                                 ec/s][A
  6%|▌         | 2/33 [15:09<4:19:23, 502.04s/it]     
Transcribe:   0%|          | 0/385.1 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   6%|▋         | 24.68/385.1 [00:03<00:46,  7.80sec/s][A
Transcribe:  14%|█▍        | 54.68/385.1 [00:06<00:38,  8.67sec/s][A
Transcribe:  22%|██▏       | 84.68/385.1 [00:09<00:32,  9.18sec/s][A
Transcribe:  29%|██▉       | 112.68/385.1 [00:27<01:25,  3.20sec/s][A
Transcribe:  37%|███▋      | 142.68/385.1 [00:52<02:05,  1.94sec/s][A
Transcribe:  45%|████▍     | 172.68/385.1 [00:56<01:19,  2.68sec/s][A
Transcribe:  52%|█████▏    | 200.54/385.1 [01:00<00:55,  3.33sec/s][A
Transcribe:  60%|█████▉    | 230.54/385.1 [01:25<01:14,  2.07sec/s][A
Transcribe:  67%|██████▋   | 258.54/385.1 [01:29<00:48,  2.63sec/s][A
Transcribe:  73%|███████▎  | 282.96/385.1 [01:32<00:31,  3.26sec/s][A
Transcribe:  81%|████████▏ | 312.96/385.1 [01:35<00:16,  4.29sec/s][A
Transcribe:  89%|████████▉ | 342.96/385.1 [01:38<00:08,  5.10sec/s][A
Transcribe:  97%|█████████▋| 372.96/385.1 [01:42<00:02,  5.65sec/s][A
Transcribe: 100%|█████████▉| 385.07/385.1 [01:43<00:00,  3.72sec/s][A


Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_8.json



                                                 sec/s][A
  6%|▌         | 2/33 [16:53<4:19:23, 502.04s/it]      
Transcribe:   0%|          | 0/935.34 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   2%|▏         | 23.22/935.34 [00:02<01:47,  8.50sec/s][A
Transcribe:   5%|▍         | 44.38/935.34 [00:05<01:45,  8.43sec/s][A
Transcribe:   8%|▊         | 74.38/935.34 [00:08<01:44,  8.27sec/s][A
Transcribe:  11%|█         | 104.38/935.34 [00:11<01:31,  9.12sec/s][A
Transcribe:  14%|█▍        | 134.38/935.34 [00:14<01:19, 10.07sec/s][A
Transcribe:  18%|█▊        | 164.38/935.34 [00:17<01:19,  9.74sec/s][A
Transcribe:  21%|██        | 194.38/935.34 [00:19<01:09, 10.62sec/s][A
Transcribe:  24%|██▎       | 221.68/935.34 [00:22<01:04, 11.00sec/s][A
Transcribe:  27%|██▋       | 251.68/935.34 [00:24<00:58, 11.71sec/s][A
Transcribe:  30%|███       | 281.68/935.34 [00:28<01:07,  9.72sec/s][A
Transcribe:  33%|███▎      | 311.68/935.34 [00:31<01:02,  9.92sec/s][A
Transcribe:  37%|███▋      | 341.68/935.34 [00:35<01:05,  9.11sec/s][A
Transcribe:  39%|███▉      | 368.08/935.34 [00:38<01:03,  8.87sec/s][A
Transcribe:  43%|████▎     | 398.08/935.34 [00:42<01:02,  8.53sec/

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_9.json



                                                 sec/s][A
  6%|▌         | 2/33 [18:34<4:19:23, 502.04s/it]      
Transcribe:   0%|          | 0/963.29 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   2%|▏         | 15.04/963.29 [00:06<07:00,  2.25sec/s][A
Transcribe:   5%|▍         | 45.04/963.29 [00:07<02:14,  6.84sec/s][A
Transcribe:   8%|▊         | 75.04/963.29 [00:12<02:06,  7.05sec/s][A
Transcribe:  11%|█         | 105.04/963.29 [00:15<01:56,  7.35sec/s][A
Transcribe:  14%|█▍        | 135.04/963.29 [00:19<01:50,  7.50sec/s][A
Transcribe:  17%|█▋        | 163.54/963.29 [00:22<01:38,  8.09sec/s][A
Transcribe:  20%|██        | 193.54/963.29 [00:25<01:30,  8.47sec/s][A
Transcribe:  23%|██▎       | 223.54/963.29 [00:29<01:26,  8.52sec/s][A
Transcribe:  26%|██▋       | 253.54/963.29 [00:30<01:04, 11.01sec/s][A
Transcribe:  29%|██▉       | 283.54/963.29 [00:33<01:02, 10.96sec/s][A
Transcribe:  33%|███▎      | 313.54/963.29 [00:37<01:07,  9.69sec/s][A
Transcribe:  36%|███▌      | 343.54/963.29 [00:40<01:07,  9.24sec/s][A
Transcribe:  39%|███▉      | 373.54/963.29 [00:44<01:09,  8.46sec/s][A
Transcribe:  42%|████▏     | 403.54/963.29 [00:48<01:05,  8.53sec/

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_10.json



                                                 sec/s][A
  6%|▌         | 2/33 [21:26<4:19:23, 502.04s/it]      
Transcribe:   0%|          | 0/474.59 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   5%|▌         | 26.0/474.59 [00:02<00:46,  9.61sec/s][A
Transcribe:  11%|█         | 51.28/474.59 [00:21<03:22,  2.09sec/s][A
Transcribe:  17%|█▋        | 81.28/474.59 [00:25<01:57,  3.36sec/s][A
Transcribe:  23%|██▎       | 111.28/474.59 [00:27<01:15,  4.82sec/s][A
Transcribe:  30%|██▉       | 141.28/474.59 [00:29<00:51,  6.44sec/s][A
Transcribe:  36%|███▌      | 171.28/474.59 [00:31<00:38,  7.96sec/s][A
Transcribe:  42%|████▏     | 201.28/474.59 [00:34<00:30,  8.83sec/s][A
Transcribe:  48%|████▊     | 227.36/474.59 [00:39<00:34,  7.25sec/s][A
Transcribe:  54%|█████▍    | 257.36/474.59 [00:43<00:30,  7.15sec/s][A
Transcribe:  61%|██████    | 287.36/474.59 [00:48<00:27,  6.87sec/s][A
Transcribe:  67%|██████▋   | 317.36/474.59 [00:51<00:21,  7.41sec/s][A
Transcribe:  73%|███████▎  | 347.36/474.59 [00:54<00:15,  8.29sec/s][A
Transcribe:  80%|███████▉  | 377.36/474.59 [00:57<00:11,  8.51sec/s][A
Transcribe:  86%|████████▌ | 407.36/474.59 [01:01<00:08,  8.32sec/s

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_11.json


  9%|▉         | 3/33 [22:36<3:58:24, 476.81s/it]
                                                 sec/s][A
  9%|▉         | 3/33 [22:36<3:58:24, 476.81s/it]      
Transcribe:   0%|          | 0/908.96 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   3%|▎         | 26.0/908.96 [00:04<02:21,  6.22sec/s][A
Transcribe:   6%|▌         | 56.0/908.96 [00:07<01:54,  7.47sec/s][A
Transcribe:   8%|▊         | 74.02/908.96 [00:10<01:53,  7.34sec/s][A
Transcribe:  11%|█▏        | 104.0/908.96 [00:20<03:06,  4.32sec/s][A
Transcribe:  15%|█▍        | 134.0/908.96 [00:23<02:15,  5.73sec/s][A
Transcribe:  18%|█▊        | 164.0/908.96 [00:28<02:08,  5.81sec/s][A
Transcribe:  21%|██▏       | 194.0/908.96 [00:32<01:59,  5.99sec/s][A
Transcribe:  25%|██▍       | 224.0/908.96 [00:36<01:42,  6.67sec/s][A
Transcribe:  28%|██▊       | 252.66/908.96 [00:40<01:34,  6.96sec/s][A
Transcribe:  31%|███       | 282.66/908.96 [01:03<03:38,  2.87sec/s][A
Transcribe:  34%|███▍      | 312.66/908.96 [01:21<04:12,  2.36sec/s][A
Transcribe:  38%|███▊      | 341.66/908.96 [01:26<03:18,  2.86sec/s][A
Transcribe:  41%|████      | 370.66/908.96 [02:10<06:16,  1.43sec/s][A
Transcribe:  44%|████▍     | 398.64/908.96 [02:14<04:31,  1.88sec/s][A
T

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_12.json



                                                 ec/s][A
  9%|▉         | 3/33 [26:52<3:58:24, 476.81s/it]     
Transcribe:   0%|          | 0/945.2 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   1%|▏         | 12.0/945.2 [00:02<02:40,  5.82sec/s][A
Transcribe:   4%|▍         | 37.66/945.2 [00:03<01:28, 10.27sec/s][A
Transcribe:   6%|▋         | 60.48/945.2 [00:06<01:32,  9.57sec/s][A
Transcribe:  10%|▉         | 90.48/945.2 [00:10<01:36,  8.84sec/s][A
Transcribe:  13%|█▎        | 120.48/945.2 [00:13<01:30,  9.13sec/s][A
Transcribe:  16%|█▌        | 150.48/945.2 [00:16<01:29,  8.87sec/s][A
Transcribe:  19%|█▉        | 179.92/945.2 [00:20<01:26,  8.89sec/s][A
Transcribe:  21%|██▏       | 203.14/945.2 [00:22<01:19,  9.28sec/s][A
Transcribe:  24%|██▍       | 230.28/945.2 [00:25<01:16,  9.30sec/s][A
Transcribe:  26%|██▌       | 247.04/945.2 [00:27<01:15,  9.20sec/s][A
Transcribe:  29%|██▊       | 271.1/945.2 [00:28<01:03, 10.57sec/s] [A
Transcribe:  32%|███▏      | 301.1/945.2 [00:31<00:58, 11.08sec/s][A
Transcribe:  34%|███▍      | 320.64/945.2 [00:32<00:54, 11.52sec/s][A
Transcribe:  37%|███▋      | 348.0/945.2 [00:34<00:50, 11.77sec/s] [A
Transcribe:

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_13.json



                                                 ec/s][A
  9%|▉         | 3/33 [28:12<3:58:24, 476.81s/it]     
Transcribe:   0%|          | 0/371.2 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   8%|▊         | 30.0/371.2 [00:06<01:11,  4.76sec/s][A
Transcribe:  16%|█▌        | 60.0/371.2 [00:11<00:56,  5.53sec/s][A
Transcribe:  24%|██▍       | 88.96/371.2 [00:14<00:44,  6.31sec/s][A
Transcribe:  32%|███▏      | 118.96/371.2 [00:19<00:38,  6.51sec/s][A
Transcribe:  40%|████      | 148.96/371.2 [00:22<00:31,  7.15sec/s][A
Transcribe:  48%|████▊     | 178.96/371.2 [00:27<00:28,  6.86sec/s][A
Transcribe:  56%|█████▋    | 208.96/371.2 [00:30<00:20,  7.86sec/s][A
Transcribe:  64%|██████▍   | 238.96/371.2 [00:32<00:15,  8.71sec/s][A
Transcribe:  72%|███████▏  | 268.96/371.2 [00:36<00:11,  8.56sec/s][A
Transcribe:  81%|████████  | 298.96/371.2 [00:39<00:07,  9.27sec/s][A
Transcribe:  89%|████████▊ | 328.96/371.2 [00:41<00:04, 10.17sec/s][A
Transcribe:  97%|█████████▋| 358.96/371.2 [00:43<00:01, 11.03sec/s][A
Transcribe: 100%|█████████▉| 371.09/371.2 [00:44<00:00,  8.32sec/s][A


Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_14.json



                                                 ?sec/s][A
  9%|▉         | 3/33 [28:56<3:58:24, 476.81s/it]       
Transcribe:   0%|          | 0/1222.67 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   2%|▏         | 20.0/1222.67 [00:02<02:56,  6.80sec/s][A
Transcribe:   3%|▎         | 42.0/1222.67 [00:08<04:16,  4.61sec/s][A
Transcribe:   6%|▌         | 70.16/1222.67 [00:12<03:25,  5.61sec/s][A
Transcribe:   8%|▊         | 94.22/1222.67 [00:15<02:47,  6.74sec/s][A
Transcribe:  10%|▉         | 120.64/1222.67 [00:17<02:23,  7.66sec/s][A
Transcribe:  12%|█▏        | 144.32/1222.67 [00:21<02:23,  7.51sec/s][A
Transcribe:  13%|█▎        | 162.86/1222.67 [00:23<02:14,  7.91sec/s][A
Transcribe:  16%|█▌        | 192.86/1222.67 [00:25<01:53,  9.10sec/s][A
Transcribe:  18%|█▊        | 222.86/1222.67 [00:28<01:45,  9.44sec/s][A
Transcribe:  21%|██        | 252.86/1222.67 [00:31<01:34, 10.25sec/s][A
Transcribe:  23%|██▎       | 282.86/1222.67 [00:34<01:33, 10.05sec/s][A
Transcribe:  26%|██▌       | 312.86/1222.67 [00:37<01:32,  9.78sec/s][A
Transcribe:  28%|██▊       | 342.86/1222.67 [00:40<01:31,  9.67sec/s][A
Transcribe:  30%|███       | 372.86/1222.67 [00:43<01:22

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_15.json


 12%|█▏        | 4/33 [31:27<4:00:46, 498.15s/it]
                                                 sec/s][A
 12%|█▏        | 4/33 [31:27<4:00:46, 498.15s/it]      
Transcribe:   0%|          | 0/376.32 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   7%|▋         | 26.6/376.32 [00:04<00:54,  6.47sec/s][A
Transcribe:  15%|█▌        | 56.6/376.32 [00:06<00:33,  9.43sec/s][A
Transcribe:  15%|█▌        | 56.6/376.32 [00:22<00:33,  9.43sec/s][A
Transcribe:  22%|██▏       | 81.6/376.32 [00:24<01:46,  2.77sec/s][A
Transcribe:  29%|██▉       | 110.12/376.32 [00:27<01:08,  3.91sec/s][A
Transcribe:  36%|███▌      | 136.12/376.32 [00:29<00:48,  4.99sec/s][A
Transcribe:  44%|████▍     | 166.12/376.32 [00:32<00:33,  6.20sec/s][A
Transcribe:  52%|█████▏    | 196.12/376.32 [00:35<00:25,  7.10sec/s][A
Transcribe:  60%|██████    | 226.12/376.32 [00:39<00:20,  7.16sec/s][A
Transcribe:  68%|██████▊   | 256.12/376.32 [00:42<00:15,  7.58sec/s][A
Transcribe:  76%|███████▌  | 286.12/376.32 [00:46<00:11,  8.16sec/s][A
Transcribe:  84%|████████▍ | 316.12/376.32 [00:48<00:06,  9.34sec/s][A
Transcribe:  89%|████████▉ | 336.44/376.32 [00:51<00:04,  8.27sec/s][A
Transcribe:  97%|█████████▋| 366.44/376.32 [00:57<00:01,  7.01sec/s][A

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_16.json



                                                 ec/s][A
 12%|█▏        | 4/33 [32:25<4:00:46, 498.15s/it]     
Transcribe:   0%|          | 0/959.2 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   3%|▎         | 27.68/959.2 [00:02<01:40,  9.23sec/s][A
Transcribe:   6%|▌         | 56.68/959.2 [00:21<06:15,  2.40sec/s][A
Transcribe:   9%|▊         | 83.9/959.2 [00:24<04:18,  3.39sec/s] [A
Transcribe:  12%|█▏        | 113.9/959.2 [00:27<02:58,  4.73sec/s][A
Transcribe:  15%|█▌        | 143.9/959.2 [00:30<02:17,  5.94sec/s][A
Transcribe:  18%|█▊        | 173.9/959.2 [00:33<01:56,  6.75sec/s][A
Transcribe:  21%|██▏       | 203.9/959.2 [00:36<01:40,  7.52sec/s][A
Transcribe:  24%|██▍       | 233.9/959.2 [00:48<02:33,  4.71sec/s][A
Transcribe:  28%|██▊       | 263.9/959.2 [00:51<02:01,  5.74sec/s][A
Transcribe:  31%|███       | 293.9/959.2 [00:54<01:46,  6.27sec/s][A
Transcribe:  34%|███▍      | 323.9/959.2 [01:06<02:24,  4.41sec/s][A
Transcribe:  37%|███▋      | 353.9/959.2 [01:09<01:56,  5.20sec/s][A
Transcribe:  40%|████      | 383.9/959.2 [01:28<03:09,  3.04sec/s][A
Transcribe:  43%|████▎     | 411.6/959.2 [01:32<02:30,  3.63sec/s][A
Transcribe:  46%|██

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_17.json



                                                 sec/s][A
 12%|█▏        | 4/33 [35:08<4:00:46, 498.15s/it]      
Transcribe:   0%|          | 0/658.05 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   4%|▍         | 28.0/658.05 [00:02<00:45, 13.91sec/s][A
Transcribe:   8%|▊         | 54.0/658.05 [00:04<00:55, 10.97sec/s][A
Transcribe:   8%|▊         | 54.0/658.05 [00:21<00:55, 10.97sec/s][A
Transcribe:  13%|█▎        | 84.0/658.05 [00:30<04:24,  2.17sec/s][A
Transcribe:  16%|█▌        | 104.58/658.05 [00:33<03:16,  2.82sec/s][A
Transcribe:  20%|██        | 134.58/658.05 [00:35<02:07,  4.12sec/s][A
Transcribe:  24%|██▎       | 155.58/658.05 [00:37<01:39,  5.04sec/s][A
Transcribe:  27%|██▋       | 180.88/658.05 [00:39<01:14,  6.36sec/s][A
Transcribe:  32%|███▏      | 210.88/658.05 [00:40<00:51,  8.65sec/s][A
Transcribe:  37%|███▋      | 240.88/658.05 [00:43<00:45,  9.17sec/s][A
Transcribe:  41%|████      | 270.88/658.05 [00:45<00:36, 10.49sec/s][A
Transcribe:  46%|████▌     | 300.88/658.05 [00:48<00:35, 10.19sec/s][A
Transcribe:  50%|█████     | 330.88/658.05 [00:50<00:31, 10.50sec/s][A
Transcribe:  55%|█████▍    | 360.88/658.05 [00:52<00:25, 11.56sec/s][A

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_18.json



                                                 sec/s][A
 12%|█▏        | 4/33 [36:33<4:00:46, 498.15s/it]      
Transcribe:   0%|          | 0/995.75 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   3%|▎         | 30.0/995.75 [00:02<01:30, 10.67sec/s][A
Transcribe:   4%|▍         | 42.36/995.75 [00:04<01:39,  9.55sec/s][A
Transcribe:   7%|▋         | 71.38/995.75 [00:06<01:19, 11.68sec/s][A
Transcribe:  10%|█         | 101.38/995.75 [00:08<01:11, 12.53sec/s][A
Transcribe:  13%|█▎        | 131.38/995.75 [00:11<01:17, 11.15sec/s][A
Transcribe:  16%|█▌        | 159.6/995.75 [00:14<01:16, 10.91sec/s] [A
Transcribe:  19%|█▉        | 189.6/995.75 [00:16<01:12, 11.18sec/s][A
Transcribe:  22%|██▏       | 214.46/995.75 [00:18<01:05, 11.99sec/s][A
Transcribe:  22%|██▏       | 221.2/995.75 [00:19<01:10, 10.96sec/s] [A
Transcribe:  25%|██▌       | 250.0/995.75 [00:20<00:52, 14.19sec/s][A
Transcribe:  28%|██▊       | 276.08/995.75 [00:22<00:53, 13.39sec/s][A
Transcribe:  30%|███       | 298.84/995.75 [00:24<00:54, 12.82sec/s][A
Transcribe:  32%|███▏      | 321.04/995.75 [00:26<00:55, 12.16sec/s][A
Transcribe:  35%|███▌      | 351.04/995.75 [00:27<00:41, 15.64sec/s]

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_19.json


 15%|█▌        | 5/33 [37:57<3:34:20, 459.29s/it]
                                                 sec/s][A
 15%|█▌        | 5/33 [37:57<3:34:20, 459.29s/it]      
Transcribe:   0%|          | 0/866.01 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   3%|▎         | 27.6/866.01 [00:02<01:20, 10.40sec/s][A
Transcribe:   6%|▌         | 53.6/866.01 [00:04<01:11, 11.43sec/s][A
Transcribe:   9%|▉         | 81.84/866.01 [00:08<01:21,  9.57sec/s][A
Transcribe:  13%|█▎        | 109.44/866.01 [00:12<01:38,  7.69sec/s][A
Transcribe:  16%|█▌        | 139.44/866.01 [00:17<01:40,  7.23sec/s][A
Transcribe:  19%|█▉        | 165.88/866.01 [00:21<01:35,  7.30sec/s][A
Transcribe:  23%|██▎       | 195.88/866.01 [00:24<01:23,  8.04sec/s][A
Transcribe:  26%|██▌       | 225.88/866.01 [00:28<01:23,  7.65sec/s][A
Transcribe:  30%|██▉       | 255.88/866.01 [00:32<01:19,  7.68sec/s][A
Transcribe:  33%|███▎      | 285.88/866.01 [00:33<00:59,  9.77sec/s][A
Transcribe:  36%|███▋      | 315.88/866.01 [00:34<00:44, 12.36sec/s][A
Transcribe:  40%|███▉      | 345.88/866.01 [00:38<00:50, 10.29sec/s][A
Transcribe:  43%|████▎     | 375.88/866.01 [00:39<00:38, 12.61sec/s][A
Transcribe:  47%|████▋     | 405.88/866.01 [00:41<00:34, 13.17sec/s]

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_20.json



                                                 ec/s][A
 15%|█▌        | 5/33 [39:49<3:34:20, 459.29s/it]     
Transcribe:   0%|          | 0/813.9 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   3%|▎         | 25.0/813.9 [00:02<01:13, 10.70sec/s][A
Transcribe:   7%|▋         | 55.0/813.9 [00:06<01:26,  8.75sec/s][A
Transcribe:  10%|▉         | 78.24/813.9 [00:08<01:17,  9.48sec/s][A
Transcribe:  13%|█▎        | 108.24/813.9 [00:12<01:21,  8.68sec/s][A
Transcribe:  17%|█▋        | 138.24/813.9 [00:15<01:14,  9.08sec/s][A
Transcribe:  21%|██        | 168.24/813.9 [00:17<01:05,  9.88sec/s][A
Transcribe:  24%|██▍       | 198.24/813.9 [00:20<01:02,  9.92sec/s][A
Transcribe:  28%|██▊       | 228.24/813.9 [00:24<01:01,  9.46sec/s][A
Transcribe:  32%|███▏      | 258.24/813.9 [00:26<00:52, 10.65sec/s][A
Transcribe:  35%|███▌      | 288.24/813.9 [00:29<00:51, 10.15sec/s][A
Transcribe:  39%|███▉      | 318.24/813.9 [00:32<00:47, 10.40sec/s][A
Transcribe:  43%|████▎     | 348.24/813.9 [00:50<01:59,  3.89sec/s][A
Transcribe:  46%|████▋     | 378.24/813.9 [00:53<01:30,  4.81sec/s][A
Transcribe:  50%|█████     | 408.24/813.9 [00:57<01:14,  5.42sec/s][A
Transcribe

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_21.json



                                                 sec/s][A
 15%|█▌        | 5/33 [42:19<3:34:20, 459.29s/it]      
Transcribe:   0%|          | 0/913.87 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   3%|▎         | 28.44/913.87 [00:04<02:11,  6.73sec/s][A
Transcribe:   6%|▋         | 58.44/913.87 [00:08<02:09,  6.63sec/s][A
Transcribe:  10%|▉         | 87.44/913.87 [00:12<01:58,  6.98sec/s][A
Transcribe:  13%|█▎        | 117.44/913.87 [00:15<01:41,  7.88sec/s][A
Transcribe:  16%|█▌        | 147.44/913.87 [00:19<01:40,  7.59sec/s][A
Transcribe:  19%|█▉        | 177.44/913.87 [00:22<01:28,  8.28sec/s][A
Transcribe:  23%|██▎       | 207.44/913.87 [00:27<01:29,  7.89sec/s][A
Transcribe:  26%|██▌       | 237.44/913.87 [00:30<01:24,  8.03sec/s][A
Transcribe:  29%|██▉       | 267.44/913.87 [00:34<01:17,  8.31sec/s][A
Transcribe:  33%|███▎      | 297.44/913.87 [00:37<01:14,  8.27sec/s][A
Transcribe:  36%|███▌      | 327.44/913.87 [00:41<01:14,  7.90sec/s][A
Transcribe:  39%|███▉      | 357.44/913.87 [00:45<01:11,  7.78sec/s][A
Transcribe:  42%|████▏     | 387.44/913.87 [00:49<01:05,  8.00sec/s][A
Transcribe:  46%|████▌     | 417.44/913.87 [00:53<01:03,  7.81sec/

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_22.json



                                                 sec/s][A
 15%|█▌        | 5/33 [44:21<3:34:20, 459.29s/it]      
Transcribe:   0%|          | 0/952.25 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   2%|▏         | 17.0/952.25 [00:02<02:06,  7.41sec/s][A
Transcribe:   5%|▍         | 47.0/952.25 [00:06<02:03,  7.34sec/s][A
Transcribe:   8%|▊         | 77.0/952.25 [00:11<02:17,  6.37sec/s][A
Transcribe:  11%|█         | 107.0/952.25 [00:15<02:03,  6.87sec/s][A
Transcribe:  14%|█▍        | 137.0/952.25 [00:17<01:33,  8.70sec/s][A
Transcribe:  18%|█▊        | 167.0/952.25 [00:21<01:31,  8.57sec/s][A
Transcribe:  21%|██        | 197.0/952.25 [00:25<01:36,  7.82sec/s][A
Transcribe:  24%|██▍       | 227.0/952.25 [00:29<01:35,  7.60sec/s][A
Transcribe:  27%|██▋       | 257.0/952.25 [00:34<01:37,  7.12sec/s][A
Transcribe:  30%|███       | 287.0/952.25 [00:37<01:27,  7.63sec/s][A
Transcribe:  33%|███▎      | 317.0/952.25 [00:40<01:17,  8.18sec/s][A
Transcribe:  36%|███▋      | 347.0/952.25 [00:44<01:10,  8.54sec/s][A
Transcribe:  40%|███▉      | 377.0/952.25 [00:57<02:04,  4.63sec/s][A
Transcribe:  43%|████▎     | 407.0/952.25 [01:00<01:37,  5.59sec/s][A
Transcri

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_23.json


 18%|█▊        | 6/33 [46:35<3:35:36, 479.11s/it]
                                                 sec/s][A
 18%|█▊        | 6/33 [46:36<3:35:36, 479.11s/it]      
Transcribe:   0%|          | 0/758.28 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   4%|▍         | 30.0/758.28 [00:03<01:25,  8.49sec/s][A
Transcribe:   8%|▊         | 60.0/758.28 [00:08<01:38,  7.06sec/s][A
Transcribe:  12%|█▏        | 90.0/758.28 [00:12<01:39,  6.74sec/s][A
Transcribe:  16%|█▌        | 120.0/758.28 [00:32<03:40,  2.89sec/s][A
Transcribe:  20%|█▉        | 150.0/758.28 [00:36<02:41,  3.76sec/s][A
Transcribe:  24%|██▎       | 180.0/758.28 [00:39<02:03,  4.67sec/s][A
Transcribe:  28%|██▊       | 210.0/758.28 [00:44<01:50,  4.97sec/s][A
Transcribe:  32%|███▏      | 240.0/758.28 [00:49<01:38,  5.25sec/s][A
Transcribe:  36%|███▌      | 270.0/758.28 [00:54<01:26,  5.65sec/s][A
Transcribe:  40%|███▉      | 300.0/758.28 [00:59<01:19,  5.73sec/s][A
Transcribe:  43%|████▎     | 328.5/758.28 [01:01<01:04,  6.69sec/s][A
Transcribe:  47%|████▋     | 358.34/758.28 [01:06<00:58,  6.83sec/s][A
Transcribe:  51%|█████     | 386.94/758.28 [01:08<00:49,  7.53sec/s][A
Transcribe:  55%|█████▍    | 416.94/758.28 [01:15<00:53,  6.35sec/s][A
Trans

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_24.json



                                                 sec/s][A
 18%|█▊        | 6/33 [50:17<3:35:36, 479.11s/it]      
Transcribe:   0%|          | 0/532.31 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   5%|▌         | 28.16/532.31 [00:04<01:23,  6.04sec/s][A
Transcribe:  10%|█         | 54.58/532.31 [00:07<01:01,  7.77sec/s][A
Transcribe:  15%|█▍        | 78.84/532.31 [00:09<00:53,  8.50sec/s][A
Transcribe:  20%|██        | 108.84/532.31 [00:13<00:49,  8.56sec/s][A
Transcribe:  26%|██▌       | 138.84/532.31 [00:16<00:42,  9.25sec/s][A
Transcribe:  32%|███▏      | 168.84/532.31 [00:19<00:38,  9.36sec/s][A
Transcribe:  37%|███▋      | 198.84/532.31 [00:23<00:39,  8.39sec/s][A
Transcribe:  43%|████▎     | 228.84/532.31 [00:26<00:34,  8.83sec/s][A
Transcribe:  49%|████▊     | 258.84/532.31 [00:30<00:32,  8.54sec/s][A
Transcribe:  54%|█████▍    | 288.84/532.31 [00:33<00:26,  9.13sec/s][A
Transcribe:  60%|█████▉    | 318.84/532.31 [00:35<00:21, 10.15sec/s][A
Transcribe:  66%|██████▌   | 348.84/532.31 [00:39<00:20,  9.04sec/s][A
Transcribe:  71%|███████   | 378.84/532.31 [00:43<00:17,  8.74sec/s][A
Transcribe:  77%|███████▋  | 408.84/532.31 [00:46<00:14,  8.57sec/

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_25.json



                                                 sec/s][A
 18%|█▊        | 6/33 [51:29<3:35:36, 479.11s/it]      
Transcribe:   0%|          | 0/341.52 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   9%|▉         | 30.0/341.52 [00:02<00:27, 11.45sec/s][A
Transcribe:  17%|█▋        | 56.68/341.52 [00:06<00:34,  8.21sec/s][A
Transcribe:  22%|██▏       | 76.6/341.52 [00:09<00:34,  7.59sec/s] [A
Transcribe:  29%|██▊       | 97.72/341.52 [00:11<00:30,  8.00sec/s][A
Transcribe:  36%|███▌      | 122.9/341.52 [00:13<00:23,  9.44sec/s][A
Transcribe:  42%|████▏     | 144.82/341.52 [00:15<00:19, 10.09sec/s][A
Transcribe:  51%|█████     | 174.82/341.52 [00:17<00:13, 12.68sec/s][A
Transcribe:  60%|█████▉    | 204.82/341.52 [00:20<00:11, 11.50sec/s][A
Transcribe:  69%|██████▉   | 234.82/341.52 [00:22<00:08, 12.36sec/s][A
Transcribe:  78%|███████▊  | 264.82/341.52 [00:25<00:06, 11.65sec/s][A
Transcribe:  86%|████████▋ | 294.82/341.52 [00:27<00:03, 11.84sec/s][A
Transcribe:  95%|█████████▌| 324.82/341.52 [00:30<00:01, 11.30sec/s][A
Transcribe: 100%|█████████▉| 341.48/341.52 [00:32<00:00, 10.53sec/s][A


Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_26.json



                                                 sec/s][A
 18%|█▊        | 6/33 [52:02<3:35:36, 479.11s/it]      
Transcribe:   0%|          | 0/593.73 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   4%|▎         | 22.18/593.73 [00:02<01:01,  9.36sec/s][A
Transcribe:   8%|▊         | 48.18/593.73 [00:04<00:56,  9.69sec/s][A
Transcribe:  13%|█▎        | 78.18/593.73 [00:08<00:54,  9.46sec/s][A
Transcribe:  18%|█▊        | 108.18/593.73 [00:11<00:54,  8.89sec/s][A
Transcribe:  23%|██▎       | 138.18/593.73 [00:15<00:51,  8.89sec/s][A
Transcribe:  28%|██▊       | 168.18/593.73 [00:18<00:48,  8.74sec/s][A
Transcribe:  33%|███▎      | 198.18/593.73 [00:53<03:02,  2.17sec/s][A
Transcribe:  38%|███▊      | 227.78/593.73 [00:56<02:08,  2.86sec/s][A
Transcribe:  43%|████▎     | 257.78/593.73 [00:59<01:29,  3.74sec/s][A
Transcribe:  48%|████▊     | 287.78/593.73 [01:02<01:05,  4.64sec/s][A
Transcribe:  54%|█████▎    | 317.78/593.73 [01:14<01:14,  3.72sec/s][A
Transcribe:  58%|█████▊    | 346.78/593.73 [01:25<01:14,  3.31sec/s][A
Transcribe:  63%|██████▎   | 376.78/593.73 [01:29<00:54,  4.01sec/s][A
Transcribe:  68%|██████▊   | 405.78/593.73 [01:55<01:24,  2.23sec/

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_27.json


 21%|██        | 7/33 [55:03<3:31:43, 488.59s/it]
                                                 sec/s][A
 21%|██        | 7/33 [55:03<3:31:43, 488.59s/it]      
Transcribe:   0%|          | 0/163.32 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:  17%|█▋        | 28.0/163.32 [00:03<00:19,  7.06sec/s][A
Transcribe:  28%|██▊       | 45.66/163.32 [00:06<00:16,  7.24sec/s][A
Transcribe:  43%|████▎     | 70.66/163.32 [00:10<00:13,  6.79sec/s][A
Transcribe:  62%|██████▏   | 100.66/163.32 [00:13<00:08,  7.51sec/s][A
Transcribe:  80%|████████  | 130.66/163.32 [00:16<00:03,  8.83sec/s][A
Transcribe:  98%|█████████▊| 160.66/163.32 [00:20<00:00,  8.21sec/s][A
Transcribe: 100%|█████████▉| 163.19/163.32 [00:20<00:00,  7.86sec/s][A


Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_28.json



                                                 sec/s][A
 21%|██        | 7/33 [55:24<3:31:43, 488.59s/it]      
Transcribe:   0%|          | 0/1090.6 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   3%|▎         | 30.0/1090.6 [00:03<01:58,  8.95sec/s][A
Transcribe:   4%|▍         | 49.0/1090.6 [00:04<01:29, 11.66sec/s][A
Transcribe:   7%|▋         | 79.0/1090.6 [00:07<01:36, 10.54sec/s][A
Transcribe:  10%|▉         | 108.0/1090.6 [00:12<02:07,  7.69sec/s][A
Transcribe:  13%|█▎        | 137.0/1090.6 [00:16<02:06,  7.53sec/s][A
Transcribe:  15%|█▌        | 166.0/1090.6 [00:21<02:14,  6.88sec/s][A
Transcribe:  18%|█▊        | 195.0/1090.6 [00:25<02:08,  6.97sec/s][A
Transcribe:  21%|██        | 224.0/1090.6 [00:30<02:09,  6.67sec/s][A
Transcribe:  23%|██▎       | 254.0/1090.6 [00:37<02:25,  5.74sec/s][A
Transcribe:  26%|██▌       | 283.0/1090.6 [01:05<05:42,  2.36sec/s][A
Transcribe:  29%|██▊       | 311.7/1090.6 [01:10<04:27,  2.91sec/s][A
Transcribe:  31%|███▏      | 341.7/1090.6 [01:15<03:38,  3.43sec/s][A
Transcribe:  34%|███▍      | 371.7/1090.6 [01:35<04:51,  2.47sec/s][A
Transcribe:  37%|███▋      | 400.62/1090.6 [01:40<03:51,  2.97sec/s][A
Transcr

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_29.json



                                                 sec/s][A
 21%|██        | 7/33 [59:19<3:31:43, 488.59s/it]      
Transcribe:   0%|          | 0/385.99 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   7%|▋         | 27.0/385.99 [00:04<00:53,  6.73sec/s][A
Transcribe:  12%|█▏        | 46.0/385.99 [00:06<00:48,  6.97sec/s][A
Transcribe:  20%|█▉        | 76.0/385.99 [00:07<00:25, 12.19sec/s][A
Transcribe:  27%|██▋       | 106.0/385.99 [00:09<00:22, 12.48sec/s][A
Transcribe:  35%|███▌      | 136.0/385.99 [00:14<00:27,  9.12sec/s][A
Transcribe:  43%|████▎     | 166.0/385.99 [00:18<00:26,  8.27sec/s][A
Transcribe:  47%|████▋     | 182.66/385.99 [00:19<00:21,  9.29sec/s][A
Transcribe:  55%|█████▍    | 212.0/385.99 [00:27<00:28,  6.20sec/s] [A
Transcribe:  61%|██████▏   | 237.0/385.99 [00:30<00:21,  6.78sec/s][A
Transcribe:  69%|██████▊   | 265.0/385.99 [00:34<00:18,  6.69sec/s][A
Transcribe:  73%|███████▎  | 283.64/385.99 [00:48<00:29,  3.50sec/s][A
Transcribe:  81%|████████▏ | 313.64/385.99 [01:08<00:30,  2.37sec/s][A
Transcribe:  89%|████████▉ | 343.64/385.99 [01:10<00:13,  3.25sec/s][A
Transcribe:  95%|█████████▌| 368.1/385.99 [01:13<00:04,  3.89sec/s] [A
Tr

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_30.json



                                                 sec/s][A
 21%|██        | 7/33 [1:00:35<3:31:43, 488.59s/it]    
Transcribe:   0%|          | 0/690.45 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   4%|▍         | 29.44/690.45 [00:02<00:47, 14.06sec/s][A
Transcribe:   8%|▊         | 56.44/690.45 [00:04<00:52, 12.02sec/s][A
Transcribe:  13%|█▎        | 86.44/690.45 [00:06<00:49, 12.27sec/s][A
Transcribe:  17%|█▋        | 116.44/690.45 [00:09<00:49, 11.60sec/s][A
Transcribe:  21%|██        | 146.44/690.45 [00:19<01:35,  5.67sec/s][A
Transcribe:  26%|██▌       | 176.44/690.45 [00:53<04:17,  1.99sec/s][A
Transcribe:  30%|██▉       | 204.56/690.45 [00:56<03:03,  2.64sec/s][A
Transcribe:  32%|███▏      | 221.92/690.45 [00:57<02:26,  3.19sec/s][A
Transcribe:  36%|███▋      | 251.92/690.45 [01:00<01:41,  4.33sec/s][A
Transcribe:  41%|████      | 281.92/690.45 [01:03<01:16,  5.36sec/s][A
Transcribe:  45%|████▌     | 311.92/690.45 [01:06<00:59,  6.32sec/s][A
Transcribe:  50%|████▉     | 341.92/690.45 [01:09<00:49,  7.01sec/s][A
Transcribe:  54%|█████▍    | 371.92/690.45 [01:12<00:42,  7.52sec/s][A
Transcribe:  58%|█████▊    | 401.92/690.45 [01:15<00:34,  8.29sec/

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_31.json



                                                   c/s][A
 24%|██▍       | 8/33 [1:02:15<3:16:01, 470.46s/it]    
Transcribe:   0%|          | 0/123.92 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:  23%|██▎       | 28.5/123.92 [00:03<00:11,  8.20sec/s][A
Transcribe:  47%|████▋     | 58.5/123.92 [00:07<00:08,  8.05sec/s][A
Transcribe:  71%|███████▏  | 88.5/123.92 [00:09<00:03, 10.10sec/s][A
Transcribe:  96%|█████████▌| 118.5/123.92 [00:12<00:00,  9.82sec/s][A
Transcribe: 100%|█████████▉| 123.79/123.92 [00:13<00:00,  9.20sec/s][A


Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_32.json



                                                   ec/s][A
 24%|██▍       | 8/33 [1:02:28<3:16:01, 470.46s/it]     
Transcribe:   0%|          | 0/1067.52 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   3%|▎         | 30.0/1067.52 [00:03<02:08,  8.10sec/s][A
Transcribe:   6%|▌         | 59.38/1067.52 [00:07<02:06,  7.95sec/s][A
Transcribe:   8%|▊         | 86.46/1067.52 [00:10<01:57,  8.35sec/s][A
Transcribe:  11%|█         | 116.46/1067.52 [00:15<02:08,  7.41sec/s][A
Transcribe:  14%|█▎        | 146.46/1067.52 [00:19<02:02,  7.53sec/s][A
Transcribe:  17%|█▋        | 176.46/1067.52 [00:22<01:54,  7.80sec/s][A
Transcribe:  19%|█▉        | 206.46/1067.52 [00:25<01:43,  8.28sec/s][A
Transcribe:  22%|██▏       | 236.46/1067.52 [00:27<01:24,  9.83sec/s][A
Transcribe:  25%|██▍       | 265.56/1067.52 [00:30<01:24,  9.50sec/s][A
Transcribe:  28%|██▊       | 295.56/1067.52 [00:33<01:20,  9.62sec/s][A
Transcribe:  30%|███       | 325.56/1067.52 [00:37<01:21,  9.10sec/s][A
Transcribe:  33%|███▎      | 355.56/1067.52 [00:40<01:12,  9.81sec/s][A
Transcribe:  36%|███▌      | 382.32/1067.52 [00:42<01:08, 10.03sec/s][A
Transcribe:  39%|███▊      | 412.32/1067.52 [00:45<01:

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_33.json



                                                   /s][A
 24%|██▍       | 8/33 [1:04:26<3:16:01, 470.46s/it]   
Transcribe:   0%|          | 0/820.8 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   3%|▎         | 24.0/820.8 [00:03<01:54,  6.98sec/s][A
Transcribe:   6%|▌         | 49.0/820.8 [00:06<01:44,  7.39sec/s][A
Transcribe:   9%|▉         | 75.0/820.8 [00:10<01:49,  6.82sec/s][A
Transcribe:  12%|█▏        | 101.0/820.8 [00:14<01:41,  7.11sec/s][A
Transcribe:  16%|█▌        | 131.0/820.8 [00:17<01:24,  8.21sec/s][A
Transcribe:  20%|█▉        | 161.0/820.8 [00:20<01:21,  8.05sec/s][A
Transcribe:  23%|██▎       | 191.0/820.8 [00:22<01:04,  9.75sec/s][A
Transcribe:  27%|██▋       | 221.0/820.8 [00:25<00:59, 10.01sec/s][A
Transcribe:  31%|███       | 251.0/820.8 [00:27<00:52, 10.82sec/s][A
Transcribe:  34%|███▍      | 281.0/820.8 [00:32<00:57,  9.34sec/s][A
Transcribe:  37%|███▋      | 303.7/820.8 [00:35<00:59,  8.72sec/s][A
Transcribe:  40%|████      | 330.24/820.8 [00:36<00:45, 10.81sec/s][A
Transcribe:  40%|████      | 330.24/820.8 [00:53<00:45, 10.81sec/s][A
Transcribe:  44%|████▍     | 360.22/820.8 [00:59<02:23,  3.21sec/s][A
Transcribe:  47%|██

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_34.json



                                                   c/s][A
 24%|██▍       | 8/33 [1:06:23<3:16:01, 470.46s/it]    
Transcribe:   0%|          | 0/392.65 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   6%|▋         | 24.96/392.65 [00:02<00:38,  9.49sec/s][A
Transcribe:  14%|█▍        | 54.96/392.65 [00:05<00:33,  9.99sec/s][A
Transcribe:  22%|██▏       | 84.96/392.65 [00:08<00:29, 10.27sec/s][A
Transcribe:  29%|██▉       | 114.96/392.65 [00:11<00:28,  9.87sec/s][A
Transcribe:  37%|███▋      | 144.96/392.65 [00:14<00:24, 10.00sec/s][A
Transcribe:  45%|████▍     | 174.96/392.65 [00:17<00:22,  9.86sec/s][A
Transcribe:  52%|█████▏    | 204.96/392.65 [00:20<00:17, 10.53sec/s][A
Transcribe:  60%|█████▉    | 234.96/392.65 [00:24<00:17,  8.85sec/s][A
Transcribe:  67%|██████▋   | 261.72/392.65 [00:28<00:16,  7.94sec/s][A
Transcribe:  74%|███████▍  | 291.72/392.65 [00:33<00:13,  7.70sec/s][A
Transcribe:  82%|████████▏ | 321.72/392.65 [00:35<00:07,  9.01sec/s][A
Transcribe:  90%|████████▉ | 351.72/392.65 [00:38<00:04,  9.25sec/s][A
Transcribe:  97%|█████████▋| 381.72/392.65 [00:40<00:01, 10.18sec/s][A
Transcribe: 100%|█████████▉| 392.59/392.65 [00:41<00:00,  9.45sec/

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_35.json


 27%|██▋       | 9/33 [1:07:04<2:45:38, 414.10s/it]
                                                   c/s][A
 27%|██▋       | 9/33 [1:07:05<2:45:38, 414.10s/it]    
Transcribe:   0%|          | 0/248.79 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:  12%|█▏        | 29.02/248.79 [00:05<00:39,  5.58sec/s][A
Transcribe:  24%|██▎       | 59.02/248.79 [00:09<00:28,  6.76sec/s][A
Transcribe:  36%|███▌      | 89.02/248.79 [00:35<01:17,  2.05sec/s][A
Transcribe:  47%|████▋     | 118.02/248.79 [00:38<00:43,  3.03sec/s][A
Transcribe:  57%|█████▋    | 142.98/248.79 [00:42<00:29,  3.63sec/s][A
Transcribe:  69%|██████▉   | 171.48/248.79 [00:45<00:17,  4.54sec/s][A
Transcribe:  81%|████████  | 201.48/248.79 [00:48<00:08,  5.57sec/s][A
Transcribe:  93%|█████████▎| 231.48/248.79 [00:51<00:02,  6.59sec/s][A
Transcribe: 100%|█████████▉| 248.68/248.79 [00:52<00:00,  4.76sec/s][A


Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_36.json



                                                   c/s][A
 27%|██▋       | 9/33 [1:07:57<2:45:38, 414.10s/it]    
Transcribe:   0%|          | 0/367.73 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   8%|▊         | 29.38/367.73 [00:03<00:42,  8.04sec/s][A
Transcribe:  16%|█▌        | 59.38/367.73 [00:05<00:29, 10.38sec/s][A
Transcribe:  24%|██▍       | 89.38/367.73 [00:08<00:24, 11.37sec/s][A
Transcribe:  32%|███▏      | 119.38/367.73 [00:10<00:21, 11.35sec/s][A
Transcribe:  41%|████      | 149.38/367.73 [00:13<00:20, 10.79sec/s][A
Transcribe:  49%|████▉     | 179.38/367.73 [00:16<00:17, 11.01sec/s][A
Transcribe:  57%|█████▋    | 209.38/367.73 [00:19<00:14, 10.93sec/s][A
Transcribe:  65%|██████▌   | 239.38/367.73 [00:21<00:10, 12.27sec/s][A
Transcribe:  68%|██████▊   | 249.34/367.73 [00:22<00:10, 10.91sec/s][A
Transcribe:  75%|███████▍  | 275.68/367.73 [00:24<00:07, 12.93sec/s][A
Transcribe:  83%|████████▎ | 305.68/367.73 [00:26<00:05, 12.25sec/s][A
Transcribe:  91%|█████████▏| 335.68/367.73 [00:29<00:02, 11.16sec/s][A
Transcribe: 100%|█████████▉| 367.7/367.73 [00:32<00:00, 11.42sec/s] [A


Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_37.json



                                                   c/s][A
 27%|██▋       | 9/33 [1:08:30<2:45:38, 414.10s/it]    
Transcribe:   0%|          | 0/128.68 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:  21%|██▏       | 27.62/128.68 [00:04<00:17,  5.93sec/s][A
Transcribe:  45%|████▍     | 57.62/128.68 [00:06<00:07,  8.98sec/s][A
Transcribe:  66%|██████▌   | 84.86/128.68 [00:09<00:04,  8.95sec/s][A
Transcribe:  89%|████████▉ | 114.86/128.68 [00:13<00:01,  8.57sec/s][A
Transcribe: 100%|█████████▉| 128.55/128.68 [00:14<00:00,  8.95sec/s][A


Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_38.json



                                                   c/s][A
 27%|██▋       | 9/33 [1:08:44<2:45:38, 414.10s/it]    
Transcribe:   0%|          | 0/782.18 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   4%|▎         | 27.38/782.18 [00:04<01:51,  6.74sec/s][A
Transcribe:   7%|▋         | 54.14/782.18 [00:07<01:42,  7.10sec/s][A
Transcribe:  11%|█         | 84.14/782.18 [00:10<01:25,  8.20sec/s][A
Transcribe:  15%|█▍        | 114.14/782.18 [00:13<01:11,  9.30sec/s][A
Transcribe:  18%|█▊        | 144.14/782.18 [00:17<01:14,  8.56sec/s][A
Transcribe:  22%|██▏       | 169.7/782.18 [00:19<01:08,  8.96sec/s] [A
Transcribe:  26%|██▌       | 199.7/782.18 [00:23<01:08,  8.51sec/s][A
Transcribe:  29%|██▉       | 229.7/782.18 [00:27<01:05,  8.44sec/s][A
Transcribe:  33%|███▎      | 259.7/782.18 [00:30<01:01,  8.45sec/s][A
Transcribe:  37%|███▋      | 289.7/782.18 [00:34<01:00,  8.17sec/s][A
Transcribe:  41%|████      | 319.7/782.18 [00:37<00:50,  9.10sec/s][A
Transcribe:  45%|████▍     | 349.7/782.18 [00:39<00:43,  9.97sec/s][A
Transcribe:  49%|████▊     | 379.7/782.18 [00:42<00:41,  9.81sec/s][A
Transcribe:  52%|█████▏    | 409.7/782.18 [00:46<00:39,  9.50sec/s][A
Tr

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_39.json


 30%|███       | 10/33 [1:10:25<2:13:26, 348.13s/it]
                                                    c/s][A
 30%|███       | 10/33 [1:10:25<2:13:26, 348.13s/it]    
Transcribe:   0%|          | 0/1065.72 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   2%|▏         | 21.92/1065.72 [00:02<01:49,  9.52sec/s][A
Transcribe:   3%|▎         | 36.92/1065.72 [00:03<01:38, 10.43sec/s][A
Transcribe:   6%|▋         | 66.92/1065.72 [00:07<01:54,  8.69sec/s][A
Transcribe:   9%|▊         | 92.92/1065.72 [00:10<01:45,  9.22sec/s][A
Transcribe:  12%|█▏        | 122.92/1065.72 [00:12<01:35,  9.84sec/s][A
Transcribe:  14%|█▍        | 152.92/1065.72 [00:15<01:30, 10.13sec/s][A
Transcribe:  17%|█▋        | 182.92/1065.72 [00:18<01:28,  9.99sec/s][A
Transcribe:  20%|█▉        | 212.92/1065.72 [00:21<01:27,  9.77sec/s][A
Transcribe:  23%|██▎       | 242.92/1065.72 [00:24<01:20, 10.28sec/s][A
Transcribe:  26%|██▌       | 272.92/1065.72 [00:27<01:18, 10.08sec/s][A
Transcribe:  28%|██▊       | 302.92/1065.72 [00:30<01:17,  9.83sec/s][A
Transcribe:  31%|███       | 332.92/1065.72 [00:34<01:15,  9.65sec/s][A
Transcribe:  34%|███▍      | 362.92/1065.72 [00:37<01:13,  9.59sec/s][A
Transcribe:  37%|███▋      | 392.92/1065.72 [00:55<02:

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_40.json



                                                    /s][A
 30%|███       | 10/33 [1:12:32<2:13:26, 348.13s/it]   
Transcribe:   0%|          | 0/170.55 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:  17%|█▋        | 29.0/170.55 [00:06<00:30,  4.68sec/s][A
Transcribe:  28%|██▊       | 48.18/170.55 [00:10<00:26,  4.59sec/s][A
Transcribe:  45%|████▍     | 76.28/170.55 [00:14<00:16,  5.59sec/s][A
Transcribe:  62%|██████▏   | 106.28/170.55 [00:19<00:11,  5.84sec/s][A
Transcribe:  80%|███████▉  | 136.28/170.55 [00:38<00:12,  2.83sec/s][A
Transcribe:  97%|█████████▋| 166.28/170.55 [00:41<00:01,  3.76sec/s][A
Transcribe: 100%|█████████▉| 170.52/170.55 [00:43<00:00,  3.96sec/s][A


Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_41.json



                                                    /s][A
 30%|███       | 10/33 [1:13:16<2:13:26, 348.13s/it]   
Transcribe:   0%|          | 0/300.41 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   9%|▉         | 27.0/300.41 [00:04<00:49,  5.57sec/s][A
Transcribe:  18%|█▊        | 54.62/300.41 [00:06<00:28,  8.61sec/s][A
Transcribe:  25%|██▌       | 76.04/300.41 [00:08<00:24,  9.31sec/s][A
Transcribe:  35%|███▌      | 106.04/300.41 [00:13<00:23,  8.25sec/s][A
Transcribe:  45%|████▌     | 136.04/300.41 [00:16<00:18,  8.84sec/s][A
Transcribe:  55%|█████▌    | 166.04/300.41 [00:18<00:13,  9.99sec/s][A
Transcribe:  65%|██████▌   | 196.04/300.41 [00:21<00:10,  9.58sec/s][A
Transcribe:  75%|███████▌  | 226.04/300.41 [00:25<00:07,  9.39sec/s][A
Transcribe:  85%|████████▌ | 256.04/300.41 [00:28<00:04,  9.49sec/s][A
Transcribe:  95%|█████████▌| 286.04/300.41 [00:30<00:01, 10.41sec/s][A
Transcribe: 100%|█████████▉| 300.29/300.41 [00:31<00:00,  9.53sec/s][A


Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_42.json



                                                    /s][A
 30%|███       | 10/33 [1:13:48<2:13:26, 348.13s/it]   
Transcribe:   0%|          | 0/585.01 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   5%|▍         | 26.86/585.01 [00:04<01:40,  5.53sec/s][A
Transcribe:   8%|▊         | 45.16/585.01 [00:07<01:21,  6.63sec/s][A
Transcribe:  13%|█▎        | 75.16/585.01 [00:09<01:00,  8.43sec/s][A
Transcribe:  18%|█▊        | 105.16/585.01 [00:12<00:52,  9.07sec/s][A
Transcribe:  23%|██▎       | 135.16/585.01 [00:16<00:50,  8.96sec/s][A
Transcribe:  28%|██▊       | 165.16/585.01 [00:34<02:01,  3.45sec/s][A
Transcribe:  33%|███▎      | 195.16/585.01 [00:37<01:27,  4.47sec/s][A
Transcribe:  38%|███▊      | 225.16/585.01 [00:41<01:10,  5.08sec/s][A
Transcribe:  44%|████▎     | 255.16/585.01 [00:44<00:55,  5.98sec/s][A
Transcribe:  49%|████▊     | 285.16/585.01 [00:49<00:48,  6.24sec/s][A
Transcribe:  54%|█████▍    | 315.16/585.01 [00:53<00:41,  6.51sec/s][A
Transcribe:  59%|█████▉    | 345.16/585.01 [00:56<00:33,  7.10sec/s][A
Transcribe:  64%|██████▍   | 375.16/585.01 [00:59<00:27,  7.72sec/s][A
Transcribe:  69%|██████▉   | 405.16/585.01 [01:04<00:25,  7.10sec/

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_43.json


 33%|███▎      | 11/33 [1:15:27<2:02:30, 334.10s/it]
                                                    /s][A
 33%|███▎      | 11/33 [1:15:28<2:02:30, 334.10s/it]   
Transcribe:   0%|          | 0/603.25 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   5%|▍         | 28.72/603.25 [00:02<00:59,  9.59sec/s][A
Transcribe:   9%|▉         | 57.1/603.25 [00:06<01:07,  8.08sec/s] [A
Transcribe:  14%|█▍        | 87.1/603.25 [00:09<00:58,  8.81sec/s][A
Transcribe:  19%|█▉        | 117.1/603.25 [00:12<00:50,  9.59sec/s][A
Transcribe:  24%|██▍       | 147.1/603.25 [00:16<00:49,  9.31sec/s][A
Transcribe:  29%|██▉       | 177.1/603.25 [00:19<00:45,  9.33sec/s][A
Transcribe:  34%|███▍      | 207.1/603.25 [00:22<00:42,  9.32sec/s][A
Transcribe:  39%|███▉      | 237.1/603.25 [00:26<00:43,  8.47sec/s][A
Transcribe:  44%|████▍     | 267.1/603.25 [00:30<00:40,  8.35sec/s][A
Transcribe:  49%|████▉     | 297.1/603.25 [00:34<00:36,  8.31sec/s][A
Transcribe:  54%|█████▍    | 327.1/603.25 [00:36<00:29,  9.37sec/s][A
Transcribe:  59%|█████▉    | 357.1/603.25 [00:40<00:28,  8.78sec/s][A
Transcribe:  63%|██████▎   | 381.1/603.25 [00:43<00:27,  8.21sec/s][A
Transcribe:  68%|██████▊   | 411.1/603.25 [00:46<00:22,  8.59sec/s][A
Transc

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_44.json



                                                    /s][A
 33%|███▎      | 11/33 [1:16:35<2:02:30, 334.10s/it]   
Transcribe:   0%|          | 0/569.81 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   5%|▌         | 28.56/569.81 [00:02<00:51, 10.41sec/s][A
Transcribe:  10%|▉         | 54.56/569.81 [00:04<00:43, 11.92sec/s][A
Transcribe:  15%|█▍        | 84.56/569.81 [00:07<00:46, 10.50sec/s][A
Transcribe:  20%|██        | 114.56/569.81 [00:08<00:32, 14.12sec/s][A
Transcribe:  25%|██▌       | 144.56/569.81 [00:10<00:27, 15.20sec/s][A
Transcribe:  31%|███       | 174.56/569.81 [00:12<00:27, 14.41sec/s][A
Transcribe:  36%|███▌      | 204.56/569.81 [00:15<00:25, 14.22sec/s][A
Transcribe:  41%|████      | 234.56/569.81 [00:17<00:22, 14.73sec/s][A
Transcribe:  43%|████▎     | 247.74/569.81 [00:18<00:24, 13.35sec/s][A
Transcribe:  46%|████▌     | 262.74/569.81 [00:20<00:25, 12.15sec/s][A
Transcribe:  51%|█████▏    | 292.74/569.81 [00:22<00:24, 11.53sec/s][A
Transcribe:  57%|█████▋    | 322.74/569.81 [00:23<00:15, 16.07sec/s][A
Transcribe:  62%|██████▏   | 352.74/569.81 [00:25<00:13, 15.85sec/s][A
Transcribe:  65%|██████▌   | 371.06/569.81 [00:26<00:12, 15.99sec/

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_45.json



                                                    /s][A
 33%|███▎      | 11/33 [1:17:49<2:02:30, 334.10s/it]   
Transcribe:   0%|          | 0/1198.5 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   2%|▏         | 29.0/1198.5 [00:04<03:21,  5.80sec/s][A
Transcribe:   4%|▍         | 48.48/1198.5 [00:07<02:50,  6.73sec/s][A
Transcribe:   6%|▌         | 74.48/1198.5 [00:10<02:20,  8.00sec/s][A
Transcribe:   9%|▊         | 104.48/1198.5 [00:12<01:56,  9.38sec/s][A
Transcribe:  11%|█         | 134.48/1198.5 [00:15<01:50,  9.64sec/s][A
Transcribe:  14%|█▎        | 164.48/1198.5 [00:19<01:55,  8.95sec/s][A
Transcribe:  16%|█▌        | 194.48/1198.5 [00:22<01:50,  9.06sec/s][A
Transcribe:  19%|█▊        | 224.48/1198.5 [00:26<01:50,  8.82sec/s][A
Transcribe:  21%|██        | 254.48/1198.5 [00:29<01:48,  8.67sec/s][A
Transcribe:  24%|██▎       | 284.48/1198.5 [00:32<01:38,  9.32sec/s][A
Transcribe:  26%|██▌       | 314.48/1198.5 [00:53<04:13,  3.48sec/s][A
Transcribe:  29%|██▊       | 344.48/1198.5 [01:03<04:24,  3.23sec/s][A
Transcribe:  31%|███       | 368.48/1198.5 [01:06<03:36,  3.84sec/s][A
Transcribe:  33%|███▎      | 398.48/1198.5 [01:08<02:40,  5.00sec/s

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_46.json



                                                    /s][A
 33%|███▎      | 11/33 [1:20:45<2:02:30, 334.10s/it]   
Transcribe:   0%|          | 0/580.34 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   5%|▍         | 28.58/580.34 [00:05<01:39,  5.52sec/s][A
Transcribe:   9%|▊         | 49.52/580.34 [00:08<01:25,  6.19sec/s][A
Transcribe:  13%|█▎        | 76.8/580.34 [00:10<01:03,  7.89sec/s] [A
Transcribe:  18%|█▊        | 106.8/580.34 [00:13<00:53,  8.77sec/s][A
Transcribe:  24%|██▎       | 136.8/580.34 [00:16<00:49,  8.92sec/s][A
Transcribe:  29%|██▊       | 166.8/580.34 [00:19<00:40, 10.11sec/s][A
Transcribe:  34%|███▍      | 196.8/580.34 [00:22<00:39,  9.60sec/s][A
Transcribe:  39%|███▊      | 224.46/580.34 [00:25<00:37,  9.54sec/s][A
Transcribe:  44%|████▍     | 254.46/580.34 [00:28<00:33,  9.85sec/s][A
Transcribe:  49%|████▉     | 284.46/580.34 [00:31<00:30,  9.75sec/s][A
Transcribe:  54%|█████▍    | 314.46/580.34 [00:34<00:27,  9.75sec/s][A
Transcribe:  59%|█████▉    | 344.46/580.34 [00:38<00:25,  9.32sec/s][A
Transcribe:  65%|██████▍   | 374.46/580.34 [00:41<00:23,  8.92sec/s][A
Transcribe:  69%|██████▉   | 399.44/580.34 [00:44<00:19,  9.08sec/s][

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_47.json


 36%|███▋      | 12/33 [1:21:46<2:01:41, 347.69s/it]
                                                    /s][A
 36%|███▋      | 12/33 [1:21:46<2:01:41, 347.69s/it]   
Transcribe:   0%|          | 0/297.43 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   8%|▊         | 23.14/297.43 [00:02<00:31,  8.76sec/s][A
Transcribe:  18%|█▊        | 53.14/297.43 [00:06<00:30,  7.98sec/s][A
Transcribe:  27%|██▋       | 80.68/297.43 [00:10<00:28,  7.69sec/s][A
Transcribe:  37%|███▋      | 110.68/297.43 [00:14<00:24,  7.59sec/s][A
Transcribe:  47%|████▋     | 140.68/297.43 [00:18<00:20,  7.68sec/s][A
Transcribe:  57%|█████▋    | 170.68/297.43 [00:21<00:15,  8.37sec/s][A
Transcribe:  67%|██████▋   | 200.68/297.43 [00:24<00:11,  8.54sec/s][A
Transcribe:  78%|███████▊  | 230.68/297.43 [00:28<00:08,  8.14sec/s][A
Transcribe:  88%|████████▊ | 260.68/297.43 [00:32<00:04,  8.24sec/s][A
Transcribe: 100%|█████████▉| 297.38/297.43 [00:34<00:00,  8.58sec/s][A


Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_48.json



                                                    /s][A
 36%|███▋      | 12/33 [1:22:21<2:01:41, 347.69s/it]   
Transcribe:   0%|          | 0/516.99 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   5%|▌         | 27.72/516.99 [00:04<01:22,  5.94sec/s][A
Transcribe:  11%|█         | 57.72/516.99 [00:08<01:08,  6.67sec/s][A
Transcribe:  17%|█▋        | 87.72/516.99 [00:12<00:58,  7.33sec/s][A
Transcribe:  23%|██▎       | 117.72/516.99 [00:17<00:57,  6.93sec/s][A
Transcribe:  29%|██▊       | 147.72/516.99 [00:19<00:46,  8.00sec/s][A
Transcribe:  34%|███▍      | 177.72/516.99 [00:23<00:43,  7.88sec/s][A
Transcribe:  40%|████      | 207.72/516.99 [00:28<00:43,  7.13sec/s][A
Transcribe:  46%|████▌     | 237.72/516.99 [00:55<01:45,  2.65sec/s][A
Transcribe:  50%|█████     | 258.68/516.99 [00:58<01:24,  3.07sec/s][A
Transcribe:  56%|█████▌    | 288.68/516.99 [01:01<00:57,  3.94sec/s][A
Transcribe:  62%|██████▏   | 318.68/516.99 [01:06<00:43,  4.60sec/s][A
Transcribe:  67%|██████▋   | 348.68/516.99 [01:10<00:32,  5.25sec/s][A
Transcribe:  73%|███████▎  | 378.68/516.99 [01:13<00:23,  5.99sec/s][A
Transcribe:  79%|███████▉  | 408.68/516.99 [01:17<00:16,  6.62sec/

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_49.json



                                                    /s][A
 36%|███▋      | 12/33 [1:23:52<2:01:41, 347.69s/it]   
Transcribe:   0%|          | 0/406.83 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   6%|▌         | 23.34/406.83 [00:01<00:28, 13.39sec/s][A
Transcribe:  12%|█▏        | 47.34/406.83 [00:04<00:39,  9.04sec/s][A
Transcribe:  19%|█▉        | 77.34/406.83 [00:08<00:37,  8.82sec/s][A
Transcribe:  26%|██▋       | 107.34/406.83 [00:11<00:31,  9.50sec/s][A
Transcribe:  34%|███▍      | 137.34/406.83 [00:14<00:27,  9.64sec/s][A
Transcribe:  39%|███▉      | 160.22/406.83 [00:17<00:29,  8.44sec/s][A
Transcribe:  47%|████▋     | 190.22/406.83 [00:20<00:23,  9.40sec/s][A
Transcribe:  54%|█████▍    | 220.22/406.83 [00:23<00:19,  9.56sec/s][A
Transcribe:  62%|██████▏   | 250.22/406.83 [00:25<00:15, 10.22sec/s][A
Transcribe:  69%|██████▉   | 280.22/406.83 [00:29<00:12,  9.76sec/s][A
Transcribe:  76%|███████▋  | 310.22/406.83 [00:32<00:09,  9.97sec/s][A
Transcribe:  84%|████████▎ | 340.22/406.83 [00:34<00:06, 10.93sec/s][A
Transcribe:  91%|█████████ | 370.22/406.83 [00:37<00:03, 10.05sec/s][A
Transcribe:  98%|█████████▊| 400.22/406.83 [00:41<00:00,  9.75sec/

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_50.json



                                                    /s][A
 36%|███▋      | 12/33 [1:24:35<2:01:41, 347.69s/it]   
Transcribe:   0%|          | 0/843.47 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   4%|▎         | 30.0/843.47 [00:03<01:24,  9.66sec/s][A
Transcribe:   7%|▋         | 60.0/843.47 [00:07<01:40,  7.79sec/s][A
Transcribe:  11%|█         | 90.0/843.47 [00:11<01:37,  7.73sec/s][A
Transcribe:  14%|█▍        | 120.0/843.47 [00:15<01:31,  7.93sec/s][A
Transcribe:  18%|█▊        | 150.0/843.47 [00:19<01:30,  7.69sec/s][A
Transcribe:  21%|██▏       | 180.0/843.47 [00:22<01:25,  7.74sec/s][A
Transcribe:  25%|██▍       | 210.0/843.47 [00:26<01:18,  8.02sec/s][A
Transcribe:  28%|██▊       | 240.0/843.47 [00:29<01:10,  8.58sec/s][A
Transcribe:  32%|███▏      | 270.0/843.47 [00:32<01:02,  9.12sec/s][A
Transcribe:  36%|███▌      | 300.0/843.47 [00:36<01:04,  8.47sec/s][A
Transcribe:  39%|███▉      | 330.0/843.47 [00:40<01:01,  8.29sec/s][A
Transcribe:  43%|████▎     | 360.0/843.47 [00:43<00:57,  8.43sec/s][A
Transcribe:  46%|████▌     | 390.0/843.47 [00:46<00:52,  8.56sec/s][A
Transcribe:  50%|████▉     | 420.0/843.47 [00:51<00:53,  7.90sec/s][A
Transcri

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_51.json


 39%|███▉      | 13/33 [1:26:20<1:48:26, 325.34s/it]
                                                    /s][A
 39%|███▉      | 13/33 [1:26:20<1:48:26, 325.34s/it]   
Transcribe:   0%|          | 0/315.45 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   8%|▊         | 25.44/315.45 [00:02<00:34,  8.49sec/s][A
Transcribe:  18%|█▊        | 55.44/315.45 [00:06<00:29,  8.95sec/s][A
Transcribe:  25%|██▌       | 79.02/315.45 [00:08<00:25,  9.10sec/s][A
Transcribe:  35%|███▍      | 109.02/315.45 [00:11<00:21,  9.71sec/s][A
Transcribe:  44%|████▍     | 139.02/315.45 [00:15<00:19,  9.01sec/s][A
Transcribe:  54%|█████▎    | 169.02/315.45 [00:17<00:14, 10.41sec/s][A
Transcribe:  62%|██████▏   | 194.34/315.45 [00:20<00:12,  9.84sec/s][A
Transcribe:  69%|██████▉   | 218.68/315.45 [00:23<00:10,  9.51sec/s][A
Transcribe:  79%|███████▉  | 248.68/315.45 [00:26<00:06,  9.57sec/s][A
Transcribe:  88%|████████▊ | 278.68/315.45 [00:29<00:03,  9.78sec/s][A
Transcribe:  98%|█████████▊| 308.68/315.45 [00:32<00:00,  9.38sec/s][A
Transcribe: 100%|█████████▉| 315.33/315.45 [00:33<00:00,  9.48sec/s][A


Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_52.json



                                                    /s][A
 39%|███▉      | 13/33 [1:26:54<1:48:26, 325.34s/it]   
Transcribe:   0%|          | 0/585.22 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   5%|▍         | 28.0/585.22 [00:05<01:52,  4.96sec/s][A
Transcribe:  10%|▉         | 58.0/585.22 [00:10<01:35,  5.53sec/s][A
Transcribe:  15%|█▌        | 88.0/585.22 [00:13<01:11,  6.91sec/s][A
Transcribe:  20%|█▉        | 116.7/585.22 [00:19<01:15,  6.21sec/s][A
Transcribe:  25%|██▌       | 146.7/585.22 [00:23<01:07,  6.50sec/s][A
Transcribe:  30%|███       | 176.7/585.22 [00:26<00:57,  7.07sec/s][A
Transcribe:  35%|███▌      | 206.7/585.22 [00:32<00:59,  6.32sec/s][A
Transcribe:  40%|████      | 236.7/585.22 [00:37<00:56,  6.16sec/s][A
Transcribe:  46%|████▌     | 266.7/585.22 [00:43<00:53,  5.90sec/s][A
Transcribe:  51%|█████     | 296.7/585.22 [00:47<00:47,  6.07sec/s][A
Transcribe:  56%|█████▌    | 326.7/585.22 [00:53<00:43,  6.00sec/s][A
Transcribe:  61%|██████    | 356.7/585.22 [00:57<00:37,  6.12sec/s][A
Transcribe:  66%|██████▌   | 386.7/585.22 [01:02<00:32,  6.15sec/s][A
Transcribe:  71%|███████   | 416.7/585.22 [01:07<00:27,  6.08sec/s][A
Transcri

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_53.json



                                                    /s][A
 39%|███▉      | 13/33 [1:28:28<1:48:26, 325.34s/it]   
Transcribe:   0%|          | 0/641.97 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   5%|▍         | 30.0/641.97 [00:02<00:57, 10.74sec/s][A
Transcribe:   8%|▊         | 54.04/641.97 [00:04<00:53, 11.04sec/s][A
Transcribe:  12%|█▏        | 75.52/641.97 [00:06<00:51, 11.04sec/s][A
Transcribe:  16%|█▋        | 105.52/641.97 [00:08<00:42, 12.62sec/s][A
Transcribe:  21%|██        | 135.52/641.97 [00:11<00:43, 11.60sec/s][A
Transcribe:  26%|██▌       | 165.52/641.97 [00:14<00:40, 11.75sec/s][A
Transcribe:  29%|██▉       | 189.22/641.97 [00:16<00:40, 11.27sec/s][A
Transcribe:  34%|███▍      | 217.32/641.97 [00:19<00:37, 11.29sec/s][A
Transcribe:  39%|███▊      | 247.32/641.97 [00:21<00:36, 10.92sec/s][A
Transcribe:  43%|████▎     | 277.32/641.97 [00:25<00:35, 10.30sec/s][A
Transcribe:  48%|████▊     | 307.32/641.97 [00:27<00:31, 10.52sec/s][A
Transcribe:  53%|█████▎    | 337.32/641.97 [00:29<00:25, 11.80sec/s][A
Transcribe:  57%|█████▋    | 367.32/641.97 [00:32<00:24, 11.12sec/s][A
Transcribe:  62%|██████▏   | 397.32/641.97 [00:36<00:24, 10.17sec/s

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_54.json



                                                    /s][A
 39%|███▉      | 13/33 [1:29:29<1:48:26, 325.34s/it]   
Transcribe:   0%|          | 0/921.07 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   2%|▏         | 22.0/921.07 [00:02<01:59,  7.50sec/s][A
Transcribe:   6%|▌         | 52.0/921.07 [00:05<01:20, 10.79sec/s][A
Transcribe:   7%|▋         | 67.18/921.07 [00:07<01:32,  9.24sec/s][A
Transcribe:  11%|█         | 97.18/921.07 [00:07<00:53, 15.40sec/s][A
Transcribe:  14%|█▍        | 127.18/921.07 [00:09<00:52, 15.08sec/s][A
Transcribe:  17%|█▋        | 157.18/921.07 [00:13<01:04, 11.83sec/s][A
Transcribe:  20%|██        | 187.18/921.07 [00:17<01:15,  9.78sec/s][A
Transcribe:  24%|██▎       | 217.18/921.07 [00:20<01:09, 10.08sec/s][A
Transcribe:  27%|██▋       | 247.18/921.07 [00:24<01:16,  8.78sec/s][A
Transcribe:  27%|██▋       | 247.18/921.07 [00:41<01:16,  8.78sec/s][A
Transcribe:  30%|███       | 277.18/921.07 [00:52<03:56,  2.72sec/s][A
Transcribe:  33%|███▎      | 306.18/921.07 [00:57<03:12,  3.19sec/s][A
Transcribe:  36%|███▋      | 336.18/921.07 [01:03<02:39,  3.67sec/s][A
Transcribe:  40%|███▉      | 366.18/921.07 [01:09<02:21,  3.93sec/s]

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_55.json


 42%|████▏     | 14/33 [1:32:04<1:44:49, 331.03s/it]
                                                    /s][A
 42%|████▏     | 14/33 [1:32:04<1:44:49, 331.03s/it]   
Transcribe:   0%|          | 0/718.32 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   4%|▍         | 27.74/718.32 [00:03<01:33,  7.38sec/s][A
Transcribe:   8%|▊         | 55.6/718.32 [00:07<01:32,  7.17sec/s] [A
Transcribe:  12%|█▏        | 85.6/718.32 [00:11<01:21,  7.81sec/s][A
Transcribe:  16%|█▌        | 115.6/718.32 [00:15<01:17,  7.76sec/s][A
Transcribe:  20%|██        | 145.6/718.32 [00:19<01:14,  7.69sec/s][A
Transcribe:  24%|██▍       | 175.6/718.32 [00:22<01:09,  7.78sec/s][A
Transcribe:  29%|██▊       | 205.6/718.32 [00:27<01:12,  7.11sec/s][A
Transcribe:  33%|███▎      | 235.6/718.32 [00:31<01:05,  7.38sec/s][A
Transcribe:  37%|███▋      | 265.6/718.32 [00:34<00:58,  7.73sec/s][A
Transcribe:  41%|████      | 295.6/718.32 [00:38<00:52,  8.00sec/s][A
Transcribe:  45%|████▌     | 325.6/718.32 [00:42<00:48,  8.01sec/s][A
Transcribe:  50%|████▉     | 355.6/718.32 [00:46<00:46,  7.85sec/s][A
Transcribe:  54%|█████▎    | 385.6/718.32 [00:50<00:42,  7.83sec/s][A
Transcribe:  58%|█████▊    | 415.6/718.32 [00:54<00:39,  7.60sec/s][A
Transc

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_56.json



                                                    /s][A
 42%|████▏     | 14/33 [1:33:34<1:44:49, 331.03s/it]   
Transcribe:   0%|          | 0/566.02 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   3%|▎         | 18.5/566.02 [00:01<00:45, 11.94sec/s][A
Transcribe:   7%|▋         | 39.54/566.02 [00:03<00:45, 11.65sec/s][A
Transcribe:  12%|█▏        | 68.1/566.02 [00:06<00:46, 10.65sec/s] [A
Transcribe:  17%|█▋        | 98.1/566.02 [00:08<00:39, 11.70sec/s][A
Transcribe:  23%|██▎       | 128.1/566.02 [00:11<00:38, 11.35sec/s][A
Transcribe:  27%|██▋       | 155.5/566.02 [00:13<00:36, 11.22sec/s][A
Transcribe:  33%|███▎      | 185.5/566.02 [00:16<00:35, 10.79sec/s][A
Transcribe:  38%|███▊      | 215.5/566.02 [00:17<00:25, 13.73sec/s][A
Transcribe:  43%|████▎     | 245.5/566.02 [00:20<00:25, 12.67sec/s][A
Transcribe:  49%|████▊     | 275.5/566.02 [00:23<00:24, 11.90sec/s][A
Transcribe:  54%|█████▍    | 305.5/566.02 [00:25<00:22, 11.75sec/s][A
Transcribe:  59%|█████▉    | 335.5/566.02 [00:28<00:19, 11.74sec/s][A
Transcribe:  65%|██████▍   | 365.5/566.02 [00:31<00:17, 11.74sec/s][A
Transcribe:  70%|██████▉   | 395.5/566.02 [00:33<00:14, 11.89sec/s][A
Transcr

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_57.json



                                                    c/s][A
 42%|████▏     | 14/33 [1:34:28<1:44:49, 331.03s/it]    
Transcribe:   0%|          | 0/1843.61 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   1%|▏         | 27.0/1843.61 [00:04<04:41,  6.46sec/s][A
Transcribe:   3%|▎         | 54.82/1843.61 [00:07<04:06,  7.27sec/s][A
Transcribe:   5%|▍         | 83.48/1843.61 [00:12<04:12,  6.96sec/s][A
Transcribe:   6%|▌         | 113.48/1843.61 [00:18<05:05,  5.66sec/s][A
Transcribe:   8%|▊         | 142.08/1843.61 [00:24<05:10,  5.48sec/s][A
Transcribe:   9%|▉         | 171.68/1843.61 [00:30<05:26,  5.12sec/s][A
Transcribe:  11%|█         | 201.68/1843.61 [00:35<04:57,  5.52sec/s][A
Transcribe:  13%|█▎        | 231.68/1843.61 [00:40<04:42,  5.71sec/s][A
Transcribe:  14%|█▍        | 261.68/1843.61 [00:46<04:47,  5.50sec/s][A
Transcribe:  16%|█▌        | 291.68/1843.61 [00:52<04:52,  5.31sec/s][A
Transcribe:  17%|█▋        | 321.68/1843.61 [00:59<05:08,  4.93sec/s][A
Transcribe:  19%|█▉        | 351.68/1843.61 [01:05<04:58,  5.00sec/s][A
Transcribe:  21%|██        | 381.68/1843.61 [01:10<04:48,  5.06sec/s][A
Transcribe:  22%|██▏       | 411.68/1843.61 [01:15<04:

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_58.json



                                                    /s][A
 42%|████▏     | 14/33 [1:40:51<1:44:49, 331.03s/it]   
Transcribe:   0%|          | 0/214.49 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:  14%|█▎        | 29.26/214.49 [00:03<00:19,  9.59sec/s][A
Transcribe:  23%|██▎       | 50.18/214.49 [00:04<00:15, 10.48sec/s][A
Transcribe:  34%|███▍      | 73.5/214.49 [00:07<00:15,  9.35sec/s] [A
Transcribe:  48%|████▊     | 103.5/214.49 [00:10<00:11,  9.69sec/s][A
Transcribe:  62%|██████▏   | 133.5/214.49 [00:13<00:08,  9.88sec/s][A
Transcribe:  76%|███████▌  | 163.5/214.49 [00:17<00:05,  9.09sec/s][A
Transcribe:  90%|█████████ | 193.5/214.49 [00:20<00:02,  9.47sec/s][A
Transcribe: 100%|█████████▉| 214.42/214.49 [00:22<00:00,  9.56sec/s][A


Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_59.json


 45%|████▌     | 15/33 [1:41:13<1:59:02, 396.80s/it]
                                                    /s][A
 45%|████▌     | 15/33 [1:41:14<1:59:02, 396.80s/it]   
Transcribe:   0%|          | 0/597.37 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   5%|▍         | 27.0/597.37 [00:03<01:16,  7.47sec/s][A
Transcribe:  10%|▉         | 57.0/597.37 [00:08<01:23,  6.46sec/s][A
Transcribe:  14%|█▍        | 83.0/597.37 [00:12<01:17,  6.64sec/s][A
Transcribe:  19%|█▉        | 113.0/597.37 [00:15<01:00,  7.97sec/s][A
Transcribe:  24%|██▍       | 143.0/597.37 [00:16<00:45,  9.94sec/s][A
Transcribe:  29%|██▉       | 173.0/597.37 [00:19<00:42,  9.95sec/s][A
Transcribe:  34%|███▍      | 203.0/597.37 [00:23<00:40,  9.78sec/s][A
Transcribe:  39%|███▉      | 233.0/597.37 [00:25<00:33, 11.01sec/s][A
Transcribe:  44%|████▍     | 263.0/597.37 [00:27<00:30, 11.05sec/s][A
Transcribe:  49%|████▉     | 293.0/597.37 [00:30<00:27, 10.89sec/s][A
Transcribe:  54%|█████▍    | 323.0/597.37 [00:34<00:27,  9.81sec/s][A
Transcribe:  59%|█████▉    | 353.0/597.37 [00:36<00:22, 10.70sec/s][A
Transcribe:  64%|██████▍   | 383.0/597.37 [00:40<00:21,  9.87sec/s][A
Transcribe:  69%|██████▉   | 413.0/597.37 [00:42<00:17, 10.44sec/s][A
Transcri

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_60.json



                                                    /s][A
 45%|████▌     | 15/33 [1:42:14<1:59:02, 396.80s/it]   
Transcribe:   0%|          | 0/799.72 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   4%|▎         | 29.3/799.72 [00:04<01:51,  6.90sec/s][A
Transcribe:   7%|▋         | 57.72/799.72 [00:08<01:42,  7.21sec/s][A
Transcribe:  11%|█         | 87.72/799.72 [00:11<01:28,  8.09sec/s][A
Transcribe:  14%|█▍        | 113.14/799.72 [00:14<01:22,  8.37sec/s][A
Transcribe:  18%|█▊        | 143.14/799.72 [00:16<01:11,  9.12sec/s][A
Transcribe:  22%|██▏       | 173.14/799.72 [00:20<01:10,  8.86sec/s][A
Transcribe:  25%|██▌       | 203.14/799.72 [00:24<01:11,  8.35sec/s][A
Transcribe:  29%|██▉       | 233.14/799.72 [00:29<01:13,  7.70sec/s][A
Transcribe:  33%|███▎      | 263.14/799.72 [00:33<01:10,  7.62sec/s][A
Transcribe:  37%|███▋      | 293.14/799.72 [00:36<01:01,  8.22sec/s][A
Transcribe:  40%|████      | 323.14/799.72 [00:39<00:58,  8.10sec/s][A
Transcribe:  44%|████▍     | 353.14/799.72 [00:42<00:51,  8.72sec/s][A
Transcribe:  48%|████▊     | 383.14/799.72 [00:46<00:50,  8.30sec/s][A
Transcribe:  52%|█████▏    | 413.14/799.72 [00:50<00:47,  8.18sec/s

Saved: /home/arveloic/MASCOT/stable_ts_json/stable_ts_audio_61.json



                                                    /s][A
 45%|████▌     | 15/33 [1:43:59<1:59:02, 396.80s/it]   
Transcribe:   0%|          | 0/819.89 [00:00<?, ?sec/s][A

Detected language: english



Transcribe:   3%|▎         | 23.0/819.89 [00:03<02:08,  6.20sec/s][A
Transcribe:   6%|▌         | 51.0/819.89 [00:06<01:38,  7.80sec/s][A
Transcribe:  10%|▉         | 81.0/819.89 [00:11<01:47,  6.87sec/s][A
Transcribe:  14%|█▎        | 111.0/819.89 [00:15<01:35,  7.39sec/s][A
Transcribe:  17%|█▋        | 141.0/819.89 [00:18<01:25,  7.93sec/s][A
Transcribe:  21%|██        | 169.0/819.89 [00:23<01:30,  7.17sec/s][A
Transcribe:  24%|██▍       | 198.0/819.89 [00:27<01:29,  6.92sec/s][A
Transcribe:  28%|██▊       | 228.0/819.89 [00:31<01:21,  7.23sec/s][A
Transcribe:  31%|███▏      | 257.0/819.89 [00:35<01:17,  7.26sec/s][A
Transcribe:  35%|███▌      | 287.0/819.89 [00:39<01:10,  7.51sec/s][A
Transcribe:  39%|███▊      | 316.0/819.89 [00:43<01:08,  7.36sec/s][A
Transcribe:  42%|████▏     | 345.0/819.89 [00:47<01:07,  6.99sec/s][A
Transcribe:  46%|████▌     | 374.0/819.89 [00:52<01:04,  6.92sec/s][A
Transcribe:  49%|████▉     | 404.0/819.89 [01:04<01:35,  4.37sec/s][A
Transcri

In [7]:
audio_folder = "Audio_Files/"
audio_files = [os.path.join(audio_folder, f) for f in os.listdir(audio_folder) if f.endswith(('.mp3', '.wav'))]

In [8]:
audio_files[0]

'Audio_Files/008-7.mp3'

In [13]:
# Directory where your JSON files are saved
json_folder = 'stable_ts_json'

# List to hold all the data
data = []

# Loop through all JSON files in the directory
for i, filename in enumerate(os.listdir(json_folder)):
    if filename.endswith('.json'):
        clip_name = audio_files[i].split("/")[1].split(".")[0]
        file_path = os.path.join(json_folder, filename)
        
        # Open and load the JSON file
        with open(file_path, 'r') as f:
            result = json.load(f)
            
            # Check if 'segments' are in the JSON
            if 'segments' in result:
                for segment in result['segments']:
                    # Extract the required fields
                    text = segment.get('text', '')
                    start = segment.get('start', None)
                    end = segment.get('end', None)
                    
                    # Append to the data list
                    data.append({
                        'clip_name': clip_name,
                        'text': text,
                        'start': start,
                        'end': end
                    })

In [14]:
# Create a DataFrame from the data list
df = pd.DataFrame(data)

In [15]:
df.head()

Unnamed: 0,clip_name,text,start,end
0,008-7,Good morning.,2.36,2.76
1,008-7,"Today is Tuesday,",2.82,4.7
2,008-7,April 8th.,5.08,6.64
3,008-7,"Okay, you guys are doing a wonderful job sitt...",10.55,17.86
4,008-7,But we know Terry is going to be himself toge...,18.16,21.06


In [17]:
df["clip_name"].nunique()

131

In [19]:
df.to_csv("stable_ts_utterances.csv", index =  False)

## Define Whisper Pipeline

In [12]:
model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=BATCH_SIZE,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
    generate_kwargs={"language": "english"},
)

def iterate_data(dataset):
    for i, item in enumerate(dataset):
        yield item["audio"]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
# 8 works fine, but larger batches could be tested 
BATCH_SIZE = 8

In [14]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=BATCH_SIZE,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
    generate_kwargs={"language": "english"},
)

In [15]:
pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")

In [16]:
def iterate_data(dataset):
    for i, item in enumerate(dataset):
        yield item["audio"]

In [21]:
ts_predictions = []

In [23]:
audio_dataset[-1]

{'audio': {'path': 'Audio_Files/008-12.mp3',
  'array': array([0.00228008, 0.00481903, 0.00688817, ..., 0.        , 0.        ,
         0.        ]),
  'sampling_rate': 44100}}

In [24]:
len(ts_predictions)

0

In [None]:
# run streamed inference
for out in pipe(iterate_data(audio_dataset), 
                batch_size=BATCH_SIZE, 
                return_timestamps = True, 
               ):
    ts_predictions.append(out["chunks"])
    gc.collect()
    torch.cuda.empty_cache()

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the m

In [39]:
len(ts_predictions)

131

In [None]:
import pickle

In [40]:
with open('whisper_largev3_transcriptions_with_timestamps.pickle', 'wb') as wlv3_transcriptions_ts:
    pickle.dump(ts_predictions, wlv3_transcriptions_ts, protocol=pickle.HIGHEST_PROTOCOL)

In [41]:
# with open('no_student_whisper_largev3_transcriptions_with_timestamps.pickle', 'wb') as wlv3_transcriptions_ts:
#     pickle.dump(ts_predictions, wlv3_transcriptions_ts, protocol=pickle.HIGHEST_PROTOCOL)

In [42]:
len(ts_predictions)

131

## Converting Transcription Output to Dataframes

In [43]:
order_of_clips = df["clip"].tolist()

In [44]:
whisper_transcription_dfs = {}
whisper_speaker_segs = {}

for transcription, clip_name in zip(ts_predictions, order_of_clips):
    whisper_transcription_df = pd.DataFrame.from_dict(transcription)
    whisper_transcription_df["start"] = whisper_transcription_df["timestamp"].apply(lambda x: x[0])
    whisper_transcription_df["end"] = whisper_transcription_df["timestamp"].apply(lambda x: x[1])
    whisper_transcription_df = whisper_transcription_df.drop("timestamp", axis=1)

    whisper_speaker_seg = whisper_transcription_df.copy()
    whisper_speaker_seg["text"] = "Teacher"
    whisper_speaker_seg = whisper_speaker_seg[["start", "end", "text"]]
    
    whisper_transcription_dfs[clip_name] = whisper_transcription_df
    whisper_speaker_segs[clip_name] = whisper_speaker_seg

In [45]:
audio_dataset[0]["audio"]

{'path': 'Audio_Files/131-9.mp3',
 'array': array([ 1.08676904e-05, -7.46108009e-04, -1.82057801e-03, ...,
         8.41914911e-08,  2.24942141e-07,  4.38241244e-07]),
 'sampling_rate': 11025}

In [46]:
from typing import Dict, List, Any

def process_transcriptions(
    whisper_transcription_dfs: Dict[str, pd.DataFrame],
    audio_dataset: List[Dict[str, Any]],
    whisper_speaker_segs: Dict[str, Any]
) -> tuple[Dict[str, pd.DataFrame], Dict[str, Any]]:
    
    # Create a dictionary for quick audio file lookup
    audio_dict = {audio["audio"]['path'].split('/')[-1].split('.')[0]: audio for audio in audio_dataset}

    for key, df in whisper_transcription_dfs.items():
        audio_file = audio_dict.get(key)
        if not audio_file:
            print(f"Warning: No matching audio file found for key {key}")
            continue

        audio_length = len(audio_file["audio"]['array']) / audio_file["audio"]['sampling_rate']
        
        # Check and update the last row
        last_index = df.index[-1]
        last_end = df.at[last_index, 'end']

        if pd.isna(last_end) or last_end > audio_length:
            df.at[last_index, 'end'] = audio_length
            whisper_speaker_segs[key][last_index, 'end'] = audio_length

    return whisper_transcription_dfs, whisper_speaker_segs

In [48]:
processed_transcription_dfs, processed_whisper_speaker_segs = process_transcriptions(whisper_transcription_dfs, audio_dataset, whisper_speaker_segs)

In [51]:
with open('whisper_largev3_transcriptions_dict.pickle', 'wb') as wm_transcriptions_dict:
    pickle.dump(processed_transcription_dfs, wm_transcriptions_dict, protocol=pickle.HIGHEST_PROTOCOL)

with open('whisper_largev3_speaker_segs_dict.pickle', 'wb') as wm_ss_dict:
    pickle.dump(processed_whisper_speaker_segs , wm_ss_dict, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# with open('no_student_whisper_largev3_transcriptions_dict.pickle', 'wb') as wm_transcriptions_dict:
#     pickle.dump(whisper_transcription_dfs, wm_transcriptions_dict, protocol=pickle.HIGHEST_PROTOCOL)

# with open('no_student_whisper_largev3_speaker_segs_dict.pickle', 'wb') as wm_ss_dict:
#     pickle.dump(whisper_speaker_segs, wm_ss_dict, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# with open('whisper_medium_transcriptions.pickle', 'wb') as wm_transcriptions:
#     pickle.dump(predictions, wm_transcriptions, protocol=pickle.HIGHEST_PROTOCOL)