# Part 1: Automatic Speech Recognition, Diarize and Label

Environment = "whisperx"

* Performance Benchmarks on local
* GPU Benchmark: 0.09961056709289551 seconds
* Memory Bandwidth Benchmark: 0.2920224666595459 seconds
* CPU Benchmark: 13.046526432037354 seconds
* Disk Write Benchmark: 2.3364615440368652 seconds
* Disk Read Benchmark: 0.05882525444030762 seconds \n
  
** all benchmarks are >> faster than Collab with the exception of Disk write.

## Setup ⚙️
Tested for PyTorch 2.0, Python 3.10 (use other versions at your own risk!)
GPU execution requires the NVIDIA libraries cuBLAS 11.x and cuDNN 8.x to be installed on the system. Please refer to the CTranslate2 documentation.

1.  Create Python3.10 environment

`conda create --name whisperx python=3.10`

`conda activate whisperx`

2. Install PyTorch, e.g. for Linux and Windows CUDA11.8:
   
conda install pytorch==2.0.0 torchaudio==2.0.0 pytorch-cuda=11.8 -c pytorch -c nvidia

See other methods here.

1. Install this repo

`pip install git+https://github.com/m-bain/whisperx.git`

If already installed, update package to most recent commit

`pip install git+https://github.com/m-bain/whisperx.git --upgrade`

## Post Setup - REQUIRED for DIARIZATION
https://github.com/m-bain/whisperX/issues/499

`pip install pyannote.audio==3.0.1`

`pip uninstall onnxruntime`

`pip install --force-reinstall onnxruntime-gpu`

In [None]:
import ffmpeg

## 1 - Convert Mp3 to WAV.

def convert_m4a_to_mp3(input_file, output_file):
    try:
        ffmpeg.input(input_file).output(output_file).run(overwrite_output=True)
        print(f"Successfully converted {input_file} to {output_file}")
    except ffmpeg.Error as e:
        print("An error occurred:", e)

# Input/ output files and usage
input_mp3 = './audio/Botswana_2007_Audio.mp3'  # Change this to your mp3 file path
output_wav = './data/Botswana_2007_Audio.wav'  # Change this to your desired output wav file path

convert_m4a_to_mp3(input_mp3, output_wav)

In [None]:
import whisperx
import gc
import os
import torch

device = "cuda"
## Full file should be the input (2007 or 2024 file..)
audio_file = "./data/Botswana_2007_Audio.wav"

## DEBUGGING, use a small file
# audio_file = "./data/Intro.wav"

batch_size = 16 # reduce if low on GPU mem
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)
without_timestamps= 'True'

## Some error handling to ensure that successfully loaded the mp3 file!
try:
    # Check if the file exists
    if not os.path.isfile(audio_file):
        raise FileNotFoundError(f"The file '{audio_file}' does not exist.")
    # Optionally, you can add more checks (like file format) here

    print(f"Successfully accessed the audio file: {audio_file}")

except FileNotFoundError as e:
    print(e)
except Exception as e:
    print(f"An unexpected error occurred: {e}")

  if ismodule(module) and hasattr(module, '__file__'):
  from speechbrain.pretrained import (


Successfully accessed the audio file: ./data/Botswana_2007_Audio.wav


## Load the Audio File

In [None]:
import whisperx
import gc
import os
import torch

device = "cuda"
## Full file should be the input (2007 or 2024 file..)
audio_file = "./data/Botswana_2007_Audio.wav"

## DEBUGGING, use a small file
# audio_file = "./data/Intro.wav"

batch_size = 16 # reduce if low on GPU mem
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)
without_timestamps= 'True'

## Some error handling to ensure that successfully loaded the mp3 file!
try:
    # Check if the file exists
    if not os.path.isfile(audio_file):
        raise FileNotFoundError(f"The file '{audio_file}' does not exist.")
    # Optionally, you can add more checks (like file format) here

    print(f"Successfully accessed the audio file: {audio_file}")

except FileNotFoundError as e:
    print(e)
except Exception as e:
    print(f"An unexpected error occurred: {e}")

  if ismodule(module) and hasattr(module, '__file__'):
  from speechbrain.pretrained import (


Successfully accessed the audio file: ./data/Botswana_2007_Audio.wav


## Transcription in Batches

### Split the Audio file into smaller pieces


In [None]:
## 3 - Split up large files in <10min


import sqlite3
import librosa
import soundfile as sf
import math


# TODO: change max duration to 300 seconds
# TODO: update the target database folder
# TODO: check input filenames
# TODO: Update the output folder

# Function to split audio and save to database
def split_audio(audio_file, max_duration=300):  # 60second (1min) for testing; 300sec for production
    conn = sqlite3.connect('./data/Audio_clips.db')
    cursor = conn.cursor()
    cursor.execute('''CREATE TABLE IF NOT EXISTS clips
                     (id INTEGER PRIMARY KEY AUTOINCREMENT, start_time REAL, end_time REAL, filename TEXT)''')

    try:
        y, sr = librosa.load(audio_file)
    except Exception as e:
        print(f"Error loading audio file: {e}")
        return []

    total_duration = librosa.get_duration(y=y, sr=sr)
    num_splits = math.ceil(total_duration / max_duration)
    results = []

    for i in range(num_splits):
        start_time = i * max_duration
        end_time = min((i + 1) * max_duration, total_duration)

        start_sample = int(start_time * sr)
        end_sample = int(end_time * sr)

        clip = y[start_sample:end_sample]
        filename = f"./data/Botswana2007_clip_{i}.wav"

        try:
            sf.write(filename, clip, sr)
            cursor.execute("INSERT INTO clips (start_time, end_time, filename) VALUES (?, ?, ?)",
                           (start_time, end_time, filename))
            conn.commit()
            results.append({"start_time": start_time, "end_time": end_time, "filename": filename})
        except Exception as e:
            print(f"Error processing clip {i}: {e}")

    conn.close()
    return results
# results is a DIctionary
results = split_audio(audio_file)

# BATCH PROCESS: Transcript - Align - Diarize

### Approach 1 - use python

In [None]:
import os
import glob
import json
import gc
import torch
import whisperx
from HF_token import TOKEN_ID

# Directory containing .wav files
wav_directory = './data/Testing'

# Get a list of all .wav files in the directory
wav_files = glob.glob(os.path.join(wav_directory, '*.wav'))

# Initialize results_full list
aligned_results_full = []

# Set device and compute type
device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16" if device == "cuda" else "float32"

print(f"TRANSCRIBING & ALIGNING using device: {device}")
print(f"Compute type is {compute_type}")

# Ensure the model directory exists
model_dir = "./model/"
os.makedirs(model_dir, exist_ok=True)



# Define batch size
batch_size = 16  # Adjust as needed

# # Load the model and save it to the local path
# try:
#     model = whisperx.load_model("large-v2", device=device, compute_type=compute_type, download_root=model_dir)
# except Exception as e:
#     print(f"Error loading model: {e}")
#     raise

# Iterate through each .wav file and process it
for wav_file in wav_files:
    print(f"Processing file: {wav_file}")
    try:
        # Load the audio file
        audio = whisperx.load_audio(wav_file)
        
        # Ensure the output directory exists
        output_dir = "./outputs/Testing"
        os.makedirs(output_dir, exist_ok=True)

        #####################     TRANSCRIPTION  #################
        # Load the model and save it to the local path
        try:
            model = whisperx.load_model("large-v2", device=device, compute_type=compute_type, download_root=model_dir)
        except Exception as e:
            print(f"Error loading model: {e}")
            raise

        print(f"STARTING Transcription on {wav_file}")

        # Transcribe the audio file
        transcribe_result = model.transcribe(audio, batch_size=batch_size)
        print(transcribe_result["segments"])  # before alignment
        
        # Save the transcription result to a JSON file
        transcript_filename = os.path.basename(wav_file).replace('.wav', '')
        with open(f'./outputs/Testing/{transcript_filename}_transcript.json', 'w') as json_file:
            json.dump(transcribe_result, json_file, indent=4)
        
        

        #####################     ALIGNMENT #################
        print(f"STARTING ALIGNMENT on {wav_file}")
        
        # Load the alignment model with the specified device
        model_a, metadata = whisperx.load_align_model(language_code="en", device=device)
        
        # Perform alignment using the specified device
        aligned_result = whisperx.align(transcribe_result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
        
        # Save the Alignment result to a JSON file
        alignment_filename = os.path.basename(wav_file).replace('.wav', '')
        with open(f'./outputs/Testing/{alignment_filename}_aligned.json', 'w') as json_file:
            json.dump(aligned_result, json_file, indent=4)
        
        # Append the aligned result to results_full
        aligned_results_full.append(aligned_result)
        
        #####################     DIARIZE #################
        print(f"STARTING DIARIZE on {wav_file}")

        # Load the DIARIZE Model
        diarize_model = whisperx.DiarizationPipeline(use_auth_token=TOKEN_ID, device=device)

        # Load the audio data
        audio_data = {
            'waveform': torch.from_numpy(audio[None, :]),
            'sample_rate': whisperx.audio.SAMPLE_RATE
                    }
        # Run the diarization model
        diarize_segments = diarize_model(audio)

        # add min/max number of speakers if known
        diarize_model(audio, min_speakers=1, max_speakers=3)

        # Assign speaker labels to words
        diarize_result = whisperx.assign_word_speakers(diarize_segments, aligned_result)

        ## SAVE the TRANSCRIPT
        diarized_filename = os.path.basename(wav_file).replace('.wav', '')
        with open(f'./outputs/Testing/{diarized_filename}_diarized.json', 'w') as json_file:
            json.dump(diarize_result, json_file, indent=4)
     
       # Clean up memory after each file
        gc.collect()
        torch.cuda.empty_cache()
    
    except Exception as e:
        print(f"Error processing file {wav_file}: {e}")

# Optionally, save the full results to a single JSON file
with open('./outputs/Testing/full_alignment.json', 'w') as json_file:
    json.dump(aligned_results_full, json_file, indent=4)



TRANSCRIBING & ALIGNING using device: cuda
Compute type is float16
Processing file: ./data/Testing\Intro_clip_0.wav
No language specified, language will be first be detected for each audio file (increases inference time).


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.3.3. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\jamie\.cache\torch\whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.0.0. Bad things might happen unless you revert torch to 1.x.
STARTING Transcription on ./data/Testing\Intro_clip_0.wav
Detected language: en (0.99) in first 30s of audio...
[{'text': ' And welcome! Thank you very much. Now, as you know, the producers on this show like to give us challenges. Specifically, where they give us a very small amount of money and tell us to buy a used car. Then they set unbelievably hard tasks to do to see which one of us got the best deal. Yeah. This week, for a Top Gear special, they came up with a real humdinger. They gave each of us 1,500 quid and told us to go to Africa and buy a car.', 'start': 0.009, 'end': 28.302}, {'text': " Yeah, there were just two conditions. It mustn't be four-wheel drive and it mustn't be built in any way to go off-road. The meeting point was the border

### Approach 2 - use the Terminal and CLI 
This seemed to work on a single file VERY fast! 

In [None]:
import os
from HF_token import TOKEN_ID
# Set the path to your directory
directory = "./data/Testing/"

# Iterate through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".wav"):  # Check for .wav files
        filepath = os.path.join(directory, filename)
        
        # Construct and run the whisperx command for each file
        command = f"whisperx {filepath} --model large-v2 --diarize --highlight_words True --hf_token {TOKEN_ID}"
        os.system(command)

 ### Approach 2B - add some subprocess to monitor progress

 This method is best as a) actually worked and b) provided insight into what is going on! 

In [None]:
import os
import subprocess
from HF_token import TOKEN_ID

# Set the path to your directory
directory = "data/Testing/"

# Iterate through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".wav"):  # Check for .wav files
        filepath = os.path.join(directory, filename)
        
        # Print the filename to show progress
        print(f"Processing file: {filename}")
        
        # Construct the whisperx command for each file
        command = f"whisperx {filepath} --model large-v2 --diarize --highlight_words True --hf_token {TOKEN_ID} --output_dir ./outputs"
        
        # Run the command and capture real-time output
        process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        
        # Display real-time output from the command
        for line in process.stdout:
            print(line.decode().strip())
        
        process.wait()  # Wait for process to finish
        
        # Confirm completion for each file
        print(f"Completed file: {filename}\n")


In [None]:
import os
import pandas as pd
import json
import glob

# Directory containing the JSON files
json_directory = 'outputs/Testing/'

# Get a list of all JSON files in the directory
json_files = glob.glob(os.path.join(json_directory, '*.json'))

# Initialize a list to hold all DataFrames
df_list = []

# Iterate through each JSON file and merge segments
for json_file in json_files:
	with open(json_file, 'r') as file:
		data = json.load(file)
		# Convert the "segments" part of the JSON data to a DataFrame
		df = pd.DataFrame(data["segments"])
		df_list.append(df)

# Concatenate all DataFrames into a single DataFrame
diarized_df = pd.concat(df_list, ignore_index=True)

# Display the consolidated DataFrame
diarized_df.head()

Unnamed: 0,start,end,text,words,speaker
0,0.349,0.609,And welcome!,"[{'word': 'And', 'start': 0.349, 'end': 0.409,...",SPEAKER_00
1,0.749,1.69,Thank you very much.,"[{'word': 'Thank', 'start': 0.749, 'end': 1.00...",SPEAKER_00
2,3.07,7.893,"Now, as you know, the producers on this show l...","[{'word': 'Now,', 'start': 3.07, 'end': 3.13, ...",SPEAKER_00
3,8.193,11.874,"Specifically, where they give us a very small ...","[{'word': 'Specifically,', 'start': 8.193, 'en...",SPEAKER_00
4,12.195,17.257,Then they set unbelievably hard tasks to do to...,"[{'word': 'Then', 'start': 12.195, 'end': 12.3...",SPEAKER_00


In [None]:
import pandas as pd
import json

# Path to the JSON file
json_file_path = 'Intro_clip_0.json'

# Open and load the JSON file
with open(json_file_path, 'r') as file:
	data = json.load(file)

# Convert the "segments" part of the JSON data to a DataFrame
intro_clip_0_df = pd.DataFrame(data["segments"])

# Display the DataFrame
intro_clip_0_df.head()

Unnamed: 0,start,end,text,words,speaker
0,0.349,0.609,And welcome!,"[{'word': 'And', 'start': 0.349, 'end': 0.409,...",SPEAKER_00
1,0.749,1.69,Thank you very much.,"[{'word': 'Thank', 'start': 0.749, 'end': 1.00...",SPEAKER_00
2,3.07,7.893,"Now, as you know, the producers on this show l...","[{'word': 'Now,', 'start': 3.07, 'end': 3.13, ...",SPEAKER_00
3,8.193,11.874,"Specifically, where they give us a very small ...","[{'word': 'Specifically,', 'start': 8.193, 'en...",SPEAKER_00
4,12.195,17.257,Then they set unbelievably hard tasks to do to...,"[{'word': 'Then', 'start': 12.195, 'end': 12.3...",SPEAKER_00
