In [9]:
import os
import gc
import re
import pickle

import whisperx
import torch
import pandas as pd

HF_TOKEN = os.environ['HF_TOKEN']

device = "cuda"
audio_file = "audios/video5.mp3"
audio_name = re.split('/|\.', audio_file)[1]  # audios/video3.mp3 -> video3
batch_size = 8  # reduce if low on GPU mem
dialogues_dir = 'dialogues'
diarized_outputs_dir = 'diarized_outputs'

for directory in [dialogues_dir, diarized_outputs_dir]:
    if not os.path.exists(os.path.join(directory)):
        os.mkdir(os.path.join(directory))

## Transcription

In [2]:
model = whisperx.load_model("large-v2", device, compute_type="float32")  # change to "int8" if low on GPU mem (may reduce accuracy)
audio = whisperx.load_audio(audio_file)
transcription = model.transcribe(audio, batch_size=batch_size)

import gc; gc.collect(); torch.cuda.empty_cache(); del model

transcription["segments"] # before alignment

Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.3.3. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../../home/enzo/.cache/torch/whisperx-vad-segmentation.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.0.0. Bad things might happen unless you revert torch to 1.x.
Detected language: en (1.00) in first 30s of audio...


[{'text': " I can record. And we don't have a ton of items to get to. And I might be able to do one that might be fun if we have a little bit of time. So corporate events, they",
  'start': 0.009,
  'end': 22.295},
 {'text': " I think I saw a little I put this in slack and I saw a little bit of kind of noise around it, which was good. You know that the nutshell here is as we've kind of restructured and try different things. The event support that we need isn't as nailed down as it needs to be. So the",
  'start': 22.875,
  'end': 46.254},
 {'text': " Current tactic that we're going with is go-to-market team signs up and kind of sponsors that event. So you support as a PMM, your campaign manager does the campaigns for that event, et cetera, et cetera, et cetera. Uh, I don't see anyone in the, uh, maybe there are comments in the issue. I don't see the, the header updated yet. I thought we had in Slack sort of farmed each one of them out.",
  'start': 46.578,
  'end': 75.981},
 {'text': "

## Output Alignment

In [3]:
model_a, metadata = whisperx.load_align_model(language_code=transcription["language"], device=device)
alignment = whisperx.align(
    transcription["segments"],
    model_a,
    metadata,
    audio,
    device,
    return_char_alignments=False
)

import gc; gc.collect(); torch.cuda.empty_cache(); del model_a

alignment["segments"]  # after alignment

[{'start': 0.189,
  'end': 1.489,
  'text': ' I can record.',
  'words': [{'word': 'I', 'start': 0.189, 'end': 0.249, 'score': 0.718},
   {'word': 'can', 'start': 0.289, 'end': 0.469, 'score': 0.91},
   {'word': 'record.', 'start': 0.529, 'end': 1.489, 'score': 0.817}]},
 {'start': 3.13,
  'end': 7.291,
  'text': "And we don't have a ton of items to get to.",
  'words': [{'word': 'And', 'start': 3.13, 'end': 3.57, 'score': 0.893},
   {'word': 'we', 'start': 3.65, 'end': 3.75, 'score': 0.992},
   {'word': "don't", 'start': 3.79, 'end': 3.99, 'score': 0.906},
   {'word': 'have', 'start': 4.05, 'end': 4.31, 'score': 0.823},
   {'word': 'a', 'start': 4.33, 'end': 4.35, 'score': 0.0},
   {'word': 'ton', 'start': 4.57, 'end': 5.01, 'score': 0.707},
   {'word': 'of', 'start': 5.991, 'end': 6.111, 'score': 0.792},
   {'word': 'items', 'start': 6.251, 'end': 6.531, 'score': 0.87},
   {'word': 'to', 'start': 6.591, 'end': 6.691, 'score': 0.972},
   {'word': 'get', 'start': 6.751, 'end': 6.931, '

## Assigning Speaker Labels (Diarization)

In [4]:
diarize_model = whisperx.DiarizationPipeline(use_auth_token=HF_TOKEN, device=device)

# add min/max number of speakers if known a priori
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
audio = whisperx.load_audio(audio_file)
diarize_segments = diarize_model(audio)

result = whisperx.assign_word_speakers(diarize_segments, alignment)
import gc; gc.collect(); torch.cuda.empty_cache(); del diarize_model


with open(os.path.join('.', 'diarized_outputs', f'{audio_name}_segments.pkl'), 'wb') as file:
    pickle.dump((diarize_segments, result), file)
    
print(diarize_segments)
print(result["segments"])  # segments are now assigned speaker IDs

                               segment label     speaker        start  \
0    [ 00:00:00.008 -->  00:00:00.025]     A  SPEAKER_00     0.008489   
1    [ 00:00:00.042 -->  00:00:00.636]     B  SPEAKER_00     0.042445   
2    [ 00:00:01.095 -->  00:00:01.553]     C  SPEAKER_00     1.095076   
3    [ 00:00:02.928 -->  00:00:05.118]     D  SPEAKER_00     2.928693   
4    [ 00:00:05.916 -->  00:00:07.495]     E  SPEAKER_00     5.916808   
..                                 ...   ...         ...          ...   
764  [ 00:42:25.831 -->  00:42:26.477]   ACK  SPEAKER_07  2545.831919   
765  [ 00:42:27.088 -->  00:42:32.979]   ACL  SPEAKER_00  2547.088285   
766  [ 00:42:34.100 -->  00:42:36.205]   ACM  SPEAKER_00  2554.100170   
767  [ 00:42:34.202 -->  00:42:35.254]   ACN  SPEAKER_07  2554.202037   
768  [ 00:42:38.582 -->  00:42:40.263]   ACO  SPEAKER_07  2558.582343   

             end  intersection        union  
0       0.025467  -2559.656533  2559.814511  
1       0.636672  -2559.045328 

## Processing Diarization Output

In [5]:
result['segments']

[{'start': 0.189,
  'end': 1.489,
  'text': ' I can record.',
  'words': [{'word': 'I',
    'start': 0.189,
    'end': 0.249,
    'score': 0.718,
    'speaker': 'SPEAKER_00'},
   {'word': 'can',
    'start': 0.289,
    'end': 0.469,
    'score': 0.91,
    'speaker': 'SPEAKER_00'},
   {'word': 'record.',
    'start': 0.529,
    'end': 1.489,
    'score': 0.817,
    'speaker': 'SPEAKER_00'}],
  'speaker': 'SPEAKER_00'},
 {'start': 3.13,
  'end': 7.291,
  'text': "And we don't have a ton of items to get to.",
  'words': [{'word': 'And',
    'start': 3.13,
    'end': 3.57,
    'score': 0.893,
    'speaker': 'SPEAKER_00'},
   {'word': 'we',
    'start': 3.65,
    'end': 3.75,
    'score': 0.992,
    'speaker': 'SPEAKER_00'},
   {'word': "don't",
    'start': 3.79,
    'end': 3.99,
    'score': 0.906,
    'speaker': 'SPEAKER_00'},
   {'word': 'have',
    'start': 4.05,
    'end': 4.31,
    'score': 0.823,
    'speaker': 'SPEAKER_00'},
   {'word': 'a',
    'start': 4.33,
    'end': 4.35,
    

In [6]:
def generate_dialogue_from_segments_fifo(segments: list[dict], speakers: dict = None) -> str:
    """Assign labels based on token-wise labeling performed by WhisperX."""
    dialogue = []
    buffer = '' 
    current_speaker = None
    
    for segment in segments:
        for word in segment['words']:
            if not current_speaker:
                current_speaker = word['speaker']
            if 'speaker' not in word.keys() or current_speaker == word['speaker']:
                buffer += f" {word['word']}"
            else:
                dialogue.append(f"{current_speaker if not speakers else speakers[current_speaker]}: {buffer}")
                buffer = word['word']
                current_speaker = word['speaker']
    return '\n'.join(dialogue)


def generate_dialogue_from_segments_most_frequent(segments: list[dict], speakers: dict = None) -> str:
    """Assign the label to the most frequent speaker within the segment."""
    dialogue = []
    for segment in segments:
        df = pd.DataFrame().from_records(segment['words'])
        most_frequent_speaker = df['speaker'].value_counts().to_frame().reset_index().loc[0, 'speaker']
        dialogue.append(f"{most_frequent_speaker if not speakers else speakers[most_frequent_speaker]}: {segment['text']}")
    return '\n'.join(dialogue)


def generate_dialogue_from_segments(segments: list[dict], speakers: dict = None) -> str:
    """Use WhisperX-assigned whole segment labels."""
    dialogue = []
    for segment in segments:
        dialogue.append(f"{segment['speaker'] if not speakers else speakers[segment['speaker']]}: {segment['text']}")
    return '\n'.join(dialogue)

In [8]:
dialogue = generate_dialogue_from_segments(
    segments=result['segments'],
#    speakers={
#        'SPEAKER_00': 'Lauren',
#        'SPEAKER_01': 'John',
#    }
)

with open(os.path.join('.', 'dialogues', f'{audio_name}_dialogue.txt'), 'w') as file:
    file.write(dialogue)

print(dialogue)

SPEAKER_00:  I can record.
SPEAKER_00: And we don't have a ton of items to get to.
SPEAKER_00: And I might be able to do one that might be fun if we have a little bit of time.
SPEAKER_00: So corporate events, they
SPEAKER_00:  I think I saw a little I put this in slack and I saw a little bit of kind of noise around it, which was good.
SPEAKER_00: You know that the nutshell here is as we've kind of restructured and try different things.
SPEAKER_00: The event support that we need isn't as nailed down as it needs to be.
SPEAKER_00: So the
SPEAKER_00:  Current tactic that we're going with is go-to-market team signs up and kind of sponsors that event.
SPEAKER_00: So you support as a PMM, your campaign manager does the campaigns for that event, et cetera, et cetera, et cetera.
SPEAKER_00: Uh, I don't see anyone in the, uh, maybe there are comments in the issue.
SPEAKER_00: I don't see the, the header updated yet.
SPEAKER_05: I thought we had in Slack sort of farmed each one of them out.
SPEA