In [1]:
import os
import subprocess
from typing import Optional, List, Dict, Any
import time
import psutil
import GPUtil
import whisper
from whisperx import load_align_model, align
from whisperx.diarize import DiarizationPipeline, assign_word_speakers
import pandas as pd

The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows.
The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows.


In [2]:
def convert_to_wav(input_file: str, output_file: Optional[str] = None) -> None:
    """
    Converts an audio file to WAV format using FFmpeg.

    Args:
        input_file: The path of the input audio file to convert.
        output_file: The path of the output WAV file. If None, the output file will be created by replacing the input file
        extension with ".wav".

    Returns:
        None
    """
    if not output_file:
        output_file = os.path.splitext(input_file)[0] + ".wav"

    command = f'ffmpeg -i "{input_file}" -vn -acodec pcm_s16le -ar 44100 -ac 1 "{output_file}"'

    try:
        subprocess.run(command, shell=True, check=True)
        print(f'Successfully converted "{input_file}" to "{output_file}"')
    except subprocess.CalledProcessError as e:
        print(f'Error: {e}, could not convert "{input_file}" to "{output_file}"')


In [3]:
def transcribe(audio_file: str, model_name: str, device: str = "cuda") -> Dict[str, Any]:
    """
    Transcribe an audio file using a speech-to-text model.

    Args:
        audio_file: Path to the audio file to transcribe.
        model_name: Name of the model to use for transcription.
        device: The device to use for inference (e.g., "cpu" or "cuda").

    Returns:
        A dictionary representing the transcript, including the segments, the language code, and the duration of the audio file.
    """
    model = whisper.load_model(model_name, device)
    result = model.transcribe(audio_file)

    language_code = result["language"]
    return {
        "segments": result["segments"],
        "language_code": language_code,
    }

In [4]:
def align_segments(
    segments: List[Dict[str, Any]],
    language_code: str,
    audio_file: str,
    device: str = "cuda",
) -> Dict[str, Any]:
    """
    Align the transcript segments using a pretrained alignment model.

    Args:
        segments: List of transcript segments to align.
        language_code: Language code of the audio file.
        audio_file: Path to the audio file containing the audio data.
        device: The device to use for inference (e.g., "cpu" or "cuda").

    Returns:
        A dictionary representing the aligned transcript segments.
    """
    model_a, metadata = load_align_model(language_code=language_code, device=device)
    result_aligned = align(segments, model_a, metadata, audio_file, device)
    return result_aligned

In [5]:
def diarize(audio_file: str, hf_token: str) -> Dict[str, Any]:
    """
    Perform speaker diarization on an audio file.

    Args:
        audio_file: Path to the audio file to diarize.
        hf_token: Authentication token for accessing the Hugging Face API.

    Returns:
        A dictionary representing the diarized audio file, including the speaker embeddings and the number of speakers.
    """
    diarization_pipeline = DiarizationPipeline(use_auth_token=hf_token)
    diarization_result = diarization_pipeline(audio_file)
    return diarization_result

In [10]:
def assign_speakers(
    diarization_result: Dict[str, Any], aligned_segments: Dict[str, Any]
) -> List[Dict[str, Any]]:
    """
    Assign speakers to each transcript segment based on the speaker diarization result.

    Args:
        diarization_result: Dictionary representing the diarized audio file, including the speaker embeddings and the number of speakers.
        aligned_segments: Dictionary representing the aligned transcript segments.

    Returns:
        A list of dictionaries representing each segment of the transcript, including the start and end times, the
        spoken text, and the speaker ID.
    """
    result_segments = assign_word_speakers(
        diarization_result, aligned_segments
    )
    results_segments_w_speakers: List[Dict[str, Any]] = []
    for result_segment in result_segments['segments']:
        try:
            results_segments_w_speakers.append(
                {"start": result_segment["start"],
                "end": result_segment["end"],
                "text": result_segment["text"],
                "speaker": result_segment["speaker"]})
        except KeyError:
            results_segments_w_speakers.append(
                {
                "start": result_segment["start"],
                "end": result_segment["end"],
                "text": result_segment["text"],
                "speaker": " "
                })
        
    return results_segments_w_speakers

In [11]:
def transcribe_and_diarize(
    audio_file: str,
    hf_token: str,
    model_name: str,
    device: str = "cuda",
):
    """
    Transcribe an audio file and perform speaker diarization to determine which words were spoken by each speaker.

    Args:
        audio_file: Path to the audio file to transcribe and diarize.
        hf_token: Authentication token for accessing the Hugging Face API.
        model_name: Name of the model to use for transcription.
        device: The device to use for inference (e.g., "cpu" or "cuda").

    Returns:
        A list of dictionaries representing each segment of the transcript, including the start and end times, the
        spoken text, and the speaker ID.
    """
    
    seg = []
    st = []
    et =[]
    sp = []
    tr = []
    transcript = transcribe(audio_file, model_name, device)
    aligned_segments = align_segments(
        transcript["segments"], transcript["language_code"], audio_file, device
    )
    diarization_result = diarize(audio_file, hf_token)
    results_segments_w_speakers = assign_speakers(diarization_result, aligned_segments)
    
    # Print the results in a user-friendly way
    for i, segment in enumerate(results_segments_w_speakers):
        seg.append(i+1)
        st.append(segment['start'])
        et.append(segment['end'])
        sp.append(segment['speaker'])
        tr.append(segment['text'])
        '''
        print(f"Segment {i + 1}:")
        print(f"Start time: {segment['start']:.2f}")
        print(f"End time: {segment['end']:.2f}")
        print(f"Speaker: {segment['speaker']}")
        print(f"Transcript: {segment['text']}")
        print("")'''
    df = pd.DataFrame(data = [seg,st,et,sp,tr],columns=['Segment','Start time','End time','Speaker','Transcript'])
    
    return df

In [8]:
convert_to_wav("F:/28 Jul, 1.10 pm_ Pushkar Naath MotoGP.aac", "F:/28 Jul, 1.10 pm_ Pushkar Naath MotoGP.wav")

Successfully converted "F:/28 Jul, 1.10 pm_ Pushkar Naath MotoGP.aac" to "F:/28 Jul, 1.10 pm_ Pushkar Naath MotoGP.wav"


In [12]:
from dotenv import load_dotenv
load_dotenv()

audio_file = (
        "F:/28 Jul, 1.10 pm_ Pushkar Naath MotoGP.wav"
    )
model_name = 'medium'
device = 'cuda'
hf_token = os.environ.get('HF_TOKEN')

In [13]:
df = transcribe_and_diarize(audio_file, hf_token, model_name,device)

Failed to align segment (" So technically I am holding my close to 60000 people, 6 to 70000 people."): backtrack failed, resorting to original...
Failed to align segment (" 6..."): no characters in this segment found in model dictionary, resorting to original...
Failed to align segment (" 4."): no characters in this segment found in model dictionary, resorting to original...
Failed to align segment (" 106..."): no characters in this segment found in model dictionary, resorting to original...
Failed to align segment (" So, album music event or something, that's part of your ticket?"): backtrack failed, resorting to original...
Failed to align segment (" There are smaller countries, there are larger countries."): backtrack failed, resorting to original...


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.0.6. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint --file C:\Users\neeraj\.cache\torch\pyannote\models--pyannote--segmentation\snapshots\c4c8ceafcbb3a7a280c2d357aee9fbc9b0be7f9b\pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 2.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.0.1+cu118. Bad things might happen unless you revert torch to 1.x.


In [None]:
df

# Trying Out each module independently

In [None]:
transcript = transcribe(audio_file, model_name, device)
transcript

In [11]:
text = ''
for d in transcript['segments']:
    text +=d['text']

In [12]:
text

" I don't think so we'll be allowed but we'll be able to share some photographs with you. Last night we shared a fantastic post on Instagram and the dark night, some port of dark night. So essentially we work in 24 hours. Weather's been very bad. So whatever time we can, whenever you want to try, sunlight, more sunlight, rain, day, whatever, a lot more. I think the main work is to fix the draft. No, it's just art. Like I was mentioning that he might be a chopper, but he's doing a good job. And that's the passion of the man. Very good. So we're not filming this. We have some other team on the other end. Everyone has their specific skills here. And if one falls off the plane, you get it. Everyone is there since day T minus 30 days or T minus 365 days. So let's start with the design and where the idea really came from. How did you get the concept? So we have been in racing 2011-12. 12, 13, 14 we did World Superbike. Amit Chandel, my director of racing, my cousin right now, he has a family

In [None]:
aligned_segments = align_segments(
        transcript["segments"], transcript["language_code"], audio_file, device)
aligned_segments

In [None]:
diarization_result = diarize(audio_file, hf_token)

In [None]:
diarization_result

In [None]:
def assign_speakers(
    diarization_result: Dict[str, Any], aligned_segments: Dict[str, Any]
) -> List[Dict[str, Any]]:
    """
    Assign speakers to each transcript segment based on the speaker diarization result.

    Args:
        diarization_result: Dictionary representing the diarized audio file, including the speaker embeddings and the number of speakers.
        aligned_segments: Dictionary representing the aligned transcript segments.

    Returns:
        A list of dictionaries representing each segment of the transcript, including the start and end times, the
        spoken text, and the speaker ID.
    """
    result_segments = assign_word_speakers(
        diarization_result, aligned_segments
    )
    results_segments_w_speakers: List[Dict[str, Any]] = []
    for result_segment in result_segments['segments']:
        try:
            print("Okay Here")
            results_segments_w_speakers.append(
                {"start": result_segment["start"],
                "end": result_segment["end"],
                "text": result_segment["text"],
                "speaker": result_segment["speaker"]})
        except KeyError:
            print("ERROR Here"
            results_segments_w_speakers.append(
                {
                "start": result_segment["start"],
                "end": result_segment["end"],
                "text": result_segment["text"],
                "speaker": " "
                })
        
    return results_segments_w_speakers

In [66]:
results_segments_w_speakers = assign_speakers(diarization_result, aligned_segments)

{'segments': [{'start': 0.122, 'end': 0.994, 'text': " What's your background?", 'words': [{'word': "What's", 'start': 0.122, 'end': 0.264, 'score': 0.483}, {'word': 'your', 'start': 0.284, 'end': 0.507, 'score': 0.413, 'speaker': 'SPEAKER_00'}, {'word': 'background?', 'start': 0.629, 'end': 0.974, 'score': 0.52, 'speaker': 'SPEAKER_00'}], 'speaker': 'SPEAKER_00'}, {'start': 0.994, 'end': 1.298, 'text': 'Like, give me...', 'words': [{'word': 'Like,', 'start': 0.994, 'end': 1.095, 'score': 0.173, 'speaker': 'SPEAKER_00'}, {'word': 'give', 'start': 1.115, 'end': 1.217, 'score': 0.341, 'speaker': 'SPEAKER_00'}, {'word': 'me...', 'start': 1.237, 'end': 1.298, 'score': 0.607, 'speaker': 'SPEAKER_00'}], 'speaker': 'SPEAKER_00'}, {'start': 1.581, 'end': 6.54, 'text': ' Yeah, I mean, yeah, complete lower middle class background in a place in Mumbai called Grant Road.', 'words': [{'word': 'Yeah,', 'start': 1.581, 'end': 1.801, 'score': 0.576, 'speaker': 'SPEAKER_00'}, {'word': 'I', 'start': 1.8