In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'fileurl:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F5520015%2F9139952%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240901%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240901T151608Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D40dc225863cacf26d17827750afa30013536a887695cc2b615fa0780bb9eec291edfc159906932cb5ffa13995e66acb55159973b478a2468589ab8f06518c7e3a27d9a19f8c556bf3b87b2a6048e484a7a41cad8958880140e394f9578cb73a47ea43b6ea209677a375192bde869b23e13392000fcf86e990f6b232d39ef8ddfe0890e97e45839f3e12ef029217394086534f1706af9888dcf03c4140481394b3b065e79802f7b87e21d7a996fa16578472e7e1aa1a76de54d468f8844a442b04d00ae9de375fa83d7d375b8e005297bece73da2d85fa16aa86d1c36783873b8014b6af04da995d14966c38f015bda8846903318963f1a6518915802e771c552,hindiaudio:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F5520275%2F9140275%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240901%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240901T151608Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D565b66893b7827df42b58b6c28c257358f8d50a8eaf154cbda050f7ee3b02e931df7d45db8786c60301195067e8d4c8c9829f3ddcb3dbc07ff0dbbc3488ba0cca0649ac1b031d3fc6a914cbb8a543fa17099d92fe7aa9a3e4efe3b3862a3c798cc343de36755fb26775399a30281c17e06adc2992bd99aa438207ed01007fc023e40dbf3fa16e3c65827d48e7e42c447b23ca832215c4dfc2a48c3ab3275e99d50a3b79db3dd6c0a025ca5eab3878dc9d68d0fca5140a088cb3bde6dd15274f49dc809469f76c6fc49d2ffc50756eafd459ac5a5e9b2e527598b0d09dd732f3d1864d6c84f3ac7073f37621064a4f64f7b0d2c5a40c0896c4f4e942b36fcbae8'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install cog

In [None]:
pip install pyannote.audio

In [None]:
pip install faster-whisper

In [None]:
!pip install torch



In [None]:
from typing import Any, List
import base64
import datetime
import subprocess
import os
import requests
import time
import torch
import re

from cog import BasePredictor, BaseModel, Input, Path
from faster_whisper import WhisperModel
from pyannote.audio import Pipeline
import torchaudio


class Output(BaseModel):
    segments: list
    language: str = None
    num_speakers: int = None


class Predictor(BasePredictor):

    def setup(self):
        """Load the model into memory to make running multiple predictions efficient"""
        model_name = "medium"
        self.model = WhisperModel(
            model_name,
            device="cuda" if torch.cuda.is_available() else "cpu",
            compute_type="float32",
        )

        # Replace with the provided token
        self.diarization_model = Pipeline.from_pretrained(
            "pyannote/speaker-diarization-3.1",
            use_auth_token="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",
        )

        if self.diarization_model is None:
            raise RuntimeError("Failed to load the diarization model. Please check your token or model access permissions.")

    def predict(
        self,
        file_string: str = Input(
            description="Either provide: Base64 encoded audio file", default=None
        ),
        file_path: Path = Input(description="Or an audio file path", default=None),
        group_segments: bool = Input(
            description="Group segments of same speaker shorter apart than 2 seconds",
            default=True,
        ),
        transcript_output_format: str = Input(
            description="Specify the format of the transcript output: individual words with timestamps, full text of segments, or a combination of both.",
            default="both",
            choices=["words_only", "segments_only", "both"],
        ),
        num_speakers: int = Input(
            description="Number of speakers, leave empty to autodetect.",
            ge=1,
            le=50,
            default=None,
        ),
        translate: bool = Input(
            description="Translate the speech into English.",
            default=False,
        ),
        language: str = Input(
            description="Language of the spoken words as a language code like 'en'. Leave empty to auto detect language.",
            default=None,
        ),
        prompt: str = Input(
            description="Vocabulary: provide names, acronyms and loanwords in a list. Use punctuation for best accuracy.",
            default=None,
        ),
        offset_seconds: int = Input(
            description="Offset in seconds, used for chunked inputs", default=0, ge=0
        ),
    ) -> Output:
        """Run a single prediction on the model"""
        try:
            # Generate a temporary filename
            temp_wav_filename = f"temp-{time.time_ns()}.wav"

            if file_path is not None:
                # Convert the provided file path to WAV format
                subprocess.run(
                    [
                        "ffmpeg",
                        "-i",
                        file_path,
                        "-ar",
                        "16000",
                        "-ac",
                        "1",
                        "-c:a",
                        "pcm_s16le",
                        temp_wav_filename,
                    ]
                )

            elif file_string is not None:
                audio_data = base64.b64decode(
                    file_string.split(",")[1] if "," in file_string else file_string
                )
                temp_audio_filename = f"temp-{time.time_ns()}.audio"
                with open(temp_audio_filename, "wb") as f:
                    f.write(audio_data)

                subprocess.run(
                    [
                        "ffmpeg",
                        "-i",
                        temp_audio_filename,
                        "-ar",
                        "16000",
                        "-ac",
                        "1",
                        "-c:a",
                        "pcm_s16le",
                        temp_wav_filename,
                    ]
                )

                if os.path.exists(temp_audio_filename):
                    os.remove(temp_audio_filename)

            segments, detected_num_speakers, detected_language = self.speech_to_text(
                temp_wav_filename,
                num_speakers,
                prompt,
                offset_seconds,
                group_segments,
                language,
                word_timestamps=True,
                transcript_output_format=transcript_output_format,
                translate=translate,
            )

            print(f"done with inference")
            # Return the results as a JSON object
            return Output(
                segments=segments,
                language=detected_language,
                num_speakers=detected_num_speakers,
            )

        except Exception as e:
            raise RuntimeError("Error Running inference with local model", e)

        finally:
            # Clean up
            if os.path.exists(temp_wav_filename):
                os.remove(temp_wav_filename)

    def convert_time(self, secs, offset_seconds=0):
        return datetime.timedelta(seconds=(round(secs) + offset_seconds))

    def speech_to_text(
        self,
        audio_file_wav,
        num_speakers=None,
        prompt="",
        offset_seconds=0,
        group_segments=True,
        language=None,
        word_timestamps=True,
        transcript_output_format="both",
        translate=False,
    ):
        time_start = time.time()

        # Transcribe audio
        print("Starting transcribing")
        options = dict(
            vad_filter=True,
            vad_parameters=dict(min_silence_duration_ms=1000),
            initial_prompt=prompt,
            word_timestamps=word_timestamps,
            language=language,
            task="translate" if translate else "transcribe",
            hotwords=prompt
        )
        segments, transcript_info = self.model.transcribe(audio_file_wav, **options)
        segments = list(segments)
        segments = [
            {
                "avg_logprob": s.avg_logprob,
                "start": float(s.start + offset_seconds),
                "end": float(s.end + offset_seconds),
                "text": s.text,
                "words": [
                    {
                        "start": float(w.start + offset_seconds),
                        "end": float(w.end + offset_seconds),
                        "word": w.word,
                        "probability": w.probability,
                    }
                    for w in s.words
                ],
            }
            for s in segments
        ]

        time_transcribing_end = time.time()
        print(
            f"Finished with transcribing, took {time_transcribing_end - time_start:.5} seconds"
        )

        print("Starting diarization")
        waveform, sample_rate = torchaudio.load(audio_file_wav)
        diarization = self.diarization_model(
            {"waveform": waveform, "sample_rate": sample_rate},
            num_speakers=num_speakers,
        )

        time_diarization_end = time.time()
        print(
            f"Finished with diarization, took {time_diarization_end - time_transcribing_end:.5} seconds"
        )

        print("Starting merging")

        # Initialize variables to keep track of the current position in both lists
        margin = 0.1  # 0.1 seconds margin

        # Initialize an empty list to hold the final segments with speaker info
        final_segments = []

        diarization_list = list(diarization.itertracks(yield_label=True))
        unique_speakers = {
            speaker for _, _, speaker in diarization.itertracks(yield_label=True)
        }
        detected_num_speakers = len(unique_speakers)

        speaker_idx = 0
        n_speakers = len(diarization_list)

        # Iterate over each segment
        for segment in segments:
            segment_start = segment["start"] + offset_seconds
            segment_end = segment["end"] + offset_seconds
            segment_text = []
            segment_words = []

            # Iterate over each word in the segment
            for word in segment["words"]:
                word_start = word["start"] + offset_seconds - margin
                word_end = word["end"] + offset_seconds + margin

                while speaker_idx < n_speakers:
                    turn, _, speaker = diarization_list[speaker_idx]

                    if turn.start <= word_end and turn.end >= word_start:
                        # Add word without modifications
                        segment_text.append(word["word"])

                        # Strip here for individual word storage
                        word["word"] = word["word"].strip()
                        segment_words.append(word)

                        if turn.end <= word_end:
                            speaker_idx += 1

                        break
                    elif turn.end < word_start:
                        speaker_idx += 1
                    else:
                        break

            if segment_text:
                combined_text = "".join(segment_text)
                cleaned_text = re.sub("  ", " ", combined_text).strip()
                new_segment = {
                    "avg_logprob": segment["avg_logprob"],
                    "start": segment_start - offset_seconds,
                    "end": segment_end - offset_seconds,
                    "speaker": speaker,
                    "text": cleaned_text,
                    "words": segment_words,
                }
                final_segments.append(new_segment)

        time_merging_end = time.time()
        print(
            f"Finished with merging, took {time_merging_end - time_diarization_end:.5} seconds"
        )

        print("Starting cleaning")
        segments = final_segments
        # Make output
        output = []  # Initialize an empty list for the output

        # Initialize the first group with the first segment
        current_group = segments[0]
        output.append(current_group)

        for segment in segments[1:]:
            # If group_segments is True and the speaker is the same as the last one,
            # merge segments that are close to each other (less than 2 seconds apart)
            if (
                group_segments
                and segment["speaker"] == current_group["speaker"]
                and segment["start"] - current_group["end"] < 2
            ):
                # Extend the end time and append the new text and words to the current group
                current_group["end"] = segment["end"]
                current_group["text"] += " " + segment["text"]
                current_group["words"].extend(segment["words"])
            else:
                # Otherwise, start a new group
                current_group = segment
                output.append(current_group)

        segments = output

        time_cleaning_end = time.time()
        print(
            f"Finished with cleaning, took {time_cleaning_end - time_merging_end:.5} seconds"
        )

        print(
            f"Finished with all, total time: {time_cleaning_end - time_start:.5} seconds"
        )

        return segments, detected_num_speakers, transcript_info.language

# URL
file_path = "/kaggle/input/hindiaudio/Audio 2.2.mp3"

# Instantiate the Predictor class
predictor = Predictor()

# Setup the models
predictor.setup()

# Call the predict method with the file_path
result = predictor.predict(
    file_path=file_path,         # Passing the file path
    group_segments=True,        # Additional settings as needed
    transcript_output_format="both",
    num_speakers=None,
    translate=False,
    language="hi",              # Set the language to Hindi
    prompt="नमस्ते, भारत, हिंदी",
    offset_seconds=0
)


# Print the result
print(result)
