<a href="https://colab.research.google.com/github/kishan-1432-rk/realtime-translator-demo/blob/main/Deep_ASR_Demo_with_AI4Bharat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Required libraries:
# pip install torch transformers datasets soundfile accelerate

from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
from datasets import load_dataset
import torch
import soundfile as sf
import io

# --- 1. ASR Model Class ---
# This class encapsulates the ASR logic for different Indian languages.
class IndicASR:
    def __init__(self, language="hi"):
        """
        Initializes the ASR model for a specific language.

        Args:
            language (str): The language code ('hi' for Hindi, 'ta' for Tamil, 'gu' for Gujarati).
        """
        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

        self.language_map = {
            'hi': 'ai4bharat/indic-whisper-v2-hi',
            'ta': 'ai4bharat/indic-whisper-v2-ta',
            'gu': 'ai4bharat/indic-whisper-v2-gu',
        }

        if language not in self.language_map:
            raise ValueError(f"Unsupported language: {language}. Please choose from {list(self.language_map.keys())}")

        self.model_id = self.language_map[language]
        print(f"Loading model for {language} from {self.model_id}...")

        # Load the pre-trained model and processor from Hugging Face.
        # This handles tokenization and feature extraction.
        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
            self.model_id,
            torch_dtype=self.torch_dtype,
            low_cpu_mem_usage=True,
            use_safetensors=True
        ).to(self.device)
        self.processor = AutoProcessor.from_pretrained(self.model_id)

    def transcribe(self, audio_path):
        """
        Transcribes the speech from an audio file.

        Args:
            audio_path (str): Path to the audio file (e.g., WAV format).

        Returns:
            str: The transcribed text.
        """
        try:
            # Read the audio file
            audio_data, sampling_rate = sf.read(audio_path, dtype='float32')

            # Process the audio to get input features for the model
            input_features = self.processor(
                audio_data,
                sampling_rate=sampling_rate,
                return_tensors="pt"
            ).input_features.to(self.device, dtype=self.torch_dtype)

            # Generate the transcription using the model's generate method
            predicted_ids = self.model.generate(
                input_features,
                max_new_tokens=128
            )

            # Decode the predicted IDs to get the text transcript
            transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
            return transcription

        except Exception as e:
            return f"An error occurred: {e}"

# --- 2. Example Usage ---
# The following code demonstrates how to use the IndicASR class.

if __name__ == "__main__":
    # Note: To run this code, you need to replace these with paths to your local audio files.
    # The audio files should be in the specified languages.
    # Placeholder paths are used here for demonstration.
    hindi_audio_path = "path/to/your/hindi_audio.wav"
    tamil_audio_path = "path/to/your/tamil_audio.wav"
    gujarati_audio_path = "path/to/your/gujarati_audio.wav"
    english_audio_path = "path/to/your/english_audio.wav" # Using the Hindi model for English as a common scenario

    # Create instances for different languages
    try:
        hindi_asr = IndicASR(language='hi')
        tamil_asr = IndicASR(language='ta')
        gujarati_asr = IndicASR(language='gu')

        print("\n--- Transcribing Hindi Audio ---")
        transcribed_hindi_text = hindi_asr.transcribe(hindi_audio_path)
        print(f"Transcription (Hindi): {transcribed_hindi_text}")

        print("\n--- Transcribing Tamil Audio ---")
        transcribed_tamil_text = tamil_asr.transcribe(tamil_audio_path)
        print(f"Transcription (Tamil): {transcribed_tamil_text}")

        print("\n--- Transcribing Gujarati Audio ---")
        transcribed_gujarati_text = gujarati_asr.transcribe(gujarati_audio_path)
        print(f"Transcription (Gujarati): {transcribed_gujarati_text}")

        # You can also use one of the models for English, as many are trained on English as well.
        print("\n--- Transcribing English Audio with Hindi Model ---")
        transcribed_english_text = hindi_asr.transcribe(english_audio_path)
        print(f"Transcription (English): {transcribed_english_text}")

    except Exception as e:
        print(f"An error occurred during initialization or transcription: {e}")

Loading model for hi from ai4bharat/indic-whisper-v2-hi...
An error occurred during initialization or transcription: ai4bharat/indic-whisper-v2-hi is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
