In [None]:
!pip install git+https://github.com/openai/whisper.git -q -U

In [None]:
!pip install yt-dlp -q -U

In [None]:
!yt-dlp https://youtu.be/zmf1Kujygt8 --format m4a -o "/content/%(id)s.%(ext)s"
!whisper "/content/zmf1Kujygt8.m4a" --model small --language English

# LIBRARIES

In [None]:
!pip install gTTS
!pip install SpeechRecognition
!pip install pydub
!pip install translate

In [None]:
from pydub import AudioSegment
import speech_recognition as sr
from translate import Translator
from gtts import gTTS

In [None]:
def convert_mp3_to_wav(input_mp3, output_wav):
    audio = AudioSegment.from_m4a(input_mp3)
    audio.export(output_wav, format="wav")

In [None]:
from pydub import AudioSegment

def convert_m4a_to_wav(input_file, output_file):
    # Load the .m4a file
    audio = AudioSegment.from_file(input_file, format='m4a')
    # Export as .wav
    audio.export(output_file, format='wav')




In [None]:
def recognize_speech(audio_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio_data = recognizer.record(source)
    try:
        recognized_text = recognizer.recognize_google(audio_data)
        return recognized_text
    except sr.UnknownValueError:
        return "Speech recognition could not understand the audio"
    except sr.RequestError as e:
        return f"Could not request results from Google's Speech Recognition API; {e}"

In [None]:
def translate_text(text, target_language):
    if text is not None:
        translator = Translator(to_lang=target_language)
        translation = translator.translate(text)
        return translation
    else:
        return "No text to translate"

In [None]:
def convert_text_to_speech(text, lang_code, output_path):
    if text != "No text to translate":
        tts = gTTS(text=text, lang=lang_code)
        tts.save(output_path)

In [None]:
def speech_to_speech_pipeline(input_mp3, output_mp3, target_language='hi'):
    # Step 1: Convert MP3 to WAV
    wav_file = "temp_speech.wav"
    convert_m4a_to_wav(input_mp3, wav_file)

    # Step 2: Recognize Speech
    recognized_text = recognize_speech(wav_file)
    print("Recognized Speech:")
    print(recognized_text)

    # Step 3: Translate Recognized Text
    translated_text = translate_text(recognized_text, target_language)
    print("Translated Text:")
    print(translated_text)

    # Step 4: Convert Translated Text to Speech
    convert_text_to_speech(translated_text, target_language, output_mp3)
    audio = AudioSegment.from_mp3(output_mp3)
    return audio

In [None]:
input_audio_file = "/kaggle/input/indicsuperb/kb_data_clean_m4a/hindi/valid/audio/844424930501806-229-f.m4a" 
output_audio_file = "translated_ta.mp3"
speech_to_speech_pipeline(input_audio_file, output_audio_file, target_language='ka')

# WHISPER

In [None]:
import torch
from transformers import pipeline

device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = pipeline(
    "automatic-speech-recognition", model="openai/whisper-base", device=device
)

In [None]:
from datasets import load_dataset
dataset = load_dataset("facebook/voxpopuli", "it", split="validation", streaming=True)
sample = next(iter(dataset))

In [None]:
from IPython.display import Audio
Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])

In [None]:
def translate(audio):
    outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
    return outputs["text"]

In [None]:
translate(sample["audio"].copy())

In [None]:
sample["raw_text"]

In [None]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan

processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

In [None]:
model.to(device)
vocoder.to(device)

In [None]:
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

In [None]:
def synthesise(text):
    inputs = processor(text=text, return_tensors="pt")
    speech = model.generate_speech(
        inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
    )
    return speech.cpu()

In [None]:
import numpy as np

target_dtype = np.int16
max_range = np.iinfo(target_dtype).max

def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    synthesised_speech = synthesise(translated_text)
    synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
    return 16000, synthesised_speech

In [None]:
sampling_rate, synthesised_speech = speech_to_speech_translation("/kaggle/input/indicsuperb/kb_data_clean_m4a/tamil/valid/audio/844424930305399-797-m.m4a")
Audio(synthesised_speech, rate=sampling_rate)

#  AI4BHARAT (MACHINE TRANSLATION)

In [None]:
!pip install --upgrade --force-reinstall --no-cache-dir numpy==1.26.4

In [None]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from IndicTransToolkit.processor import IndicProcessor

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

src_lang, tgt_lang = "hin_Deva", "kan_Knda"
model_name = "ai4bharat/indictrans2-indic-indic-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name, 
    trust_remote_code=True, 
    torch_dtype=torch.float16, 
    attn_implementation="flash_attention_2"
).to(DEVICE)

ip = IndicProcessor(inference=True)

input_sentences = [
    "जब मैं छोटा था, मैं हर रोज़ पार्क जाता था।",
    "हमने पिछले सप्ताह एक नई फिल्म देखी जो कि बहुत प्रेरणादायक थी।",
    "अगर तुम मुझे उस समय पास मिलते, तो हम बाहर खाना खाने चलते।",
    "मेरे मित्र ने मुझे उसके जन्मदिन की पार्टी में बुलाया है, और मैं उसे एक तोहफा दूंगा।",
]

In [None]:
batch = ip.preprocess_batch(
    input_sentences,
    src_lang=src_lang,
    tgt_lang=tgt_lang,
)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

inputs = tokenizer(
    batch,
    truncation=True,
    padding="longest",
    return_tensors="pt",
    return_attention_mask=True,
).to(DEVICE)


with torch.no_grad():
    generated_tokens = model.generate(
        **inputs,
        use_cache=True,
        min_length=0,
        max_length=256,
        num_beams=5,
        num_return_sequences=1,
    )


In [None]:
generated_tokens = tokenizer.batch_decode(
    generated_tokens,
    skip_special_tokens=True,
    clean_up_tokenization_spaces=True,
)

translations = ip.postprocess_batch(generated_tokens, lang=tgt_lang)

for input_sentence, translation in zip(input_sentences, translations):
    print(f"{src_lang}: {input_sentence}")
    print(f"{tgt_lang}: {translation}")


# AI4BHARAT TTS

In [None]:
! pip install git+https://github.com/huggingface/parler-tts.git

In [None]:
import torch
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to(device)
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
description_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)

In [None]:
prompt = "मेरे मित्र ने मुझे उसके जन्मदिन की पार्टी में बुलाया है, और मैं उसे एक तोहफा दूंगा।"
description = "A calm, warm female voice with medium pitch and a steady speed."

description_input_ids = description_tokenizer(description, return_tensors="pt").to(device)
prompt_input_ids = tokenizer(prompt, return_tensors="pt").to(device)

generation = model.generate(input_ids=description_input_ids.input_ids, attention_mask=description_input_ids.attention_mask, prompt_input_ids=prompt_input_ids.input_ids, prompt_attention_mask=prompt_input_ids.attention_mask)
audio_arr = generation.cpu().numpy().squeeze()
sf.write("indic_hindi.wav", audio_arr, model.config.sampling_rate)

# AI4BHARAT STT

In [None]:
!pip install onnxruntime
# Install dependencies
!pip install transformers torchaudio

In [3]:

import torch
from transformers import AutoModel
import torchaudio

# Load model
model = AutoModel.from_pretrained("ai4bharat/indic-conformer-600m-multilingual", trust_remote_code=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Load audio (replace with your file)
wav, sr = torchaudio.load("/kaggle/input/tamil-dataset-90/New-Recording.wav")
wav = torch.mean(wav, dim=0, keepdim=True)
target_sr = 16000
if sr != target_sr:
    resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
    wav = resampler(wav)

# Transcribe (choose language code, e.g., "hi" for Hindi)
transcription_ctc = model(wav.to(device), "hi", "ctc")
print("CTC Transcription:", transcription_ctc)
transcription_rnnt = model(wav.to(device), "hi", "rnnt")
print("RNNT Transcription:", transcription_rnnt)


Fetching 403 files:   0%|          | 0/403 [00:00<?, ?it/s]

CTC Transcription: 
RNNT Transcription: 


# INDIC SEAMLESS

In [None]:
pip install torch torchaudio transformers datasets


In [1]:
from huggingface_hub import login
login("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")

In [2]:
import torchaudio
from transformers import SeamlessM4Tv2ForSpeechToText
from transformers import SeamlessM4TTokenizer, SeamlessM4TFeatureExtractor

model = SeamlessM4Tv2ForSpeechToText.from_pretrained("ai4bharat/indic-seamless").to("cuda")
processor = SeamlessM4TFeatureExtractor.from_pretrained("ai4bharat/indic-seamless")
tokenizer = SeamlessM4TTokenizer.from_pretrained("ai4bharat/indic-seamless")

2025-05-19 08:03:59.625472: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747641839.817068      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747641839.871395      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/2.76k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/139k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.01G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/9.91M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/1.78k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/19.7k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.17M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.34k [00:00<?, ?B/s]

In [18]:
audio, orig_freq = torchaudio.load("/kaggle/input/indian-languages-audio-dataset/Indian_Languages_Audio_Dataset/Tamil/10192.mp3")
audio = torchaudio.functional.resample(audio, orig_freq=orig_freq, new_freq=16_000) 
audio_inputs = processor(audio, sampling_rate=16_000, return_tensors="pt").to("cuda")

text_out = model.generate(**audio_inputs, tgt_lang="hin")[0].cpu().numpy().squeeze()
print(tokenizer.decode(text_out, clean_up_tokenization_spaces=True, skip_special_tokens=True))


कुंडुई की आंखें आज़वार्क कड़िया के साथ कुछ संगीतमय रूप में हैं।
