In [1]:
import torch
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import soundfile as sf
import re
import gradio as gr
import os
import torchvision
import random
import cProfile
import subprocess
import edge_tts
import snakeviz
import asyncio
import gprof2dot
import whisper
import nest_asyncio
import transformers
from langchain import HuggingFacePipeline, PromptTemplate, LLMChain
from transformers import AutoTokenizer, AutoModelForCausalLM
os.environ['FFMPEG_PATH'] = "C:\FFmpeg\bin\ffmpeg.exe"
nest_asyncio.apply()

In [2]:
model_name = "microsoft/phi-2"


tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
        model_name,
        trust_remote_code=True, 
        device_map="auto", 
        cache_dir="./model_docs"
       
)
model.eval() 
device_map = {"module": "cuda:0", "module.encoder": "cuda:1"}
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
    max_new_tokens=100,
    temperature=0.7,
    truncation=True,
    do_sample=True, 
    top_k=10, 
    num_return_sequences=1, 
    eos_token_id=tokenizer.eos_token_id, 
    pad_token_id=tokenizer.eos_token_id,
    num_workers=torch.cuda.device_count(),
    clean_up_tokenization_spaces=True
)


template = PromptTemplate(input_variables=['input'], 
template="Generate a response for the following input: {input}"
       )

llm = HuggingFacePipeline(pipeline=pipeline)

chain = LLMChain(llm=llm, prompt=template)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device device because they were offloaded to the cpu and disk.
  warn_deprecated(
  warn_deprecated(


In [3]:
def VAD(audio_file):
    torch.set_num_threads(1)

    
    model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad')
    (get_speech_timestamps, _, read_audio, _, _) = utils
    
  
    wav, original_sample_rate = librosa.load(audio_file, sr=None)
    print("Original sample rate:", original_sample_rate)
    
   
    resampled_wav = librosa.resample(wav, orig_sr=original_sample_rate, target_sr=16000)
    print("Resampled to 16 kHz")
    
 
    speech_timestamps = get_speech_timestamps(resampled_wav, model, sampling_rate=16000)
    print("Speech timestamps:", speech_timestamps)
    
   
    if not speech_timestamps:
        raise ValueError("No speech detected in the audio file.")
    
 
    trimmed_wav = []
    for timestamp in speech_timestamps:
        start_idx = timestamp['start']
        end_idx = timestamp['end']
        trimmed_wav.extend(resampled_wav[start_idx:end_idx])
    
 
    compiled_wav = np.array(trimmed_wav)
    
   
    compiled_wav_resampled = librosa.resample(compiled_wav, orig_sr=16000, target_sr=16000)
    
 
    sf.write('compiled_resampled_audio.wav', compiled_wav_resampled, 16000)
    print("Compiled and resampled audio saved as 'compiled_resampled_audio.wav'")
    
    
    plt.figure(figsize=(15, 8))
    
    plt.subplot(3, 1, 1)
    plt.plot(wav)
    plt.title('Original Audio')
    
    plt.subplot(3, 1, 2)
    plt.plot(resampled_wav)
    plt.title('Resampled Audio (16 kHz)')
    
    plt.subplot(3, 1, 3)
    plt.plot(compiled_wav_resampled)
    plt.title('Compiled and Resampled Audio (16 kHz)')
    
    plt.tight_layout()
    plt.show()
    return compiled_wav_resampled

In [4]:
def STT(audio_file):
    model = whisper.load_model("base")
    result = model.transcribe(audio_file)
    
    print(result["text"])
    return result["text"]

In [5]:
def LLM_Response(prompt):
    prompt=str(prompt)
    prompt=prompt.replace('.','')
    promptl=prompt+" Answer in two sentence or less"
   

    response = chain.invoke(promptl)
    
    text_data = response['text']
    print(text_data)
    text_data = text_data.replace(promptl, '')
    text_data = text_data.replace(prompt, '')
    text_data = text_data.replace('Generate a response for the following input', '')
    text_data = text_data.replace(':', '')
    text_data = text_data.replace('-', '')
    text_data = text_data.replace('Input','')
    text_data = text_data.replace('Output','')
    text_data = text_data.replace('INPUT','')
    text_data = text_data.replace('OUTPUT','')
    text_data = text_data.replace('Answer','')
    text_data = text_data.replace('Response','')
    text_data = text_data.replace('#', '')
    text_data = text_data.replace('Generating an output of ','')
    
    print(text_data)
    return text_data

In [6]:
async def STT_Audio(TEXT, gender, rate, volume, pitch, max_retries=3):
    voices = {
        "male": ["en-US-ChristopherNeural", "en-US-GuyNeural", "en-US-SteffanNeural"],
        "female": ["en-US-EmmaNeural", "en-US-JennyNeural", "en-US-MichelleNeural"]
    }

    if gender not in ["male", "female"]:
        raise ValueError("Invalid gender. Please enter 'male' or 'female'.")

    voice = random.choice(voices[gender])
    output_file = "output_final.wav"

    for attempt in range(max_retries):
        try:
            communicate = edge_tts.Communicate(TEXT, voice, rate=rate, volume=volume, pitch=pitch)
            await communicate.save(output_file)
            print(f"Audio saved to {output_file}")
            return output_file
        except edge_tts.exceptions.NoAudioReceived as e:
            print(f"Attempt {attempt + 1} failed: {e}. Retrying...")
            await asyncio.sleep(1)  
    raise RuntimeError("Failed to generate audio after multiple attempts")
    

In [7]:



async def pipeline_m(audio_file, gender, rate, volume, pitch):
    CARW = VAD(audio_file)
    Text = STT(CARW)
    LLM_res = LLM_Response(Text)
    processed_audio = await STT_Audio(LLM_res, gender, rate, volume, pitch)
    return processed_audio


async def process_audio(audio, gender, rate, volume, pitch):
    return await pipeline_m(audio, gender, rate, volume, pitch)


gr.Interface(
    fn=process_audio,
    inputs=[
        gr.Audio(type="filepath", label="Record Audio"),  # Accepts audio input from microphone or file
        gr.Dropdown(choices=["male", "female"], label="gender"),  # Gender selection
        gr.Textbox(label="rate (+40%, -18%)"),  # Rate adjustment as percentage
        gr.Textbox(label="volume (+25%, -28%)"),  # Volume adjustment as percentage
        gr.Textbox(label="pitch (-39Hz)")  # Pitch adjustment in Hz
    ],
    outputs=gr.Audio(type="filepath"),  # Outputs the processed audio file and plays it
    theme='freddyaboulton/dracula_revamped'  # Optional theme
).launch(server_port=6003, share=True)



Running on local URL:  http://127.0.0.1:6003
Running on public URL: https://5fa9024e89d1d2bbcd.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "C:\Users\ASUS\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\uvicorn\protocols\http\h11_impl.py", line 406, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
  File "C:\Users\ASUS\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\uvicorn\middleware\proxy_headers.py", line 70, in __call__
    return await self.app(scope, receive, send)
  File "C:\Users\ASUS\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\fastapi\applications.py", line 1054, in __call__
    await super().__call__(scope, receive, send)
  File "C:\Users\ASUS\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\starlette\applica

Original sample rate: 44100
Resampled to 16 kHz
Speech timestamps: [{'start': 28704, 'end': 56288}]
Compiled and resampled audio saved as 'compiled_resampled_audio.wav'


  plt.show()


 What is the capital of India?
Generate a response for the following input:  What is the capital of India? Answer in two sentence or less. Response: The capital of India is New Delhi. It is a city located in the northern part of the country.

 .  The capital of India is New Delhi. It is a city located in the northern part of the country.

Audio saved to output_final.wav
