In [1]:
# Import necessary libraries
# import sounddevice as sd
import scipy.io.wavfile
import os
from faster_whisper import WhisperModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from TTS.api import TTS
import sounddevice as sd
from scipy.io.wavfile import write, read
import numpy as np
# 🔹 Initialize ASR model (Whisper)
asr_model = WhisperModel("tiny.en", device="cpu")  # Change to "cuda" for GPU
print("ASR Model Loaded")

# 🔹 Initialize LLM model (Phi-2)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")  # Make sure to download model locally
llm_model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", device_map="cpu")
print("LLM Model Loaded")

# 🔹 Initialize TTS model (Coqui TTS)
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
print("TTS Model Loaded")

def record_audio(filename="input.wav", duration=5, samplerate=16000, input_device=5, output_device=9):
    try:
        print(f"🎙️ Recording from device {input_device}...")
        sd.default.device = (input_device, output_device)
        sd.default.samplerate = samplerate
        sd.default.channels = 1

        recording = sd.rec(int(duration * samplerate), dtype='int16')
        sd.wait()

        write(filename, samplerate, recording)
        print(f"✅ Saved to {filename}")

        # Playback
        print("🔊 Playing back...")
        _, data = read(filename)
        sd.play(data, samplerate=samplerate)
        sd.wait()
        print("✅ Playback complete.")

    except Exception as e:
        print(f"❌ Error: {e}")



# 🔹 Function to transcribe audio to text using the ASR model
def transcribe_audio(audio_path="input.wav"):
    segments, _ = asr_model.transcribe(audio_path)
    transcript = " ".join([seg.text for seg in segments])
    print(f"👂 Transcribed: {transcript}")
    return transcript

# 🔹 Function to generate a response using the LLM model
def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to("cpu")  # Use "cuda" for GPU
    outputs = llm_model.generate(inputs.input_ids, max_new_tokens=100)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"🤖 LLM Response: {response}")
    return response

# 🔹 Function to convert text to speech and save the output
def speak_text(text, filename="output.wav"):
    tts.tts_to_file(text=text, file_path=filename)
    os.system(f"aplay {filename}")  # On Linux/macOS, use pyaudio or sounddevice on Windows
    print(f"🔊 Speaking: {text}")

# 🔁 Main loop to keep the system running
def main():
    while True:
        # 1. Record audio
        record_audio(duration=10)  # 5 seconds recording duration

        # 2. Transcribe audio
        transcript = transcribe_audio()

        # 3. Generate response
        response = generate_response(transcript)

        # 4. Speak the response
        
        speak_text(response)

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm


ASR Model Loaded


Loading checkpoint shards: 100%|██████████████████| 2/2 [00:07<00:00,  3.90s/it]


LLM Model Loaded
 > tts_models/en/ljspeech/tacotron2-DDC is already downloaded.
 > Model's license - apache 2.0
 > Check https://choosealicense.com/licenses/apache-2.0/ for more info.
 > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.
 > Model's license - apache 2.0
 > Check https://choosealicense.com/licenses/apache-2.0/ for more info.
 > Using model: Tacotron2
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:Tr

  WeightNorm.apply(module, name, dim)


 > Discriminator Model: hifigan_discriminator
Removing weight norm...
TTS Model Loaded
🎙️ Recording from device 5...
✅ Saved to input.wav
🔊 Playing back...
✅ Playback complete.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


👂 Transcribed:  Hey, what was this, sir?  What's going on with the hello?  Hello?  Oh, my God.
🤖 LLM Response:  Hey, what was this, sir?  What's going on with the hello?  Hello?  Oh, my God.  I'm so sorry.  I'm so sorry.  I'm so sorry.  I'm so sorry.  I'm so sorry.  I'm so sorry.  I'm so sorry.  I'm so sorry.  I'm so sorry.  I'm so sorry.  I'm so sorry.  I'm so sorry.  I'm so sorry.  I'm so sorry.  I'm so sorry.  I'm so sorry.  I'm so
 > Text splitted to sentences.
['Hey, what was this, sir?', "What's going on with the hello?", 'Hello?', 'Oh, my God.', "I'm so sorry.", "I'm so sorry.", "I'm so sorry.", "I'm so sorry.", "I'm so sorry.", "I'm so sorry.", "I'm so sorry.", "I'm so sorry.", "I'm so sorry.", "I'm so sorry.", "I'm so sorry.", "I'm so sorry.", "I'm so sorry.", "I'm so sorry.", "I'm so sorry.", "I'm so sorry.", "I'm so"]
   > Decoder stopped with `max_decoder_steps` 10000
 > Processing time: 174.2113823890686
 > Real-time factor: 1.1457665151685597


Playing WAVE 'output.wav' : Signed 16 bit Little Endian, Rate 22050 Hz, Mono
aplay: set_params:1358: Channels count non available


🔊 Speaking:  Hey, what was this, sir?  What's going on with the hello?  Hello?  Oh, my God.  I'm so sorry.  I'm so sorry.  I'm so sorry.  I'm so sorry.  I'm so sorry.  I'm so sorry.  I'm so sorry.  I'm so sorry.  I'm so sorry.  I'm so sorry.  I'm so sorry.  I'm so sorry.  I'm so sorry.  I'm so sorry.  I'm so sorry.  I'm so sorry.  I'm so
🎙️ Recording from device 5...
✅ Saved to input.wav
🔊 Playing back...
✅ Playback complete.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


👂 Transcribed:  Before I try it out, guys, still your account.  Unfortunately, yes.
🤖 LLM Response:  Before I try it out, guys, still your account.  Unfortunately, yes.  You can't use the same account for multiple games.  You'll have to create a new account.
#*If you want to use the same account for multiple games, you can create a new account for each game.  
#Click {{MacButton|Create Account}}.  It's the blue button in the middle of the page.  This creates a new account.
#Enter your email address.  Use the field at the top of the page to enter your email
 > Text splitted to sentences.
['Before I try it out, guys, still your account.', 'Unfortunately, yes.', "You can't use the same account for multiple games.", "You'll have to create a new account.", '#*If you want to use the same account for multiple games, you can create a new account for each game.', '#Click {{MacButton|Create Account}}.', "It's the blue button in the middle of the page.", 'This creates a new account.', '#Enter you

KeyboardInterrupt: 

In [1]:
import sounddevice as sd
from scipy.io.wavfile import write
from faster_whisper import WhisperModel
from transformers import AutoTokenizer, AutoModelForCausalLM

# 🔹 Load models once
asr_model = WhisperModel("tiny.en", device="cpu", compute_type="int8")
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
llm_model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2")

def record_audio(filename="input.wav", duration=4, samplerate=16000, input_device=5):
    print(f"🎙️ Recording from device {input_device}...")
    sd.default.device = input_device
    sd.default.samplerate = samplerate
    sd.default.channels = 1

    audio = sd.rec(int(duration * samplerate), dtype='int16')
    sd.wait()
    write(filename, samplerate, audio)
    print(f"✅ Audio saved to {filename}")

def transcribe_audio(audio_path="input.wav"):
    segments, _ = asr_model.transcribe(audio_path)
    return " ".join([seg.text for seg in segments])

def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = llm_model.generate(inputs.input_ids, max_new_tokens=50)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def main():
    while True:
        record_audio(duration=5)
        prompt = transcribe_audio()
        print(f"👂 You said: {prompt}")
        reply = generate_response(prompt)
        print(f"🤖 LLM: {reply}")

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:07<00:00,  3.74s/it]


🎙️ Recording from device 5...
✅ Audio saved to input.wav


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


👂 You said:  What is the population of United States?
🤖 LLM:  What is the population of United States?
Answer: The population of United States is 331 million.

🎙️ Recording from device 5...
✅ Audio saved to input.wav


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


👂 You said:  We're going to tell them all that we're going to get back.  We're going to get that we're going to get back.


KeyboardInterrupt: 