In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load the GPU-enabled DialoGPT-large model.
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
model_llm = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-large").to(device)
print("DialoGPT-large loaded on device:", device)

def generate_response(prompt):
    """
    Generate a response using DialoGPT-large on GPU with sampling parameters
    tuned for coherent output, and strip the prompt if it is repeated.
    """
    input_ids = tokenizer.encode(prompt + tokenizer.eos_token, return_tensors="pt").to(device)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    attention_mask = (input_ids != tokenizer.pad_token_id).long().to(device)
    output_ids = model_llm.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=100,
        do_sample=True,
        temperature=0.7,       # Lower temperature for more coherent, less random output
        top_p=0.9,             # Top-p sampling to focus on high-probability tokens
        repetition_penalty=1.1, # Moderate repetition penalty to avoid loops
        pad_token_id=tokenizer.eos_token_id
    )
    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    if response.lower().startswith(prompt.lower()):
        response = response[len(prompt):].strip()
    return response

print("generate_response function defined.")


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda
DialoGPT-large loaded on device: cuda
generate_response function defined.


In [2]:
import pyttsx3

# Initialize two TTS engines.
engine1 = pyttsx3.init()
engine2 = pyttsx3.init()

# List available voices.
voices = engine1.getProperty('voices')
print("Available voices:")
for idx, voice in enumerate(voices):
    print(f"{idx}: {voice.name} ({voice.id})")

# Set voices. If you have at least two voices, use them; otherwise, manually override Model 2's voice.
if len(voices) >= 2:
    engine1.setProperty('voice', voices[0].id)
    engine2.setProperty('voice', voices[1].id)
else:
    engine1.setProperty('voice', voices[0].id)
    # Example manual override (adjust as needed):
    # engine2.setProperty('voice', "HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Speech\\Voices\\Tokens\\TTS_MS_EN-US_ZIRA_11.0")
    engine2.setProperty('voice', voices[0].id)

print("Model 1 voice ID:", engine1.getProperty('voice'))
print("Model 2 voice ID:", engine2.getProperty('voice'))


Available voices:
0: Microsoft David Desktop - English (United States) (HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_EN-US_DAVID_11.0)
1: Microsoft Zira Desktop - English (United States) (HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_EN-US_ZIRA_11.0)
Model 1 voice ID: HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_EN-US_DAVID_11.0
Model 2 voice ID: HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_EN-US_DAVID_11.0


In [3]:
import time
import os
import tempfile
import uuid
from playsound import playsound

def speak_text(engine, text):
    """Speak the given text using the provided pyttsx3 engine."""
    engine.say(text)
    engine.runAndWait()

def capture_tts(speaker_engine, text):
    """
    Generate TTS output using speaker_engine by saving to a unique file,
    then transcribe that file using Whisper (without playing it).
    Returns the transcription.
    """
    unique_filename = f"tts_output_{uuid.uuid4().hex}.wav"
    speaker_engine.save_to_file(text, unique_filename)
    speaker_engine.runAndWait()
    time.sleep(1)  # Allow time for the file to be released.
    transcription = transcribe_audio(unique_filename)
    try:
        os.remove(unique_filename)
    except Exception as e:
        print(f"Warning: could not remove temporary file {unique_filename}: {e}")
    return transcription

print("speak_text and capture_tts functions defined.")


speak_text and capture_tts functions defined.


In [4]:
import wave
import pyaudio
import whisper

def record_audio(duration=10, rate=16000, chunk_size=1024):
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16, channels=1, rate=rate, input=True, frames_per_buffer=chunk_size)
    frames = []
    print("Recording...")
    for i in range(0, int(rate / chunk_size * duration)):
        data = stream.read(chunk_size)
        frames.append(data)
    print("Finished recording.")
    stream.stop_stream()
    stream.close()
    p.terminate()
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
    wf = wave.open(temp_file.name, 'wb')
    wf.setnchannels(1)
    wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
    wf.setframerate(rate)
    wf.writeframes(b''.join(frames))
    wf.close()
    return temp_file.name

print("record_audio function defined.")

print("Loading Whisper model...")
whisper_model = whisper.load_model("base")
print("Whisper model loaded.")

def transcribe_audio(audio_filename):
    result = whisper_model.transcribe(audio_filename, language="en")
    return result["text"]

print("transcribe_audio function defined.")


record_audio function defined.
Loading Whisper model...
Whisper model loaded.
transcribe_audio function defined.


In [5]:
import numpy as np

def calculate_rms(audio_buffer):
    audio_buffer = audio_buffer.astype(np.float32)
    audio_buffer[np.isnan(audio_buffer) | np.isinf(audio_buffer)] = 0
    return np.sqrt(np.mean(np.square(audio_buffer)))

def save_audio(filename, audio_data, rate=16000):
    with wave.open(filename, 'wb') as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(rate)
        wf.writeframes(audio_data)
    return os.path.abspath(filename)

def partner_record_audio(duration, rate=16000, chunk_size=1600):
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16,
                    channels=1,
                    rate=rate,
                    input=True,
                    frames_per_buffer=chunk_size)
    frames = []
    for _ in range(0, int(rate / chunk_size * duration)):
        data = stream.read(chunk_size)
        frames.append(data)
    stream.stop_stream()
    stream.close()
    p.terminate()
    audio_data = b''.join(frames)
    return audio_data

def monitor_and_record(scalar=0.5, duration_for_average=8, rate=16000, chunk_size=16000):
    print("Monitoring loudness...")
    initial_audio = partner_record_audio(duration_for_average, rate, chunk_size)
    average_loudness = calculate_rms(np.frombuffer(initial_audio, dtype=np.int16).astype(np.float32))
    print(f"Average loudness (RMS) in the room: {average_loudness}")
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16,
                    channels=1,
                    rate=rate,
                    input=True,
                    frames_per_buffer=chunk_size)
    frames = []
    is_recording = False
    upThresh = (1.0 + scalar) * average_loudness
    lowThresh = (1.0 - scalar) * average_loudness
    print("upThresh:", upThresh)
    print("lowThresh:", lowThresh)
    print("Listening for loudness change...")
    try:
        while True:
            data = stream.read(chunk_size)
            audio_data = np.frombuffer(data, dtype=np.int16).copy()
            audio_data[np.isnan(audio_data) | np.isinf(audio_data)] = 32767
            current_loudness = calculate_rms(audio_data)
            print("Current loudness:", current_loudness)
            if not np.isnan(current_loudness):
                if current_loudness > upThresh and not is_recording:
                    print("Loudness exceeded threshold. Starting recording...")
                    is_recording = True
                    frames = [data]
                elif current_loudness < lowThresh and is_recording:
                    print("Loudness dropped below threshold. Stopping recording...")
                    is_recording = False
                    break
                elif is_recording:
                    frames.append(data)
    finally:
        stream.stop_stream()
        stream.close()
        p.terminate()
    if frames:
        print("Saving recorded audio...")
        audio_data = b''.join(frames)
        saved_file = save_audio("recorded_audio.wav", audio_data, rate)
        print("Saved recorded audio to:", saved_file)
        return saved_file

print("Partner's dynamic recording functions defined.")


Partner's dynamic recording functions defined.


In [None]:
# ----- Main Conversation Loop -----

# Step 1: Capture the initial spoken prompt dynamically.
print("Please speak your initial prompt. Wait for the system to detect and record your speech...")
initial_audio_file = monitor_and_record(scalar=0.5, duration_for_average=8, rate=16000, chunk_size=16000)
initial_prompt = transcribe_audio(initial_audio_file)
os.remove(initial_audio_file)
print("Transcribed initial prompt:", initial_prompt)

# Step 2: Model 1 generates its first reply from the transcribed initial prompt.
response_text_1 = generate_response(initial_prompt)
print("Model 1 says:", response_text_1)
speak_text(engine1, response_text_1)
time.sleep(2)

# Use Model 1's first reply as the conversation context.
current_context = response_text_1

# Step 3: Conversation Loop: Exchange responses using file-captured TTS.
num_rounds = 10  # Adjust as desired.
for i in range(num_rounds):
    print(f"\n--- Conversation Round {i+1} ---")
    
    # Model 2 captures Model 1's current reply via file capture.
    transcription_model1 = capture_tts(engine1, current_context)
    print("Model 2 hears:", transcription_model1)
    
    # Model 2 generates its reply based on the captured text.
    response_text_2 = generate_response(transcription_model1)
    print("Model 2 says:", response_text_2)
    speak_text(engine2, response_text_2)
    time.sleep(2)
    
    # Model 1 captures Model 2's reply via file capture.
    transcription_model2 = capture_tts(engine2, response_text_2)
    print("Model 1 hears:", transcription_model2)
    
    # Model 1 generates its new reply.
    response_text_1 = generate_response(transcription_model2)
    print("Model 1 says:", response_text_1)
    speak_text(engine1, response_text_1)
    time.sleep(2)
    
    # Update conversation context for the next round.
    current_context = response_text_1


Please speak your initial prompt. Wait for the system to detect and record your speech...
Monitoring loudness...
Average loudness (RMS) in the room: 52.58523941040039
upThresh: 78.87785911560059
lowThresh: 26.292619705200195
Listening for loudness change...
Current loudness: 88.90449
Loudness exceeded threshold. Starting recording...
Current loudness: 97.04795
Current loudness: 417.6359
Current loudness: 1627.999
Current loudness: 679.0152
Current loudness: 145.56215
Current loudness: 38.05188
Current loudness: 37.442146
Current loudness: 36.069153
Current loudness: 502.60876
Current loudness: 978.72437
Current loudness: 624.9519
Current loudness: 581.0121
Current loudness: 597.4029
Current loudness: 413.85434
Current loudness: 127.341835
Current loudness: 41.100338
Current loudness: 55.496235
Current loudness: 8.75051
Loudness dropped below threshold. Stopping recording...
Saving recorded audio...
Saved recorded audio to: d:\usuhackathon2025\Datathon\CelestialChoreography\CelestialCho

  a = scaled_dot_product_attention(


Transcribed initial prompt:  Yeah, we're all on 312. Use a pip on, use pip to uninstall whisper again if you're having trouble.
Model 1 says: , I'm pretty sure that's how it works.'Tis the best way of doing things for me anyway! 306 Warlock with high charisma and low luck... haha :D psn is same as username? oops lolwutupooguymfwocuteymee 8pypi michiganen 1st werido1s

--- Conversation Round 1 ---
Model 2 hears:  I'm pretty sure that's how it works Tis the best way of doing things for me anyway. 306 Warlock with high charisma and low luck, ha ha, laughing face, PSN is same as username? Oops low you to put Gimphwakity me 8PYP I Michigan in first where I don't want S.
Model 2 says: .? edit : spelling mistakes are bad, btw! sry'thx m8s lt 3 u 2o7m4mee
Model 1 hears:  Edit, spelling mistakes are bad, BTW. SRED-THX-MH-S-L-T-3-U-207-M for me.
Model 1 says: some of us don't know the difference. s EDIT : Thanks! I didn t notice that it was a typo and not just my phone keyboard haha...? HAHaha s