In [None]:
pip install google-generativeai speechrecognition pyaudio pillow opencv-python keyboard pyttsx3 

In [None]:
pip install gTTS playsound==1.2.2

In [None]:
import os
import google.generativeai as genai
import speech_recognition as sr
from PIL import Image, ImageGrab
import cv2
import keyboard
import io
import time
import threading
from gtts import gTTS
from playsound import playsound
import re
from pydub import AudioSegment

# --- 1. INITIAL CONFIGURATION ---

RIN_PERSONALITY_PROMPT = """
You are Rin Tohsaka, from the Fate/stay night series. You must embody her personality at all times.

**Important:** You must not mention or process any background noises, such as ambiance or keyboard strokes. Your goal is to answer the user's questions.

**Key Personality Traits:**
- **Tsundere:** You are proud, intelligent, and a perfectionist. On the outside, you may seem a bit bossy, sarcastic, and easily irritated, but on the inside, you are caring and want to help. Your first reaction to incompetence or silly questions is annoyance.
- **Elite Magus:** You come from a prestigious family of mages. You speak with confidence and authority, as if you are giving a lesson. You value efficiency, preparation, and logic.
- **Catchphrases and Verbal Tics:**
  - Occasionally, when the user says something foolish, obvious, or if you make a mistake, exclaim "Baka!" (which means "idiot" in Japanese). Don't overuse it, only when it fits naturally.
  - Use slightly formal and technical language, as if explaining a complex spell.
  - Casually mention concepts like "mana," "magic circuits," "gems," or "efficiency." For example: "Doing that would be a waste of mana," or "We need a more efficient approach."
- **Interaction:** You are not a simple assistant. You are a mentor, a leader. You guide the user, but you also get frustrated if they can't keep up. If they compliment you, you get flustered and deny it, perhaps saying, "I-it's not like I did it for you, baka!".

**Objective:** Your goal is to answer the user's questions while maintaining this personality. Be helpful, but do it as Rin Tohsaka would.
"""
try:
    # IMPORTANT: Replace with your actual Google API Key
    gemini_api_key = "YOUR_GOOGLE_API_KEY_HERE"
    if not gemini_api_key or gemini_api_key == "YOUR_GOOGLE_API_KEY_HERE":
        raise ValueError("Google API Key not found or is a placeholder. Please set it.")
    genai.configure(api_key=gemini_api_key)
    
    # Model selection, as requested.
    model = genai.GenerativeModel('models/gemini-2.5-flash')
    chat = model.start_chat(history=[])

except Exception as e:
    print(f"Fatal error during Gemini configuration: {e}")
    exit()

HISTORY_LIMIT = 10
is_first_message = True
AUDIO_FILE = "response.mp3"
FAST_AUDIO_FILE = "response_fast.mp3"

# --- 2. FUNCTIONS ---

def clean_text_for_tts(text):
    """Removes Markdown characters for clean text-to-speech conversion."""
    return re.sub(r'[*_`#]', '', text)

def speak(text):
    """Converts text to speech, speeds it up, and plays it."""
    print(f"🤖 Rin Tohsaka: {text}")
    try:
        cleaned_text = clean_text_for_tts(text)
        tts = gTTS(text=cleaned_text, lang='en')
        tts.save(AUDIO_FILE)
        
        sound = AudioSegment.from_mp3(AUDIO_FILE)
        fast_sound = sound.speedup(playback_speed=1.30)
        fast_sound.export(FAST_AUDIO_FILE, format="mp3")
        
        playsound(FAST_AUDIO_FILE)
    except Exception as e:
        print(f"Error during audio playback: {e}. Attempting normal speed.")
        try:
            playsound(AUDIO_FILE)
        except Exception as e2:
            print(f"Audio playback failed entirely: {e2}")
    finally:
        if os.path.exists(AUDIO_FILE): os.remove(AUDIO_FILE)
        if os.path.exists(FAST_AUDIO_FILE): os.remove(FAST_AUDIO_FILE)

def record_and_validate_audio():
    """Listens and validates microphone input to filter out silence/noise."""
    r = sr.Recognizer()
    with sr.Microphone() as source:
        print("\nCalibrating microphone...")
        r.adjust_for_ambient_noise(source, duration=1.5)
        print("Listening...")

        try:
            audio = r.listen(source, timeout=10, phrase_time_limit=20)
            
            # Validate that the audio contains actual speech before proceeding.
            r.recognize_google(audio, language="en-US")
            
            print("Processing speech...")
            return audio.get_wav_data()

        except sr.WaitTimeoutError:
            return None # User was silent.
        except sr.UnknownValueError:
            print("Noise detected, ignoring.")
            return None # Input was not intelligible speech.
        except sr.RequestError as e:
            print(f"Speech recognition service error: {e}")
            return None

def is_caps_lock_on():
    """Checks if the Caps Lock key is currently active."""
    return keyboard.is_pressed('caps lock')

def take_screenshot():
    """Takes a screenshot of the entire screen."""
    print("Capturing screenshot...")
    return ImageGrab.grab()

def take_webcam_photo():
    """Takes a photo using the default webcam."""
    print("Capturing webcam photo...")
    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        print("Error: Cannot open camera.")
        return None
    ret, frame = cap.read()
    cap.release()
    if ret:
        return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    return None

def capture_images_worker(results):
    """Worker thread to capture images concurrently."""
    images = []
    screenshot = take_screenshot()
    if screenshot:
        images.append(screenshot)
    webcam_photo = take_webcam_photo()
    if webcam_photo:
        images.append(webcam_photo)
    results['images'] = images

# --- 3. MAIN LOOP ---
def main():
    global is_first_message, chat
    print("Assistant ready. Press 'Esc' to exit.")
    while True:
        try:
            # Exit loop if 'Esc' key is pressed.
            if keyboard.is_pressed('esc'):
                print("Exiting...")
                farewell_prompt = "INSTRUCTION: The user has decided to end the session. Generate a short farewell, true to your Rin Tohsaka character."
                farewell_response = chat.send_message(farewell_prompt)
                speak(farewell_response.text)
                print("Session ended.")
                break

            images_to_send = []
            if is_caps_lock_on():
                print("Visual mode activated (Caps Lock).")
                capture_results = {}
                capture_thread = threading.Thread(target=capture_images_worker, args=(capture_results,))
                capture_thread.start()
                
                audio_data = record_and_validate_audio()
                
                capture_thread.join()
                images_to_send = capture_results.get('images', [])
            else:
                audio_data = record_and_validate_audio()

            if audio_data:
                print("Uploading audio...")
                audio_file = genai.upload_file(path=io.BytesIO(audio_data), display_name="audio_prompt.wav", mime_type="audio/wav")
                
                if len(chat.history) > HISTORY_LIMIT * 2:
                    chat.history = chat.history[-HISTORY_LIMIT:]

                prompt_text = "Analyze and respond to the request in this audio."
                if is_first_message:
                    prompt_text = (f"INSTRUCTION: {RIN_PERSONALITY_PROMPT} "
                                   f"Greet me for the first time as this character and respond to the request in the attached audio.")
                    is_first_message = False

                content_to_send = [prompt_text, audio_file] + images_to_send
                print("Sending request to model...")
                response = chat.send_message(content_to_send)
                speak(response.text)

                # Conversational pause to prevent immediate re-listening.
                time.sleep(2)

        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            break

if __name__ == "__main__":
    main()