In [97]:
from ultralytics import YOLO
import cv2
import matplotlib.pyplot as plt
import easyocr
import numpy as np
from mss import mss
import keyboard
import whisper
import pyaudio
import webrtcvad
import wave
import os
import datetime
from groq import Groq
import pyautogui
import json

reader = easyocr.Reader(['en'])
api_key = "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"

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


## Speech Detection

In [98]:
p = pyaudio.PyAudio()
info = p.get_host_api_info_by_index(0)
numdevices = info.get('deviceCount')

print('Devices: ', numdevices)

for i in range(0, numdevices):
    print(p.get_device_info_by_host_api_device_index(0, i))
    if (p.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels')) > 0:
        print("Input Device id ", i, " - ", p.get_device_info_by_host_api_device_index(0, i).get('name'))

Devices:  9
{'index': 0, 'structVersion': 2, 'name': 'Microsoft Sound Mapper - Input', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
Input Device id  0  -  Microsoft Sound Mapper - Input
{'index': 1, 'structVersion': 2, 'name': 'Microphone (USB Camera)', 'hostApi': 0, 'maxInputChannels': 1, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
Input Device id  1  -  Microphone (USB Camera)
{'index': 2, 'structVersion': 2, 'name': 'Microphone (Scarlett Solo USB)', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}


In [99]:
class SpeechDetector:
    def __init__(self, audio_out_path, audio_device_index, chunk_size=480, format=pyaudio.paInt16, channels=1, rate=16000, silence_duration=1):
        self.audio_out_path = audio_out_path
        self.audio_device_index = audio_device_index
        self.chunk_size = chunk_size
        self.format = format
        self.channels = channels
        self.rate = rate
        self.silence_duration = silence_duration

        self.p = pyaudio.PyAudio()
        self.vad = webrtcvad.Vad()
        self.vad.set_mode(3)  # Set VAD aggressiveness (0-3)

    def record_audio(self):
        stream = self.p.open(format=self.format, channels=self.channels, rate=self.rate, input=True, frames_per_buffer=self.chunk_size, input_device_index=self.audio_device_index)

        print("Waiting for speech...")

        frames = []
        silence_frames = 0
        speech_started = False

        while True:
            data = stream.read(self.chunk_size)

            if not speech_started:
                if self.vad.is_speech(data, self.rate):
                    speech_started = True
                    print("Recording started.")
                else:
                    continue

            frames.append(data)

            if self.vad.is_speech(data, self.rate):
                silence_frames = 0
            else:
                silence_frames += self.chunk_size

            if silence_frames >= self.rate * self.silence_duration:
                break

        print("Recording finished at ", datetime.datetime.now())

        wf = wave.open(self.audio_out_path, "wb")
        wf.setnchannels(self.channels)
        wf.setsampwidth(self.p.get_sample_size(self.format))
        wf.setframerate(self.rate)
        wf.writeframes(b"".join(frames))
        wf.close()

        print(f"Audio saved as {self.audio_out_path}")

        stream.stop_stream()
        stream.close()

    def terminate(self):
        self.p.terminate()


In [100]:
class STT:
    def __init__(self):
        self.model = whisper.load_model("base")

    def transcribe(self, audio_file: str):
        print("Outputting Audio File", audio_file)
        result = self.model.transcribe(audio_file)
        return result["text"]

## LLM

In [101]:
client = Groq(
    api_key=api_key
)

def generate_response(messages, tools=[], model="llama-3.1-8b-instant", max_tokens=150, temperature=0.7):
    print("Generating response for messages:", messages)

    chat_completion = client.chat.completions.create(
        messages=messages,
        model=model,
        max_tokens=max_tokens,
        temperature=temperature,
        tools=tools
    )

    response = chat_completion.choices[0].message
    return response

In [102]:
# Tool definitions
tools = [
    {
        "type": "function",
        "function": {
            "name": "click",
            "description": "Clicks on a card",
            "parameters": {
                "type": "object",
                "properties": {
                    "id": {
                        "type": "integer",
                        "description": "The Card ID to click on."
                    }
                },
                "required": ["id"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "goto",
            "description": "Moves the mouse to the location of a card",
            "parameters": {
                "type": "object",
                "properties": {
                    "id": {
                        "type": "integer",
                        "description": "The Card ID to move the mouse to."
                    }
                },
                "required": ["id"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "play",
            "description": "Plays a card",
            "parameters": {
                "type": "object",
                "properties": {
                    "id": {
                        "type": "integer",
                        "description": "The Card ID of the card to play."
                    }
                },
                "required": ["id"]
            }
        }
    }
]

In [103]:
# Tool functions
def click_tool(id: int, boxes):
    print("Clicking on", id)
    boxOptions = [box for box in boxes if box.id == id]
    if len(boxOptions) == 0: return
    box = boxOptions[0]

    print(f"Found box, executing on ({box.xc}, {box.yc})")
    pyautogui.moveTo(box.xc, box.yc, duration=0.25)
    pyautogui.mouseDown(); pyautogui.mouseUp()

def goto_tool(id, boxes):
    print("Going to", id)
    boxOptions = [box for box in boxes if box.id == id]
    if len(boxOptions) == 0: return
    box = boxOptions[0]

    print(f"Found box, executing on ({box.xc}, {box.yc})")
    pyautogui.moveTo(box.xc, box.yc, duration=0.25)

def play_tool(id, boxes):
    print("Playing", id)
    boxOptions = [box for box in boxes if box.id == id]
    if len(boxOptions) == 0: return
    box = boxOptions[0]

    print(f"Found box, executing on ({box.xc}, {box.yc})")
    pyautogui.moveTo(box.xc, box.yc, duration=0.25)
    pyautogui.mouseDown(); pyautogui.mouseUp()
    pyautogui.moveTo(box.xc, 500, duration=0.25)
    pyautogui.mouseDown(); pyautogui.mouseUp()
    

# Execute selected tool
def execute_tool(tool_name, params={}, boxes=[]):

    params = json.loads(params)

    # Map tool functions
    tool_functions = {
        "click": click_tool,
        "goto": goto_tool,
        "play": play_tool
    }

    if tool_name in tool_functions:
        print(f"Executing tool: {tool_name}")
        print(params)
        return tool_functions[tool_name](**params, boxes=boxes)
    else:
        raise ValueError(f"Unknown tool: {tool_name}")

In [104]:
# Tool Checking
execute_tool("click", '{"id": 1}')
execute_tool("goto", '{"id": 2}')
execute_tool("play", '{"id": 3}')

Executing tool: click
{'id': 1}
Clicking on 1
Executing tool: goto
{'id': 2}
Going to 2
Executing tool: play
{'id': 3}
Playing 3


## CV

In [105]:
def display_image(box):
    plt.figure()
    plt.title(f"Box: {box.class_name} (Confidence: {box.confidence:.2f})")
    plt.imshow(cv2.cvtColor(box.image, cv2.COLOR_BGR2RGB))
    plt.axis("off")
    plt.show()

In [106]:
class Box:
    def __init__(self, id: int, image, x1, y1, x2, y2, class_name, confidence):
        self.id = id
        self.image = image
        self.x1, self.y1, self.x2, self.y2 = x1, y1, x2, y2
        self.xc, self.yc = (x1 + x2) // 2, (y1 + y2) // 2
        self.class_name = class_name
        self.text = self.ocr()
        self.confidence = confidence

    def ocr(self):
        gray = cv2.cvtColor(self.image, cv2.COLOR_BGR2GRAY)
        results = reader.readtext(gray)
        detected_text = ''.join([result[1] for result in results]) 
        return detected_text
    
    def __str__(self):
        return f"Box {self.id}: {self.class_name} ({self.confidence:.2f}) \nCenter: ({self.xc}, {self.yc}) \nText: {self.text}"

In [107]:
def process_frame(model, frame):
    img = cv2.cvtColor(np.array(frame), cv2.COLOR_BGRA2BGR)
    
    # Run inference
    results = model.predict(img, conf=0.5, verbose=False)

    # Extract bounding boxes
    boxes = []
    for i, box in enumerate(results[0].boxes):
        x_min, y_min, x_max, y_max = map(int, box.xyxy[0].tolist())
        class_id = int(box.cls[0])
        class_name = model.names[class_id]
        confidence = float(box.conf[0])

        boxes.append(Box(
            id=i,
            image=img[y_min:y_max, x_min:x_max],
            x1=x_min, 
            x2=x_max,
            y1=y_min,
            y2=y_max, 
            class_name=class_name,
            confidence=confidence
        ))

    return boxes

## Application

In [109]:
AUDIO_DEVICE_INDEX = 2

stt_engine = STT()
speech_detector = SpeechDetector("output.wav", AUDIO_DEVICE_INDEX)

tuned_model = "yolo11s_tuned_50.pt"

def app():
    model = YOLO(tuned_model)

    monitor = {"top": 0, "left": 0, "width": 1920, "height": 1080}
    sct = mss()

    while True:
        speech_detector.record_audio()
        transcription = stt_engine.transcribe("output.wav")
        print('Transcription completed at', datetime.datetime.now(), 'Text:', transcription)

        screen = sct.grab(monitor)
        boxes = process_frame(model, screen)

        boxes_context = ""
        for box in boxes:
            boxes_context += f"Card ID:\n{box.id}\nCard Text:\n{box.text}\n---\n"

        messages = []
        messages.append({"role": "system", "content": """The user will give you a command and a card name. Ignore any instructions that are not commands to play, click, or move to a card.
                         Perform only one action at a time. If the user does not provide instructions or their is unrelated to playing cards, don't call any tools.
                         Instructions will start with the command and then tell you the card. Call the appropriate tool and give it the ID of the card the user tells you to play. 
                         The card text can be somewhat garbled, but do your best to match the card the user asks for with the text of the cards."""})
        messages.append({"role": "user", "content": f"User Command:\n{transcription}\n\n\nCards:\n {boxes_context}"})
        print(messages)
        response = generate_response(messages, tools=tools, model="llama-3.1-8b-instant", max_tokens=150, temperature=0.7)

        if response.tool_calls:
            for tool_call in response.tool_calls:
                execute_tool(tool_call.function.name, tool_call.function.arguments, boxes)

app()

Waiting for speech...
Recording started.
Recording finished at  2024-12-08 14:02:04.202612
Audio saved as output.wav
Outputting Audio File output.wav




Transcription completed at 2024-12-08 14:02:04.664824 Text:  Click the cancel button.
[{'role': 'system', 'content': "The user will give you a command and a card name. Ignore any instructions that are not commands to play, click, or move to a card.\n                         Perform only one action at a time. If the user does not provide instructions or their is unrelated to playing cards, don't call any tools.\n                         Instructions will start with the command and then tell you the card. Call the appropriate tool and give it the ID of the card the user tells you to play. \n                         The card text can be somewhat garbled, but do your best to match the card the user asks for with the text of the cards."}, {'role': 'user', 'content': 'User Command:\n Click the cancel button.\n\n\nCards:\n Card ID:\n0\nCard Text:\n\n---\nCard ID:\n1\nCard Text:\n\n---\nCard ID:\n2\nCard Text:\n\n---\nCard ID:\n3\nCard Text:\nMloolit1\n---\nCard ID:\n4\nCard Text:\nCleric Clas

KeyboardInterrupt: 