# Jarvis
This notebook implements a chatbot from YouTube based on groq

In [39]:
from groq import Groq
import cv2
import pyperclip
from PIL import ImageGrab, Image
import google.generativeai as genai
import os

web_cam = cv2.VideoCapture(0)

sys_msg = (
    'You are a multi-modal AI voice assistant. Your user may or may not have attached a photo for context '
    '(either a screenshot or a webcam capture). Any photo has already been processed into a hihgly detailed '
    'text prompt that will be attached to their transcribed vocie propmt. Generate the most usefil and '
    'factual response possible, carefully considering all previous generated test in your response before '
    'adding new tokens to the response. Do not expect or request images, just use the context if added. '
    'Use all of the context of this conversation so your response is relevant to the conversation. Make '
    'your responses clear and concise, avoiding any verbosity.'
)

convo = [{'role': 'system', 'content': sys_msg}]
genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))
generation_config = {
    'temperature': 0.7,
    'top_p': 1,
    'top_k': 1,
    'max_output_tokens': 2048
}
safety_settings = [
    {
        'category': 'HARM_CATEGORY_HARASSMENT',
        'threshold': 'BLOCK_NONE'
    },
    {
        'category': 'HARM_CATEGORY_HATE_SPEECH',
        'threshold': 'BLOCK_NONE'
    },
    {
        'category': 'HARM_CATEGORY_SEXUALLY_EXPLICIT',
        'threshold': 'BLOCK_NONE'
    },
    {
        'category': 'HARM_CATEGORY_DANGEROUS_CONTENT',
        'threshold': 'BLOCK_NONE'
    },
]
vision_model = genai.GenerativeModel('gemini-1.5-flash-latest',
                                    generation_config=generation_config,
                                    safety_settings=safety_settings)
groq_client = Groq()

def groq_prompt(prompt, img_context=None):
    if img_context:
        prompt = f'USER_PROMPT: {prompt}\n\n   IMAGE CONTEXT: {img_context}'
    convo.append({'role': 'user', 'content': prompt})
    chat_completion = groq_client.chat.completions.create(messages=convo, model="llama3-70b-8192")

    response = chat_completion.choices[0].message
    convo.append(response)
    return response.content

In [2]:
# test the prompt
prompt = input('USER: ')
response = groq_prompt(prompt)
print(response)

Who is David Lawrence's favorite comedian? Bill Ba-ray-dee. I'll see myself out.


In [36]:


def function_call(prompt):
    sys_msg = (
        'You are an AI function calling model. You will determine which function from this list: '
        '["extract clipboard", "take screenshot", "capture webcam", "None"] should be called to gain the context '
        'for another AI to respond to the user. You are not responding to the user. You should only return one value from the list. '
        'If the user asks for help with something on their screen you should respond with "take screen shot". '
        'If the user asks for help on something in their environment or their physical appearance you should respond with "capture webcam". '
        'If the user asks for help with their clipboard content you should respond with "extract clipboard". '
        'If none of the previous options are appropriate respond with "None". Only respond woth one value from the list and do not provide any explanation.'
    ) 

    function_convo = [{'role': 'system', 'content': sys_msg},
                      {'role': 'user', 'content': prompt}]
    
    chat_completion = groq_client.chat.completions.create(messages=function_convo, model="mixtral-8x7b-32768")
    return chat_completion.choices[0].message.content

def take_screenshot():
    path = 'screenshot.jpg'
    screenshot = ImageGrab.grab()
    rgb_screenshot = screenshot.convert('RGB')
    rgb_screenshot.save(path, quality=15)


def web_cam_capture():
    if not web_cam.isOpened():
        print('Error: Camera did not open successfully')
        return
    
    path = 'webcam.jpg'
    ret, frame = web_cam.read()
    cv2.imwrite(path, frame)


def get_clipboard_text():
    clipboard_content = pyperclip.paste()
    if isinstance(clipboard_content, str):
        return clipboard_content
    else:
        print('Error: Clipboard content is not a string')
        return None

def vision_prompt(prompt, photo_path):
    img = Image.open(photo_path)
    prompt = (
        'You are the vision analysis AI that provides semantic meaning from images to provide context '
        'to send to another AI that will create a response to the user. Do not respond as the AI assistant '
        'to the user. Instead take the user prompt input and try to extract all meaning from the photo '
        'relevant to the user prompt. Then generate as much objective data about the image for the AI '
        'assistant who will respond to the user. \nUSER PROMPT: {prompt}'
    )
    response = vision_model.generate_content([prompt, img])
    return response.text

In [3]:
prompt = 'I want you to analyse the code I put on the clipboard.'
response = function_call(prompt)
print(response)
print(function_call("I am holding amy dog up to the webcam. What is on his nose?"))

extract clipboard
"capture webcam"


In [10]:

web_cam_capture()
take_screenshot()
print(get_clipboard_text())

web_cam_capture()
take_screenshot()
print(get_clipboard_text())


In [41]:
def jarvis():
    finished = False
    while finished == False:
        visual_context=None
        prompt = input('USER: ')
        if 'bye' in prompt.lower():
            finished=True
            print('Bye')
            break

        call = function_call(prompt)
        call = call.lower()
        print(call)

        if 'take screenshot' in call:
            print('taking screenshot')
            take_screenshot()
            visual_context = vision_prompt(prompt, photo_path='screenshot.jpg')
            print(f'screenshot: {visual_context}')
        
        elif 'capture webcam' in call:
            print('capturing webcam')
            web_cam_capture()
            visual_context = vision_prompt(prompt, photo_path='webcam.jpg')
            print(f'webcam: {visual_context}')

        elif 'extract clipboard' in call:
            print('Copying clipboard text')
            paste = get_clipboard_text()
            prompt = f'{prompt}\n\n CLIPBOARD CONTENT: {paste}'
            print(f'clipboard: {paste}')
            visual_context = None

        response = groq_prompt(prompt=prompt, img_context=visual_context)
        print(response)

In [42]:
jarvis()

"none"
Hello! It's nice to meet you. I'm here to help with any questions or topics you'd like to discuss. What's on your mind today?
none
I'm happy to help! However, I don't have any visual information about you, so I can't tell what color jumper you're wearing. If you'd like to share a photo or describe what you're wearing, I'd be happy to try and help you with that!
"take screenshot"
taking screenshot
screenshot: ```json
{
  "code": "jarvis.ipynb",
  "code_language": "python",
  "code_snippet": "print('capturing webcam')\nweb_cam_capture()\nvisual_context = vision_prompt(prompt, photo_path='webcam.jpg')\nprint(f'webcam: {visual_context}')",
  "code_description": "This code snippet is part of a larger Python program that takes a user prompt and attempts to extract visual context from an image. It first captures an image from the webcam and then calls a function, `vision_prompt`, which is likely responsible for analyzing the image and extracting relevant information. The extracted visu