In [None]:
import cv2
import base64
import os
import requests
import time
from openai import AzureOpenAI
import openai
from IPython.display import display, clear_output, Image, Audio

from collections import deque
from datetime import datetime
from threading import Thread

import azure.cognitiveservices.speech as speechsdk


# Set Configuration

In [None]:
GPT_4V_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") 
GPT_4V_KEY = os.getenv("AZURE_OPENAI_KEY")
# West US

SPEECH_KEY = os.getenv("AZURE_SPEECH_KEY") 
SPEECH_REGION = "westeurope"
VOICE_NAME = "JennyMultilingualV2Neural2"

PRMTS = ["Describe this image", 
        "What is the number of the parking space containing the car? explain why.",
        "Solve this 9 by 9 Sudoku puzzle"]
PROMPT_NUMBER = 1

# Capture the video

In [None]:
class CameraCapture:

    def __init__(self):
        self.stopped = False
        self.started = False
        self.frame = None
        self.screen = None

    def start(self):
        Thread(target=self.get, args=()).start()
        return self
    
    def get(self):
        while True:
            if not self.started:
                self.screen = display(None, display_id=True)
                self.stream = cv2.VideoCapture(0)
                self.started = True
                
            (self.grabbed, self.frame) = self.stream.read()
            self.frame = cv2.resize(self.frame, (480, 280))
            success, self.frame = cv2.imencode('.jpg', self.frame)
            self.screen.update(Image(data = self.frame.tobytes()))
    
    def stop(self):
        self.stopped = True


# GPT-4 Chat Completion

In [None]:
def gpt_complete(client, previous_texts, frame):

    prompt_context = ' '.join(previous_texts)
    
    
    prompt_message = f"Context: {prompt_context}. {PRMTS[PROMPT_NUMBER]} " 

    base64_image = base64.b64encode(frame).decode('utf-8')
    
    response = client.chat.completions.create(
        model="gpt-4-vision-preview",
        max_tokens= 600,
        n= 1,
        messages= [
        { "role": "system", "content": "You are a helpful assistant. Instructions: - Only answer questions related to visual brain teasers. - If you're unsure of an answer, you can say I don't know or I'm not sure and recommend users go to the IRS website for more information. " },
        { "role": "system", "content": "Context: - At Microsoft, we're committed to the advancement of AI driven by principles that put people first. " },
        {
            "role": "user", 
            "content": [
                {
                    "type": "text",
                    "text": prompt_message
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}"
                    }
                } ,
           ] 
        }]
    )
    return response.choices[0].message.content


# Text to Speech

In [None]:
class Speaker:

    def __init__(self):
        self.speech_config = None
        self.audio_config = None
        self.speech_synthesizer = None

    def start(self):
        self.speech_config = speechsdk.SpeechConfig(subscription = SPEECH_KEY, region = SPEECH_REGION)
        self.audio_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=True)
        self.speech_config.speech_synthesis_language = "en-US"
        self.speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=self.speech_config, audio_config=self.audio_config)
        return self

    def speak(self, text, voice):
        self.speech_config.speech_synthesis_voice_name = voice
        self.speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=self.speech_config, audio_config=self.audio_config)
        result = self.speech_synthesizer.speak_text_async(text).get()
        return result
        


# Main Loop

In [None]:

camera = CameraCapture().start()
text_speaker = Speaker().start()

client = AzureOpenAI(
  api_key = GPT_4V_KEY,
  api_version = "2023-12-01-preview",
  azure_endpoint = GPT_4V_ENDPOINT
)

previous_texts = deque(maxlen = 5)
print("")

while True:
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    
    if camera.started:
        input()
        result = text_speaker.speak(PRMTS[PROMPT_NUMBER],  "en-US-AvaNeural")
        PRMTS[PROMPT_NUMBER]
        
        try :
            text = gpt_complete(client, previous_texts, camera.frame)
            print(f"\r {timestamp}: {text}", end="")
            result = text_speaker.speak(text, "en-US-AndrewNeural")
            previous_texts.append(f"[{timestamp}] {text}")
        
        except Exception as inst:
            print(inst)
            print( "Please try again")
    
    time.sleep(1)

