In [12]:
!pip install openai requests python-dotenv matplotlib librosa ipyaudioworklet gradio Pillow pydantic


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
from openai import OpenAI
from dotenv import load_dotenv
from pydantic import BaseModel
import json
import gradio as gr
import io
import librosa
from IPython.display import Audio, display
from PIL import Image
from io import BytesIO
import requests
import matplotlib.pyplot as plt

load_dotenv()
client = OpenAI()

In [2]:
class CrowdCounter(BaseModel):
    count: int
    analysis: str

# Constants for homework
PROMPT = "How many people are in this image?"
OPENAI_MODEL = "gpt-4o-2024-08-06"
CROWD_THRESHOLD = 10
VOICE_PROMPT = "how-many-people.m4a"
SAVE_PATH = "audio/gpt-response.mp3"

In [13]:
# Define a function that takes an image URL and returns the number of people inside the image.
# If the number of people counted is greater than the CROWD_THRESHOLD (set to 10), then it is considered overcrowded.
def input_image(image_url):
    response = client.beta.chat.completions.parse(
        model=OPENAI_MODEL,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": PROMPT},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": image_url,
                        }
                    }
                ]
            }
        ],
        response_format=CrowdCounter
    )
    data = json.loads(response.choices[0].message.content)

    number_of_people = data['count']

    if number_of_people < CROWD_THRESHOLD:
        return f"There are {number_of_people} people."
    else:
        return f"There are {number_of_people} people, which is {number_of_people - CROWD_THRESHOLD} too many. This place is overcrowded."

demo = gr.Interface(
    fn=input_image,
    inputs=[ 
        gr.Text()
    ],  
    outputs=[
        gr.Text()
    ],
)

# Launch the Gradio app
demo.launch()

* Running on local URL:  http://127.0.0.1:7864

To create a public link, set `share=True` in `launch()`.




In [10]:
# Takes audio file and returns text
def transcript_speech(speech_filename):
    with open(speech_filename, "rb") as audio_file:
        transcription = client.audio.transcriptions.create(
            model="whisper-1",
            file=audio_file,
            response_format="json",
            language="en"
        )
    return transcription.text

# Takes text and plays speech file
def speak_prompt(speech_prompt, autoplay=True, speech_file_path="speech.mp3"):
    response = client.audio.speech.create(
        model="tts-1",
        voice="alloy",
        input=speech_prompt
    )

    response.stream_to_file(speech_file_path)

    if autoplay:
        play_speech(speech_file_path)

# Plays the speech at file path
def play_speech(file_path):
    y, sr = librosa.load(file_path)
    audio = Audio(data=y, rate=sr, autoplay=True)
    display(audio)

def generate_situation_image(dalle_prompt):
    response = client.images.generate(
        model="dall-e-3",
        prompt=dalle_prompt,
        size="1024x1024",
        n=1
    )

    image_url = response.data[0].url
    response = requests.get(image_url)
    img = Image.open(BytesIO(response.content))

    return img

# Show the image
def display_image(img):
    plt.imshow(img)
    plt.axis('off')
    plt.show()

# For a given image url and prompt, figure out the recommended action to take based on crowd size. 
# `prompt` must be related to the count of humans in the image
def input_image_and_prompt(image_url, prompt):
    response = client.beta.chat.completions.parse(
        model=OPENAI_MODEL,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": image_url,
                        }
                    }
                ]
            }
        ],
        response_format=CrowdCounter
    )
    data = json.loads(response.choices[0].message.content)

    number_of_people = data['count']

    if number_of_people < CROWD_THRESHOLD:
        return f"There are {number_of_people} people. There are no actions to take."
    else:
        return f"There are {number_of_people} people, which is {number_of_people - CROWD_THRESHOLD} too many. This place is overcrowded. Show a sign to disperse the crowd"

In [11]:
def conversation_generation(image_url, audio_path):
    transcripted_text = transcript_speech(audio_path)

    openai_response = input_image_and_prompt(image_url, transcripted_text)
    output_audio_file = "speak_speech.mp3"
    speak_prompt(openai_response, False, output_audio_file)
    image = generate_situation_image(openai_response)
    
    return output_audio_file, image

In [15]:
voice_app = gr.Interface(
    conversation_generation,
    inputs=[
        gr.Text(),
        gr.Audio(sources=["microphone"], type="filepath"),
    ],
    outputs=[
        gr.Audio(type="filepath"),
        gr.Image()
    ],
    title="Crowd Situation Interpreter",
    description="This is an event where people are in the main hall"
)

voice_app.launch()

* Running on local URL:  http://127.0.0.1:7866

To create a public link, set `share=True` in `launch()`.




  response.stream_to_file(speech_file_path)
