### Yolo를 활용해서 Oject Detection 진행하고, 인식한 Object 박스로 그리기

In [None]:
import gradio as gr
import cv2
import numpy as np
from PIL import Image, ImageDraw

weights_path = "yolo3/yolov3.weights"
config_path = "yolo3/yolov3.cfg"
names_path = "yolo3/coco_korean.names"

net = cv2.dnn.readNet(weights_path, config_path)

with open(names_path, "r", encoding='utf-8') as file:
    label_list = file.read().strip().split("\n")

# print(net, label_list)

def stream_webcam(image):
    return image

def detect_objects(image):
    drawn_image = Image.fromarray(image.copy())
    draw = ImageDraw.Draw(drawn_image)

    # image의 width, height를 가져온다.
    height, width = image.shape[:2]
    # print(height, width)

    blob = cv2.dnn.blobFromImage(image, 1/255.0, (416, 416), swapRB=True, crop=False)
    net.setInput(blob=blob)
    # 블롭을 생성하고 전방향 전파 진행. 
    layer_name_list = net.getLayerNames()
    output_layer_list = [layer_name_list[i - 1] for i in net.getUnconnectedOutLayers()]

    # yolo_82, yolo_94, yolo_102 총 3개의 레이어들이 예측을 진행. detection_list에는 총 3개의 예측이 있음.
    detection_list = net.forward(output_layer_list)

    for output in detection_list:
        # output : 각 레이어의 예측 정보
        for detection in output:
            # detection : 총 85개. x, y, w, h, confidence + 80개의 names 정보.
            score_list = detection[5:]
            class_index = np.argmax(score_list)
            confidence = score_list[class_index]

            if confidence > 0:
                # print(class_index, label_list[class_index], confidence)
                bounding_box = detection[:4] * np.array([width, height, width, height])
                center_x, center_y, w, h = bounding_box.astype('int')
                x = int(center_x - w / 2)
                y = int(center_y - h / 2)
                # print(x if x > 0 else 0, y if y > 0 else 0, w, h)

                draw.rectangle((x, y, x + w, y + h), outline=(0, 255, 0), width=3)
                draw.text((x + 5, y+ 5), text="Confidence : {:.2f}%".format(confidence*100), fill=(255, 0, 0))

            # if class_index > 0:
            #     print(class_index, confidence)

            # print(detection[:5])
            # print(detection[5:])

    return drawn_image
    # print(image)

with gr.Blocks() as demo:
    
    webcam_input = gr.Image(label="카메라", sources="webcam", streaming=True, width=480, height=270, mirror_webcam=False)
    output_image = gr.Image(label="검출 화면", type="pil")

    # webcam_input.stream(stream_webcam, inputs=[webcam_input], outputs=[output_image])
    webcam_input.stream(detect_objects, inputs=[webcam_input], outputs=[output_image])
                        
demo.launch(server_port=8083)

# image = cv2.imread("C:/Users/jooeu/Desktop/git/ms-ai-school/250327_yolo3/test_image_2.jpg")
# detect_objects(image)

* Running on local URL:  http://127.0.0.1:8083

To create a public link, set `share=True` in `launch()`.




### 중복되어 인식되는 값이 많아서 그 값 중에서 일부만 가져와서 중복 제거

In [None]:
import gradio as gr
import cv2
import numpy as np
import random
import platform
 
from PIL import Image, ImageDraw, ImageFont

weights_path = "yolo3/yolov3.weights"
config_path = "yolo3/yolov3.cfg"
names_path = "yolo3/coco_korean.names"

net = cv2.dnn.readNet(weights_path, config_path)

with open(names_path, "r", encoding='utf-8') as file:
    label_list = file.read().strip().split("\n")

# print(net, label_list)

def stream_webcam(image):
    return image

def random_color():
    return (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))

def get_font():
    font_size = 20
    if platform.system() == "Darwin":
        font = ImageFont.truetype("AppleGothic.ttf", size=font_size)
    elif platform.system() == "Windows":
        font = ImageFont.truetype("malgun.ttf", size=font_size)
    else:
        font = ImageFont.load_default(size=font_size)
    return font

def detect_objects(image):
    drawn_image = Image.fromarray(image.copy())
    draw = ImageDraw.Draw(drawn_image)

    # image의 width, height를 가져온다.
    height, width = image.shape[:2]
    # print(height, width)

    blob = cv2.dnn.blobFromImage(image, 1/255.0, (416, 416), swapRB=True, crop=False)
    net.setInput(blob=blob)
    # 블롭을 생성하고 전방향 전파 진행. 
    layer_name_list = net.getLayerNames()
    output_layer_list = [layer_name_list[i - 1] for i in net.getUnconnectedOutLayers()]

    # yolo_82, yolo_94, yolo_102 총 3개의 레이어들이 예측을 진행. detection_list에는 총 3개의 예측이 있음.
    detection_list = net.forward(output_layer_list)

    bounding_box_list = []
    confidence_list = []
    class_index_list = []

    for output in detection_list:
        # output : 각 레이어의 예측 정보
        for detection in output:
            # detection : 총 85개. x, y, w, h, confidence + 80개의 names 정보.
            score_list = detection[5:]
            class_index = np.argmax(score_list)
            confidence = score_list[class_index]

            if confidence > 0:
                # print(class_index, label_list[class_index], confidence)
                bounding_box = detection[:4] * np.array([width, height, width, height])
                center_x, center_y, w, h = bounding_box.astype('int')
                x = int(center_x - w / 2)
                y = int(center_y - h / 2)

                if x < 0:
                    x = 0
                if y < 0:
                    y = 0
                # print(x, y, w, h)

                bounding_box_list.append([x, y, w, h])
                confidence_list.append(confidence)
                class_index_list.append(class_index)
    # NMS (Non-Maximum Suppression) : 중복된 박스 제거. 0.5 이상인 박스들 중에서 가장 높은 confidence를 가진 박스만 남기고 나머지 박스들은 제거. 0.4는 threshold.
    extracted_index_list = cv2.dnn.NMSBoxes(bounding_box_list, confidence_list, 0.5, 0.4)
    print(class_index_list, extracted_index_list)

    for extracted_index in extracted_index_list:

        x, y, w, h = bounding_box_list[extracted_index]
        confidence = confidence_list[extracted_index]
        class_index = class_index_list[extracted_index]
        label = label_list[class_index]

        color = random_color()
        print(label, x, y, w, h, confidence)

        draw.rectangle((x, y, x + w, y + h), outline=color, width=3)
        draw.text((x + 5, y+ 5), text="{} : {:.2f}%".format(label_list[class_index], confidence*100), fill=color, font=get_font())
    return drawn_image
    # print(image)

with gr.Blocks() as demo:
    
    webcam_input = gr.Image(label="카메라", sources="webcam", streaming=True, width=480, height=270, mirror_webcam=False)
    output_image = gr.Image(label="검출 화면", type="pil")

    webcam_input.stream(detect_objects, inputs=[webcam_input], outputs=[output_image])
                        
demo.launch(server_port=8083)

# image = cv2.imread("C:/Users/jooeu/Desktop/git/ms-ai-school/250327_yolo3/test_image_2.png")
# detect_objects(image)

* Running on local URL:  http://127.0.0.1:8083

To create a public link, set `share=True` in `launch()`.




[np.int64(0), np.int64(0), np.int64(0), np.int64(57), np.int64(57), np.int64(57)] [1 3]
사람 637 193 609 590 0.98044986
소파 138 339 1697 593 0.5562487
[np.int64(0), np.int64(0), np.int64(62), np.int64(62), np.int64(62), np.int64(62), np.int64(62)] [0 6]
사람 632 195 624 586 0.9781089
TV 모니터 250 343 1644 712 0.77255416
[np.int64(0), np.int64(0), np.int64(62), np.int64(0), np.int64(0), np.int64(62), np.int64(62), np.int64(62), np.int64(62), np.int64(62), np.int64(62)] [4 7]
사람 573 207 695 616 0.9314702
TV 모니터 179 250 1637 752 0.8806646
[np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(62), np.int64(62), np.int64(62), np.int64(62), np.int64(62)] [1 8]
사람 595 197 646 577 0.9619559
TV 모니터 273 360 1607 673 0.735822
[np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(62), np.int64(0), np.int64(62), np.int64(0), np.int64(62), np.int64(62), np.int64(72)] [0 9]
사람 661 192 419 592 0.9901688
사람 980 226 474 554 0.98761785
[np.int64(0), np.int64(0), np.int6

### GPT와 음성 받아와서 gradio 인터페이스 구성

In [None]:
import io
import gradio as gr
import cv2
import numpy as np
import random
import platform
import requests
import base64
import re
 
from PIL import Image, ImageDraw, ImageFont
import os
from dotenv import load_dotenv

load_dotenv()
OPENAI_ENDPOINT = os.getenv("OPENAI_ENDPOINT")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
DEPLOYMENT_NAME = os.getenv("DEPLOYMENT_NAME")

SPEECH_ENDPOINT = os.getenv("SPEECH_ENDPOINT")
SPEECH_API_KEY = os.getenv("SPEECH_API_KEY")

weights_path = "yolo3/yolov3.weights"
config_path = "yolo3/yolov3.cfg"
names_path = "yolo3/coco_korean.names"

net = cv2.dnn.readNet(weights_path, config_path)

with open(names_path, "r", encoding='utf-8') as file:
    label_list = file.read().strip().split("\n")

# print(net, label_list)

def request_gpt(image_array):
    endpoint = "{}/openai/deployments/{}/chat/completions?api-version=2025-01-01-preview".format(OPENAI_ENDPOINT, DEPLOYMENT_NAME)
    headers = {
        "Content-Type": "application/json",
        "api-key": OPENAI_API_KEY
    }

    # numpy 형태의 이미지를 PIL 형태로 변환
    image = Image.fromarray(image_array)

    # PIL를 바이너리 형태로 읽음. buffered_io = io.BytesIO()
    buffered_io = io.BytesIO()
    image.save(buffered_io, format="png")

    # image Base64로 인코딩 utf-8
    base64_image = base64.b64encode(buffered_io.getvalue()).decode("utf-8")
    # print(base64_image)

    message_list = []

    # 시스템 메세지 설정
    message_list.append({
        "role": "system",
        "content": [{
            "type": "text",
            "text": """
                너는 사진 속에서 감지된 물체를 분석하는 봇이야.
                무조건 분석결과를 한국어로 답변해줘.
                """
        }]
    })

    message_list.append({
        "role": "user",
        "content": [{
            "type": "text",
            "text": """
                너는 물체를 감지하는 YOLO 모델이야.
                이 사진에서 감지된 물체에 대해 감지확률과 함께 자세한 설명을 붙여줘.
                반드시 감지된 물체, 바운딩 박스 안에 있는 물체에 대해서만 설명해줘.
                부연 설명 필요없고 감지된 물체에 대해서만 설명해줘.
                """
        }, {
            "type": "image_url",
            "image_url": {
                "url": "data:image/png;base64,{}".format(base64_image),
                "caption": "물체 감지 결과"
            }
        }]
    })

    body = {
        "messages": message_list,
        "temperature": 0.7,
        "top_p": 0.95,
        "max_tokens": 16000,
    }

    response = requests.post(endpoint, headers=headers, json=body)
    # print(response.status_code, response.text)

    if response.status_code == 200:
        response_json = response.json()
        content = response_json["choices"][0]["message"]["content"]
    else:
        content = response.text

    return content

def request_tts(text):
    endpoint = SPEECH_ENDPOINT
    headers = {
        "Ocp-Apim-Subscription-Key": SPEECH_API_KEY,
        "Content-Type": "application/ssml+xml",
        "X-Microsoft-OutputFormat": "riff-44100hz-16bit-mono-pcm"
    }
    
    body = f"""
        <speak version='1.0' xml:lang='ko-KR'>
            <voice name='ko-KR-JiMinNeural'>
                <prosody rate="20%">
                    {text}
                </prosody>
            </voice>
        </speak>
    """

    response = requests.post(endpoint, headers=headers, data=body)
    if response.status_code == 200:
        file_name = "response_audio.wav"
        with open(file_name, "wb") as audio_file:
            audio_file.write(response.content)
        return file_name
    else:
        print(f"TTS 요청 실패: {response.status_code}, {response.text}")
        return None

def stream_webcam(image):
    return image

def random_color():
    return (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))

def get_font():
    font_size = 20
    if platform.system() == "Darwin":
        font = ImageFont.truetype("AppleGothic.ttf", size=font_size)
    elif platform.system() == "Windows":
        font = ImageFont.truetype("malgun.ttf", size=font_size)
    else:
        font = ImageFont.load_default(size=font_size)
    return font

def detect_objects(image):
    drawn_image = Image.fromarray(image.copy())
    draw = ImageDraw.Draw(drawn_image)

    # image의 width, height를 가져온다.
    height, width = image.shape[:2]
    # print(height, width)

    blob = cv2.dnn.blobFromImage(image, 1/255.0, (416, 416), swapRB=True, crop=False)
    net.setInput(blob=blob)
    # 블롭을 생성하고 전방향 전파 진행. 
    layer_name_list = net.getLayerNames()
    output_layer_list = [layer_name_list[i - 1] for i in net.getUnconnectedOutLayers()]

    # yolo_82, yolo_94, yolo_102 총 3개의 레이어들이 예측을 진행. detection_list에는 총 3개의 예측이 있음.
    detection_list = net.forward(output_layer_list)

    bounding_box_list = []
    confidence_list = []
    class_index_list = []

    for output in detection_list:
        # output : 각 레이어의 예측 정보
        for detection in output:
            # detection : 총 85개. x, y, w, h, confidence + 80개의 names 정보.
            score_list = detection[5:]
            class_index = np.argmax(score_list)
            confidence = score_list[class_index]

            if confidence > 0:
                # print(class_index, label_list[class_index], confidence)
                bounding_box = detection[:4] * np.array([width, height, width, height])
                center_x, center_y, w, h = bounding_box.astype('int')
                x = int(center_x - w / 2)
                y = int(center_y - h / 2)

                if x < 0:
                    x = 0
                if y < 0:
                    y = 0
                # print(x, y, w, h)

                bounding_box_list.append([x, y, w, h])
                confidence_list.append(confidence)
                class_index_list.append(class_index)
    # NMS (Non-Maximum Suppression) : 중복된 박스 제거. 0.5 이상인 박스들 중에서 가장 높은 confidence를 가진 박스만 남기고 나머지 박스들은 제거. 0.4는 threshold.
    extracted_index_list = cv2.dnn.NMSBoxes(bounding_box_list, confidence_list, 0.5, 0.4)
    # print(class_index_list, extracted_index_list)

    for extracted_index in extracted_index_list:

        x, y, w, h = bounding_box_list[extracted_index]
        confidence = confidence_list[extracted_index]
        class_index = class_index_list[extracted_index]
        label = label_list[class_index]

        color = random_color()
        # print(label, x, y, w, h, confidence)

        draw.rectangle((x, y, x + w, y + h), outline=color, width=3)
        draw.text((x + 5, y+ 5), text="{} : {:.2f}%".format(label_list[class_index], confidence*100), fill=color, font=get_font())
    return drawn_image

def stream_webcam(image):
    drawn_image = detect_objects(image)
    return drawn_image

def click_capture(image):
    return image

def click_send_gpt(image_array, histories):
    content = request_gpt(image_array)
    histories.append({"role": "user", "content": gr.Image(label="감지화면", value=image_array)})
    histories.append({"role": "assistant", "content": content})

    return histories

def change_chatbot(histories):
    content = histories[-1]['content']
    # print(content)
    
    pattern = r'[^가-힣a-zA-Z\s%,\.\d]'
    # pattern = r'[^\w\sㄱ-ㅎ가-힣]'

    cleaned_content = re.sub(pattern, '', content)
    print(cleaned_content)
    file_name = request_tts(cleaned_content)  # 음성 변환
    return file_name

with gr.Blocks() as demo:
    
    with gr.Row():
        webcam_input = gr.Image(label="카메라", sources="webcam", streaming=True, width=480, height=270, mirror_webcam=False)
        output_image = gr.Image(label="검출 화면", type="pil", interactive=False)
        output_capture_image = gr.Image(label="캡쳐 화면", interactive=False)

    with gr.Row():
        capture_button = gr.Button("캡쳐")
        send_gpt_button = gr.Button("GPT로 전송")

    with gr.Row():
        chatbot = gr.Chatbot(label="분석결과", type="messages")    
        chatbot_audio = gr.Audio(label="GPT", interactive=False, autoplay=True)

    webcam_input.stream(stream_webcam, inputs=[webcam_input], outputs=[output_image])
    capture_button.click(click_capture, inputs=[output_image], outputs=[output_capture_image])
    send_gpt_button.click(click_send_gpt, inputs=[output_capture_image, chatbot], outputs=[chatbot])
    chatbot.change(change_chatbot, inputs=[chatbot], outputs=[chatbot_audio])
                        
demo.launch(server_port=8062)

# image = cv2.imread("C:/Users/jooeu/Desktop/git/ms-ai-school/250327_yolo3/test_image_2.png")
# requst_gpt(image)

* Running on local URL:  http://127.0.0.1:8062

To create a public link, set `share=True` in `launch()`.




1. 감지된 물체 고양이  
    감지 확률 93.46%  
    설명 갈색 털을 가진 귀여운 고양이가 바닥에 앉아 있는 모습입니다.  

2. 감지된 물체 고양이  
    감지 확률 94.95%  
    설명 흰 털과 갈색 털을 가진 고양이가 바깥쪽을 바라보고 있는 모습입니다.  

3. 감지된 물체 고양이  
    감지 확률 95.30%  
    설명 작은 크기의 새끼 고양이가 바닥 위에 앉아 있는 모습입니다.  

4. 감지된 물체 고양이  
    감지 확률 97.00%  
    설명 어린 새끼 고양이가 풀밭에 앉아 있는 모습입니다.  

5. 감지된 물체 고양이  
    감지 확률 81.97%  
    설명 회색 줄무늬를 가진 고양이가 뒷발로 서 있는 모습입니다.  
