In [1]:
import cv2
import numpy as np
import base64
from dotenv import load_dotenv
import os
import requests
import time
import json
from pathlib import Path
from PIL import Image, ImageDraw, ImageFont
import pyautogui  # 用于获取屏幕尺寸
from skimage.restoration import denoise_tv_chambolle

# 配置信息
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")  #把你的API_KEY 放到系统.env变量里 然后用os获取
    
if OPENAI_API_KEY is None:
    raise ValueError("API key not found! Set OPENAI_API_KEY environment variable.")

OPENAI_API_ENDPOINT = "https://api.openai.com/v1/chat/completions"
ROI_SAVE_PATH = Path("temp_roi.jpg")

# 全局变量存储鼠标坐标和状态
mouse_x, mouse_y = 0, 0
last_result = "Hover & press 's' to analyze"
last_analysis_time = 0   # 记录上次分析时间
bubble_alpha = 0.0       # 气泡透明度（用于淡入动画）

# 加载中文字体（需要下载字体文件）
FONT_PATH = "assets/font/DouyinSansBold.ttf"  # 替换为合适的字体文件
FONT_SIZE = 24
font = ImageFont.truetype(FONT_PATH, FONT_SIZE)

# 获取屏幕分辨率
screen_width, screen_height = pyautogui.size()


In [2]:
def mouse_callback(event, x, y, flags, param):
    """更新鼠标位置"""
    global mouse_x, mouse_y
    if event == cv2.EVENT_MOUSEMOVE:
        mouse_x, mouse_y = x, y

def analyze_roi_with_openai(image: np.ndarray) -> str:
    """调用 OpenAI GPT-4o Vision 分析鼠标区域"""
    try:
        time.sleep(1)
        cv2.imwrite(str(ROI_SAVE_PATH), cv2.cvtColor(image, cv2.COLOR_RGB2BGR))
        
        with open(ROI_SAVE_PATH, "rb") as f:
            image_data = f.read()

        headers = {
            "Authorization": f"Bearer {OPENAI_API_KEY}"
        }

        payload = {
            "model": "gpt-4o",
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "请仅列出图片中出现的主要物体名称，不要包含描述或修饰词，例如：'桌子, 椅子, 书, 笔'。"},
                        {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64," + base64.b64encode(image_data).decode("utf-8")}}
                    ]
                }
            ],
            "max_tokens": 1000
        }

        for attempt in range(5):  
            response = requests.post(OPENAI_API_ENDPOINT, headers=headers, json=payload)
            if response.status_code == 429:
                wait_time = 2 ** attempt  
                print(f"API 限制，等待 {wait_time} 秒后重试...")
                time.sleep(wait_time)
            else:
                response.raise_for_status()
                break

        result = response.json()["choices"][0]["message"]["content"]
        print(result)
        return result.strip()

    except Exception as e:
        return f"API Error: {str(e)}"
    finally:
        if ROI_SAVE_PATH.exists():
            ROI_SAVE_PATH.unlink()
            

def draw_text_bubble(image, text, x, y, alpha):
    """在鼠标附近绘制一个带中文文本的矩形框（带淡入动画）"""
    global font

    # 创建PIL Image (保证大小一致)
    pil_img = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    draw = ImageDraw.Draw(pil_img)

    # 计算文本大小
    text_size = draw.textbbox((0, 0), text, font=font)
    text_w, text_h = text_size[2] - text_size[0], text_size[3] - text_size[1]

    # 气泡大小
    bubble_w, bubble_h = text_w + 20, text_h + 20
    padding = 10

    # 气泡位置（防止超出窗口）
    x_offset, y_offset = 15, 30  # 让气泡稍微偏离鼠标
    x1 = min(x + x_offset, image.shape[1] - bubble_w - padding)
    y1 = min(y + y_offset, image.shape[0] - bubble_h - padding)
    x2, y2 = x1 + bubble_w, y1 + bubble_h

    # 计算透明度
    overlay = Image.new("RGBA", pil_img.size, (0, 0, 0, 0))  # 创建透明层
    overlay_draw = ImageDraw.Draw(overlay)

    # 画半透明背景 (黑色，alpha 渐变)
    fill_color = (50, 50, 50, int(alpha * 255))  # 透明度随 `alpha` 变化
    overlay_draw.rectangle([x1, y1, x2, y2], fill=fill_color, outline=(255, 255, 255, int(alpha * 255)))

    # 画中文文本 (白色，alpha 渐变)
    text_fill = (255, 255, 255, int(alpha * 255))
    overlay_draw.text((x1 + 10, y1 + 10), text, font=font, fill=text_fill)

    # 合并图层
    pil_img = Image.alpha_composite(pil_img.convert("RGBA"), overlay).convert("RGB")

    return cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)  # 转回 OpenCV 格式
    
    




In [3]:
def auto_canny(image, sigma=0.33):
    v = np.median(image)
    lower = int(max(0, (1.0 - sigma) * v))
    upper = int(min(255, (1.0 + sigma) * v))
    return cv2.Canny(image, lower, upper)

In [4]:
def post_process(edges):
    # 形态学闭运算连接边缘
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (2,2))
    closed = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, kernel)
    
    # 使用 Total Variation (TV) 去噪来替代各向异性扩散
    denoised = denoise_tv_chambolle(closed.astype(float), weight=0.1)

    # skimage 返回的是 float64 归一化到 [0, 1]，需要转换回 uint8
    denoised = (denoised * 255).astype(np.uint8)
    
    return denoised

In [5]:

# 变量初始化
mouse_x, mouse_y = 0, 0
last_result = ""
last_analysis_time = 0
bubble_alpha = 0.0
canny_threshold = 60  # 默认的 Canny 阈值
canny_multiplier = 2    # 默认的 Canny upperbound 倍率

def detect_edge_interactive(image_path, rect_size=200, edge_color=(255, 255, 255), alpha=1):
    """交互式鼠标边缘检测 + GPT-4o 分析 + 鼠标旁边显示结果（仅在按 's' 后短暂显示，带动画）
       添加滚轮缩放检测区域，修复 global 变量冲突
       增加键盘 I/D 调整 canny_threshold
    """
    global mouse_x, mouse_y, last_result, last_analysis_time, bubble_alpha, canny_threshold, canny_multiplier

    # 读取原始图像
    img = cv2.imread(image_path)
    if img is None:
        raise ValueError("无法读取图像文件")

    # 获取原始尺寸
    original_h, original_w = img.shape[:2]
    screen_width, screen_height = 1200, 800  # 设定屏幕大小（可以根据需求调整）
    
    # 计算缩放比例，确保图片不会超出屏幕
    scale_factor = min(screen_width / original_w, screen_height / original_h, 1.0)  
    new_w, new_h = int(original_w * scale_factor), int(original_h * scale_factor)

    # 调整图像大小
    img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # 统一格式

    # 计算鼠标交互的缩放比例（适配缩放后的图片）
    mouse_scale_x = original_w / new_w
    mouse_scale_y = original_h / new_h

    # 创建窗口
    cv2.namedWindow("Smart Vision Analyzer", cv2.WINDOW_NORMAL)
    cv2.resizeWindow("Smart Vision Analyzer", new_w, new_h)  

    result_display_time = 3  # 结果显示时间（秒）

    def mouse_callback(event, x, y, flags, param):
        """更新鼠标位置，监听滚轮调整 `rect_size`"""
        nonlocal rect_size  

        if event == cv2.EVENT_MOUSEMOVE:
            global mouse_x, mouse_y
            mouse_x, mouse_y = x, y

        elif event == cv2.EVENT_MOUSEWHEEL:
            if flags > 0:  
                rect_size = min(rect_size + 20, min(original_h, original_w))  
            else:  
                rect_size = max(rect_size - 20, 50)  

    cv2.setMouseCallback("Smart Vision Analyzer", mouse_callback)

    while True:
        result = img.copy()
        half_size = rect_size // 2
        x1, y1 = max(0, int(mouse_x * mouse_scale_x) - half_size), max(0, int(mouse_y * mouse_scale_y) - half_size)
        x2, y2 = min(original_w, x1 + rect_size), min(original_h, y1 + rect_size)

        roi = cv2.resize(img, (original_w, original_h))[y1:y2, x1:x2]  
        if roi.size == 0:
            continue

        gray_roi = cv2.cvtColor(roi, cv2.COLOR_RGB2GRAY)
        edges = auto_canny(gray_roi)
        edges = post_process(edges) 

        overlay = np.zeros_like(img)

        y1_resized, y2_resized = int(y1 / mouse_scale_y), int(y2 / mouse_scale_y)
        x1_resized, x2_resized = int(x1 / mouse_scale_x), int(x2 / mouse_scale_x)

        overlay_h, overlay_w = y2_resized - y1_resized, x2_resized - x1_resized

        if edges.shape[:2] != (overlay_h, overlay_w):
            edges = cv2.resize(edges, (overlay_w, overlay_h), interpolation=cv2.INTER_NEAREST)

        overlay[y1_resized:y2_resized, x1_resized:x2_resized][edges > 0] = edge_color
        result = cv2.addWeighted(result, 1, overlay, alpha, 0)

        time_since_analysis = time.time() - last_analysis_time
        if time_since_analysis <= result_display_time:
            bubble_alpha = min(1.0, bubble_alpha + 0.1)  
        else:
            bubble_alpha = max(0.0, bubble_alpha - 0.1)  

        if bubble_alpha > 0:
            result = draw_text_bubble(result, last_result, mouse_x, mouse_y, bubble_alpha)

        # 显示 Canny 阈值信息
        cv2.putText(result, f"Threshold: {canny_threshold} Multiplier: {canny_multiplier} | Scroll to resize | 's' to analyze | 'q' to quit",
                    (20, new_h - 20), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 255), 2)

        cv2.imshow("Smart Vision Analyzer", cv2.cvtColor(result, cv2.COLOR_RGB2BGR))

        key = cv2.waitKey(30)  
        if key == ord('q'):
            break
        elif key == ord('s'):
            if roi.size > 0:
                last_result = "分析中..."
                cv2.imshow("Smart Vision Analyzer", cv2.cvtColor(result, cv2.COLOR_RGB2BGR))  
                cv2.waitKey(1)  
                last_result = analyze_roi_with_openai(roi)
                last_result = last_result[:20]  
                last_analysis_time = time.time()  
                bubble_alpha = 0.1  
        elif key == ord('i'):  
            canny_threshold = min(canny_threshold + 10, 255)  
        elif key == ord('d'):  
            canny_threshold = max(canny_threshold - 10, 10)  
        elif key == ord('f'):  
            canny_multiplier = max(canny_multiplier - 0.1, 1)
        elif key == ord('g'):  
            canny_multiplier = min(canny_multiplier + 0.1, 5)   

    cv2.destroyAllWindows()


In [6]:
if __name__ == "__main__":
    image_path = "assets/image/Palace.jpg"  # 替换为你的图片路径
    detect_edge_interactive(image_path)

窗户, 花园, 树木, 砖墙
