<a href="https://colab.research.google.com/github/lightrainofmay/multimodal-search/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install faiss-cpu
!pip install flask-cors
!pip install pyngrok
import json
import requests
from bs4 import BeautifulSoup
import re
import time

# ✅ 网站主页
BASE_URL = "https://jino-lan.site/"
ALL_URLS = set()  # 存储所有文章 URL
media_entries = []  # 存储所有媒体资源
id_counter = 1  # 资源 ID 计数器

# ✅ **获取所有文章的 URL**
def get_all_article_links():
    global ALL_URLS
    next_page = BASE_URL  # 初始页面
    while next_page:
        print(f"🔍 Crawling: {next_page}")
        response = requests.get(next_page)
        soup = BeautifulSoup(response.text, 'html.parser')

        # 🔹 提取所有文章链接
        for a_tag in soup.find_all("a", href=True):
            href = a_tag["href"]
            if "/?p=" in href and href.startswith(BASE_URL):  # 确保是文章链接
                ALL_URLS.add(href)

        # 🔹 查找 `下一页` 按钮
        next_btn = soup.find("a", string=re.compile("下一页|Next", re.IGNORECASE))
        next_page = next_btn["href"] if next_btn else None

        if next_page and not next_page.startswith("http"):
            next_page = BASE_URL + next_page  # 修正相对 URL

    print(f"✅ Found {len(ALL_URLS)} article URLs!")

# ✅ **爬取每个文章页面**
def scrape_media_from_page(url):
    global id_counter
    print(f"📄 Scraping: {url}")

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # 🔹 **查找 `.wp-block-media-text` 代码块**
    for media_block in soup.find_all("div", class_="wp-block-media-text"):
        text_content = ""

        # 🔹 **提取 `p` 和 `figcaption` 组合文本**
        text_elements = media_block.find_all(["figcaption"])
        text_lines = [el.get_text(strip=True) for el in text_elements if el.get_text(strip=True)]
        if text_lines:
            text_content = " ".join(text_lines)

        # 🔹 **找到 `audio` 并赋值 `text`**
        for figure in media_block.find_all("figure", class_="wp-block-audio"):
            audio_tag = figure.find("audio")
            if audio_tag and audio_tag.get("src"):
                audio_src = audio_tag["src"]

                print(f"[🎵 Audio] {audio_src} → {text_content}")
                media_entries.append({
                    "id": id_counter,
                    "type": "audio",
                    "file": audio_src,
                    "text": text_content,
                    "page": url
                })
                id_counter += 1

        # 🔹 **找到 `img` 并赋值 `text`**
        for img_tag in media_block.find_all("img"):
            img_src = img_tag.get("src") or img_tag.get("data-src")
            if img_src:
                print(f"[✅ Image] {img_src} → {text_content}")
                media_entries.append({
                    "id": id_counter,
                    "type": "image",
                    "file": img_src,
                    "text": text_content,
                    "page": url
                })
                id_counter += 1

    # 🔹 **查找 `a` 标签中的音频**
    for link in soup.find_all('a', href=True):
        href = link["href"]
        if re.search(r'\.(mp3|wav|ogg|m4a)$', href, re.IGNORECASE):
            desc = link.get_text(strip=True) or f"Audio {id_counter}"
            print(f"[🔗 Audio Link] {href} → {desc}")

            media_entries.append({
                "id": id_counter,
                "type": "audio",
                "file": href,
                "text": desc,
                "page": url
            })
            id_counter += 1

# ✅ **执行完整爬取**
def crawl_all_pages():
    get_all_article_links()  # 先获取所有文章链接
    for article_url in ALL_URLS:
        scrape_media_from_page(article_url)  # 逐个爬取多媒体资源
        time.sleep(1)  # 避免过快爬取，降低服务器负担

    # 🔹 **保存 JSON**
    with open("jino_all_media.json", "w", encoding="utf-8") as f:
        json.dump(media_entries, f, indent=2, ensure_ascii=False)

    print(f"\n✅ Extracted {len(media_entries)} media entries across {len(ALL_URLS)} pages!")

# ✅ **运行爬取**
crawl_all_pages()


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0
🔍 Crawling: https://jino-lan.site/
✅ Found 4 article URLs!
📄 Scraping: https://jino-lan.site/?p=806
[🎵 Audio] https://jino-lan.site/wp-content/uploads/2025/03/105.wav → 105.身体 a³³mʌ⁴⁴ body
[✅ Image] https://jino-lan.site/wp-content/uploads/2025/03/105.jpg → 105.身体 a³³mʌ⁴⁴ body
[🎵 Audio] https://jino-lan.site/wp-content/uploads/2025/03/106.wav → 106.四肢 li⁴⁴la⁵⁵ the four limbs
[✅ Image] https://jino-lan.site/wp-content/uploads/2025/03/106-3.jpg → 106.四肢 li⁴⁴la⁵⁵ the four limbs
[🎵 Audio] https://jino-lan.site/wp-content/uploads/2025/03/107-1.wav → 107.四肢2 a³³kji⁴⁴a³³la⁵⁵ the four limbs
[✅ Image] https://j

In [2]:
import json
import numpy as np
import faiss
import pandas as pd
from sentence_transformers import SentenceTransformer
import re  # 引入正则表达式支持

# ✅ 加载更适合中文的语义搜索模型
model = SentenceTransformer("moka-ai/m3e-base")

# ✅ 读取 JSON 数据
json_file = "/content/jino_all_media.json"
with open(json_file, "r", encoding="utf-8") as f:
    media_entries = json.load(f)

# ✅ 转换为 Pandas DataFrame
df = pd.DataFrame(media_entries)

# ✅ 确保 "text" 列存在并非空值
if "text" not in df.columns:
    raise ValueError("JSON 数据缺少 'text' 字段")

df["text"] = df["text"].fillna("")  # 替换 NaN 为 空字符串

# ✅ **增强文本，让嵌入支持 IPA + 拼音**
df["enhanced_text"] = df["text"].apply(lambda x: f"{x} {x.lower()}")

# ✅ 计算所有文本的向量（使用增强文本）
df["embedding"] = df["enhanced_text"].apply(
    lambda x: model.encode(x, normalize_embeddings=True) if isinstance(x, str) and x.strip() else np.zeros(768)
)

# ✅ **创建 FAISS 索引**
embedding_dim = df["embedding"].iloc[0].shape[0]  # 获取嵌入维度
index = faiss.IndexFlatL2(embedding_dim)

# ✅ 添加所有嵌入到 FAISS 索引
embeddings_matrix = np.vstack(df["embedding"].values).astype(np.float32)
index.add(embeddings_matrix)

def hybrid_search(query, df, index, top_k=20):
    """
    1️⃣ **关键词搜索**：先获取所有包含 `query` 的文本，确保它们排在最前面。
    2️⃣ **语义搜索（FAISS）**：补充一些可能相关但不包含 `query` 的内容。
    3️⃣ **排序**：先展示关键词匹配的内容，再展示语义匹配的内容。
    4️⃣ **图片和音频的排序**：与文本搜索结果保持相同顺序。
    """

    print(f"\n🔍 **搜索关键词：{query}**")

    # 🔹 **关键词搜索**（增强支持国际音标）
    regex_pattern = rf"{query}"  # 直接匹配，不加 \b，防止 Unicode 问题
    keyword_results = df[df["text"].str.contains(regex_pattern, regex=True, na=False)].copy()
    keyword_results = keyword_results.drop(columns=["embedding"], errors="ignore")  # 去掉 `embedding`

    print("\n✅ **关键词搜索匹配到的内容：**")
    for text in keyword_results["text"].tolist():
        print(f"- {text}")

    # 🔹 **语义搜索（FAISS）**
    query_embedding = model.encode(query, normalize_embeddings=True).reshape(1, -1)
    _, indices = index.search(query_embedding, top_k)
    semantic_results = df.iloc[indices[0]].copy()
    semantic_results = semantic_results.drop(columns=["embedding"], errors="ignore")  # 去掉 `embedding`

    print("\n✅ **FAISS 语义搜索匹配到的内容：**")
    for text in semantic_results["text"].tolist():
        print(f"- {text}")

    # ✅ **合并搜索结果（关键词匹配优先）**
    combined_results = pd.concat([keyword_results, semantic_results]).drop_duplicates().reset_index(drop=True)

    print("\n✅ **最终排序后的搜索结果（关键词匹配在前，语义匹配在后）：**")
    for text in combined_results["text"].tolist():
        print(f"- {text}")

    # ✅ **按文本搜索结果的顺序整理图片和音频**
    text_results = combined_results["text"].tolist()

    # 🔹 **获取对应的文件路径**
    file_results = combined_results[["text", "file"]].dropna()

    # 🔹 **按 text_results 顺序匹配文件**
    sorted_files = []
    for text in text_results:
        matched_files = file_results[file_results["text"] == text]["file"].tolist()
        sorted_files.extend(matched_files)

    # ✅ **分类整理图片和音频**
    image_results = [img for img in sorted_files if img.endswith((".jpg", ".png", ".webp"))]
    audio_results = [audio for audio in sorted_files if audio.endswith((".wav", ".mp3", ".ogg"))]

    print("\n✅ **最终排序后的图像搜索结果（关键词搜索在前）：**")
    for r in image_results:
        print(f"- {r}")

    print("\n✅ **最终排序后的音频搜索结果（关键词搜索在前）：**")
    for r in audio_results:
        print(f"- {r}")

    return {
        "text": text_results,
        "image": image_results,
        "audio": audio_results
    }

# ✅ 运行搜索
query = "身体"
search_results = hybrid_search(query, df, index, top_k=5)


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/26.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/932 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/409M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/439k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


🔍 **搜索关键词：身体**

✅ **关键词搜索匹配到的内容：**
- 105.身体 a³³mʌ⁴⁴ body
- 105.身体 a³³mʌ⁴⁴ body

✅ **FAISS 语义搜索匹配到的内容：**
- 105.身体 a³³mʌ⁴⁴ body
- 105.身体 a³³mʌ⁴⁴ body
- 106.四肢 li⁴⁴la⁵⁵ the four limbs
- 106.四肢 li⁴⁴la⁵⁵ the four limbs
- 107.四肢2 a³³kji⁴⁴a³³la⁵⁵ the four limbs

✅ **最终排序后的搜索结果（关键词匹配在前，语义匹配在后）：**
- 105.身体 a³³mʌ⁴⁴ body
- 105.身体 a³³mʌ⁴⁴ body
- 106.四肢 li⁴⁴la⁵⁵ the four limbs
- 106.四肢 li⁴⁴la⁵⁵ the four limbs
- 107.四肢2 a³³kji⁴⁴a³³la⁵⁵ the four limbs

✅ **最终排序后的图像搜索结果（关键词搜索在前）：**
- https://jino-lan.site/wp-content/uploads/2025/03/105.jpg
- https://jino-lan.site/wp-content/uploads/2025/03/105.jpg
- https://jino-lan.site/wp-content/uploads/2025/03/106-3.jpg
- https://jino-lan.site/wp-content/uploads/2025/03/106-3.jpg

✅ **最终排序后的音频搜索结果（关键词搜索在前）：**
- https://jino-lan.site/wp-content/uploads/2025/03/105.wav
- https://jino-lan.site/wp-content/uploads/2025/03/105.wav
- https://jino-lan.site/wp-content/uploads/2025/03/106.wav
- https://jino-lan.site/wp-content/uploads/2025/03/106.wav
- https://jino-lan.sit

In [3]:

import json
import numpy as np
import faiss
import pandas as pd
import os
from sentence_transformers import SentenceTransformer
import pickle
from IPython.display import display, Image, Audio

# ✅ **配置文件路径**
json_file = "/content/jino_all_media.json"
index_file = "/content/faiss_index.bin"
embedding_file = "/content/text_embeddings.pkl"

# ✅ **加载适合中文的语义搜索模型**
model = SentenceTransformer("moka-ai/m3e-base")

# ✅ **读取 JSON 数据**
with open(json_file, "r", encoding="utf-8") as f:
    media_entries = json.load(f)

# ✅ **转换为 Pandas DataFrame**
df = pd.DataFrame(media_entries)

# ✅ **确保 "text" 列存在且不为空**
if "text" not in df.columns:
    raise ValueError("JSON 数据缺少 'text' 字段")

df["text"] = df["text"].fillna("")  # 替换 NaN 为 空字符串

# ✅ **索引和嵌入的加载或重新计算**
if os.path.exists(index_file) and os.path.exists(embedding_file):
    print("✅ 加载已有的 FAISS 索引...")
    index = faiss.read_index(index_file)

    with open(embedding_file, "rb") as f:
        text_embeddings = pickle.load(f)
else:
    print("⚠️ 未找到索引文件，正在重新计算嵌入并创建索引...")

    # **增强文本，支持国际音标**
    df["enhanced_text"] = df["text"].apply(lambda x: f"{x} {x.lower()}")

    # **计算文本嵌入向量**
    df["embedding"] = df["enhanced_text"].apply(
        lambda x: model.encode(x, normalize_embeddings=True) if isinstance(x, str) and x.strip() else np.zeros(768)
    )

    # **存储文本嵌入**
    text_embeddings = np.vstack(df["embedding"].values).astype(np.float32)

    # **创建 FAISS 索引**
    embedding_dim = text_embeddings.shape[1]  # 获取嵌入维度
    index = faiss.IndexFlatL2(embedding_dim)
    index.add(text_embeddings)

    # **保存索引**
    faiss.write_index(index, index_file)

    # **保存文本嵌入**
    with open(embedding_file, "wb") as f:
        pickle.dump(text_embeddings, f)

    print("✅ FAISS 索引和文本嵌入已保存！")


def semantic_search(query, df, index, top_k=5):
    """
    ✅ **执行 FAISS 语义搜索**
    """
    print(f"\n🔍 **搜索关键词：{query}**")

    # 🔹 **计算查询向量**
    query_embedding = model.encode(query, normalize_embeddings=True).reshape(1, -1)
    _, indices = index.search(query_embedding, top_k)

    # ✅ **过滤掉超出范围的索引**
    valid_indices = [i for i in indices[0] if i < len(df)]
    if not valid_indices:
        print("\n⚠️ **未找到相关数据**")
        return []

    # 🔹 **获取搜索结果**
    semantic_results = df.iloc[valid_indices].copy()
    semantic_results = semantic_results.drop(columns=["embedding"], errors="ignore")  # 删除 embedding 列

    # ✅ **去重**
    semantic_results = semantic_results.drop_duplicates().reset_index(drop=True)

    return semantic_results["text"].tolist()


def process_results(df, text_results):
    """
    ✅ **匹配文本对应的图片、音频**
    """
    # 🔹 **创建文本 → 文件映射**
    file_results = df[["text", "file"]].dropna().drop_duplicates()
    text_to_files = file_results.groupby("text")["file"].apply(list).to_dict()

    # ✅ **初始化存储**
    text_file_map = {}

    for text in text_results:
        images = []
        audios = []
        if text in text_to_files:
            files = text_to_files[text]
            images = [f for f in files if f.endswith((".jpg", ".png", ".webp"))]
            audios = [f for f in files if f.endswith((".wav", ".mp3", ".ogg"))]

        # ✅ **仅当图片和音频均只有一个时，才保留该条目**
        if len(images) == 1 and len(audios) == 1:
            text_file_map[text] = {"image": images, "audio": audios}

    return text_file_map


def display_results(text_file_map):
    """
    ✅ **仅展示图片、音频均为 1 的数据**
    """
    print("\n✅ **最终搜索结果**")

    if not text_file_map:
        print("\n⚠️ **没有符合要求的匹配项（即每条数据必须恰好有 1 张图片和 1 个音频）**")
        return

    for text, media in text_file_map.items():
        print(f"\n📝 **{text}**")

        # 🔹 **展示图片**
        img_path = media["image"][0]
        display(Image(url=img_path, width=300, height=300))

        # 🔹 **展示音频**
        audio_path = media["audio"][0]
        display(Audio(url=audio_path, autoplay=False))


# ✅ **执行搜索**
query = "火"
semantic_texts = semantic_search(query, df, index, top_k=10)
final_results = process_results(df, semantic_texts)
display_results(final_results)


⚠️ 未找到索引文件，正在重新计算嵌入并创建索引...
✅ FAISS 索引和文本嵌入已保存！

🔍 **搜索关键词：火**

✅ **最终搜索结果**

📝 **22. mi⁴⁴ 火**



📝 **23. 烟 mi⁴⁴khju⁴⁴**



📝 **44. va⁵⁵lo³³ 猪水滚凼**



📝 **20. 露水 pɛ⁴⁴ji⁴⁴**



📝 **63. a³³tɤ⁵⁵ 垃圾**


In [4]:
import faiss
import numpy as np
import pickle

# 保存 FAISS 索引到文件
faiss.write_index(index, "faiss_index.bin")

# 保存 DataFrame（文本、文件信息）
df.to_pickle("media_data.pkl")

# 加载 FAISS 索引和数据
def load_faiss_index():
    index = faiss.read_index("faiss_index.bin")
    df = pd.read_pickle("media_data.pkl")
    return index, df


In [None]:
!pip install flask-cors
!pip install pyngrok
import openai
import json
import numpy as np
import faiss
import pandas as pd
import os
import pickle
import re
from flask import Flask, request, jsonify
from flask_cors import CORS
from sentence_transformers import SentenceTransformer
from pyngrok import ngrok
from google.colab import userdata

# ✅ 初始化 Flask
app = Flask(__name__)
CORS(app)

# ✅ 设置 ngrok 认证
NGROK_AUTH_TOKEN = userdata.get('NGROK_AUTH_TOKEN')
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# ✅ OpenAI API Key
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
openai.api_key = OPENAI_API_KEY

# ✅ 配置文件路径
json_file = "/content/jino_all_media.json"
index_file = "/content/faiss_index.bin"
embedding_file = "/content/text_embeddings.pkl"

# ✅ **加载适合中文的语义搜索模型**
model = SentenceTransformer("moka-ai/m3e-base")

# ✅ **读取 JSON 数据**
with open(json_file, "r", encoding="utf-8") as f:
    media_entries = json.load(f)

# ✅ **转换为 Pandas DataFrame**
df = pd.DataFrame(media_entries)

# ✅ **确保 "text" 列存在且不为空**
df["text"] = df["text"].fillna("")  # 替换 NaN 为 空字符串

# ✅ **索引和嵌入的加载或重新计算**
if os.path.exists(index_file) and os.path.exists(embedding_file):
    print("✅ 加载已有的 FAISS 索引...")
    index = faiss.read_index(index_file)

    with open(embedding_file, "rb") as f:
        text_embeddings = pickle.load(f)
else:
    print("⚠️ 未找到索引文件，正在重新计算嵌入并创建索引...")

    # **增强文本，支持国际音标**
    df["enhanced_text"] = df["text"].apply(lambda x: f"{x} {x.lower()}")

    # **计算文本嵌入向量**
    df["embedding"] = df["enhanced_text"].apply(
        lambda x: model.encode(x, normalize_embeddings=True) if isinstance(x, str) and x.strip() else np.zeros(768)
    )

    # **存储文本嵌入**
    text_embeddings = np.vstack(df["embedding"].values).astype(np.float32)

    # **创建 FAISS 索引**
    embedding_dim = text_embeddings.shape[1]  # 获取嵌入维度
    index = faiss.IndexFlatL2(embedding_dim)
    index.add(text_embeddings)

    # **保存索引**
    faiss.write_index(index, index_file)

    # **保存文本嵌入**
    with open(embedding_file, "wb") as f:
        pickle.dump(text_embeddings, f)

    print("✅ FAISS 索引和文本嵌入已保存！")



def extract_keywords(user_input):
    """
    ✅ **使用 GPT-4 提取用户输入中的关键词（避免返回 "基诺语"）**
    """
    client = openai.OpenAI(api_key=OPENAI_API_KEY)  # 传递 API Key

    prompt = f"""请从下面的中文问题中提取最核心的搜索关键词：

    1. 如果问题是 **"基诺语的X怎么说？"**，你应该只返回 **"X"**，不要返回 "基诺语"。
    2. 仅返回一个最相关的关键词（比如 **"山"、"水"、"火"**）。
    3. 不要返回句子或多余的解释，只返回 **关键词**。

    **问题**：{user_input}
    **关键词**："""

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=5
    )

    keyword = response.choices[0].message.content.strip()
    print(f"📝 提取的关键词：{keyword}")
    return keyword



def semantic_search(query, df, index, top_k=10):
    """
    ✅ **执行 FAISS 语义搜索**
    """
    print(f"\n🔍 **搜索关键词：{query}**")

    # 🔹 **计算查询向量**
    query_embedding = model.encode(query, normalize_embeddings=True).reshape(1, -1)
    _, indices = index.search(query_embedding, top_k)

    # ✅ **过滤掉超出范围的索引**
    valid_indices = [i for i in indices[0] if i < len(df)]
    if not valid_indices:
        print("\n⚠️ **未找到相关数据**")
        return []

    # 🔹 **获取搜索结果**
    semantic_results = df.iloc[valid_indices].copy()
    semantic_results = semantic_results.drop(columns=["embedding"], errors="ignore")  # 删除 embedding 列

    return semantic_results["text"].tolist()


def process_results(df, text_results):
    """
    ✅ **匹配文本对应的图片、音频，并合并相同 `text`**
    """
    file_results = df[["text", "file"]].dropna().drop_duplicates()
    text_to_files = file_results.groupby("text")["file"].apply(list).to_dict()

    text_file_map = {}

    for text in text_results:
        images = set()
        audios = set()

        if text in text_to_files:
            files = text_to_files[text]
            images.update(f for f in files if f.endswith((".jpg", ".png", ".webp")))
            audios.update(f for f in files if f.endswith((".wav", ".mp3", ".ogg")))

        if text in text_file_map:
            text_file_map[text]["images"].update(images)
            text_file_map[text]["audios"].update(audios)
        else:
            text_file_map[text] = {"images": images, "audios": audios}

    for key in text_file_map:
        text_file_map[key]["images"] = list(text_file_map[key]["images"])
        text_file_map[key]["audios"] = list(text_file_map[key]["audios"])

    return text_file_map


@app.route("/chat", methods=["POST"])
def chat():
    """
    ✅ **Chatbot 处理逻辑**
    """
    data = request.json
    user_query = data.get("message", "").strip()

    if not user_query:
        return jsonify({"error": "Message cannot be empty"}), 400

    # 🔹 **使用 GPT-4 提取关键词**
    keyword = extract_keywords(user_query)

    # 🔹 **执行语义搜索**
    search_results = semantic_search(keyword, df, index, top_k=5)

    # 🔹 **匹配多模态数据**
    multi_modal_results = process_results(df, search_results)

    if not multi_modal_results:
        return jsonify({"query": user_query, "response": "查询的词汇不存在", "search_results": {}})

    response_results = []

    for text_result, media in multi_modal_results.items():
        image_urls = list(set(media["images"])) if media["images"] else ["暂无图片"]
        audio_urls = list(set(media["audios"])) if media["audios"] else ["暂无音频"]

        response_results.append({
            "text": text_result,
            "images": image_urls,
            "audios": audio_urls
        })

    return jsonify({"query": user_query, "search_results": response_results})


# ✅ **启动 ngrok**
public_url = ngrok.connect(5000).public_url
print(f"🌍 **公网上访问链接:** {public_url}")

app.run(port=5000)


✅ 加载已有的 FAISS 索引...
🌍 **公网上访问链接:** https://e9dd-104-199-207-47.ngrok-free.app
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [29/Mar/2025 03:19:01] "POST /chat HTTP/1.1" 200 -


📝 提取的关键词：身体

🔍 **搜索关键词：身体**


INFO:werkzeug:127.0.0.1 - - [29/Mar/2025 03:19:36] "POST /chat HTTP/1.1" 200 -


📝 提取的关键词：头

🔍 **搜索关键词：头**


In [None]:
import json
import numpy as np
import faiss
import pandas as pd
import os
import pickle
from flask import Flask, request, jsonify
from flask_cors import CORS
from sentence_transformers import SentenceTransformer
from pyngrok import ngrok
from langchain.chat_models import ChatOpenAI
from langchain.schema import StrOutputParser, AIMessage
from langchain.prompts import ChatPromptTemplate
from google.colab import userdata

# ✅ 初始化 Flask
app = Flask(__name__)
CORS(app)

# ✅ 设置 ngrok 认证
NGROK_AUTH_TOKEN = userdata.get('NGROK_AUTH_TOKEN')
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# ✅ OpenAI API Key
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
openai.api_key = OPENAI_API_KEY

# ✅ 配置文件路径
json_file = "/content/jino_all_media.json"
index_file = "/content/faiss_index.bin"
embedding_file = "/content/text_embeddings.pkl"

# ✅ 加载适合中文的语义搜索模型
model = SentenceTransformer("moka-ai/m3e-base")

# ✅ 读取 JSON 数据
with open(json_file, "r", encoding="utf-8") as f:
    media_entries = json.load(f)

df = pd.DataFrame(media_entries)
df["text"] = df["text"].fillna("")

# ✅ **索引和嵌入的加载**
if os.path.exists(index_file) and os.path.exists(embedding_file):
    print("✅ 加载已有的 FAISS 索引...")
    index = faiss.read_index(index_file)

    with open(embedding_file, "rb") as f:
        text_embeddings = pickle.load(f)
else:
    print("⚠️ 未找到索引文件，正在重新计算嵌入...")
    df["enhanced_text"] = df["text"].apply(lambda x: f"{x} {x.lower()}")
    df["embedding"] = df["enhanced_text"].apply(
        lambda x: model.encode(x, normalize_embeddings=True) if isinstance(x, str) and x.strip() else np.zeros(768)
    )
    text_embeddings = np.vstack(df["embedding"].values).astype(np.float32)
    embedding_dim = text_embeddings.shape[1]
    index = faiss.IndexFlatL2(embedding_dim)
    index.add(text_embeddings)
    faiss.write_index(index, index_file)
    with open(embedding_file, "wb") as f:
        pickle.dump(text_embeddings, f)
    print("✅ FAISS 索引和文本嵌入已保存！")


def semantic_search(query, df, index, top_k=3):
    """
    ✅ **FAISS 语义搜索**
    """
    print(f"\n🔍 **搜索关键词：{query}**")
    query_embedding = model.encode(query, normalize_embeddings=True).reshape(1, -1)
    _, indices = index.search(query_embedding, top_k)
    valid_indices = [i for i in indices[0] if i < len(df)]
    if not valid_indices:
        print("\n⚠️ **未找到相关数据**")
        return []
    semantic_results = df.iloc[valid_indices].copy()
    semantic_results = semantic_results.drop(columns=["embedding"], errors="ignore").drop_duplicates().reset_index(drop=True)
    return semantic_results["text"].tolist()


def process_results(df, text_results, max_items=3):
    """
    ✅ **匹配文本对应的图片、音频**
    """
    file_results = df[["text", "file"]].dropna().drop_duplicates()
    text_to_files = file_results.groupby("text")["file"].apply(list).to_dict()
    text_file_map = {}

    for text in text_results:
        images = []
        audios = []
        if text in text_to_files:
            files = text_to_files[text]
            images = [f for f in files if f.endswith((".jpg", ".png", ".webp"))][:max_items]
            audios = [f for f in files if f.endswith((".wav", ".mp3", ".ogg"))][:max_items]

        if images or audios:
            text_file_map[text] = {"images": images, "audios": audios}

    return text_file_map


@app.route("/chat", methods=["POST"])
def chat():
    """
    ✅ **Chatbot 处理逻辑**
    """
    data = request.json
    user_query = data.get("message", "").strip()

    if not user_query:
        return jsonify({"error": "Message cannot be empty"}), 400

    # 🔹 **执行语义搜索**
    search_results = semantic_search(user_query, df, index, top_k=10)
    multi_modal_results = process_results(df, search_results, max_items=3)

    text_result = list(multi_modal_results.keys())[0] if multi_modal_results else "查询的词汇不存在"
    image_urls = multi_modal_results[text_result]["images"] if text_result in multi_modal_results else []
    audio_urls = multi_modal_results[text_result]["audios"] if text_result in multi_modal_results else []

    print(f"\n📌 最终匹配的文本: {text_result}")
    print(f"📌 选取的图片: {image_urls}")
    print(f"📌 选取的音频: {audio_urls}")

    # ✅ **GPT-4o LLM 设置**
    gpt4o = ChatOpenAI(model="gpt-4o", temperature=0.0, api_key=OPENAI_API_KEY)
    parser = StrOutputParser()

    # ✅ **使用 Markdown 格式返回图片和音频**
    image_markdown = "\n".join([f"![相关图片]({url})" for url in image_urls]) if image_urls else "暂无相关图片"
    audio_markdown = "\n".join([f'<audio controls><source src="{url}" type="audio/mpeg"></audio>' for url in audio_urls]) if audio_urls else "暂无相关音频"

    # ✅ **修正的 Prompt**
    prompt = ChatPromptTemplate.from_template(
        """
        你是一个基于基诺族语言文化知识库的智能助手。请基于以下信息回答用户的问题：

        **查询内容:** {query}

        **相关文本:**
        {text_result}

        **相关图片:**
        {image_markdown}

        **相关音频:**
        {audio_markdown}

        **重要要求:**
        - 只能基于提供的 `文本、图片、音频` 回答问题，不能编造内容。
        - 只使用最相关的内容，不要返回无关信息。
        """
    )

    formatted_prompt = prompt.format(
        query=user_query,
        text_result=text_result,
        image_markdown=image_markdown,
        audio_markdown=audio_markdown
    )

    response = gpt4o.invoke(formatted_prompt)

    return jsonify({
        "query": user_query,
        "response": response.content,
        "search_results": {
            "text": text_result,
            "images": image_urls,
            "audios": audio_urls
        }
    })


# ✅ **启动 ngrok**
public_url = ngrok.connect(5000).public_url
print(f"🌍 **公网上访问链接:** {public_url}")

app.run(port=5000)
