In [4]:
!pip install streamlit PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [5]:
import streamlit as st
import chromadb
from chromadb.utils import embedding_functions
from openai import OpenAI
import os
from dotenv import load_dotenv
import PyPDF2
import uuid

# 載入環境變數
load_dotenv()

# 常數設定
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200

### 簡單的模型選擇器

In [6]:
class SimpleModelSelector:
    """簡單的模型選擇器"""

    def __init__(self):
        # 可選的 LLM 模型
        self.llm_models = {"openai": "GPT-4", "ollama": "Llama3"}

        # 可選的 Embedding 模型及其維度
        self.embedding_models = {
            "openai": {"name": "OpenAI Embeddings", "dimensions": 1536, "model_name": "text-embedding-3-small"},
            "chroma": {"name": "Chroma Default", "dimensions": 384, "model_name": None},
            "nomic": {"name": "Nomic Embed Text", "dimensions": 768, "model_name": "nomic-embed-text"},
        }

    def select_models(self):
        """透過 Streamlit 介面選擇模型"""
        st.sidebar.title("📚 模型選擇")

        llm = st.sidebar.radio(
            "選擇 LLM 模型:",
            options=list(self.llm_models.keys()),
            format_func=lambda x: self.llm_models[x],
        )

        embedding = st.sidebar.radio(
            "選擇 Embedding 模型:",
            options=list(self.embedding_models.keys()),
            format_func=lambda x: self.embedding_models[x]["name"],
        )

        return llm, embedding

### 處理 PDF 檔案並切割文字

In [7]:
class SimplePDFProcessor:
    """處理 PDF 檔案並切割文字"""

    def __init__(self, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def read_pdf(self, pdf_file):
        """讀取 PDF 並提取文字"""
        reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
        return text

    def create_chunks(self, text, pdf_file):
        """將文字切成多個段落 (chunk)"""
        chunks = []
        start = 0

        while start < len(text):
            end = start + self.chunk_size

            if start > 0:
                start = start - self.chunk_overlap

            chunk = text[start:end]

            if end < len(text):
                last_period = chunk.rfind(".")
                if last_period != -1:
                    chunk = chunk[: last_period + 1]
                    end = start + last_period + 1

            chunks.append({
                "id": str(uuid.uuid4()),
                "text": chunk,
                "metadata": {"source": pdf_file.name},
            })

            start = end

        return chunks


### 簡單版 RAG 系統

In [8]:
class SimpleRAGSystem:
    """簡單版 RAG 系統"""

    def __init__(self, embedding_model="openai", llm_model="openai"):
        self.embedding_model = embedding_model
        self.llm_model = llm_model

        # 初始化 ChromaDB
        self.db = chromadb.PersistentClient(path="./chroma_db")

        # 設定嵌入（Embedding）功能
        self.setup_embedding_function()

        # 設定 LLM
        if llm_model == "openai":
            self.llm = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        else:
            self.llm = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")

        # 建立或取得 collection
        self.collection = self.setup_collection()

    def setup_embedding_function(self):
        """依據設定選擇適合的嵌入方法"""
        try:
            if self.embedding_model == "openai":
                self.embedding_fn = embedding_functions.OpenAIEmbeddingFunction(
                    api_key=os.getenv("OPENAI_API_KEY"),
                    model_name="text-embedding-3-small",
                )
            elif self.embedding_model == "nomic":
                self.embedding_fn = embedding_functions.OpenAIEmbeddingFunction(
                    api_key="ollama",
                    api_base="http://localhost:11434/v1",
                    model_name="nomic-embed-text",
                )
            else:
                self.embedding_fn = embedding_functions.DefaultEmbeddingFunction()
        except Exception as e:
            st.error(f"設定嵌入函式錯誤: {str(e)}")
            raise e

    def setup_collection(self):
        """建立或取得 ChromaDB collection"""
        collection_name = f"documents_{self.embedding_model}"
        try:
            try:
                collection = self.db.get_collection(
                    name=collection_name, embedding_function=self.embedding_fn
                )
                st.info(f"已使用現有的 collection ({self.embedding_model})")
            except:
                collection = self.db.create_collection(
                    name=collection_name,
                    embedding_function=self.embedding_fn,
                    metadata={"model": self.embedding_model},
                )
                st.success(f"新建 collection 成功 ({self.embedding_model})")
            return collection
        except Exception as e:
            st.error(f"建立 collection 出錯: {str(e)}")
            raise e

    def add_documents(self, chunks):
        """將文字段落新增到資料庫"""
        try:
            if not self.collection:
                self.collection = self.setup_collection()

            self.collection.add(
                ids=[chunk["id"] for chunk in chunks],
                documents=[chunk["text"] for chunk in chunks],
                metadatas=[chunk["metadata"] for chunk in chunks],
            )
            return True
        except Exception as e:
            st.error(f"新增文件時出錯: {str(e)}")
            return False

    def query_documents(self, query, n_results=3):
        """從資料庫查詢相關段落"""
        try:
            if not self.collection:
                raise ValueError("找不到可用的 collection")

            results = self.collection.query(query_texts=[query], n_results=n_results)
            return results
        except Exception as e:
            st.error(f"查詢文件時出錯: {str(e)}")
            return None

    def generate_response(self, query, context):
        """用 LLM 根據 context 生成回答"""
        try:
            prompt = f"""
            根據以下內容回答問題。
            如果內容中沒有答案，請直接說不知道。

            內容: {context}

            問題: {query}

            答案:
            """

            response = self.llm.chat.completions.create(
                model="gpt-4o-mini" if self.llm_model == "openai" else "llama3.2",
                messages=[
                    {"role": "system", "content": "你是一位樂於助人的助理。"},
                    {"role": "user", "content": prompt},
                ],
            )

            return response.choices[0].message.content
        except Exception as e:
            st.error(f"產生回答時出錯: {str(e)}")
            return None

    def get_embedding_info(self):
        """取得目前使用的嵌入模型資訊"""
        model_selector = SimpleModelSelector()
        model_info = model_selector.embedding_models[self.embedding_model]
        return {
            "name": model_info["name"],
            "dimensions": model_info["dimensions"],
            "model": self.embedding_model,
        }

In [9]:
def main():
    st.title("🤖 簡單版 RAG 系統")

    # 初始化 Session 狀態
    if "processed_files" not in st.session_state:
        st.session_state.processed_files = set()
    if "current_embedding_model" not in st.session_state:
        st.session_state.current_embedding_model = None
    if "rag_system" not in st.session_state:
        st.session_state.rag_system = None

    # 選擇模型
    model_selector = SimpleModelSelector()
    llm_model, embedding_model = model_selector.select_models()

    # 如果嵌入模型改變，就清空資料
    if embedding_model != st.session_state.current_embedding_model:
        st.session_state.processed_files.clear()
        st.session_state.current_embedding_model = embedding_model
        st.session_state.rag_system = None
        st.warning("嵌入模型已變更，請重新上傳文件")

    try:
        if st.session_state.rag_system is None:
            st.session_state.rag_system = SimpleRAGSystem(embedding_model, llm_model)

        embedding_info = st.session_state.rag_system.get_embedding_info()
        st.sidebar.info(
            f"目前使用模型:\n"
            f"- 名稱: {embedding_info['name']}\n"
            f"- 維度: {embedding_info['dimensions']}"
        )
    except Exception as e:
        st.error(f"初始化 RAG 系統時出錯: {str(e)}")
        return

    # 上傳 PDF
    pdf_file = st.file_uploader("上傳 PDF 文件", type="pdf")

    if pdf_file and pdf_file.name not in st.session_state.processed_files:
        processor = SimplePDFProcessor()
        with st.spinner("處理 PDF 中..."):
            try:
                text = processor.read_pdf(pdf_file)
                chunks = processor.create_chunks(text, pdf_file)
                if st.session_state.rag_system.add_documents(chunks):
                    st.session_state.processed_files.add(pdf_file.name)
                    st.success(f"成功處理 {pdf_file.name}")
            except Exception as e:
                st.error(f"處理 PDF 時出錯: {str(e)}")

    # 查詢介面
    if st.session_state.processed_files:
        st.markdown("---")
        st.subheader("🔍 問問題")
        query = st.text_input("輸入問題:")

        if query:
            with st.spinner("生成回答中..."):
                results = st.session_state.rag_system.query_documents(query)
                if results and results["documents"]:
                    response = st.session_state.rag_system.generate_response(
                        query, results["documents"][0]
                    )

                    if response:
                        st.markdown("### 📝 回答:")
                        st.write(response)

                        with st.expander("查看來源段落"):
                            for idx, doc in enumerate(results["documents"][0], 1):
                                st.markdown(f"**段落 {idx}:**")
                                st.info(doc)
    else:
        st.info("👆 請先上傳 PDF 文件")

if __name__ == "__main__":
    main()

2025-04-17 21:54:20.585 
  command:

    streamlit run c:\Users\33313\.conda\envs\openai\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-04-17 21:54:20.587 Session state does not function when running a script without `streamlit run`
