In [38]:
import os
import time

from google.auth import load_credentials_from_file
import pandas as pd
# Replace this with your own GCP project IAM credential file
credentials, project_id = load_credentials_from_file(
    "/Users/Apple/secrets/genai-441923-47c3e249f8b8.json"
)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/Apple/secrets/genai-441923-47c3e249f8b8.json"

In [39]:
import os
from typing import List, Union
import fitz
from langchain import hub
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_vertexai import VertexAI, VertexAIEmbeddings
import hashlib

llm = VertexAI(
    model_name="gemini-1.0-pro",
    temperature=0.3,
    max_output_tokens=8192,
    max_workers=2,
    
)

embedding_model = VertexAIEmbeddings(
    model_name="textembedding-gecko"
)

In [25]:

# 使用模型生成文本
prompt = "请简要介绍一下人工智能的发展历史。"
response = llm(prompt)
print(response)


## 人工智能发展历史简述

人工智能 (AI) 的发展历史可以追溯到 20 世纪 50 年代，经历了多个阶段，取得了显著的进步。

**1. 萌芽阶段 (1950s-1960s):**

* 1950 年，图灵测试提出，为人工智能研究奠定了基础。
* 1956 年，达特茅斯会议上首次提出“人工智能”的概念。
* 1960s，专家系统开始出现，用于解决特定领域的问题。

**2. 发展阶段 (1970s-1980s):**

* 1970s，人工智能研究陷入瓶颈，专家系统发展受限。
* 1980s，机器学习开始兴起，为人工智能带来了新的突破。

**3. 复兴阶段 (1990s-2010s):**

* 1990s，深度学习技术取得突破，推动人工智能快速发展。
* 2000s，人工智能应用开始普及，例如图像识别、语音识别等。
* 2010s，人工智能在各个领域取得重大进展，例如自动驾驶、医疗诊断等。

**4. 突破阶段 (2020s-至今):**

* 2020s，人工智能模型规模不断扩大，性能大幅提升。
* 人工智能应用范围不断扩展，例如生成式 AI、元宇宙等。
* 人工智能伦理问题受到关注，引发广泛讨论。

**总结:** 人工智能发展至今，取得了巨大的进步，并对各个领域产生了深刻的影响。未来，人工智能将继续发展，并为人类社会带来更多机遇和挑战。

**参考资料:**

* https://en.wikipedia.org/wiki/History_of_artificial_intelligence
* https://www.britannica.com/technology/artificial-intelligence/History-of-artificial-intelligence
* https://www.ibm.com/topics/artificial-intelligence


In [45]:
llm

VertexAI(client=<vertexai.generative_models.GenerativeModel object at 0x2b8e89ed0>, project='genai-441923', model_name='gemini-1.5-flash', client_options=ClientOptions: {'api_endpoint': 'us-central1-aiplatform.googleapis.com', 'client_cert_source': None, 'client_encrypted_cert_source': None, 'quota_project_id': None, 'credentials_file': None, 'scopes': None, 'api_key': None, 'api_audience': None, 'universe_domain': None}, default_metadata=(), client_preview=<vertexai.generative_models.GenerativeModel object at 0x2b8e89d80>, temperature=0.3, max_output_tokens=8192, model_family=<GoogleModelFamily.GEMINI_ADVANCED: '2'>)

In [40]:
def get_directory_hash(directory_path: str) -> str:
    """Calculate a hash of the directory contents to detect changes."""
    hasher = hashlib.md5()
    for filename in sorted(os.listdir(directory_path)):
        if filename.lower().endswith('.pdf'):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'rb') as f:
                hasher.update(f.read())
    return hasher.hexdigest()

def extract_text_from_pdf(pdf_path: str) -> List[Document]:
    documents = []
    try:
        pdf_document = fitz.open(pdf_path)
        for page_num in range(len(pdf_document)):
            page = pdf_document[page_num]
            text = page.get_text("text", flags=fitz.TEXT_DEHYPHENATE | fitz.TEXT_PRESERVE_WHITESPACE)
            if text.strip():
                metadata = {
                    "source": pdf_path,
                    "page": page_num + 1,
                    "total_pages": len(pdf_document)
                }
                documents.append(Document(page_content=text, metadata=metadata))
        pdf_document.close()
    except Exception as e:
        print(f"Error processing {pdf_path}: {str(e)}")
    return documents

def load_pdfs_from_directory(directory_path: str) -> List[Document]:
    documents = []
    for filename in os.listdir(directory_path):
        if filename.lower().endswith('.pdf'):
            file_path = os.path.join(directory_path, filename)
            documents.extend(extract_text_from_pdf(file_path))
    return documents

class RAGPipeline:
    def __init__(self, data_dir: str = "./data", persist_dir: str = "./chroma_db"):
        self.data_dir = data_dir
        self.persist_dir = persist_dir
        self.vectorstore = None
        self.rag_chain = None
        self.retrieve_docs = []  # 添加检索文档记录列表
        self.initialize_pipeline()
        
    def initialize_pipeline(self):
        # Create persist_dir if it doesn't exist
        os.makedirs(self.persist_dir, exist_ok=True)

        # Check if we need to update embeddings
        should_update = self._should_update_embeddings()
        print(f'Should update embeddings: {should_update}')
        if not should_update and os.path.exists(self.persist_dir):
            # Try to load existing vectorstore
            self.vectorstore = Chroma(
                persist_directory=self.persist_dir,
                embedding_function=embedding_model
            )
        else:
            # Create new embeddings
            self._create_new_embeddings()

        retriever = self.vectorstore.as_retriever(
            search_kwargs={"k": 4}
        )
        prompt = hub.pull("rlm/rag-prompt")

        def format_docs(docs):
            # 存储检索到的文档内容
            doc_contents = [doc.page_content for doc in docs]
            self.retrieve_docs.append(doc_contents)
            return "\n\n".join(doc_contents)

        self.rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt
            | llm
            | StrOutputParser()
        )

    def _should_update_embeddings(self) -> bool:
        """Check if the source documents have changed."""
        if not os.path.exists(self.persist_dir):
            return True
            
        current_hash = get_directory_hash(self.data_dir)
        hash_file = os.path.join(self.persist_dir, "directory_hash.txt")
        
        if not os.path.exists(hash_file):
            return True
        
        with open(hash_file, 'r') as f:
            stored_hash = f.read().strip()
        
        return current_hash != stored_hash

    def _create_new_embeddings(self):
        """Create new embeddings from the documents."""
        docs = load_pdfs_from_directory(self.data_dir)
        if not docs:
            raise ValueError(f"No documents were loaded from {self.data_dir}")

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len,
            is_separator_regex=False,
        )
        splits = text_splitter.split_documents(docs)

        # Create new vectorstore with persistence enabled
        self.vectorstore = Chroma.from_documents(
            documents=splits,
            embedding=embedding_model,
            persist_directory=self.persist_dir
        )

        # Save directory hash
        current_hash = get_directory_hash(self.data_dir)
        with open(os.path.join(self.persist_dir, "directory_hash.txt"), 'w') as f:
            f.write(current_hash)

    def retrieve(self, query: str, k: int = 4) -> List[Document]:
        """Retrieve relevant documents for a query."""
        docs = self.vectorstore.similarity_search(query, k=k)
        # 存储检索到的文档内容
        self.retrieve_docs.append([doc.page_content for doc in docs])
        return docs

    def query(self, question: str) -> str:
        """Query the RAG pipeline."""
        return self.rag_chain.invoke(question)

    def get_retrieve_history(self) -> List[List[str]]:
        """获取所有检索历史记录"""
        return self.retrieve_docs

    def clear_retrieve_history(self):
        """清空检索历史记录"""
        self.retrieve_docs = []
    
    def retrieve_and_query(self, query: str) -> str:
        """Retrieve relevant documents and query the RAG pipeline."""
        self.retrieve(query)
        query_res = self.query(query), 
        retrieve_res = self.get_retrieve_history()[0] if self.get_retrieve_history() else []
        self.clear_retrieve_history()
        return query_res, retrieve_res
# Example usage:
# rag = RAGPipeline(data_dir="./data", persist_dir="./chroma_db")
# 
# # To retrieve documents:
# relevant_docs = rag.retrieve("your query here")
# 
# # To get an answer:
# answer = rag.query("your question here")
#
# # To get retrieval history:
# history = rag.get_retrieve_history()
#
# # To clear retrieval history:
# rag.clear_retrieve_history()

In [41]:
rag = RAGPipeline(data_dir="./data", persist_dir="./chroma_db")


Should update embeddings: False


In [None]:
ans, retrieve_history = rag.retrieve_and_query("How does the number of datasets and templates affect the performance of instruction tuning in the FLAN model?")

In [31]:
ans

"The provided context does not directly address the impact of the number of datasets and templates on instruction tuning performance in the FLAN model. However, it does mention that using more datasets per cluster improved performance by almost 10% on average. Conversely, using more templates per dataset had a negligible effect on performance, suggesting that the model's performance is more sensitive to the number of datasets than the number of templates. \n"

In [32]:
retrieve_history

[['better understand the output format. In addition, for all task clusters, standard deviation among\ntemplates is lower for few-shot FLAN, indicating reduced sensitivity to prompt engineering.\nNLI\nRead. Comp. Closed-Book QA Commonsense\nCoreference\nTranslation\nZero-shot FLAN\nFew-shot FLAN\nPerformance\n20\n40\n60\n80\n54.7 59.3\n59.6 60.0\n53.7\nStruct to text\n57.2\n31.0 33.0\n80.0 80.8\n63.8 67.4\n39.2\n49.4\nTask Cluster:\n# datasets:\n7\n5\n3\n4\n2\n3\n4\nFigure 9:\nAdding few-shot exemplars to FLAN is a complementary method for improving the\nperformance of instruction-tuned models. The orange bars indicate standard deviation among\ntemplates, averaged at the dataset level for each task cluster.\n4.5\nINSTRUCTION TUNING FACILITATES PROMPT TUNING\n32 training \nexamples\nFull training \nset\n100\n0\n50\n75\n25\nPerformance after \nprompt tuning\nInstruction-tuned model\nUntuned model\n63.8\n78.1\n79.1\n87.4\nFigure 10:\nInstruction-tuned\nmodels respond better to continuous i

In [29]:
test_dataset = pd.read_json('./data/eval_dataset_1.json')

In [30]:
test_dataset.columns = ['user_input', 'reference', 'response', 'retrieved_contexts']

In [43]:
import time

In [50]:
for i in range(len(test_dataset)):
    res, retrieve_data = rag.retrieve_and_query(test_dataset['user_input'][i])
    # Use .loc to set values
    test_dataset.loc[i, 'response'] = res[0]
    test_dataset.loc[i, 'retrieved_contexts'] = retrieve_data
    time.sleep(1)

In [62]:
from ragas import EvaluationDataset
eval_dataset = EvaluationDataset.from_pandas(test_dataset[:4])

In [63]:
eval_dataset.to_pandas()

Unnamed: 0,user_input,retrieved_contexts,response,reference
0,How does instruction tuning affect the zero-sh...,"[models finetuned on dataset name only, we rep...",Instruction tuning significantly improves zero...,For larger models on the order of 100B paramet...
1,What is the Zero-shot-CoT method and how does ...,[Large Language Models are Zero-Shot Reasoners...,"I'm sorry, but the provided context does not c...",Zero-shot-CoT is a zero-shot template-based pr...
2,How does prompt tuning affect model performanc...,[prompt tuning\nInstruction-tuned model\nUntun...,Prompt tuning can improve the performance of N...,Prompt tuning improves model performance in im...
3,What is the purpose of instruction tuning in l...,[LM\n(C) Instruction tuning (FLAN)\nInstructio...,Instruction tuning aims to improve the ability...,The purpose of instruction tuning in language ...


In [64]:
from ragas.run_config import RunConfig

# increasing max_workers to 64 and timeout to 60 seconds

my_run_config = RunConfig(max_workers=1, timeout=60)

In [65]:
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, SemanticSimilarity
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

evaluator_llm = LangchainLLMWrapper(llm, run_config=my_run_config)
evaluator_embeddings = LangchainEmbeddingsWrapper(embedding_model, run_config=my_run_config)

In [66]:
import os
os.environ["OPENAI_API_KEY"] = ""

In [67]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-3.5-turbo"))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

In [68]:
evaluator_llm

LangchainLLMWrapper(run_config=RunConfig(timeout=180, max_retries=10, max_wait=60, max_workers=16, exception_types=<class 'openai.RateLimitError'>, log_tenacity=False, seed=42), multiple_completion_supported=False)

In [69]:
metrics = [
    LLMContextRecall(llm=evaluator_llm), 
    FactualCorrectness(llm=evaluator_llm), 
    Faithfulness(llm=evaluator_llm),
    SemanticSimilarity(embeddings=evaluator_embeddings)
]
results = evaluate(dataset=eval_dataset, metrics=metrics, run_config=my_run_config)

Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

In [70]:
results

{'context_recall': 0.5833, 'factual_correctness': 0.2850, 'faithfulness': 0.2750, 'semantic_similarity': 0.9050}

In [71]:
df = results.to_pandas()
df.head()

Unnamed: 0,user_input,retrieved_contexts,response,reference,context_recall,factual_correctness,faithfulness,semantic_similarity
0,How does instruction tuning affect the zero-sh...,"[models finetuned on dataset name only, we rep...",Instruction tuning significantly improves zero...,For larger models on the order of 100B paramet...,0.0,0.0,0.5,0.887301
1,What is the Zero-shot-CoT method and how does ...,[Large Language Models are Zero-Shot Reasoners...,"I'm sorry, but the provided context does not c...",Zero-shot-CoT is a zero-shot template-based pr...,1.0,0.22,0.0,0.848312
2,How does prompt tuning affect model performanc...,[prompt tuning\nInstruction-tuned model\nUntun...,Prompt tuning can improve the performance of N...,Prompt tuning improves model performance in im...,0.333333,0.0,0.0,0.917471
3,What is the purpose of instruction tuning in l...,[LM\n(C) Instruction tuning (FLAN)\nInstructio...,Instruction tuning aims to improve the ability...,The purpose of instruction tuning in language ...,1.0,0.92,0.6,0.96688
