# 构建完整的RAG应用

## step1 搜集文献资料 pdf
## 分类保存在文件夹中，构建文件夹树形结构

In [3]:
import os

def extract_file_dirs(directory):
    file_paths = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                fp = os.path.join(root, file)
                file_paths.append(fp)
    return file_paths

root_dir = "./papers"
files = extract_file_dirs(root_dir)
files

['./papers\\Enantioselective Iridium-Catalyzed Allylic Substitution with 2-Methylpyridines.pdf',
 './papers\\Iridium-Catalyzed Asymmetric Allylic Amination Reaction with N-Aryl Phosphoramidite Ligands.pdf']

## 将文件由pdf转换为txt，保存在当前目录下

In [17]:
from pdfminer.high_level import extract_text

# 将从pdf中提取的文字写入文件
def write_to_file(text, file_name):
    with open(file_name, 'w', encoding='utf-8') as f:
        f.write(text)

docs = []
docs.append(extract_text(files[0]))
docs

# 循环调用函数
for i in range(0, len(docs)):
    # 判断是否存在文件夹，若不存在则创建
    if not os.path.exists('txt_papers'):
        os.mkdir('txt_papers')
    file_name = f'txt_papers/text_{i}.txt'
    write_to_file(docs[i], file_name)

## step2 切分文档

In [28]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)

recursive_chunks = []
for doc in docs:
    recursive_chunks.extend(text_splitter.split_text(doc))
recursive_chunks[:5], len(recursive_chunks)

(['Communications\n\nAngewandte\n\nChemie\n\nAsymmetric Catalysis\n\nInternational Edition: DOI: 10.1002/anie.201700433\nGerman Edition:\nDOI: 10.1002/ange.201700433\n\nEnantioselective Iridium-Catalyzed Allylic Substitution with\n2-Methylpyridines\nXi-Jia Liu and Shu-Li You*\n\nAbstract: An enantioselective iridium-catalyzed allylic sub-\nstitution with a set of highly unstabilized nucleophiles gen-\nerated in situ from 2-methylpyridines is described. Enantioen-\nriched 2-substituted pyridines, which are frequently encoun-\ntered in natural products and pharmaceuticals, could be easily\nconstructed by this simple method in good yields and excellent\nenantioselectivity. The synthetic utility of the pyridine products\nis demonstrated through the synthesis of a key intermediate of\na reported Na+/H+ exchanger inhibitor and the total synthesis\nof ((cid:2))-lycopladine A.\n\nPyridines are among the most prevalent heterocyclic',
  'Pyridines are among the most prevalent heterocyclic\n\nstr

## step3 将分块存入向量数据库中

##### 嵌入模型选择
##### 向量数据库选择
##### 持久化到本地

In [31]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

embedding = HuggingFaceEmbeddings(model_name="BAAI/bge-small-zh-v1.5")

# 保存到当前根目录的vector_db文件夹下
kb_name = "vector_mytry_1"
persist_directory = os.path.join(os.getcwd(), kb_name)

vectordb = Chroma.from_texts(
    texts=recursive_chunks,
    embedding=embedding,
    persist_directory=persist_directory,
)
vectordb.persist()

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange
  attn_output = torch.nn.functional.scaled_dot_product_attention(
  warn_deprecated(


## step4 检索chain构建

### 词表扩充 sentencepiece

In [None]:
# setup QnA
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
import sentencepiece as spm

input_files = [
    "./data/corpus.txt",
]

spm.SentencePieceTrainer.train(
    input=input_files,
    model_prefix='tokenizer_spm_model',
    vocab_size=5000,
    character_coverage=0.9995, # 字符集丰富的中文、日文，设置为0.9995
    model_type='bpe', # unigram、bpe、word和char
    max_sentence_length=2048, # UTF-8中一个汉字3个字节，最大长度为2048字节
)

# tokenizer_spm_model.model 为模型文件
# tokenizer_spm_model.vocab 为词表文件

#### 合并llm和领域词表

In [None]:
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
from transformers import LlamaTokenizer
from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model

llama2_tokenizer_dir = "llama2_tokenizer/tokenizer.model"
llama2_tokenizer = LlamaTokenizer.from_pretrained(llama2_tokenizer_dir)

chinese_sp_model = spm.SentencePieceProcessor()
chinese_sp_model_file = "tokenizer_spm_model.model"
chinese_sp_model.Load(chinese_sp_model_file)

llama2_spm = sp_pb2_model.ModelProto()
llama2_spm.ParseFromString(llama2_tokenizer.sp_model.serialized_model_proto())

chinese_spm = sp_pb2_model.ModelProto()
chinese_spm.ParseFromString(chinese_sp_model.serialized_model_proto())

# add chinese tokens to llama2 tokenizer
llama_spm_tokens_set = set(p.piece for p in llama2_spm.pieces)
for p in chinese_spm.pieces:
    piece = p.piece
    if piece not in llama_spm_tokens_set:
        new_p = sp_pb2_model.ModelProto().SentencePiece()
        new_p.piece = piece
        new_p.score = 0
        llama2_spm.pieces.append(new_p)
f"New model pieces: {len(llama2_spm.pieces)}"

## save
output_sp_dir = 'llama2_chinese'
os.makedirs(output_sp_dir, exist_ok=True)
with open(output_sp_dir + '/chinese_llama2.model', 'wb') as f:
    f.write(llama2_spm.SerializeToString())
tokenizer = LlamaTokenizer(vocab_file=output_sp_dir + '/chinese_llama2.model')
output_hf_dir = 'llama2_chinese'
os.makedirs(output_hf_dir, exist_ok=True)
tokenizer.save_pretrained(output_hf_dir)
f"Chinese-LLaMA tokenizer has been saved to {output_hf_dir}"

# /llama2_chinese
# /chinese_llama2.model /special_tokens_map.json /tokenizer_config.json /tokenizer.model
# /llama2_tokenizer/tokenizer.model

# test
llama_tokenizer = LlamaTokenizer.from_pretrained(llama2_tokenizer_dir)
chinese_llama_tokenizer = LlamaTokenizer.from_pretrained(output_hf_dir)
tokenizer.all_special_ids
tokenizer.all_special_tokens
tokenizer.special_tokens_map
text = "白日依山尽，黄河入海流。欲穷千里目，更上一层楼"
f"test text: {text}"
f"tokenized by llama2 tokenizer: {llama_tokenizer.tokenize(text)}"
f"tokenized by chinese llama tokenizer: {chinese_llama_tokenizer.tokenize(text)}"

### query rewrite  构建重写器来重写搜索查询

In [None]:
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0)

rewrite_template = """
    Provide a better search query for \
    web search engine to answer the given question, end \
    the queries with '**'. Question: \
    {x} Answer:
"""
rewrite_prompt = ChatPromptTemplate.from_template(rewrite_template)

def _parse(text):
    return text.strip("**")

rewriter = rewrite_prompt | llm | StrOutputParser() | _parse

base_template = """
    Answer the users question based only on the following context:
    <context>
    {context}
    </context>
    Question: {question}
"""
base_prompt = ChatPromptTemplate(template=base_template, input_variables=["context", "question"])

search = DuckDuckGoSearchAPIWrapper()

def retriever(query):
    return search.run(query)

rewrite_retrieve_read_chain = (
    {
        "context": { "x": RunnablePassthrough() } | rewriter | retriever,
        "question": RunnablePassthrough(),
    }
    | base_prompt
    | llm
    | StrOutputParser()
)

### Contextual Compression

In [None]:
# 每个块长度很长，且包含有不相干的信息；使用LLM从检索到的文档中删除那些不相关的段落。
from langchain.retrievers.document_compressors import DocumentCompressorPipeline, EmbeddingsFilter
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_transformers import EmbeddingsRedundantFilter
from langchain.retrievers import ContextualCompressionRetriever

splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
redundant_filter = EmbeddingsRedundantFilter(embeddings=embedding)
relevant_filter = EmbeddingsFilter(embeddings=embedding,similarity_threshold=0.75)
pipeline_compressor = DocumentCompressorPipeline(transformers=[splitter, redundant_filter, relevant_filter])
compression_retriever = ContextualCompressionRetriever(
    base_compressor=pipeline_compressor,
    base_retriever=vectordb.as_retriever()
)

def format_docs(docs):
    doc_strings = [doc.page_content for doc in docs]
    return "\n\n".join(doc_strings)

rag_chain = (
    {
        "context": { "x": RunnablePassthrough() } | rewriter | compression_retriever | format_docs,
        "question": RunnablePassthrough(),
    }
    | base_prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("What did the author do growing up?")


### prepare multi query retriever

In [None]:
from langchain.retrievers import MultiQueryRetriever
from langchain.chat_models import ChatOpenAI

question = "What is the capital of France?"
llm = ChatOpenAI(temperature=0)
retriever_from_llm = MultiQueryRetriever.from_llm(retriever=vectordb.as_retriever(), llm=llm)

In [None]:
qa_system_prompt = """
    Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
    {context}
"""

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        ("human", "{question}"),
    ]
)

def format_docs (docs):
    doc_strings = [doc.page_content for doc in docs]
    return "\n\n".join(doc_strings)

rag_chain = (
    {
        "context": retriever_from_llm | format_docs,
        "question": RunnablePassthrough(),
    }
    | qa_prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("What did the author do growing up?")

## step5 Ragas效果评估比较

In [None]:

documents = []

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=False
)

text_splitter_chunks = text_splitter.create_documents([d.page_content for d in documents])

questions = []
ground_truths_semantic = []
contexts = []
answers = []

question_template = """\
    Given the following context, answer the question using only the context. If the answer is not contained within the context, say "I don't know."
    Context: {context}
"""
question_prompt = ChatPromptTemplate.from_template(question_template)

ground_truth_template = """\
    Given the following context, answer the question using only the context. If the answer is not contained within the context, say "I don't know."
    Context: {context}
    Question: {question}
"""
ground_truth_prompt = ChatPromptTemplate.from_template(ground_truth_template)

question_chain = question_prompt | llm | StrOutputParser()
ground_truth_chain = ground_truth_prompt | llm | StrOutputParser()

for chunk in text_splitter_chunks[10:20]:
    questions.append(question_chain.invoke({"context": chunk.page_content}))
    contexts.append(chunk.page_content)
    ground_truths_semantic.append(ground_truth_chain.invoke({"context": chunk.page_content, "question": questions[-1]}))
    answers.append(text_splitter_chunks.invoke(questions[-1]))

#  将生成的内容格式化为HuggingFace数据集格式
from datasets import load_dataset, Dataset
qagc_list = []

for question, answer, context, ground_truth in zip(questions, answers, contexts, ground_truths_semantic):
    qagc_list.append({"question": question, "answer": answer, "context": context, "ground_truth": ground_truth})

eval_dataset = Dataset.from_list(qagc_list)
# Dataset({
#     features: ['question', 'answer', 'context', 'ground_truth'],
#     num_rows: 100
# })

# 实施Ragas指标并评估我们创建的数据集。
from ragas.metrics import answer_relavancy, faithfulness, context_recall, context_precision
from ragas import evaluate

result = evaluate(
    eval_dataset,
    metrics=[answer_relavancy, faithfulness, context_recall, context_precision],
    llm=llm,
    embeddings=embedding,
    raise_exceptions=False
)

# {'context_precision': 1.0000, 'faithfulness': 0.8857, 'answer_relevancy': 0.9172, 'context_recall': 1.0000}
result_df = result.to_pandas()
result_df





