# text切分
# RAG分块策略之语义分块

### Fixed size chunking
### Recursive Chunking
### Document Specific Chunking
### Semantic Chunking

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)

In [None]:
texts = [
    'ArithmeticErrordddddddd'
]

In [None]:
splits = []
for text in texts:
    splits.extend(text_splitter.split_text(text))
splits[:5], len(splits)

In [None]:
documents = []

In [None]:
naive_chunks = text_splitter.split_documents(documents)
for chunk in naive_chunks[10:15]:
    print(chunk.page_content+'\n')

In [None]:
from langchain.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

In [None]:
embed_model = 'ddddd'

semantic_chunker = SemanticChunker(embed_model, breakpoint_threshold_type="precentile")

semantic_chunks = semantic_chunker.create_documents([d.page_content for d in documents])

In [None]:
from langchain_community.vectorstores import Chroma

semantic_chunk_vectorstore = Chroma.from_documents(
    documents=semantic_chunks,
    embedding=embed_model,
    persist_directory="./semantic_chunks",
)

semantic_chunk_retriever = semantic_chunk_vectorstore.as_retriever(search_kwargs={"k": 3})
semantic_chunk_retriever.invoke("Describe the Feature-based Approach with BERT")

In [None]:
from langchain_core.prompts import ChatPromptTemplate

In [None]:
rag_template = """
use the following context to answer the user's question, please response in chinese:
    Context:
    {content}
    
    -----
    User's question:
    {question}
"""

rag_prompt = ChatPromptTemplate.from_template(rag_template)

In [None]:
ChatGroq = ''
userdata = ''
chat_model = ChatGroq(temperature=0,
                      model_name="mixtral-8x7b-32768",
                      api_key=userdata.get("GROQ_API_KEY"))

In [None]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

semantic_rag_chain = (
    {"context": semantic_chunk_retriever, "question": RunnablePassthrough()}
    | rag_prompt
    | chat_model
    | StrOutputParser()
)
semantic_rag_chain.invoke("Describe the Feature-based Approach with BERT?")

### 语义块的Ragas评估比较

In [None]:
synthetic_data_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=False
)

synthetic_data_chunks = synthetic_data_splitter.create_documents([d.page_content for d in documents])


In [None]:
questions = []
ground_truths_semantic = []
contexts = []
answers = []

question_template = """\
    Given the following context, answer the question using only the context. If the answer is not contained within the context, say "I don't know."
    Context: {context}
"""
question_prompt = ChatPromptTemplate.from_template(question_template)

ground_truth_template = """\
    Given the following context, answer the question using only the context. If the answer is not contained within the context, say "I don't know."
    Context: {context}
    Question: {question}
"""
ground_truth_prompt = ChatPromptTemplate.from_template(ground_truth_template)


In [None]:
question_chain = question_prompt | chat_model | StrOutputParser()
ground_truth_chain = ground_truth_prompt | chat_model | StrOutputParser()

for chunk in synthetic_data_chunks[10:20]:
    questions.append(question_chain.invoke({"context": chunk.page_content}))
    contexts.append(chunk.page_content)
    ground_truths_semantic.append(ground_truth_chain.invoke({"context": chunk.page_content, "question": questions[-1]}))
    answers.append(semantic_rag_chain.invoke(questions[-1]))

In [None]:
#  将生成的内容格式化为HuggingFace数据集格式
from datasets import load_dataset, Dataset

qagc_list = []

for question, answer, context, ground_truth in zip(questions, answers, contexts, ground_truths_semantic):
    qagc_list.append({"question": question, "answer": answer, "context": context, "ground_truth": ground_truth})

eval_dataset = Dataset.from_list(qagc_list)
eval_dataset

# Dataset({
#     features: ['question', 'answer', 'context', 'ground_truth'],
#     num_rows: 100
# })

In [None]:
# 实施Ragas指标并评估我们创建的数据集。

from ragas.metrics import answer_relavancy, faithfulness, context_recall, context_precision
from ragas import evaluate

result = evaluate(
    eval_dataset,
    metrics=[answer_relavancy, faithfulness, context_recall, context_precision],
    llm=chat_model,
    embeddings=embed_model,
    raise_exceptions=False
)

In [None]:
import os
from google.colab import userdata
import openai

os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")
openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
from ragas import evaluate

result = evaluate(
    eval_dataset,
    metrics=[
        "answer_relevancy",
        "context_relevancy",
        "context_precision",
        "context_f1",
    ]
)
result
# {'context_precision': 1.0000, 'faithfulness': 0.8857, 'answer_relevancy': 0.9172, 'context_recall': 1.0000}
result_df = result.to_pandas()
result_df

### Naive Chunker的Ragas评估比较

In [None]:
import tqdm

questions = []
ground_truths_semantic = []
contexts = []
answers = []

for chunk in tqdm.tqdm(synthetic_data_chunks[10:20]):
    questions.append(question_chain.invoke({"context": chunk.page_content}))
    contexts.append([chunk.page_content])
    ground_truths_semantic.append(ground_truth_chain.invoke({"question": questions[-1], "context": contexts[-1]}))
    answers.append(naive_rag_chain.invoke(questions[-1]))

In [None]:
qagc_list = []

for question, answer, context,ground_truth in zip(questions, answers, contexts,ground_truths_semantic):
    qagc_list.append({
        "question": question,
        "answer": answer,
        "context": context,
        "ground_truth": ground_truth
    })

naive_eval_dataset = Dataset.from_list(qagc_list)
naive_eval_dataset

In [None]:
naive_result = evaluate(
    naive_eval_dataset,
    metrics=[
        context_precision,
        context_recall,
        faithfulness,
        answer_relavancy
    ]
)
naive_result

# {'context_precision': 1.0000, 'faithfulness': 0.9500, 'answer_relevancy': 0.9182, 'context_recall': 1.0000}