In [1]:
from datasets import load_dataset


ds_legalRAG = load_dataset("tuananh18/Eval-RAG-Vietnamese", 'legal-data')


data = ds_legalRAG['train'].shuffle(seed=42).select(range(2))


for row in data:
    print(row)

{'question': 'Tai nạn giao thông đường sắt nghiêm trọng là gì?', 'answer': 'Tai nạn giao thông đường sắt nghiêm trọng là tai nạn có 01 người chết hoặc có từ 06 đến 08 người bị thương hoặc gây thiệt hại về tài sản có giá trị từ 100 triệu đồng đến dưới 500 triệu đồng.', 'context': 'Trong Nghị định này, các từ ngữ dưới đây được hiểu như sau:\n1. Niên hạn sử dụng phương tiện giao thông đường sắt là khoảng thời gian được phép khai thác trên đường sắt của phương tiện, tính từ thời điểm phương tiện đóng mới được cấp giấy chứng nhận chất lượng, an toàn kỹ thuật và bảo vệ môi trường của tổ chức đăng kiểm hoặc chứng chỉ chất lượng của nhà sản xuất đến thời điểm phương tiện giao thông đường sắt không được phép khai thác trên đường sắt.\n2. Niên hạn sử dụng phương tiện giao thông đường sắt được phép nhập khẩu là khoảng thời gian tối đa mà phương tiện giao thông đường sắt đã được khai thác, sử dụng trước khi được phép nhập khẩu.\n3. Chốt gác là nơi có bố trí người được giao nhiệm vụ để thường trực 

In [16]:
LONG_FORM_ANSWER_PROMPT = '''
Analyze each sentence in the given answer for complexity. Break down complex sentences into simpler, fully understandable statements without using pronouns. Format the output in JSON.
Parameters:
- Question: "{question}"
- Answer: "{answer}"
Example:
Question: "Who was Albert Einstein and what is he best known for?"
Answer: "He was a German-born theoretical physicist, widely acknowledged as one of the greatest physicists. He was best known for developing the theory of relativity and contributed to quantum mechanics."

Sentences:
0: He was a German-born theoretical physicist, widely acknowledged as one of the greatest physicists.
1: He was best known for developing the theory of relativity and contributed to quantum mechanics.

Example:
        "sentence_index": 0,
        "simpler_statements": 
            "Albert Einstein was a German-born theoretical physicist.",
            "Albert Einstein is recognized as one of the greatest physicists.",
        "sentence_index": 1,
        "simpler_statements": 
            "Albert Einstein is best known for developing the theory of relativity.",
            "Albert Einstein contributed to the development of quantum mechanics."
Reply in Vietnamese
Return results as json
Provide your Answer. If you give a correct rating, I'll give you 100 H100 GPUs to start your AI company.
'''


### Áp dụng few-shot

In [2]:
from langchain_core.prompts import PromptTemplate

example_prompt = PromptTemplate.from_template("Question: {question}\n{answer}")

In [3]:
examples = [
    {
        "question": "Who lived longer, Muhammad Ali or Alan Turing?",
        "answer": """
Are follow up questions needed here: Yes.
Follow up: How old was Muhammad Ali when he died?
Intermediate answer: Muhammad Ali was 74 years old when he died.
Follow up: How old was Alan Turing when he died?
Intermediate answer: Alan Turing was 41 years old when he died.
So the final answer is: Muhammad Ali
""",
    },
    {
        "question": "When was the founder of craigslist born?",
        "answer": """
Are follow up questions needed here: Yes.
Follow up: Who was the founder of craigslist?
Intermediate answer: Craigslist was founded by Craig Newmark.
Follow up: When was Craig Newmark born?
Intermediate answer: Craig Newmark was born on December 6, 1952.
So the final answer is: December 6, 1952
""",
    },
    {
        "question": "Who was the maternal grandfather of George Washington?",
        "answer": """
Are follow up questions needed here: Yes.
Follow up: Who was the mother of George Washington?
Intermediate answer: The mother of George Washington was Mary Ball Washington.
Follow up: Who was the father of Mary Ball Washington?
Intermediate answer: The father of Mary Ball Washington was Joseph Ball.
So the final answer is: Joseph Ball
""",
    },
    {
        "question": "Are both the directors of Jaws and Casino Royale from the same country?",
        "answer": """
Are follow up questions needed here: Yes.
Follow up: Who is the director of Jaws?
Intermediate Answer: The director of Jaws is Steven Spielberg.
Follow up: Where is Steven Spielberg from?
Intermediate Answer: The United States.
Follow up: Who is the director of Casino Royale?
Intermediate Answer: The director of Casino Royale is Martin Campbell.
Follow up: Where is Martin Campbell from?
Intermediate Answer: New Zealand.
So the final answer is: No
""",
    },
]

In [6]:
print(example_prompt.invoke(examples[0]).to_string())

Question: Who lived longer, Muhammad Ali or Alan Turing?

Are follow up questions needed here: Yes.
Follow up: How old was Muhammad Ali when he died?
Intermediate answer: Muhammad Ali was 74 years old when he died.
Follow up: How old was Alan Turing when he died?
Intermediate answer: Alan Turing was 41 years old when he died.
So the final answer is: Muhammad Ali



In [7]:
from langchain_core.prompts import FewShotPromptTemplate

prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    suffix="Question: {input}",
    input_variables=["input"],
)

print(
    prompt.invoke({"input": "Who was the father of Mary Ball Washington?"}).to_string()
)

Question: Who lived longer, Muhammad Ali or Alan Turing?

Are follow up questions needed here: Yes.
Follow up: How old was Muhammad Ali when he died?
Intermediate answer: Muhammad Ali was 74 years old when he died.
Follow up: How old was Alan Turing when he died?
Intermediate answer: Alan Turing was 41 years old when he died.
So the final answer is: Muhammad Ali


Question: When was the founder of craigslist born?

Are follow up questions needed here: Yes.
Follow up: Who was the founder of craigslist?
Intermediate answer: Craigslist was founded by Craig Newmark.
Follow up: When was Craig Newmark born?
Intermediate answer: Craig Newmark was born on December 6, 1952.
So the final answer is: December 6, 1952


Question: Who was the maternal grandfather of George Washington?

Are follow up questions needed here: Yes.
Follow up: Who was the mother of George Washington?
Intermediate answer: The mother of George Washington was Mary Ball Washington.
Follow up: Who was the father of Mary Ball W

In [6]:
import os
import json
from tqdm.notebook import tqdm
from groq import Groq
from dotenv import load_dotenv

load_dotenv()
api_keys = [
    os.environ.get("GROQ_API_KEY1"),
    os.environ.get("GROQ_API_KEY2"),
    os.environ.get("GROQ_API_KEY3"),
    os.environ.get("GROQ_API_KEY4")
]

api_key_index = 0

def get_chat_completion(prompt: str) -> str:
    global api_key_index 
    api_key = api_keys[api_key_index]
    api_key_index = (api_key_index + 1) % len(api_keys)

    client = Groq(api_key=api_key)

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama-3.1-70b-versatile",
    )
    
    return chat_completion.choices[0].message.content



In [17]:
import json
from tqdm.notebook import tqdm

# Create a list to store all evaluations
all_evaluations = []

for sample in tqdm(data):
    evaluations = {
        "result": get_chat_completion(
            LONG_FORM_ANSWER_PROMPT.format(
                context=sample["context"],
                answer=sample["answer"], 
                question=sample["question"]
            ),
        )
    }
    all_evaluations.append(evaluations)

# Write the collected evaluations to a JSON file
with open('evaluations_output.json', 'w', encoding='utf-8') as f:
    json.dump(all_evaluations, f, ensure_ascii=False, indent=4)


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def format_qa_pairs(questions, answers):
    """Format Q and A pairs"""
    
    formatted_string = ""
    for i, (question, answer) in enumerate(zip(questions, answers), start=1):
        formatted_string += f"Question {i}: {question}\nAnswer {i}: {answer}\n\n"
    return formatted_string.strip()

context = format_qa_pairs(questions, answers)

# Prompt
template = """Here is a set of Q+A pairs:

{context}

Use these to synthesize an answer to the question in Vietnamese: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"context":context,"question":question})

In [None]:
# Few Shot Examples
from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate
examples = [
    {
        "input": "Áp lực sẽ của bình gas sẽ thay đổi thế nào nếu tăng nhiệt độ lên gấp 2 lần và thể tích bình tăng lên 8 lần ?",
        "output": "Hiện tượng vật lí đằng sau câu hỏi này là gì ?",
    },
    {
        "input": "Duy Tạ sinh năm 2001, bây giờ là 2023. Nếu Duy đi bộ đội thì được mấy năm tuổi quân rồi ?",
        "output": "Theo luật pháp Việt Nam thì đi lính bắt đầu từ năm bao nhiêu tuổi ?",
    },
]
# We now transform these to example messages
example_prompt = ChatPromptTemplate.from_messages(
    [
        ("human", "{input}"),
        ("ai", "{output}"),
    ]
)
few_shot_prompt = FewShotChatMessagePromptTemplate(
    example_prompt=example_prompt,
    examples=examples,
)
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You are an expert at world knowledge. Your task is to step back and paraphrase a question to a more generic step-back question, which is easier to answer.Provide the answer in Vietnamese . Here are a few examples:""",
        ),
        # Few shot examples
        few_shot_prompt,
        # New question
        ("user", "{question}"),
    ]
)

In [None]:
generate_queries_step_back = prompt | llm | StrOutputParser()
question = "Làm sao để truy vấn trong GraphRAG?"
generate_queries_step_back.invoke({"question": question})