In [None]:
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm
from langchain_text_splitters import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer
from glob import glob
import json
import pandas as pd
from datasets import Dataset
import random

In [126]:
DEFAULT_MODEL = "qwen25-7b"
MODEL_PATH = "/workspace/home/NLP_CORE/HUB_LLM/Qwen2-7B-Instruct"
SERVER_HOST = "http://0.0.0.0:9148"
TEMPERATURE = 0.0
TOP_P = 0.9
MAX_TOKENS = 1024
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_PATH)
MARKDOWN_PATH="/raid/hoang/viettel-ai-race/data/processed/output_markdown_1/*.md"
QUESTION_PATH = "/raid/hoang/viettel-ai-race/data/processed/question.csv"

def get_completion_vllm(
    input_prompt,
    system_prompt=None,
    model=DEFAULT_MODEL,
    temperature=TEMPERATURE,
    top_p=TOP_P,
    max_tokens=MAX_TOKENS,
    server_host=SERVER_HOST,
):
    client = OpenAI(
        api_key="EMPTY",
        base_url=f"{server_host}/v1",
    )
    messages = []
    if system_prompt is not None:
        messages.append({"role": "system", "content": system_prompt})
    messages.append({"role": "user", "content": input_prompt})
    try:
        response = client.chat.completions.create(
            model=model,
            messages=messages,
            seed=0,
            top_p=top_p,
            temperature=temperature,
            max_tokens=max_tokens,
        )
        return response.choices[0].message.content
    except Exception as e:
        print(e)
        return None

def multi_thread_task_dict(task_dictionary, num_workers=1, show_progress=True):
    final_results = {}
    futures = []

    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        for id_, task in task_dictionary.items():
            futures.append(
                executor.submit(
                    lambda id_=id_, task=task: {"id": id_, "task_result": task()}
                )
            )

        if show_progress:
            with tqdm(total=len(futures)) as pbar:
                for future in as_completed(futures):
                    result = future.result()
                    final_results[result["id"]] = result["task_result"]
                    pbar.update(1)
        else:
            for future in as_completed(futures):
                result = future.result()
                final_results[result["id"]] = result["task_result"]

    return final_results

def parse_json(json_text):
    first = json_text.find("{")
    last = json_text.rfind("}")
    return json.loads(json_text[first:last+1])

### Index phase

In [27]:
SUMMARIZE_CHUNK_PROMPT = """
Provide a short, one sentence summary in Vietnamese for this chunk of text. The summary should cover all aspects of the text chunk. Return only the Vietnamse summary without other information.
### Text chunk
{{chunk}}
""".strip()

SUMARIZE_WHOLE_DOCUMENT = """
You will be given a list of summarization for each paragraph in one larger document. Provide a short, one sentence summary in Vietnamese for the whole document based on the summarization from all smaller chunks.
The summary should cover all aspects gathered from text chunks. Return only the Vietnamse summary without other information.

### Summarization of chunks
{{chunk_summarziations}}
""".strip()

def get_summary(document, num_summaries=10, chunk_size=1024, chunk_overlap=256, tokenizer=TOKENIZER):
    def preprocess_document(document):
        return "\n".join(document.split("\n\n"))
        
    document_dict = {}
    text_splitter = RecursiveCharacterTextSplitter(
        length_function=lambda x: len(tokenizer.tokenize(x)),
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n##"]
    )
    document = preprocess_document(document)
    document_dict['document_content'] = document
    document_dict['chunks'] = []
    chunks = text_splitter.split_text(document)
    for chunk in chunks:
        prompt = SUMMARIZE_CHUNK_PROMPT.replace("{{chunk}}", chunk)
        summary = get_completion_vllm(prompt)
        document_dict['chunks'].append({
            "chunk_content": chunk,
            "chunk_summary": summary
        })
    chunk_summarizations = "\n\n".join([f"{i}, {chunk}" for i, chunk in enumerate(chunks)])
    prompt = SUMARIZE_WHOLE_DOCUMENT.replace("{{chunk_summarziations}}", chunk_summarizations)
    document_dict['document_summary'] = get_completion_vllm(prompt)
    return document_dict

file_paths = glob(MARKDOWN_PATH)
data = []
for file_path in file_paths:
    with open(file_path) as f:
        data.append({
            "file_path": file_path,
            "document_content": f.read()
        })

def task_summary(data_point):
    summary = get_summary(data_point['document_content'])
    data_point['document_summary'] = summary['document_summary']
    data_point['chunks'] = summary['chunks']
    return data_point

task_dict = {
    i: lambda point=point: task_summary(point)
    for i, point in enumerate(data)
}
task_results = multi_thread_task_dict(task_dict, num_workers=10)
with open("/raid/hoang/viettel-ai-race/data/processed/data_and_summary.json", "w") as f:
    json.dump(list(task_results.values()), f, ensure_ascii=False)

### Retrieve and Answer Phase

In [109]:
questions = Dataset.from_pandas(pd.read_csv(QUESTION_PATH))

In [110]:
AUGMENT_QUESTION_PROMPT = """
You will be given a question that need to search for information before answer. Your job is to generate {{k}} different Vietnamese queries to find information for the question.
The queries should be diverse and different from each other.
Return only the augmented queries without other information. Each query in one line.

### Question: {{question}}
""".strip()

def get_augmented_queries(question, num_queries=5):
    question = f"""{question['Question']}\n{question['A']}\n{question['B']}\n{question['C']}\n{question['D']}"""
    prompt = AUGMENT_QUESTION_PROMPT.replace("{{question}}", question).replace("{{k}}", str(num_queries))
    output = get_completion_vllm(prompt)
    return [x.strip() for x in output.split("\n") if x]

In [87]:
with open("/raid/hoang/viettel-ai-race/data/processed/data_and_summary.json") as f:
    data_and_summary = json.load(f)

def split_summary_and_convert_to_index(chunks, max_chunk_tokens=1024):
    summary_chunks = []
    total_tokens = 0
    for i, chunk in enumerate(chunks):
        num_tokens = len(TOKENIZER.tokenize(chunk))
        if (total_tokens + num_tokens > max_chunk_tokens) or len(summary_chunks) == 0:
            summary_chunks.append([(i, chunk)])
            total_tokens = num_tokens
        else:
            summary_chunks[-1].append((i, chunk))
            total_tokens += num_tokens
    return summary_chunks
    
chunks = [x["document_summary"] for x in data_and_summary]
chunks = split_summary_and_convert_to_index(chunks)

In [122]:
DOCUMENT_RETRIEVAL_PROMPT = """
You will be given a list of document summary and a list of queries. Try to return the top up to {{top_documents}} documents that best match with the queries.
The document summary will be given along with the index of that document (e.g., "index, summary_content"). Return only the index of the best match documents in the json format without other information: {"document_indices": [1,3,4,...]}. If you can not find relevant documents for the queries, simply return {"document_indices": null}.
### Document Summaries
{{document_summaries}}

### Queries
{{queries}}
""".strip()
def get_relevant_document_indices(augmented_queries, summary_chunk, top_documents=3):
    document_summaries= "\n".join([f"{i}, {content}" for i, content in summary_chunk])
    augmented_queries = "\n".join(augmented_queries)
    prompt = DOCUMENT_RETRIEVAL_PROMPT.replace("{{document_summaries}}", document_summaries).replace("{{queries}}", queries).replace("{{top_documents}}", str(top_documents))
    output = get_completion_vllm(prompt)
    output = parse_json(output)
    return output["document_indices"]

In [None]:
QUESTION_ANSWERING_PROMPT = """
Please answer the multiple choice question grounding on the given context.
The question is a multiple choice question with 4 options (A, B, C, D).
Return the answer in the following json format: {"answer": "A" or "B" or "C" or "D"} or {"answer": None} if you can not find the answer in the context.

### Question
{{question}}

### Context
{{context}}
""".strip()

def answer_question_with_context(question, context):
    input_question = question['Question'] + "\n" +  [question['A'], question['B'], question['C'], question['D']]
    prompt = QUESTION_ANSWERING_PROMPT.replace("{{question}}", input_question).replace("{{context}}", context)
    output = get_completion_vllm(prompt)
    return parse_json(output)["answer"]

In [None]:
def get_final_answer(question, data_and_summary):
    augmented_queries = get_augmented_queries(question)
    document_summaries = [x["document_summary"] for x in data_and_summary]
    summaries_combined = split_summary_and_convert_to_index(document_summaries)
    document_indices = []
    for sumamry_chunk in summaries_combined:
        selected_indices = get_relevant_document_indices(augmented_queries, sumamry_chunk)
        if selected_indices:
            document_indices.extend(selected_indices)    
    selected_documents = [data_and_summary[i] for i in document_indices]
    for document in selected_documents:
        for chunk in document['chunks']:
            answer = answer_question_with_context(question, chunk['chunk_content'])
            if answer:
                return answer
    return random.choice(["A", "B", "C", "D"])
            

get_final_answer(questions[0], data_and_summary)

{'chunk_content': '<!-- image -->\n## VIETTEL AI RACE CHỈ TIÊU KỸ THUẬT MÁ NHIỆT KHỔ GIẤY A8 ĐỂ\n## T MÁY IN A8 ĐỂ BÀN\nTD291\nLần ban hành: 1\n## NỘI DUNG BỘ CHỈ TIÊU KỸ THUẬT IN NHIỆT KHỔ GIẤY A8 ĐỂ BÀN\n## 1.  MỤC ĐÍCH VÀ PHẠM VI SỬ DỤNG\n## 1.1  Lịch sử ban hành\n- -  Lần thứ 01.\n## 1.2  Mục đích tài liệu\n- -  Ban hành bộ chỉ tiêu kỹ thuật (CTKT) làm sở cứ cho việc đầu tư, mua sắm, KCS thiết bị Máy in nhiệt trong Công ty thành viên abc\n## 1.3  Phạm vi áp dụng\n- -  Áp dụng trong Công ty thành viên abc\n## 1.4  Tài liệu liên quan\n- -  TCVN 9088:2011: Công nghệ thông tin – Thiết bị văn phòng – Thông tin tối thiểu cần có trong bảng thông số kỹ thuật – Máy in.\n## 2.  NỘI DUNG CHI TIẾT\n- -  Phương pháp đánh giá: Sản phẩm đáp ứng về mặt kỹ thuật khi đạt 100% yêu cầu theo chỉ tiêu kỹ thuật.\n- -  Sửa đổi (Modification): Các tiêu chí đánh giá sẽ có 02 lựa chọn:\n+ + Không được sửa đổi (M): Đây là các chỉ tiêu cơ bản và bắt buộc để đáp ứng yêu cầu về kỹ thuật và chất lượng thiết bị, d