## Install and Environment setting

In [None]:
%%capture
!pip install pandas==2.2.3 jupyter==1.1.1 langchain==0.3.23 langchain-community==0.3.21 rich==14.0.0 openai==1.71.0 faiss-gpu==1.7.2 numpy<2
!pip install -U langchain langchain-community
!pip install rouge-score
!pip install datasets
!pip install huggingface_hub
!pip install sentence-transformers
!pip install -U spacy
!python -m spacy download en_core_web_sm
!pip install semantic-text-splitter
!pip install -U langchain-experimental
!pip install langchain openai

In [None]:
import warnings
warnings.filterwarnings("ignore")

## Parameter setting

In [None]:
import logging
import json
from rich.console import Console
from rich.logging import RichHandler

console = Console(stderr=True, record=True)
log_handler = RichHandler(rich_tracebacks=True, console=console, markup=True)
logging.basicConfig(format="%(message)s",datefmt="[%X]",handlers=[log_handler])
log = logging.getLogger("rich")
log.setLevel(logging.DEBUG)

DEBUG: bool = False
DATASET_PATH: str = "public_dataset.json"

MODEL_TEMPERATURE: float = 0.3
MODEL_MAX_TOKENS: int = 128

## Model for answer and chain


*   Use meta-llama/llama-3.2-3b-instruct as Generator
*   I use Openrouter api here...(need credit)



In [None]:
from langchain_core.language_models.llms import BaseLLM
from langchain_core.output_parsers import StrOutputParser
from langchain.llms import HuggingFaceHub
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key="OPEN_ROUTER_API_KEY",
    #model="meta-llama/llama-3.1-8b-instruct",
    #model="mistralai/mistral-7b-instruct-v0.2",     # 60-80:14
    model="meta-llama/llama-3.2-3b-instruct",      #
    temperature=0,
    max_tokens=1024,
)

response = llm.invoke("How are you?")
console.print(response)

  llm = ChatOpenAI(


## Prompt for checking the answer


*   Use meta-llama/llama-3.1-8b-instruct as judge
*   Temperature setting = 0.6



In [None]:
PROMPT_JUDGEMENT: str = (
  """Assume you are a human expert in grading predictions given by a model. You are given a document, a question and a model prediction. Judge if the prediction matches the ground truth answer by following these steps:
1: Take it as granted that the Ground Truth is always correct.
2: If the Prediction indicates it is not sure about the answer, "score" should be "0"; otherwise, go the next step.
3: If the Prediction exactly matches the Ground Truth, "score" is 1.
4: If the Prediction does not exactly match the Ground Truth, go through the following steps.
5: If the Ground Truth is a number, "score" is 1 if and only if the Prediction gives a number that almost exactly matches the ground truth.
6: If the Prediction is self-contradictory, "score" must be 0.
7: If the prediction is not answering the question, "score" must be 0.
8: If the prediction is a concise and correct summary of the ground truth, "score" is 1.
9: If ground truth contains a set of items, prediction must contain exactly same items for the score to be 1.
10: Otherwise, "score" is 0.
Keep the answer concise. Don't provide irrelevant information.
score 5 times in a row
""")

PROMPT_JUDGE_CONTENT = (
"""document: {document}
question: {question}
Ground Truth: {answer}
Prediction: {prediction}
""")

CHAT_JUDGE_TEMPLATE = (
  f"system: {PROMPT_JUDGEMENT}\n"
  f"human: {PROMPT_JUDGE_CONTENT}\n"
  "assistant: The score is "
)

llm_judge = ChatOpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key="OPEN_ROUTER_API_KEY",
    model="meta-llama/llama-3.1-8b-instruct",
    temperature=0.6,
    max_tokens=128,
)

## Reranker module


*   let model(llama-3.1-8b-instruct) reorder the chunk



In [None]:
def prompt_rerank(llm, question, docs, top_n=5):
    scored_docs = []
    for doc in docs:
        scoresum = 0
        for i in range(3):
            prompt = f"""You are a document analysis expert responsible for scoring the relevance between a paragraph and a question.
Please strictly follow the rules below:
- Based on whether the paragraph can directly support answering the question, assign an integer score (0 or 1).
- 0 means completely irrelevant; 1 means partly or fullly supports the answer.
- Strictly output only one integer (no period, unit, or explanation), directly as the score.
Question:{question}

Paragraph：
{doc.page_content}

Please output a single integer score directly, for example: 1
Your Answer: """
            response = llm.invoke(prompt)
            if len(response.content) == 1:
              score = int(response.content)
            else:
              score = int(response.content[1])
            scoresum += score
        scored_docs.append((scoresum, doc))
        print(scoresum, end = " ")
    # top_n
    scored_docs.sort(key=lambda x: x[0], reverse=True)
    top_docs = [doc for score, doc in scored_docs if score >= 2]
    if len(top_docs) == 0:
        top_docs = [doc for score, doc in scored_docs if score >= 1]

    return top_docs

#
from typing import List
from langchain.schema import Document
from langchain.schema import BaseRetriever

class StaticRetriever(BaseRetriever):
    def __init__(self, docs: List[Document]):
        super().__init__()
        self._docs = docs

    def _get_relevant_documents(self, query: str) -> List[Document]:
        return self._docs

def reanswer(llm, context, question, answer):
    prompt = f"""
You are an automatic grader. Please score the given answer strictly according to the following rules:
- Did the answer respond to the question (1 for Yes, 0 for No)
- Completeness of the answer

Question: {question}

Answer: {answer}

Give a score 0 or 1. Output only the number without any explanation., for example: 1
Your Answer: """
    response = llm.invoke(prompt)
    print(response.content[:3], end = "")
    if response.content[0] != " ":
        score = int(response.content)
    else:
        score = int(response.content[1])
    if score == 1:
        return 1
    else:
        return 0

## Embedding Model and textsplitter


*   Embedding : intfloat/e5-large-v2
*   Text_splitter : Spacy



In [None]:
from langchain.docstore.document import Document
from langchain_core.prompts import PromptTemplate
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_core.vectorstores import InMemoryVectorStore
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.text_splitter import SpacyTextSplitter
from semantic_text_splitter import TextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain.chains.retrieval import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from sentence_transformers import SentenceTransformer
from sentence_transformers import CrossEncoder

## Reranker ##

with open(DATASET_PATH, "r") as f:
  dataset = json.load(f)

RETRIEVE_TOP_K: int = 10                        #0~20 top3: 0.25 top5: 0.22 #top10: 0.1964

##############   Model、參數調控   #############
embeddings = HuggingFaceEmbeddings(
    model_name="intfloat/e5-large-v2"             #### 0~20: 0.25  40~60: 0.231  60~80: 0.22 SOTA

)
##############   Chunk切割   #############
text_splitter = SpacyTextSplitter(
  chunk_size=512, # number of characters
  chunk_overlap=256,
  length_function=len,
  add_start_index=True,
)

  embeddings = HuggingFaceEmbeddings(


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

## Prompt setting for Generator

*   Chain-of-thought(CoT)
*   Instruction-based



In [None]:
##############   Prompt   #############
CHAT_TEMPLATE_RAG = (
"""You are answering a question based on the provided context below. Strictly follow these rules:

- Base your answer solely on the provided context.
- Start with brief 1.inference, keep your explanation based on provided content.
- After the *Step1-Inference:, *Step2-Check whether the inference is correct based on provided content
- At the end, output "*Final Answer:"
- Using a confident, concise sentence based directly on the context.
- Direct using words or phrase in context whenever possible.


<context>
{context}
</context>

Question: {input}
Answer:"""
)

## Main Loop
### Parameter:
*   start : begin index
*   leng : numbers of data
*   initial_k : **Maximum numbers** of chunk for dynamic k
*   min_k : **minimum numbers** of chunk for dynamic k
*   Uncomment code to use other modules





In [None]:
score = 0
start = 0
leng = 100
i = start
retry = 0
while i != start+leng:
  demo_title = dataset[i]["title"]
  demo_full_text = dataset[i]["full_text"]
  demo_question = dataset[i]["question"]
  demo_answer = dataset[i]["answer"]
  demo_evidence = dataset[i]["evidence"]

  ############################### embbeding ###############################

  documents = demo_full_text.split("\n\n\n")[:-1]
  docs = [Document(page_content=doc) for doc in documents]
  docs_splits = text_splitter.split_documents(docs)
  vector_store = InMemoryVectorStore.from_documents(docs_splits, embeddings)


  #vector_store = FAISS.from_documents(docs_splits, embeddings)
  ##### Textsplitter #####
  # documents = demo_full_text.split("\n\n\n")[:-1]
  # docs = [Document(page_content=doc) for doc in documents]
  # text_splitter = TextSplitter(512)
  # docs_splits = []
  # for doc in docs:
  #   for chunk_text in text_splitter.chunks(doc.page_content):
  #     docs_splits.append(Document(page_content=chunk_text))  # metadata 可加可不加

  # vector_store = InMemoryVectorStore.from_documents(docs_splits, embeddings)

  ############################### Retrieval ###############################
  retrieval_qa_prompt = PromptTemplate.from_template(template=CHAT_TEMPLATE_RAG)

  combine_docs_chain = create_stuff_documents_chain(llm, retrieval_qa_prompt)

  # ####### Dynamic Top-k ######
  initial_k = 20
  docs_with_score = vector_store.similarity_search_with_score(demo_question, k=initial_k)
  max_score = docs_with_score[0][1]
  score_threshold = 0.985
  dynamic_k = sum(1 for _, score in docs_with_score if score / max_score >= score_threshold)
  dynamic_k = max(dynamic_k, 4)
  # dynamic_k =

  rag_qa_chain = create_retrieval_chain(
    retriever=vector_store.as_retriever(search_kwargs={"k": dynamic_k}, search_type="similarity"),
    combine_docs_chain=combine_docs_chain
  )
  #############################
  # ###### Reranker 並套用在 retriever 上 #############
  # retrieved_docs = [doc for doc, _ in vector_store.similarity_search_with_score(demo_question, k=15)]

  # # prompt-based rerank
  # reranked_docs = prompt_rerank(llm, demo_question, retrieved_docs)
  # compressed_retriever = StaticRetriever(reranked_docs)

  # rag_qa_chain = create_retrieval_chain(
  #   retriever=compressed_retriever,
  #   combine_docs_chain=combine_docs_chain
  # )
  ########################################################


  ############################### Predicting ###############################
  response_new = rag_qa_chain.invoke({"input": f"{demo_question}"})
  # print(f"{response_new['answer']}")
  #response_new['answer'] = response_new['answer'].split("Final Answer:")[-1].strip()
  #print(f"Q: {demo_question}")
  #print(f"GroundTruth: {demo_answer}\n")
  print(f"{response_new['answer']}")

  i += 1
  retrieved_list = []
  for retrieved_chunk in response_new["context"]:
    retrieved_list.append(retrieved_chunk.page_content)
  from rouge_score import rouge_scorer

  scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
  fmeasure_scores = []

  for chunk in retrieved_list:
    scores = scorer.score_multi(  # using maximum f-measure
      targets=demo_evidence,
      prediction=chunk
    )
    fmeasure_scores.append(scores["rougeL"].fmeasure)

  final_evidence_score = sum(fmeasure_scores) / len(fmeasure_scores)
  #print("================")
  print(i, end = "")
  print(f"{final_evidence_score = :.4f}")
  print(f"dynamic k = {len(fmeasure_scores)}")
  score += final_evidence_score

  ############### Model's Response ####################
  chat_prompt = PromptTemplate.from_template(template=CHAT_JUDGE_TEMPLATE)
  llm_judge_chain = chat_prompt | llm_judge | StrOutputParser()

  query = {
  "document": f"Paper title: {demo_title}\n" + str(demo_evidence),
  "question": demo_question,
  "answer": demo_answer,
  "prediction": (
    response_new["answer"]
  )
  }
  _response = ["The score is "]
  print("The score is ", end="")
  for chunk in llm_judge_chain.stream(query):
    _response.append(chunk)
    print(chunk, end="")

  print("\n---------------------------------------------------------------------")
print(score/leng)

**Step 1: Inference**
The authors use visualizations to show how the LSTM and HMM components of the hybrid algorithm complement each other in terms of features learned in the data.

**Step 2: Check and Reflect**
The inference is correct, as the text explicitly states "We use visualizations to show how the LSTM and HMM components of the hybrid algorithm complement each other in terms of features learned in the data."

**Final Answer**
The authors use visualizations to show the complementary nature of the features learned by LSTMs and HMMs.
96final_evidence_score = 0.2652
dynamic k = 4
The score is 0. The prediction is not a direct match to the ground truth, and it does not exactly match the ground truth. The prediction is not self-contradictory, but it does not answer the question directly.
---------------------------------------------------------------------
**Step 1: Inference**
The in-house data employed for the task are labeled datasets of 1,100 users with gender tags, including 550

## Output for Test Dataset

In [None]:
from tqdm import tqdm
from langchain.docstore.document import Document
from langchain_text_splitters import SpacyTextSplitter
from langchain.vectorstores import FAISS
from langchain_core.vectorstores import InMemoryVectorStore
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain.chains.retrieval import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
with open("private_dataset.json", "r") as f:
  dataset = json.load(f)

RETRIEVE_TOP_K: int = 5     # 浮動很大  gte #0~20 top3: 0.25 top5: 0.22 #top10: 0.1964
                             #60~80: top1:0.139 top3:0.12 top5:0.14 top10:0.1446

##############   Model、參數調控   #############
embeddings = HuggingFaceEmbeddings(
    model_name="intfloat/e5-large-v2"        #### 0~20: 0.201  40~60: 0.1878 60~80: 0.1962 SOTA

)
##############   Chunk切割   #############
text_splitter = SpacyTextSplitter(
  chunk_size=512, # number of characters
  chunk_overlap=256,
  length_function=len,
  add_start_index=True,
)

##############   Prompt   #############
CHAT_TEMPLATE_RAG = (
"""You are answering a question based on the provided context below. Strictly follow these rules:

- Base your answer solely on the provided context.
- Start with brief 1.inference, keep your explanation based on provided content.
- After the Step1:inference, Step2:check and reflect whether the inference is correct
- At the end, output "Final Answer:"
- Using a confident, concise sentence based directly on the context.
- Direct using words or phrase in context whenever possible.


<context>
{context}
</context>

Question: {input}
Answer:"""
)


submission_data = []
for i in tqdm(range(len(dataset)), desc="Running RAG evaluation"): # len(dataset)
  demo_title = dataset[i]["title"]
  demo_full_text = dataset[i]["full_text"]
  demo_question = dataset[i]["question"]

  ############################### embbeding ###############################
  documents = demo_full_text.split("\n\n\n")[:-1]
  docs = [Document(page_content=doc) for doc in documents]

  docs_splits = text_splitter.split_documents(docs)
  vector_store = InMemoryVectorStore.from_documents(docs_splits, embeddings)

  ############################### Predicting ###############################
  retrieval_qa_prompt = PromptTemplate.from_template(template=CHAT_TEMPLATE_RAG)

  combine_docs_chain = create_stuff_documents_chain(llm, retrieval_qa_prompt)

  # ####### Dynamic Top-k ######
  initial_k = 10
  docs_with_score = vector_store.similarity_search_with_score(demo_question, k=initial_k)
  max_score = docs_with_score[0][1]
  score_threshold = 0.985
  dynamic_k = sum(1 for _, score in docs_with_score if score / max_score >= score_threshold)
  dynamic_k = max(dynamic_k, 4)

  rag_qa_chain = create_retrieval_chain(
    retriever=vector_store.as_retriever(search_kwargs={"k": dynamic_k}, search_type="similarity"),
    combine_docs_chain=combine_docs_chain
  )

  response_new = rag_qa_chain.invoke({"input": f"{demo_question}"})

  retrieved_list = []
  for retrieved_chunk in response_new["context"]:
    retrieved_list.append(retrieved_chunk.page_content)

  submission_item = {
        "title": demo_title,
        "answer": response_new['answer'],
        "evidence": retrieved_list
  }
  submission_data.append(submission_item)

# 輸出成 JSON 檔
with open("111511236.json", "w", encoding="utf-8") as f:
    json.dump(submission_data, f, indent=2, ensure_ascii=False)

Running RAG evaluation: 100%|██████████| 100/100 [06:12<00:00,  3.72s/it]


In [None]:
import zipfile
!unzip hw3_111511236.zip

Archive:  hw3_111511236.zip
   creating: hw3_111511236/
  inflating: hw3_111511236/111511236.json  
  inflating: hw3_111511236/111511236.pdf  
  inflating: hw3_111511236/111511236.py  
