## 環境建置
以下%%bash是[GNU Bash or simply Bash is a Unix shell and command language.](https://stackoverflow.com/questions/61910926/what-does-the-bash-command-do-in-python)。可以在python code中，跑linux語言。也可以使用os這個套件。

```
%%bash
pip install colpali-engine # from PyPi
pip install git+https://github.com/illuin-tech/colpali # from source
pip install ipykernel
pip install torchvision
sudo apt-get update && sudo apt-get install poppler-utils -y
pip install colpali-engine==0.3.1 pdf2image pypdf pyvespa vespacli requests numpy tqdm
export HF_HOME=/home/jovyan/datasets/soc-20250518223404/
echo $HF_HOME
```

## 引入相依套件(Import dependencies)

In [1]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from io import BytesIO
from transformers.utils.import_utils import is_flash_attn_2_available
# import colpali_engine
from colpali_engine.models import ColQwen2_5
from colpali_engine.models import ColQwen2_5_Processor

# 引入(Import)VLM-ColQwen2.5

In [2]:
model_name = "vidore/colqwen2.5-v0.2"

model = ColQwen2_5.from_pretrained(
        "vidore/colqwen2.5-v0.2",
        cache_dir="/home/jovyan/soc-20250527135343/",
        torch_dtype=torch.bfloat16,
        device_map="cuda:0",  # or "mps" if on Apple Silicon
        attn_implementation="flash_attention_2" if is_flash_attn_2_available() else None,
    ).eval()
processor = ColQwen2_5_Processor.from_pretrained("vidore/colqwen2.5-v0.2", cache_dir="/home/jovyan/soc-20250527135343/")

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


# 檔案整理的Helper Function
- `download_pdf`：輸入網址，回傳下載PDF
- `get_pdf_images`：轉換下載的PDF，存成圖檔

In [3]:
import requests
from pdf2image import convert_from_path
from pypdf import PdfReader

# 下載PDF
def download_pdf(url):
    response = requests.get(url)
    if response.status_code == 200:
        return BytesIO(response.content)
    else:
        raise Exception(f"Failed to download PDF: Status code {response.status_code}")

# PDF_to_images
def get_pdf_images(pdf_url):
    # Download the PDF
    pdf_file = download_pdf(pdf_url)
    # Save the PDF temporarily to disk (pdf2image requires a file path)
    temp_file = "temp.pdf"
    with open(temp_file, "wb") as f:
        f.write(pdf_file.read())
    reader = PdfReader(temp_file)
    page_texts = []
    for page_number in range(len(reader.pages)):
        page = reader.pages[page_number]
        text = page.extract_text()
        page_texts.append(text)
    images = convert_from_path(temp_file)
    assert len(images) == len(page_texts)
    return (images, page_texts)

import os 
from pdf2image import convert_from_path
from pypdf import PdfReader

path = "./Guidelines"
def get_cpic_pdf_images_texts(path):
    CPIC_pdf = []
    pdf_files = []
    # 讀取所有pdf文件的路徑，存於pdf_files
    for i, file in enumerate(os.listdir(path)):
        if file.endswith(".pdf"):
            pdf_path = os.path.join(path, file)
            pdf_files.append(pdf_path)
            CPIC_pdf.append({"path": pdf_path, "name": file})
    
    # 讀取所有pdf文件的內容，存於CPIC_pdf
    for i,file in enumerate(pdf_files):
        reader = PdfReader(file)
        page_texts = []
        for page_number in range(len(reader.pages)):
            page = reader.pages[page_number]
            text = page.extract_text()
            page_texts.append(text)
        images = convert_from_path(file)
        CPIC_pdf[i]["images"] = images
        CPIC_pdf[i]["texts"] = page_texts
    assert len(images) == len(page_texts)
    return CPIC_pdf

from IPython.display import display


def resize_image(image, max_height=800):
    width, height = image.size
    if height > max_height:
        ratio = max_height / height
        new_width = int(width * ratio)
        new_height = int(height * ratio)
        return image.resize((new_width, new_height))
    return image

# display(resize_image(CPIC_pdf[42]["images"][0]))

import base64


def get_base64_image(image):
    buffered = BytesIO()
    image.save(buffered, format="JPEG")
    return str(base64.b64encode(buffered.getvalue()), "utf-8")

## 使用CPIC Guidelines的PDF檔，轉換圖片作為資料集

In [4]:
CPIC_pdf = get_cpic_pdf_images_texts(path)

## 產生嵌入向量(Generate Embedding)

In [5]:
len(CPIC_pdf)

28

In [19]:
for pdf in CPIC_pdf:
    page_embeddings = []
    dataloader = DataLoader(
        pdf["images"],
        batch_size=2,
        shuffle=False,
        collate_fn=lambda x: processor.process_images(x),
    )

    for batch_doc in tqdm(dataloader):
        with torch.no_grad():
            batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
            embeddings_doc = model(**batch_doc)
            page_embeddings.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
    pdf["embeddings"] = page_embeddings

100%|██████████| 4/4 [00:03<00:00,  1.30it/s]
100%|██████████| 3/3 [00:01<00:00,  1.58it/s]
100%|██████████| 3/3 [00:01<00:00,  1.63it/s]
100%|██████████| 5/5 [00:02<00:00,  1.86it/s]
100%|██████████| 3/3 [00:01<00:00,  1.56it/s]
100%|██████████| 4/4 [00:02<00:00,  1.61it/s]
100%|██████████| 4/4 [00:02<00:00,  1.84it/s]
100%|██████████| 6/6 [00:03<00:00,  1.80it/s]
100%|██████████| 3/3 [00:01<00:00,  1.63it/s]
100%|██████████| 4/4 [00:02<00:00,  1.90it/s]
100%|██████████| 5/5 [00:02<00:00,  1.82it/s]
100%|██████████| 3/3 [00:01<00:00,  2.00it/s]
100%|██████████| 5/5 [00:03<00:00,  1.64it/s]
 75%|███████▌  | 6/8 [00:03<00:01,  1.65it/s]

In [20]:
CPIC_pdf[len(CPIC_pdf)-1].keys()

dict_keys(['path', 'name', 'images', 'texts', 'embeddings'])

In [22]:
import numpy as np

vespa_feed = []
for pdf in CPIC_pdf:
    path = pdf["path"]
    name = pdf["name"]
    for page_number, (page_text, embedding, image) in enumerate(
        zip(pdf["texts"], pdf["embeddings"], pdf["images"])
    ):
        base_64_image = get_base64_image(resize_image(image, 640))
        embedding_dict = dict()
        for idx, patch_embedding in enumerate(embedding):
            binary_vector = (
                np.packbits(np.where(patch_embedding > 0, 1, 0))
                .astype(np.int8)
                .tobytes()
                .hex()
            )
            embedding_dict[idx] = binary_vector
        page = {
            "id": hash(name + " "+ str(page_number)),
            "name": name,
            "path": path,
            "page_number": page_number,
            "image": base_64_image,
            "text": page_text,
            "embedding": embedding_dict,
        }
        vespa_feed.append(page)

In [None]:
vespa_feed[0]

In [24]:
from vespa.package import Schema, Document, Field, FieldSet, HNSW

colpali_schema = Schema(
    name="pdf_page",
    document=Document(
        fields=[
            Field(
                name="id", type="string", indexing=["summary", "index"], match=["word"]
            ),
            Field(name="name", type="string", indexing=["summary", "index"]),
            Field(
                name="path",
                type="string",
                indexing=["summary", "index"],
                match=["text"],
                index="enable-bm25",
            ),
            Field(name="page_number", type="int", indexing=["summary", "attribute"]),
            Field(name="image", type="raw", indexing=["summary"]),
            Field(
                name="text",
                type="string",
                indexing=["index"],
                match=["text"],
                index="enable-bm25",
            ),
            Field(
                name="embedding",
                type="tensor<int8>(patch{}, v[16])",
                indexing=[
                    "attribute",
                    "index",
                ],  # adds HNSW index for candidate retrieval.
                ann=HNSW(
                    distance_metric="hamming",
                    max_links_per_node=8,
                    neighbors_to_explore_at_insert=100,
                ),
            ),
        ]
    ),
    fieldsets=[FieldSet(name="default", fields=["name", "text"])],
)

In [25]:
from vespa.package import ApplicationPackage

vespa_app_name = "cpicguidelinevrag"
vespa_application_package = ApplicationPackage(
    name=vespa_app_name, schema=[colpali_schema]
)

In [26]:
from vespa.package import RankProfile, Function, FirstPhaseRanking, SecondPhaseRanking

colpali_profile = RankProfile(
    name="default",
    inputs=[("query(qt)", "tensor(querytoken{}, v[128])")],
    functions=[
        Function(
            name="max_sim",
            expression="""
                sum(
                    reduce(
                        sum(
                            query(qt) * unpack_bits(attribute(embedding)) , v
                        ),
                        max, patch
                    ),
                    querytoken
                )
            """,
        ),
        Function(name="bm25_score", expression="bm25(name) + bm25(text)"),
    ],
    first_phase=FirstPhaseRanking(expression="bm25_score"),
    second_phase=SecondPhaseRanking(expression="max_sim", rerank_count=100),
)
colpali_schema.add_rank_profile(colpali_profile)

In [27]:
from vespa.deployment import VespaCloud
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Replace with your tenant name from the Vespa Cloud Console
tenant_name = "hsinnosukejp"

# key = os.getenv("VESPA_TEAM_API_KEY", None)
# if key is not None:
#     key = key.replace(r"\n", "\n")  # To parse key correctly
# print(key)

vespa_cloud = VespaCloud(
    tenant=tenant_name,
    application=vespa_app_name,
    # key_content=key,  # Key is only used for CI/CD testing of this notebook. Can be removed if logging in interactively
    application_package=vespa_application_package,
)

Setting application...
Running: vespa config set application hsinnosukejp.cpicguidelinevrag.default
Setting target cloud...
Running: vespa config set target cloud

No api-key found for control plane access. Using access token.
Checking for access token in auth.json...
Access token expired. Please re-authenticate.
Your Device Confirmation code is: PJKP-XHKD
Automatically open confirmation page in your default browser? [Y/n] 
Opened link in your browser: https://login.console.vespa-cloud.com/activate?user_code=PJKP-XHKD
Waiting for login to complete in browser ... done;1m⣻[0;22m
[32mSuccess:[0m Logged in
 auth.json created at /home/jovyan/.vespa/auth.json
Successfully obtained access token for control plane access.


In [21]:
dir(vespa_cloud)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__firstlineno__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__static_attributes__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_application_root_has_tests',
 '_build_no',
 '_check_vespacli_available',
 '_follow_deployment',
 '_generate_cert_vespacli',
 '_get_auth_headers',
 '_get_deployment_status',
 '_get_last_deployable',
 '_handle_response',
 '_load_certificate_pair',
 '_print_log_entry',
 '_read_private_key',
 '_request',
 '_request_with_access_token',
 '_request_with_api_key',
 '_set_application',
 '_set_target_cloud',
 '_start_deployment',
 '_start_prod_deployment',
 '_submitted_timestamp',
 '_to_application_zip',
 '_try_get_access_token',
 '_vespa_au

In [28]:
from vespa.application import Vespa

app: Vespa = vespa_cloud.deploy()

Deployment started in run 2 of dev-aws-us-east-1c for hsinnosukejp.cpicguidelinevrag. This may take a few minutes the first time.
INFO    [08:49:51]  Deploying platform version 8.530.11 and application dev build 2 for dev-aws-us-east-1c of default ...
INFO    [08:49:51]  Using CA signed certificate version 1
INFO    [08:49:51]  Using 1 nodes in container cluster 'cpicguidelinevrag_container'
INFO    [08:49:54]  Session 33 for tenant 'hsinnosukejp' prepared, but activation failed: 1/2 application hosts and 2/2 admin hosts for hsinnosukejp.cpicguidelinevrag have completed provisioning and bootstrapping, still waiting for h119506.dev.us-east-1c.aws.vespa-cloud.net
INFO    [08:50:00]  Deploying platform version 8.530.11 and application dev build 2 for dev-aws-us-east-1c of default ...
INFO    [08:50:00]  1/2 application hosts and 2/2 admin hosts for hsinnosukejp.cpicguidelinevrag have completed provisioning and bootstrapping, still waiting for h119506.dev.us-east-1c.aws.vespa-cloud.net
INF

In [29]:
print("Number of PDF pages:", len(vespa_feed))

Number of PDF pages: 293


In [40]:
from vespa.io import VespaResponse

async with app.asyncio(connections=1, timeout=180) as session:
    for page in tqdm(vespa_feed):
        response: VespaResponse = await session.feed_data_point(
            data_id=page["id"], fields=page, schema="pdf_page"
        )
        if not response.is_successful():
            print(response.json())

  0%|          | 0/293 [00:00<?, ?it/s]

100%|██████████| 293/293 [01:42<00:00,  2.86it/s]


In [7]:
question_num = [
    "Q3",
    "Q4",
    "Q7",
    "Q8",
    "Q11",
    "Q12",
    "Q15",
    "Q16",
    "Q19",
    "Q20"
]
# Raw Questions
queries = [
    #Q3, Q4, Q7, Q8, Q11, Q12, Q15, Q16, Q19, Q20
    "A 55-year-old male patient of Taiwanese descent with a history of gout (requiring urate-lowering therapy) undergoes kidney transplantation and is initiated on azathioprine for immunosuppression. Pre-emptive genotyping reveals he carries both the HLA-B*5801 allele (increasing risk for allopurinol-induced SCARs) and is heterozygous for a NUDT15 variant (e.g., c.415C>T, conferring intermediate thiopurine methyltransferase activity). Analyze the complex management challenges: a) the significantly increased risk of potentially fatal azathioprine-induced myelosuppression due to the NUDT15 status, requiring dose reduction, and b) the contraindication of using first-line allopurinol for his gout due to the HLA-B*5801 status, necessitating alternative gout management strategies (like febuxostat, considering its own warnings and potential interaction with azathioprine metabolism). Discuss the sequential decision-making and monitoring required.",
    "Consider a patient presenting with acute coronary syndrome (ACS) requiring percutaneous coronary intervention (PCI) and subsequent dual antiplatelet therapy (DAPT) plus high-intensity statin treatment. Genotyping indicates the patient is a CYP2C19 intermediate metabolizer (e.g., *1/*2 genotype, potentially reducing clopidogrel activation) and also carries the SLCO1B1 c.521T>C variant (associated with increased risk of simvastatin-induced myopathy). Critically evaluate the choices for both the P2Y12 inhibitor component of DAPT (clopidogrel vs. ticagrelor/prasugrel) and the high-intensity statin (simvastatin vs. atorvastatin/rosuvastatin), explicitly considering how the combination of these two genetic variants influences the overall risk-benefit assessment for preventing both ischemic events (stent thrombosis) and adverse drug reactions (myopathy).",
    "A patient diagnosed with major depressive disorder and comorbid neuropathic pain requires pharmacotherapy. They are identified as a CYP2D6 poor metabolizer and a CYP2C19 rapid metabolizer. Analyze the implications of this combined genotype profile when considering treatment options such as: a) initiating venlafaxine (SNRI primarily metabolized by CYP2D6, with CYP2C19 involvement), versus b) initiating escitalopram (SSRI primarily metabolized by CYP2C19) potentially augmented with nortriptyline (TCA primarily metabolized by CYP2D6). Discuss the predicted impact on drug exposure, efficacy, and tolerability for both scenarios, and outline a rational therapeutic strategy incorporating pharmacogenetic insights and potential therapeutic drug monitoring (TDM).",
    "An epileptic patient of Southeast Asian ancestry, currently stabilized on phenytoin but experiencing breakthrough seizures, is being considered for add-on therapy with carbamazepine. Genetic testing reveals the patient carries the HLA-B*1502 allele and possesses a CYP2C9*1/*3 genotype (intermediate metabolizer). Evaluate the compounded risks: a) the extremely high risk of SJS/TEN associated with HLA-B*1502 upon introducing carbamazepine (and the known, albeit lower, risk with phenytoin), and b) the altered phenytoin metabolism due to the CYP2C9 variant, potentially leading to toxicity or requiring dose adjustments, which could be further complicated by drug interactions if carbamazepine were added. Discuss the absolute contraindication for carbamazepine and the careful management and monitoring required for continuing phenytoin, considering alternative add-on agents without the HLA-B*1502 association.",
    "A patient undergoing moderately emetogenic chemotherapy requires antiemetics and is prescribed ondansetron (metabolized partly by CYP2D6). Concurrently, they require analgesia for breakthrough pain and are given codeine (a prodrug requiring activation by CYP2D6). If this patient is subsequently found to be a CYP2D6 ultrarapid metabolizer (e.g., due to gene duplication), analyze the potential clinical consequences arising from this genotype impacting both medications simultaneously: a) the heightened risk of opioid toxicity (e.g., respiratory depression) even with standard codeine doses due to rapid conversion to morphine, and b) the potential for altered ondansetron efficacy or clearance (though clinical significance is less established than for codeine/TCAs). Discuss appropriate management adjustments for both pain and nausea in this specific genetic context.",
    "A 58-year-old patient with an ECOG performance status of 1 presents with rectal bleeding and a 5kg weight loss, subsequently diagnosed with metastatic colorectal cancer (KRAS wild-type) suitable for first-line chemotherapy with the FOLFIRI regimen. Baseline total bilirubin is 1.1 mg/dL; renal function is normal. Just prior to initiating FOLFIRI, he develops significant oropharyngeal candidiasis requiring treatment and is prescribed itraconazole 200 mg daily. Pre-treatment genotyping reveals he is homozygous for the UGT1A1*28 allele (genotype 7/7). Please justify your proposed starting dose modification for irinotecan, outline an intensified monitoring strategy, and provide recommendations regarding the use of antiviral agents. ",
    "A 38-year-old patient presents with severe, lancinating trigeminal neuralgia unresponsive to NSAIDs, significantly impacting his work as an interpreter. Carbamazepine is considered. HLA-A*31:01 testing is ordered, acknowledging standard practice and test availability. While awaiting the results (which may take several days), carbamazepine is cautiously initiated at 100mg BID due to symptom severity. Three days later, he develops acute bacterial sinusitis requiring antibiotic therapy, and clarithromycin 500mg BID is prescribed for 10 days. Outline your specific actions regarding the carbamazepine dose immediately upon starting clarithromycin. Discuss how this necessary short-term management, driven by the drug interaction, intersects with and potentially complicates the longer-term therapeutic plan which will be influenced by the eventual HLA-A*31:01 result.",
    "A 28-year-old patient of mixed European descent with a baseline eGFR of 95 mL/min/1.73m² is undergoing consolidation therapy for primary mediastinal large B-cell lymphoma. The treatment regimen includes high-dose methotrexate (HD-MTX) administered at 5 g/m². He is also routinely receiving omeprazole 40 mg daily for gastric protection. Pharmacogenetic testing indicates that the patient carries the SLCO1B1 c.521T>C variant (heterozygous, e.g., *1/*15 genotype). Following HD-MTX infusion, the plasma methotrexate concentration at 72 hours is 5 μM, accompanied by a 1.5-fold worsening of renal function. What factors can contribute to delayed methotrexate elimination, and what are the underlying mechanisms? What are the current recommended management strategies for delayed methotrexate elimination? Which methotrexate-related toxicities should be monitored, and until when should monitoring continue to ensure that methotrexate toxicity risk has significantly decreased?",
    "A 45-year-old patient with a history of coronary stent placement one year prior (currently on low-dose aspirin and clopidogrel 75mg daily) presents with moderate-severe epigastric pain. Endoscopy confirms multiple duodenal ulcers and Helicobacter pylori gastritis. He previously failed bismuth quadruple therapy. Standard 14-day clarithromycin-based triple therapy (including amoxicillin 1g BID and a PPI) is indicated. Pharmacogenetic testing reveals the patient is a CYP2C19 ultrarapid metabolizer (*17/*17 genotype). Propose an integrated management plan: Specify your chosen PPI regimen, detail how you will manage the clopidogrel therapy during these 14 days, and state your plan for confirming eradication.",
    "A 7-year-old child treated for high-risk neuroblastoma received multi-agent chemotherapy including a cumulative cisplatin dose of 400 mg/m² and weekly vincristine infusions. Baseline audiology was normal. Post-treatment evaluation detects significant bilateral sensorineural hearing loss (Grade 2-3) and concurrent Grade 2 peripheral neuropathy (sensory > motor). Retrospective pharmacogenetic testing shows the child is heterozygous for TPMT*3A, a reduced function allele. Discuss the management of the child's susceptibility to chemotherapy-induced neurotoxicities (both auditory and peripheral). Recommendations for subsequent pharmacologic treatment of high-risk neuroblastoma."
    ]

In [37]:
dataloader = DataLoader(
    queries,
    batch_size=1,
    shuffle=False,
    collate_fn=lambda x: processor.process_queries(x),
)
qs = []
for batch_query in dataloader:
    with torch.no_grad():
        batch_query = {k: v.to(model.device) for k, v in batch_query.items()}
        embeddings_query = model(**batch_query)
        qs.extend(list(torch.unbind(embeddings_query.to("cpu"))))

In [20]:
from pdf2image import convert_from_path
from PIL import Image
import os

def open_pdf_page(pdf_path, page_number):
    """
    Open pdf with page number
    
    Params:
    pdf_path (str)
    page_number (int), 0 indexing
    
    Returns:
    PIL.Image: 
    """
    try:
        images = convert_from_path(pdf_path)
        
        if page_number < 0 or page_number >= len(images):
            raise ValueError(f"页码 {page_number} 超出范围。PDF共有 {len(images)} 页")
        
        selected_page = images[page_number]
        return selected_page
        
    except Exception as e:
        print(f"处理PDF时出错: {str(e)}")
        return None

In [None]:
open_pdf_page("./Guidelines/Clinical Pharmacogenetics Implementation Consortium (CPIC) Guideline for CYP2D6, CYP2C19, CYP2B6, SLC6A4, and HTR2A Genotypes and Serotonin Reuptake Inhibitor Antidepressants (April 2023).pdf", 12)

In [7]:
from IPython.display import display, HTML

question_num = [
    "Q3",
    "Q4",
    "Q7",
    "Q8",
    "Q11",
    "Q12",
    "Q15",
    "Q16",
    "Q19",
    "Q20"
]
# 創建保存圖片的目錄
output_dir = "Hard_question_CPIC_retrieval_directly"
os.makedirs(output_dir, exist_ok=True)

def display_query_results_Qnumber(query, response, q_number, hits=3):
    query_time = response.json.get("timing", {}).get("searchtime", -1)
    query_time = round(query_time, 2)
    count = response.json.get("root", {}).get("fields", {}).get("totalCount", 0)
    html_content = f"Query text: '{query}', query time {query_time}s, count={count}, top results:"

    for i, hit in enumerate(response.hits[:hits]):
        title = hit["fields"]["name"]
        path = hit["fields"]["path"]
        page = hit["fields"]["page_number"]
        image = hit["fields"]["image"]
        score = hit["relevance"]
        # open image using path
        show_image = open_pdf_page(path, page)

        image_filename = f"{q_number}_{i}_{title}_{page+1}.png"
        image_path = os.path.join(output_dir, image_filename)
        show_image.save(image_path)
        
        # display(resize_image(show_image, 640))
        html_content += f"PDF Result {i + 1}\n"
        html_content += f"File: {q_number}_{i}_{title}_{page+1}.png\n"
        html_content += f'Title: {title}, page {page+1} with score {score:.2f}\n'
        html_content += (
            f''
        )

    display(HTML(html_content))
    output_file = "Hard_question_CPIC_retrieval_directly.txt"
    with open(output_file, "a", encoding="utf-8") as f:  # 使用 "a" 模式追加內容
        f.write(html_content)
        f.write("\n")  # 添加額外的換行

In [39]:
from vespa.io import VespaQueryResponse


async with app.asyncio(connections=1, timeout=120) as session:
    for idx, query in enumerate(queries):
        query_embedding = {k: v.tolist() for k, v in enumerate(qs[idx])}
        response: VespaQueryResponse = await session.query(
            yql="select name,path,image,page_number from pdf_page where userInput(@userQuery)",
            ranking="default",
            userQuery=query,
            timeout=120,
            hits=10,
            body={"input.query(qt)": query_embedding, "presentation.timing": True},
        )
        assert response.is_successful()
        q_number = question_num[idx]
        display_query_results_Qnumber(query, response, q_number, hits=10)

# Query Rewriting部分

In [None]:
from langchain_nvidia_ai_endpoints import ChatNVIDIA
import os
import dotenv

dotenv.load_dotenv()


client = ChatNVIDIA(
  model="meta/llama-3.3-70b-instruct",
  api_key=os.getenv("NVIDIA_API"), 
  temperature=0,
  top_p=0.95,
  max_tokens=4096,
)

cpic_guideline_name_list = os.listdir("./Guidelines")
queries_to_rewrite = queries 
queries_q_dissect = []


for idx, query in enumerate(queries_to_rewrite):
  prompt = f"""

    You are an expert information retrieval assistant specializing in pharmacogenomics.
    Your sole task is to deconstruct a user's question about CPIC guidelines and convert it into one or more structured search query strings formatted for a database.
    Your final output **must be only a Python list** containing these formatted strings. Do not include any other text, explanation, or conversational filler.

    **CPIC Guideline List:**
    `{cpic_guideline_name_list}`
    **User Question:**

    `{query}`

    Now, apply this logic to the user's question below. To do this, you will:

    1.  **Analyze the User's Question:** Deconstruct the query to understand its components.
    2.  **Identify Entities:**
        * Extract the primary **drug name(s)** (product or generic name). If no specific drug is mentioned, use "N/A".
        * Extract the primary **gene name(s)**. If no specific gene is mentioned, use "N/A".
    3.  **Construct Content to Search:**
        Reasoning:
        * Analyze the gene and drug in the question.
        * List out the direct interaction between the gene and drug.
        * Don't list out the indirect interaction between the gene and drug.
        * Don't consider the interaction between the gene and drug if the gene is not related to the drug, or the drug is not related to the gene.
        * Don't take the drug and gene that's not mentioned in the question into consideration. Avoid overthinking.
        * Consider possible cpic guidelines and content that is a direct quote from the guideline.
        * **Crucially, the extracted "Content to Search" must inherently contain both the identified drug name and gene name.**
        * The total length of "Content to Search" should not exceed 150 words.
        * The "Content to Search" should also specify the reason for choosing this guideline and what specific information to search for within it. Be as detailed as possible within the word constraints.
        * **If `Content to Search` must also be "N/A", then `CPIC Guideline Name` must also be "N/A".**
    4. * **Identify the specific CPIC Guideline Name** from `{cpic_guideline_name_list}` that is **DIRECTLY RELATED** to **both** the identified drug and gene.
            * **Strict Matching Rule:** A guideline is "directly related" only if:
                * The drug name (or its generic equivalent) is explicitly part of the guideline's name, OR
                * The guideline specifically covers the drug in its primary scope (as implied by its name, generic name, or product name), AND
                * The gene name is also relevant to that same guideline.
            * **Crucially:**
                * Do **NOT** consider any indirect interactions between a drug in the question and a drug *within* a guideline if the primary scope of the guideline (as indicated by its name, generic name, or product name) does not include the questioned drug.
                * Do **NOT** consider any indirect interactions between a gene in the question and a gene *within* a guideline if the primary scope of the guideline (as indicated by its name, generic name, or product name) does not explicitly include the questioned gene in relation to the questioned drug.
            * If, after applying these strict rules, no directly related CPIC Guideline Name is found, use "N/A".
    5.  **Format the Output:**
        * Generate a separate formatted string for each distinct query implied by the question.
        * Use the exact format: `"Drug Name: [Extracted Drug], Gene Name: [Extracted Gene], CPIC Guideline Name: [Identified CPIC Guideline Name], Content to Search: [Extracted Guideline Text or N/A]"`
        * Place all generated strings into a single Python list.
        * **Final Rule:** If, after all steps, all fields (Drug Name, Gene Name, CPIC Guideline Name, Content to Search) for a potential query are "N/A" or no valid query could be formed, then **return an empty Python list `[]`**.

    Now, learn the required format from these examples.

    Positive Example:
    ---
    **Example 1:**
    "Drug Name: Venlafaxine, Gene Name: CYP2D6, CPIC Guideline Name: Antidepressants (Selective Serotonin Reuptake Inhibitors and Serotonin Norepinephrine Reuptake Inhibitors), Content to Search: Implications for drug exposure, efficacy, and tolerability in CYP2D6 PM and dosing recommendations."

    Negative Example (Crucial for "N/A" handling and empty list output):
    ---
    **Example 2 (Incorrect Logic & Output):**
    User Question: "What are the CPIC guidelines for Irinotecan and UGT1A1?"
    CPIC Guideline List: `["Clinical Pharmacogenetics Implementation Consortium (CPIC) Guidelines for UGT1A1 and Atazanavir Prescribing (September 2015) .pdf", ...other guidelines...]`

    An **INCORRECT** output would be:
    `["Drug Name: Irinotecan, Gene Name: UGT1A1, CPIC Guideline Name: Clinical Pharmacogenetics Implementation Consortium (CPIC) Guidelines for UGT1A1 and Atazanavir Prescribing (September 2015) .pdf, Content to Search: UGT1A1*28"]`
    **Reasoning for Incorrectness:** Although UGT1A1 is mentioned in the guideline name, Irinotecan is *not* a drug specifically covered or named in the "UGT1A1 and Atazanavir" guideline. According to the rules, we must ensure the drug is directly related to the guideline's generic name or explicitly present in the guideline name. Since Irinotecan does not belong to Atazanavir's generic group nor is it named, this guideline is not directly related to Irinotecan.

    **The CORRECT output for the above User Question would be an empty Python list:**
    `[]`
    **Reasoning for Correctness:** Because no CPIC Guideline in the provided list directly covers the combination of "Irinotecan" and "UGT1A1" according to the strict matching rules, no valid search query can be formed. Therefore, an empty list must be returned.
    
    **Example 3 (Empty List Output):**
    User Question: "A 7-year-old child treated for high-risk neuroblastoma received multi-agent chemotherapy including a cumulative cisplatin dose of 400 mg/m² and weekly vincristine infusions. Baseline audiology was normal. Post-treatment evaluation detects significant bilateral sensorineural hearing loss (Grade 2-3) and concurrent Grade 2 peripheral neuropathy (sensory > motor). Retrospective pharmacogenetic testing shows the child is heterozygous for TPMT*3A, a reduced function allele. Discuss the management of the child's susceptibility to chemotherapy-induced neurotoxicities (both auditory and peripheral). Recommendations for subsequent pharmacologic treatment of high-risk neuroblastoma."
    CPIC Guideline List: `["Q20_0_Clinical Pharmacogenetics Implementation Consortium (CPIC) Guideline for MT-RNR1 and Aminoglycosides (May 2021) .pdf", "Q20_1_Clinical Pharmacogenetics Implementation Consortium (CPIC) Guideline for MT-RNR1 and Aminoglycosides (May 2021) .pdf", "Q20_2_Clinical Pharmacogenetics Implementation Consortium (CPIC) Guidelines for Dihydropyrimidine Dehydrogenase Genotype and Fluoropyrimidine Dosing (December 2013) .pdf", "Q20_3_Clinical Pharmacogenetics Implementation Consortium Guidelines for thiopurine dosing based on TPMT and NUDT15 genotypes- 2018 Update (November 2018) .pdf", "Q20_4_Clinical Pharmacogenetics Implementation Consortium (CPIC) Guidelines for Dihydropyrimidine Dehydrogenase Genotype and Fluoropyrimidine Dosing (December 2013) .pdf", "Q20_5_Clinical Pharmacogenetics Implementation Consortium Guidelines for Thiopurine Methyltransferase Genotype and Thiopurine Dosing .pdf", "Q20_6_Clinical Pharmacogenetics Implementation Consortium Guidelines for thiopurine dosing based on TPMT and NUDT15 genotypes- 2018 Update (November 2018) .pdf", "Q20_7_Clinical Pharmacogenetics Implementation Consortium (CPIC) Guideline for CYP2D6 and Tamoxifen Therapy (January 2018) .pdf", "Q20_8_Clinical Pharmacogenetics Implementation Consortium (CPIC) Guideline for CYP2D6 Genotype and Use of Ondansetron and Tropisetron (December 2016) .pdf", "Q20_9_Clinical Pharmacogenetics Implementation Consortium (CPIC) Guideline for Dihydropyrimidine Dehydrogenase Genotype and Fluoropyrimidine Dosing- 2017 Update (October 2017) .pdf"]`
    
    An **INCORRECT** output would be:
    ```python
    [
        "Drug Name: Cisplatin, Gene Name: TPMT, CPIC Guideline Name: N/A, Content to Search: N/A",
        "Drug Name: Vincristine, Gene Name: TPMT, CPIC Guideline Name: N/A, Content to Search: N/A"
    ]
    ```
    **Reasoning for Incorrectness:** According to the "Final Rule" in step 5, if all fields are "N/A" or no valid query could be formed (which is the case here as TPMT is mentioned but no guideline directly links TPMT to Cisplatin or Vincristine's neurotoxicity), then an empty list `[]` must be returned.

    **The CORRECT output for the above User Question would be an empty Python list:**
    `[]`
    **Reasoning for Correctness:** Because no CPIC Guideline in the provided list directly covers the combination of "Cisplatin/Vincristine" and "TPMT" according to the strict matching rules, no valid search query can be formed. Therefore, an empty list must be returned.
    ---

    **CPIC Guideline List:**
    `{cpic_guideline_name_list}`
    **User Question:**
    `{query}`

    **Output:**
    """
  response = client.invoke([{"role":"user","content":prompt}])
  # print(f"response: {response}")
  print(f"query_rewrite_{question_num[idx]} = {response.content}")
  queries_q_dissect.append(list(response.content))

print(queries_q_dissect)


In [46]:
from IPython.display import display, HTML

question_num = [
    "q3",
    "q4",
    "q7",
    "q8",
    "q11",
    "q12",
    "q15",
    "q16",
    "q19",
    "q20"
]
# 創建保存圖片的目錄
output_dir = "Hard_question_CPIC_retrieval_directly"
os.makedirs(output_dir, exist_ok=True)

def display_query_results(query, response, hits=3):
    query_time = response.json.get("timing", {}).get("searchtime", -1)
    query_time = round(query_time, 2)
    count = response.json.get("root", {}).get("fields", {}).get("totalCount", 0)
    html_content = f"Query text: '{query}', query time {query_time}s, count={count}, top results:"

    for i, hit in enumerate(response.hits[:hits]):
        title = hit["fields"]["name"]
        path = hit["fields"]["path"]
        page = hit["fields"]["page_number"]
        image = hit["fields"]["image"]
        score = hit["relevance"]
        # open image using path

        
        # display(resize_image(show_image, 640))
        html_content += f"PDF Result {i + 1}\n"
        html_content += f"File: {i}_{title}_{page+1}.png\n"
        html_content += f'Title: {title}, page {page+1} with score {score:.2f}\n'
        html_content += (
            f''
        )

    display(HTML(html_content))

In [None]:
query_rewrite_Q3 = [
    "Drug Name: Azathioprine, Gene Name: NUDT15, CPIC Guideline Name: Clinical Pharmacogenetics Implementation Consortium Guidelines for thiopurine dosing based on TPMT and NUDT15 genotypes- 2018 Update (November 2018) .pdf, Content to Search: NUDT15 variant and azathioprine dose reduction",
    "Drug Name: Allopurinol, Gene Name: HLA-B, CPIC Guideline Name: Clinical Pharmacogenetics Implementation Consortium (CPIC) Guidelines for Human Leukocyte Antigen-B (HLA-B) Genotype and Allopurinol Dosing (February 2013) .pdf, Content to Search: HLA-B*5801 allele and allopurinol contraindication"
]
query_rewrite_Q4 = [
    "Drug Name: Clopidogrel, Gene Name: CYP2C19, CPIC Guideline Name: Clinical Pharmacogenetics Implementation Consortium Guideline for CYP2C19 Genotype and Clopidogrel Therapy- 2022 update (January 2022) .pdf, Content to Search: Implications of CYP2C19 intermediate metabolizer status on clopidogrel activation and dosing recommendations",
    "Drug Name: Simvastatin, Gene Name: SLCO1B1, CPIC Guideline Name: The Clinical Pharmacogenetics Implementation Consortium (CPIC) guideline for SLCO1B1, ABCG2, and CYP2C9 and statin-associated musculoskeletal symptoms (January 2022) .pdf, Content to Search: Risk of simvastatin-induced myopathy in patients with SLCO1B1 c.521T>C variant and alternative statin options"
]
query_rewrite_Q7 = [
    "Drug Name: Venlafaxine, Gene Name: CYP2D6, CPIC Guideline Name: Clinical Pharmacogenetics Implementation Consortium Guideline (CPIC®) for CYP2D6 and CYP2C19 Genotypes and Dosing of Tricyclic Antidepressants- 2016 Update (December 2016) .pdf, Content to Search: Implications for drug exposure, efficacy, and tolerability in CYP2D6 poor metabolizers and CYP2C19 rapid metabolizers",
    "Drug Name: Escitalopram, Gene Name: CYP2C19, CPIC Guideline Name: Clinical Pharmacogenetics Implementation Consortium (CPIC) Guideline for CYP2D6, CYP2C19, CYP2B6, SLC6A4, and HTR2A Genotypes and Serotonin Reuptake Inhibitor Antidepressants (April 2023).pdf, Content to Search: Predicted impact on drug exposure, efficacy, and tolerability in CYP2C19 rapid metabolizers",
    "Drug Name: Nortriptyline, Gene Name: CYP2D6, CPIC Guideline Name: Clinical Pharmacogenetics Implementation Consortium Guideline (CPIC®) for CYP2D6 and CYP2C19 Genotypes and Dosing of Tricyclic Antidepressants- 2016 Update (December 2016) .pdf, Content to Search: Dosing recommendations for CYP2D6 poor metabolizers"
]
query_rewrite_Q8 = [
    "Drug Name: Phenytoin, Gene Name: CYP2C9, CPIC Guideline Name: Clinical Pharmacogenetics Implementation Consortium (CPIC) Guidelines for CYP2C9 and HLA-B Genotype and Phenytoin Dosing (August 2020) .pdf, Content to Search: Implications of CYP2C9*1/*3 genotype on phenytoin metabolism and dosing recommendations",
    "Drug Name: Carbamazepine, Gene Name: HLA-B, CPIC Guideline Name: Clinical Pharmacogenetics Implementation Consortium (CPIC) Guideline for HLA Genotype and Use of Carbamazepine and Oxcarbazepine- 2017 Update (December 2017) .pdf, Content to Search: HLA-B*1502 allele association with carbamazepine-induced SJS/TEN and absolute contraindication"
]
query_rewrite_Q11 = [
    "Drug Name: Ondansetron, Gene Name: CYP2D6, CPIC Guideline Name: Clinical Pharmacogenetics Implementation Consortium (CPIC) Guideline for CYP2D6 Genotype and Use of Ondansetron and Tropisetron (December 2016) .pdf, Content to Search: Implications of CYP2D6 ultrarapid metabolizer genotype on ondansetron efficacy and clearance",
    "Drug Name: Codeine, Gene Name: CYP2D6, CPIC Guideline Name: Clinical Pharmacogenetics Implementation Consortium (CPIC) guideline for CYP2D6, OPRM1, and COMT genotype and select opioid therapy (December 2020) .pdf, Content to Search: Heightened risk of opioid toxicity in CYP2D6 ultrarapid metabolizers due to rapid conversion of codeine to morphine"
]
query_rewrite_Q12 = []
query_rewrite_Q15 = ["Drug Name: Carbamazepine, Gene Name: HLA-A, CPIC Guideline Name: Clinical Pharmacogenetics Implementation Consortium (CPIC) Guideline for HLA Genotype and Use of Carbamazepine and Oxcarbazepine- 2017 Update (December 2017) .pdf, Content to Search: HLA-A*31:01 allele and carbamazepine dosing recommendations"]
query_rewrite_Q16 = ["Drug Name: Methotrexate, Gene Name: SLCO1B1, CPIC Guideline Name: The Clinical Pharmacogenetics Implementation Consortium (CPIC) guideline for SLCO1B1, ABCG2, and CYP2C9 and statin-associated musculoskeletal symptoms (January 2022) .pdf, Content to Search: SLCO1B1 variant and methotrexate elimination"]
query_rewrite_Q19 = [
    "Drug Name: Clopidogrel, Gene Name: CYP2C19, CPIC Guideline Name: Clinical Pharmacogenetics Implementation Consortium Guideline for CYP2C19 Genotype and Clopidogrel Therapy- 2022 update (January 2022) .pdf, Content to Search: Recommendations for CYP2C19 ultrarapid metabolizers and clopidogrel dosing adjustments",
    "Drug Name: PPI, Gene Name: CYP2C19, CPIC Guideline Name: Clinical Pharmacogenetics Implementation Consortium (CPIC) Guideline for CYP2C19 and Proton Pump Inhibitor Dosing (August 2020) .pdf, Content to Search: Guidance on selecting appropriate PPIs for CYP2C19 ultrarapid metabolizers"
]
query_rewrite_Q20 = []

In [None]:
from vespa.io import VespaQueryResponse
for queries in queries_q_dissect:

    dataloader = DataLoader(
        queries,
        batch_size=1,
        shuffle=False,
        collate_fn=lambda x: processor.process_queries(x),
    )

    qs = []
    for batch_query in dataloader:
        with torch.no_grad():
            batch_query = {k: v.to(model.device) for k, v in batch_query.items()}
            embeddings_query = model(**batch_query)
            qs.extend(list(torch.unbind(embeddings_query.to("cpu"))))

    async with app.asyncio(connections=1, timeout=120) as session:
        for idx, query in enumerate(queries):
            query_embedding = {k: v.tolist() for k, v in enumerate(qs[idx])}
            response: VespaQueryResponse = await session.query(
                yql="select name,path,image,page_number from pdf_page where userInput(@userQuery)",
                ranking="default",
                userQuery=query,
                timeout=120,
                hits=10,
                body={"input.query(qt)": query_embedding, "presentation.timing": True},
            )
            assert response.is_successful()
            display_query_results(query, response)