In [1]:
import pandas as pd

def create_prompts_from_template_and_csv(template, csv_file_path):
    """
    根據指定的 Prompt 模板和 CSV 檔案，生成一系列客製化的 Prompt。
    此版本會將 Drug 和 Gene 欄位中的換行符號替換為逗號。

    Args:
        template (str): 包含 f-string 佔位符的 Prompt 模板。
        csv_file_path (str): CSV 檔案的路徑。

    Returns:
        list: 包含所有生成 Prompt 字串的列表。
    """
    all_prompts = []
    
    try:
        df = pd.read_csv(csv_file_path)
        # 清理欄位名稱，移除可能存在的前後空白
        df.columns = df.columns.str.strip()
    except FileNotFoundError:
        print(f"錯誤：找不到檔案 '{csv_file_path}'")
        return []
    except KeyError as e:
        print(f"錯誤：CSV 檔案中缺少必要的欄位: {e}。請確保欄位名稱正確。")
        return []

    # 迭代 CSV 中的每一行
    for index, row in df.iterrows():
        # 提取資料並清除值的前後空白
        drug_name_raw = row['Drug'].strip()
        gene_name_raw = row['Gene'].strip()
        guideline_name = row['Guidelines'].strip()
        
        # --- 新增的處理步驟 ---
        # 將字串中的換行符號 '\n' 替換為 ', '
        drug_name_processed = drug_name_raw.replace('\n', ', ')
        gene_name_processed = gene_name_raw.replace('\n', ', ')
        
        # 使用 .format() 將處理過的資料填入模板末端
        formatted_prompt = template.format(
            csv_drug_name=drug_name_processed,
            csv_gene_name=gene_name_processed,
            csv_guideline_name=f"{guideline_name}.pdf"
        )
        all_prompts.append(formatted_prompt)
        
    return all_prompts


In [None]:

# --- 主程式執行區 ---

# 使用之前修正好的 Prompt 模板
final_prompt_template = """
## CPIC QA Dataset Generator
You are a pharmacogenomics expert creating structured QA datasets from CPIC guidelines. Follow this protocol:

#### 1. Question Generation Requirements
- **Exclusively generate drug-gene interaction questions**
- **Question structure**:  
  "How does [Gene] affect [Drug] therapy?"  
  Rephrase the question to focus on the drug-gene interaction, or information .
- **Key requirements**:  
  - Questions MUST require CPIC guideline consultation  

#### 2. Answer Format Specification
{{
"Drug Name": "[Exact drug name from guideline]",
"Gene Name": "[Standard gene nomenclature]",
"CPIC Guideline Name": "[Full guideline title with publication date].pdf",
"Content to Search": "[Text specifying the missing clinical information(parameters, guidance, dosage recommendation, etc.)requiring guideline search]"
}}
- **Content to Search rules**:  
  - Specify the missing clinical information(parameters, guidance, dosage recommendation, etc.)requiring guideline search and contributing to answer the question


#### 3. Processing Workflow
**Step 1: Process drug-gene pairs**  
- For each combination in input lists:  
  (Drug, Gene) → Generate QA pair

**Step 2: Generate QA pairs per combination**  

**Step 3: Format output**  
{{
"[Exact Guideline Title].pdf": [
{{
"question": "How does CYP2C19 affect dexlansoprazole therapy?",
"answer": {{
"Drug Name": "Dexlansoprazole",
"Gene Name": "CYP2C19",
"CPIC Guideline Name": "Clinical Pharmacogenetics Implementation Consortium (CPIC) Guideline for CYP2C19 and Proton Pump Inhibitor Dosing (August 2020).pdf",
"Content to Search": "Dosing adjustments for dexlansoprazole in CYP2C19 poor metabolizers"
}}
}}
]
}}

**Begin processing immediately upon receiving inputs.**
Drugs: {csv_drug_name}
Genes: {csv_gene_name}
CPIC guideline: {csv_guideline_name}
"""

# 指定您的 CSV 檔案路徑
csv_file_path = "Guideline_Drug_Gene pair - Sheet1.csv" 

# 生成28個版本的 Prompts
generated_prompts = create_prompts_from_template_and_csv(final_prompt_template, csv_file_path)

# 驗證並印出結果
if generated_prompts:
    print(f"成功根據最終模板生成了 {len(generated_prompts)} 個 Prompt 版本。\n")
    
    # 印出一個包含多個藥物的 Prompt 作為範例，以檢視換行符號是否被成功替換
    # (此處選擇第5個 Prompt，它對應質子幫浦抑制劑，有多個藥物)
    print("--- Prompt 範例 (第 5 個版本)，檢視換行符號處理 ---")
    print(generated_prompts[4])

成功根據最終模板生成了 28 個 Prompt 版本。

--- Prompt 範例 (第 5 個版本)，檢視換行符號處理 ---

## CPIC QA Dataset Generator
You are a pharmacogenomics expert creating structured QA datasets from CPIC guidelines. Follow this protocol:

#### 1. Question Generation Requirements
- **Exclusively generate drug-gene interaction questions**
- **Question structure**:  
  "How does [Gene] affect [Drug] therapy?"  
- **Key requirements**:  
  - Questions MUST require CPIC guideline consultation  

#### 2. Answer Format Specification
{
"Drug Name": "[Exact drug name from guideline]",
"Gene Name": "[Standard gene nomenclature]",
"CPIC Guideline Name": "[Full guideline title with publication date].pdf",
"Content to Search": "[Text specifying the missing clinical information(parameters, guidance, dosage recommendation, etc.)requiring guideline search]"
}
- **Content to Search rules**:  
  - Specify the missing clinical information(parameters, guidance, dosage recommendation, etc.)requiring guideline search and contributing to a

In [9]:
# save prompts to a text file
with open("generated_prompts.txt", "w") as f:
    for i, prompt in enumerate(generated_prompts):
        f.write(f"--- Prompt 版本 {i+1} ---\n")
        f.write(prompt + "\n")
        f.write("="*40 + "\n")

In [40]:
import json
# read in dataset.txt, it's a list of dictionary

# raed the dataset directly from list of dictionary
dataset = []
with open("dataset.txt", "r") as f:
    dataset = json.load(f)
    

# total number of questions
guideline_num = 0
q_num = 0
for data in dataset:
    for key, value in data.items():
        guideline_num += 1
        q_num += len(value)
print(f"總共有 {guideline_num} 個指引。")
print(f"總共有 {q_num} 個問題。")
    

總共有 28 個指引。
總共有 387 個問題。


In [6]:
creative_prompt_template = """
## CPIC QA Dataset Generator
You are a pharmacogenomics expert creating structured QA datasets from CPIC guidelines. Follow this protocol:

#### 1. Question Generation Requirements
- **Exclusively generate drug-gene interaction questions**
- **Question Structure**:
  Design questions to extract specific clinical decision-making information. Below are the required question types with purposes, content targets, and examples:

  1. **Specific Dosing Guidance**  
     - *Purpose*: Extract precise drug dosage values and adjustment protocols  
     - *Content to Search*: Dosage numbers, adjustment protocols, clinical recommendations  
     - *Example*: "What is the recommended efavirenz dose for a CYP2B6 poor metabolizer?"  

  2. **Clinical Risk Stratification**  
     - *Purpose*: Obtain quantified adverse reaction risk data  
     - *Content to Search*: Risk multipliers, incidence rates, toxicity thresholds  
     - *Example*: "What is the increased risk of CNS adverse effects for a CYP2B6 poor metabolizer?"  

  3. **Pharmacokinetics/Pharmacodynamics Data**  
     - *Purpose*: Gather therapeutic drug monitoring values  
     - *Content to Search*: Plasma concentration ranges, therapeutic windows  
     - *Example*: "What is the suggested therapeutic range for plasma efavirenz concentrations?"  

  4. **Genotype-Phenotype Correspondence**  
     - *Purpose*: Clarify genotype to metabolic phenotype mapping  
     - *Content to Search*: Diplotype definitions, phenotype categories  
     - *Example*: "What diplotypes define a CYP2B6 intermediate metabolizer?"  

  5. **Gene/Drug Basic Information**  
     - *Purpose*: Provide foundational mechanistic definitions  
     - *Content to Search*: Gene functions, drug mechanisms, enzyme roles  
     - *Example*: "What is the function of the CYP2B6*6 allele?"  

  6. **Special Population Dosing**  
     - *Purpose*: Extract dosing for special populations  
     - *Content to Search*: Pediatric/pregnancy dosing, comorbidity adjustments  
     - *Example*: "What is the dosing recommendation for efavirenz in children weighing <40kg with CYP2B6 poor metabolizer phenotype?"  

#### 2. Answer Format Specification
{{
"Drug Name": "[Exact drug name from guideline]",
"Gene Name": "[Standard gene nomenclature]",
"CPIC Guideline Name": "[Full guideline title with publication date].pdf",
"Content to Search": "[Text specifying missing clinical parameters/guidance requiring guideline search]"
}}

#### 3. Processing Workflow
**Step 1: Process drug-gene pairs**  
- For each combination in input lists:  
  (Drug, Gene) → Generate 1 QA pair per question type  

**Step 2: Generate QA pairs**  
- Each guideline with multiple drugs and genes will create #drug times #gene QA pairs
- Each drug-gene combination will generate 6 QA pairs (one for each question type)  
- Ensure phenotype specificity (poor/intermediate/ultrarapid metabolizers)  

**Step 3: Format output**  
{{
"[Exact Guideline Title].pdf": [
{{
"question": "How should omeprazole dosing be adjusted for CYP2C19 poor metabolizers?",
"answer": {{
"Drug Name": "Omeprazole",
"Gene Name": "CYP2C19",
"CPIC Guideline Name": "Clinical Pharmacogenetics Implementation Consortium (CPIC) Guideline for CYP2C19 and Proton Pump Inhibitor Dosing (August 2020).pdf",
"Content to Search": "Dose adjustment protocol for omeprazole in CYP2C19 poor metabolizers"
}}
}},
{{
"question": "What is the toxicity risk for CYP2C19 poor metabolizers taking omeprazole?",
"answer": {{
"Drug Name": "Omeprazole",
"Gene Name": "CYP2C19",
"CPIC Guideline Name": "Clinical Pharmacogenetics Implementation Consortium (CPIC) Guideline for CYP2C19 and Proton Pump Inhibitor Dosing (August 2020).pdf",
"Content to Search": "Quantified risk data for adverse effects in CYP2C19 PMs using omeprazole"
}}
}}
// Additional QA pairs for other question types
]
}}

**Begin processing immediately upon receiving inputs.**
Drugs: {csv_drug_name}
Genes: {csv_gene_name}
CPIC guideline: {csv_guideline_name}
"""

# 指定您的 CSV 檔案路徑
csv_file_path = "Guideline_Drug_Gene pair - Sheet1.csv" 

# 生成28個版本的 Prompts
generated_prompts = create_prompts_from_template_and_csv(creative_prompt_template, csv_file_path)

# save prompts to a text file
with open("generated_prompts.txt", "w") as f:
    for i, prompt in enumerate(generated_prompts):
        f.write(f"--- Prompt 版本 {i+1} ---\n")
        f.write(prompt + "\n")
        f.write("="*40 + "\n")

In [10]:
import json
# read in dataset.txt, it's a list of dictionary

# raed the dataset directly from list of dictionary
dataset = []
# with open("creative_dataset_check.txt", "r") as f:
with open("creative_dataset.txt", "r") as f:
    dataset = json.load(f)
    

# total number of questions
guideline_num = 0
q_num = 0
for data in dataset:
    for key, value in data.items():
        guideline_num += 1
        print(f"for Guideline: {key}, we'll have {len(value)} questions.")
        q_num += len(value)
print(f"總共有 {guideline_num} 個指引。")
print(f"總共有 {q_num} 個問題。")
    

for Guideline: Clinical Pharmacogenetics Implementation Consortium (CPIC) Guidelines for Ivacaftor Therapy in the Context of CFTR Genotype (March 2014).pdf, we'll have 6 questions.
for Guideline: Clinical Pharmacogenetics Implementation Consortium (CPIC) Guideline for CYP2B6 and Efavirenz-containing Antiretroviral Therapy (April 2019).pdf, we'll have 6 questions.
for Guideline: Clinical Pharmacogenetics Implementation Consortium Guideline for CYP2B6 Genotype and Methadone Therapy (July 2024).pdf, we'll have 6 questions.
for Guideline: Clinical Pharmacogenetics Implementation Consortium Guideline for CYP2C19 Genotype and Clopidogrel Therapy: 2022 update (January 2022).pdf, we'll have 6 questions.
for Guideline: Clinical Pharmacogenetics Implementation Consortium (CPIC) Guideline for CYP2C19 and Proton Pump Inhibitor Dosing (August 2020).pdf, we'll have 36 questions.
for Guideline: Clinical Pharmacogenetics Implementation Consortium (CPIC) Guideline for CYP2C19 and Voriconazole Therapy (

In [None]:
with open("negative_sample_v1.txt", "r") as f:
    # read in as a list
    

    negative_sample = json.load(f)
negative_sample[0] 

In [9]:
guideline_num = 0
q_num = 0
for data in dataset:
    for key, value in data.items():
        guideline_num += 1
        q_num += len(value)
print(f"總共有 {guideline_num} 個指引。")
print(f"總共有 {q_num} 個問題。")

NameError: name 'dataset' is not defined

In [8]:
import ast

path = "negative_sample.txt"
negative_sample = []


with open(path, "r") as f:
    # read in as a list
    negative_sample = ast.literal_eval(f.read())

negative_sample[0]

{'question': 'What is the CPIC-recommended dose for metformin in a patient with the CYP2D6 *4/*4 genotype?',
 'answer': 'No CPIC guideline information available.'}

In [13]:
dataset[0].keys()

dict_keys(['Clinical Pharmacogenetics Implementation Consortium (CPIC) Guidelines for Ivacaftor Therapy in the Context of CFTR Genotype (March 2014).pdf'])