In [1]:
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd
import os
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# get root directory
root = Path().absolute().parent.parent
root = str(root)

In [3]:
dataset = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k")

In [4]:
print(f"dataset structure: {dataset}")
print(f"1st sample of train set: {dataset['train'][0]}")

dataset structure: DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 112165
    })
})
1st sample of train set: {'instruction': "If you are a doctor, please answer the medical questions based on the patient's description.", 'input': 'I woke up this morning feeling the whole room is spinning when i was sitting down. I went to the bathroom walking unsteadily, as i tried to focus i feel nauseous. I try to vomit but it wont come out.. After taking panadol and sleep for few hours, i still feel the same.. By the way, if i lay down or sit down, my head do not spin, only when i want to move around then i feel the whole world is spinning.. And it is normal stomach discomfort at the same time? Earlier after i relieved myself, the spinning lessen so i am not sure whether its connected or coincidences.. Thank you doc!', 'output': 'Hi, Thank you for posting your query. The most likely cause for your symptoms is benign paroxysmal positional ver

check the column

In [5]:
if "input" not in dataset['train'].column_names or "output" not in dataset['train'].column_names:
    print("not found input or output column in dataset")

data mapping function

In [6]:
def merge_columns(example):
    merged_text = f"Patient: {example['input']} Doctor: {example['output']}"
    return {"merged_text": merged_text}

In [7]:
processed_dataset = dataset.map(merge_columns)
print(f"dataset structure: {processed_dataset}")
print(f"1st sample of train set: {processed_dataset['train'][0]}")

dataset structure: DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'merged_text'],
        num_rows: 112165
    })
})
1st sample of train set: {'instruction': "If you are a doctor, please answer the medical questions based on the patient's description.", 'input': 'I woke up this morning feeling the whole room is spinning when i was sitting down. I went to the bathroom walking unsteadily, as i tried to focus i feel nauseous. I try to vomit but it wont come out.. After taking panadol and sleep for few hours, i still feel the same.. By the way, if i lay down or sit down, my head do not spin, only when i want to move around then i feel the whole world is spinning.. And it is normal stomach discomfort at the same time? Earlier after i relieved myself, the spinning lessen so i am not sure whether its connected or coincidences.. Thank you doc!', 'output': 'Hi, Thank you for posting your query. The most likely cause for your symptoms is benign paroxysmal

In [8]:
# save processed_dataset['train']["merged_text"] to csv files
sample_size = 1000
pd.DataFrame({"context": processed_dataset['train']["merged_text"][:sample_size]}).to_csv(f"{root}/data/input/context-{sample_size}.csv", index=False)

Run script 1. 

Extract all info from context, and devide it to privacy_info and known_info.

In [14]:
import subprocess
result = subprocess.run(["python", f"{root}/scripts/step1_extract_info.py", "-m", "doubao-1-5-lite", "-i", f"{root}/data/input/context-{sample_size}.csv", "-o", f"{root}/data/output/", "-t", "Person"], stdout=subprocess.PIPE)

Processing contexts: 100%|██████████| 1000/1000 [37:45<00:00,  2.27s/it]


Run script 2.

Generate attack prompt for RAG.

In [17]:
result = subprocess.run(["python", f"{root}/scripts/step2_generate_attack_prompt.py", "-m", "doubao-1-5-lite", "-i", f"{root}/data/output/", "-o", f"{root}/data/output/"], stdout=subprocess.PIPE)

Generating attack prompts: 100%|██████████| 1000/1000 [29:49<00:00,  1.79s/it]
