### PromptEHR合成数据

In [1]:
from promptehr import load_synthetic_data

data = load_synthetic_data()

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# medical codes词汇对照表
voc = data['voc']

def decode_visit(visit, voc):
    diag_ids, prod_ids, med_ids = visit
    diag_codes = [voc['diag'].idx2word[i] for i in diag_ids]
    prod_codes = [voc['prod'].idx2word[i] for i in prod_ids]
    med_codes  = [voc['med'].idx2word[i] for i in med_ids]
    return {
        "diagnoses": diag_codes,
        "procedures": prod_codes,
        "medications": med_codes
    }

In [7]:
patient = 0
visit = data['visit'][patient][0]  # 第一次就诊
visit

[[0, 1, 2, 3, 5, 7, 41, 313, 1],
 [0, 1, 82],
 [2, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 51, 19, 26]]

In [23]:
patients = data['visit']
len(patients)

1000

In [31]:
# 平均每个患者就诊次数：2.516
sum_length = 0
for patient in data['visit']:
    sum_length += len(patient)
print("平均每个患者就诊次数：" + str(sum_length / len(data['visit'])))

平均每个患者就诊次数：2.516


In [None]:
# 给每个患者解码，每个visit三种codes
def decode_patient(patient_id, data):
    visits = []
    for v in data['visit'][patient_id]:
        visits.append(decode_visit(v, data['voc']))
    return visits

decoded_patient0 = decode_patient(0, data)
for i, v in enumerate(decoded_patient0):
    print(f"Visit {i+1}: {v}")

Visit 1: {'diagnoses': ['4239', '5119', '78551', '4589', '7220', '2724', 'E8788', 'V4501', '5119'], 'procedures': ['3731', '8872', '3491'], 'medications': ['A02B', 'B05C', 'A12A', 'A12C', 'C01C', 'A07A', 'A10A', 'N01A', 'C07A', 'C03C', 'A12B', 'N07A', 'A03F', 'A02A', 'C10A']}
Visit 2: {'diagnoses': ['4239', '5119', 'V1259', '53081'], 'procedures': ['8872', '3961'], 'medications': ['N02B', 'A02B', 'A06A', 'A12C', 'C01C', 'A04A', 'C07A', 'C03C', 'A12B', 'C02D', 'R01A', 'C01E', 'B01A', 'N05C']}


In [4]:
from promptehr import SequencePatient
from promptehr import load_synthetic_data
from promptehr import PromptEHR

# init model
model = PromptEHR()
model.from_pretrained()

# load input data
demo = load_synthetic_data(n_sample=1000) # we have 10,000 samples in total

# build the standard input data for train or test PromptEHR models
seqdata = SequencePatient(data={'v':demo['visit'], 'y':demo['y'], 'x':demo['feature'],},
    metadata={
        'visit':{'mode':'dense'},
        'label':{'mode':'tensor'}, 
        'voc':demo['voc'],
        'max_visit':20,
        }
    )
# you can try to fit on this data by
# model.fit(seqdata)

# start generate
# n: the target total number of samples to generate
# n_per_sample: based on each sample, how many fake samples will be generated
# the output will have the same format of `SequencePatient`
fake_data = model.predict(seqdata, n=1000, n_per_sample=10)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BartTokenizer'. 
The class this function is called from is 'DataTokenizer'.


Download pretrained PromptEHR model, save to ./simulation/pretrained_promptEHR.
Load pretrained PromptEHR model from ./simulation/pretrained_promptEHR


AttributeError: DataTokenizer has no attribute total_vocab_size

### LangGraph环境配置

In [None]:
# 安装相关的包
pip install --quiet -U langchain_openai langchain_core langchain_community tavily-python langgraph langgraph-prebuilt langchain_ollama

Note: you may need to restart the kernel to use updated packages.


In [17]:
# Load environment variables and set up auto-reload
from dotenv import load_dotenv
load_dotenv()

True

In [18]:
import os, getpass

def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}: ")

_set_env("OPENAI_API_KEY")

In [None]:
# 查看ollama中下载的模型
!ollama list

NAME             ID              SIZE      MODIFIED     
qwen3:14b        7d7da67570e2    9.3 GB    3 months ago    
llama3:latest    365c0bd3c000    4.7 GB    5 months ago    


In [21]:
from langchain_openai import ChatOpenAI
from langchain_ollama import ChatOllama
from langchain_core.messages import HumanMessage, SystemMessage

# 使用不同的模型来对话
gpt4o = ChatOpenAI(model="gpt-4o", temperature=0.1)
llama = ChatOllama(model="llama3", temperature=0.1)
qwen = ChatOllama(model="qwen3:14b", temperature=0.1)

msg = HumanMessage(content="Hello, how are you?")

response = gpt4o.invoke([msg])
print(response.content)

Hello! I'm just a computer program, so I don't have feelings, but I'm here and ready to help you. How can I assist you today?


In [27]:
diagnoses_translator = """
    You are a ICD-9 diagnosis code translator.
    You will be given a list of ICD-9-CM diagnosis codes, and you need to translate them into a list of natural language descriptions.
    Here are some examples of the mapping between ICD-9-CM and natural language descriptions:
    "4239" -> "423.9" -> "Pericardial disease"
    "E8788" -> "E878.8" -> "Other specified surgical operations and procedures causing abnormal patient reaction or later complication without misadventure at time of operation"
    "V4501" -> "V45.01" -> "Cardiac pacemaker in situ"
    Now, start the translation with high accuracy and efficiency.
"""

diag_result = gpt4o.invoke([SystemMessage(content=diagnoses_translator), HumanMessage(content="5119, V1259, 78551")])
print(diag_result.content)


1. "5119" -> "511.9" -> "Unspecified pleural effusion"
2. "V1259" -> "V12.59" -> "Personal history of other diseases of the circulatory system"
3. "78551" -> "785.51" -> "Cardiogenic shock"


In [29]:
procedures_translator = """
    You are a ICD-9-CM procedure code translator.
    You will be given a list of ICD-9-CM procedure codes, and you need to translate them into a list of natural language descriptions.
    Here are some examples of the mapping between ICD-9-CM procedure codes and natural language descriptions:
    "3731" -> "37.31" -> "Pericardiectomy"
    "3491" -> "34.91" -> "Thoracentesis"
    Now, start the translation with high accuracy and efficiency.
"""

proc_result = gpt4o.invoke([SystemMessage(content=procedures_translator), HumanMessage(content="8872, 3961")])
print(proc_result.content)

- "88.72" -> "Diagnostic ultrasound of heart"
- "39.61" -> "Extracorporeal circulation auxiliary to open heart surgery"


In [30]:
medications_translator = """
    You are a medication code (ATC code) translator.
    You will be given a list of medication codes, and you need to translate them into a list of natural language descriptions.
    Here are some examples of the mapping between medication codes and natural language descriptions:
    "N02B" -> "Other analgesics and antipyretics"
    "A02B" -> "Drugs for peptic ulcer and gastro-oesophageal reflux disease (gord)"
    Now, start the translation with high accuracy and efficiency.
"""

med_result = gpt4o.invoke([SystemMessage(content=medications_translator), HumanMessage(content="A06A, C07A, R01A, N05C")])
print(med_result.content)

- "A06A" -> "Drugs for constipation"
- "C07A" -> "Beta blocking agents"
- "R01A" -> "Decongestants and other nasal preparations for topical use"
- "N05C" -> "Hypnotics and sedatives"
