In [11]:
from pyhealth.datasets import MIMIC3Dataset

mimic3_data = MIMIC3Dataset(root='../data/',
tables=["DIAGNOSES_ICD","PROCEDURES_ICD", "PRESCRIPTIONS"],
code_mapping = {'NDC': ("ATC", {"target_kwargs": {"level":3}})},
dev = True)

mimic3_data.stat()



Statistics of base dataset (dev=True):
	- Dataset: MIMIC3Dataset
	- Number of patients: 1000
	- Number of visits: 1295
	- Number of visits per patient: 1.2950
	- Number of events per visit in DIAGNOSES_ICD: 9.3544
	- Number of events per visit in PROCEDURES_ICD: 4.3351
	- Number of events per visit in PRESCRIPTIONS: 59.2556



'\nStatistics of base dataset (dev=True):\n\t- Dataset: MIMIC3Dataset\n\t- Number of patients: 1000\n\t- Number of visits: 1295\n\t- Number of visits per patient: 1.2950\n\t- Number of events per visit in DIAGNOSES_ICD: 9.3544\n\t- Number of events per visit in PROCEDURES_ICD: 4.3351\n\t- Number of events per visit in PRESCRIPTIONS: 59.2556\n'

In [12]:
mimic3_data.info()


dataset.patients: patient_id -> <Patient>

<Patient>
    - visits: visit_id -> <Visit> 
    - other patient-level info
    
    <Visit>
        - event_list_dict: table_name -> List[Event]
        - other visit-level info
    
        <Event>
            - code: str
            - other event-level info



In [13]:
from pyhealth.tasks import drug_recommendation_mimic3_fn

mimic3_data = mimic3_data.set_task(task_fn=drug_recommendation_mimic3_fn)

mimic3_data.stat()

Generating samples for drug_recommendation_mimic3_fn:   0%|          | 0/1000 [00:00<?, ?it/s]

Generating samples for drug_recommendation_mimic3_fn: 100%|██████████| 1000/1000 [00:00<00:00, 8939.40it/s]


Statistics of sample dataset:
	- Dataset: MIMIC3Dataset
	- Task: drug_recommendation_mimic3_fn
	- Number of samples: 322
	- Number of patients: 115
	- Number of visits: 322
	- Number of visits per patient: 2.8000
	- conditions:
		- Number of conditions per sample: 44.5342
		- Number of unique conditions: 915
		- Distribution of conditions (Top-10): [('5856', 443), ('7100', 416), ('40301', 370), ('V5861', 316), ('V1251', 308), ('2875', 272), ('28521', 245), ('4280', 242), ('32723', 241), ('4019', 221)]
	- procedures:
		- Number of procedures per sample: 12.1118
		- Number of unique procedures: 303
		- Distribution of procedures (Top-10): [('3995', 490), ('3893', 299), ('9904', 284), ('9604', 135), ('966', 131), ('9671', 112), ('3895', 103), ('9390', 103), ('9907', 97), ('9672', 89)]
	- drugs:
		- Number of drugs per sample: 26.8882
		- Number of unique drugs: 165
		- Distribution of drugs (Top-10): [('B01A', 292), ('B05X', 287), ('A02B', 281), ('N02B', 279), ('A06A', 271), ('N02A', 251)

"Statistics of sample dataset:\n\t- Dataset: MIMIC3Dataset\n\t- Task: drug_recommendation_mimic3_fn\n\t- Number of samples: 322\n\t- Number of patients: 115\n\t- Number of visits: 322\n\t- Number of visits per patient: 2.8000\n\t- conditions:\n\t\t- Number of conditions per sample: 44.5342\n\t\t- Number of unique conditions: 915\n\t\t- Distribution of conditions (Top-10): [('5856', 443), ('7100', 416), ('40301', 370), ('V5861', 316), ('V1251', 308), ('2875', 272), ('28521', 245), ('4280', 242), ('32723', 241), ('4019', 221)]\n\t- procedures:\n\t\t- Number of procedures per sample: 12.1118\n\t\t- Number of unique procedures: 303\n\t\t- Distribution of procedures (Top-10): [('3995', 490), ('3893', 299), ('9904', 284), ('9604', 135), ('966', 131), ('9671', 112), ('3895', 103), ('9390', 103), ('9907', 97), ('9672', 89)]\n\t- drugs:\n\t\t- Number of drugs per sample: 26.8882\n\t\t- Number of unique drugs: 165\n\t\t- Distribution of drugs (Top-10): [('B01A', 292), ('B05X', 287), ('A02B', 281

In [14]:
from pyhealth.datasets import split_by_patient, get_dataloader
from fractions import Fraction
from pyhealth.models import Transformer 

train_data, eval_data, test_data = split_by_patient(mimic3_data, [Fraction(2, 3), Fraction(1, 6), Fraction(1, 6)], seed=1203)

train_dataloader = get_dataloader(train_data, batch_size=64, shuffle=True)
eval_dataloader = get_dataloader(eval_data, batch_size=64, shuffle=False)
test_dataloader = get_dataloader(test_data, batch_size=64, shuffle=False)

model = Transformer(
    dataset = mimic3_data,
    feature_keys = ["conditions", "procedures"],
    label_key="drugs",
    mode="multilabel",
)


In [16]:
from pyhealth.trainer import Trainer

trainer = Trainer(model=model)

trainer.train(
    train_dataloader=train_dataloader,
    val_dataloader=eval_dataloader,
    epochs=10,
    monitor="pr_auc_samples"
)

Transformer(
  (embeddings): ModuleDict(
    (conditions): Embedding(917, 128, padding_idx=0)
    (procedures): Embedding(305, 128, padding_idx=0)
  )
  (linear_layers): ModuleDict()
  (transformer): ModuleDict(
    (conditions): TransformerLayer(
      (transformer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadedAttention(
            (linear_layers): ModuleList(
              (0-2): 3 x Linear(in_features=128, out_features=128, bias=False)
            )
            (output_linear): Linear(in_features=128, out_features=128, bias=False)
            (attention): Attention()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (feed_forward): PositionwiseFeedForward(
            (w_1): Linear(in_features=128, out_features=512, bias=True)
            (w_2): Linear(in_features=512, out_features=128, bias=True)
            (dropout): Dropout(p=0.5, inplace=False)
            (activation): GELU(approximate='none')
          )
          (in

Epoch 0 / 10: 100%|██████████| 4/4 [00:00<00:00,  6.53it/s]

--- Train epoch-0, step-4 ---
loss: 0.5079



Evaluation: 100%|██████████| 1/1 [00:00<00:00, 17.14it/s]


--- Eval epoch-0, step-4 ---
pr_auc_samples: 0.5551
loss: 0.4090
New best pr_auc_samples score (0.5551) at epoch-0, step-4



Epoch 1 / 10: 100%|██████████| 4/4 [00:00<00:00, 10.70it/s]

--- Train epoch-1, step-8 ---
loss: 0.4426



Evaluation: 100%|██████████| 1/1 [00:00<00:00, 14.44it/s]

--- Eval epoch-1, step-8 ---





pr_auc_samples: 0.5836
loss: 0.3985
New best pr_auc_samples score (0.5836) at epoch-1, step-8



Epoch 2 / 10: 100%|██████████| 4/4 [00:00<00:00, 11.93it/s]

--- Train epoch-2, step-12 ---
loss: 0.3984



Evaluation: 100%|██████████| 1/1 [00:00<00:00, 14.20it/s]

--- Eval epoch-2, step-12 ---





pr_auc_samples: 0.5889
loss: 0.3893
New best pr_auc_samples score (0.5889) at epoch-2, step-12



Epoch 3 / 10: 100%|██████████| 4/4 [00:00<00:00, 11.14it/s]

--- Train epoch-3, step-16 ---
loss: 0.3749



Evaluation: 100%|██████████| 1/1 [00:00<00:00, 15.30it/s]

--- Eval epoch-3, step-16 ---





pr_auc_samples: 0.5848
loss: 0.3819



Epoch 4 / 10: 100%|██████████| 4/4 [00:00<00:00,  7.41it/s]

--- Train epoch-4, step-20 ---
loss: 0.3468



Evaluation: 100%|██████████| 1/1 [00:00<00:00, 16.76it/s]

--- Eval epoch-4, step-20 ---





pr_auc_samples: 0.5797
loss: 0.3764



Epoch 5 / 10: 100%|██████████| 4/4 [00:00<00:00,  8.19it/s]

--- Train epoch-5, step-24 ---
loss: 0.3349



Evaluation: 100%|██████████| 1/1 [00:00<00:00, 17.29it/s]

--- Eval epoch-5, step-24 ---
pr_auc_samples: 0.5847
loss: 0.3719




Epoch 6 / 10: 100%|██████████| 4/4 [00:00<00:00,  9.41it/s]

--- Train epoch-6, step-28 ---
loss: 0.3130



Evaluation: 100%|██████████| 1/1 [00:00<00:00, 17.76it/s]

--- Eval epoch-6, step-28 ---
pr_auc_samples: 0.5910
loss: 0.3678
New best pr_auc_samples score (0.5910) at epoch-6, step-28




Epoch 7 / 10: 100%|██████████| 4/4 [00:00<00:00, 10.45it/s]

--- Train epoch-7, step-32 ---
loss: 0.3029



Evaluation: 100%|██████████| 1/1 [00:00<00:00, 12.83it/s]


--- Eval epoch-7, step-32 ---
pr_auc_samples: 0.5962
loss: 0.3653
New best pr_auc_samples score (0.5962) at epoch-7, step-32



Epoch 8 / 10: 100%|██████████| 4/4 [00:00<00:00,  8.47it/s]

--- Train epoch-8, step-36 ---
loss: 0.2946



Evaluation: 100%|██████████| 1/1 [00:00<00:00, 15.89it/s]

--- Eval epoch-8, step-36 ---
pr_auc_samples: 0.6004
loss: 0.3650
New best pr_auc_samples score (0.6004) at epoch-8, step-36




Epoch 9 / 10: 100%|██████████| 4/4 [00:00<00:00,  9.06it/s]

--- Train epoch-9, step-40 ---





loss: 0.2860


Evaluation: 100%|██████████| 1/1 [00:00<00:00, 14.29it/s]

--- Eval epoch-9, step-40 ---





pr_auc_samples: 0.6010
loss: 0.3654
New best pr_auc_samples score (0.6010) at epoch-9, step-40
Loaded best model


In [17]:
score = trainer.evaluate(test_dataloader)
print(score)

from pyhealth.metrics.multilabel import multilabel_metrics_fn

y_true, y_prob, loss = trainer.inference(test_dataloader)
multilabel_metrics_fn(y_true, y_prob, metrics=["pr_auc_samples"])

Evaluation: 100%|██████████| 1/1 [00:00<00:00, 24.28it/s]


{'pr_auc_samples': 0.6303895179645207, 'loss': 0.40910544991493225}


Evaluation: 100%|██████████| 1/1 [00:00<00:00, 17.23it/s]


{'pr_auc_samples': 0.6303895179645207}