In [8]:
import pandas as pd
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

True
NVIDIA GeForce RTX 4080 SUPER


## pyhealth.Event

In [19]:
from pyhealth.data import Event
from datetime import datetime

- **[Arguments]**:
  - ``code`` – str, code of the event (e.g., “428.0” for heart failure)
  - ``table`` – str, str, name of the table where the event is recorded. E.g., “DIAGNOSES_ICD”.
  - ``vocabulary`` – str, vocabulary of the code (e.g., ‘ICD9CM’, ‘ICD10CM’, ‘NDC’).
  - ``visit_id`` – str, unique identifier of the visit.
  - ``patient_id`` – str, unique identifier of the patient.
  - ``timestamp`` – Optional[datetime], timestamp of the event. Defaults to None.
  - ``**attr`` – optional attributes of the event. Attributes to add to visit as key=value pairs.
- **[Attributes]**: to show additional attributes
    - ``attr_dict``: show the information stored in ****attr**

In [2]:
event1 = Event(
    code="428.0",
    table="DIAGNOSES_ICD",
    vocabulary="ICD9CM",
    visit_id="v001",
    patient_id="p001",
    timestamp=datetime.now(),
)

print(event1)

Event from patient p001 visit v001:
	- Code: 428.0
	- Table: DIAGNOSES_ICD
	- Vocabulary: ICD9CM
	- Timestamp: 2024-05-20 11:28:02.936396


In [9]:
event2 = Event(
    code="00069153041",
    table="PRESCRIPTIONS",
    vocabulary="NDC",
    visit_id="v001",
    patient_id="p001",
    timestamp=datetime.now(),
    # addtional value를 dict 형태로 추가할 수 있음
    active_on_discharge = True,
)

event2.attr_dict

{'active_on_discharge': True}

In [10]:
event3 = Event(
    code = "00069153041",
    table = "PRESCRIPTIONS",
    vocabulary = "NDC",
    visit_id = "130744",
    patient_id = "103",
    timestamp = datetime.fromisoformat("2019-08-12 00:00:00"),
    dosage = "250mg"
)
print(event3)

Event from patient 103 visit 130744:
	- Code: 00069153041
	- Table: PRESCRIPTIONS
	- Vocabulary: NDC
	- Timestamp: 2019-08-12 00:00:00
	- dosage: 250mg


## pyhealth.Visit

In [11]:
from pyhealth.data import Visit
from datetime import datetime, timedelta

- **[Arguments]**:
  - ``visit_id`` – str, unique identifier of the visit.
  - ``patient_id`` – str, unique identifier of the patient.
  - ``encounter_time`` – Optional[datetime], timestamp of visit’s encounter. Defaults to None.
  - ``discharge_time`` – Optional[datetime], timestamp of visit’s discharge. Defaults to None.
  - ``discharge_status`` - Optional, patient’s status upon discharge. E.g., “Alive”, “Dead”. Defaults to None.
  - ``**attr`` – optional attributes of the visit. Attributes to add to visit as key=value pairs.

- **[Atrributes]**
  - `available_tables`: Returns a list of available tables for the visit.
  - `num_events`: Returns the total number of events in the visit.
- **[Methods]**
  - `add_event()`: Adds an event to the visit.
  - `get_evet_list()`: Returns a list of events from a specific table.
  - `get_code_list()`: Returns a list of codes from a specific table.

In [12]:
visit1 = Visit(
    visit_id = "v001",
    patient_id = "p001", 
    encounter_time = datetime.now() - timedelta(days=2),
    discharge_time = datetime.now() - timedelta(days=1),
    discharge_status = "Alive"
)

# visit1에 앞서 저장했던 event1과 event2를 추가함
visit1.add_event(event1)
visit1.add_event(event2)

print(visit1)

Visit v001 from patient p001 with 2 events:
	- Encounter time: 2024-05-18 11:33:08.162798
	- Discharge time: 2024-05-19 11:33:08.162798
	- Discharge status: Alive
	- Available tables: ['DIAGNOSES_ICD', 'PRESCRIPTIONS']
	- Event from patient p001 visit v001:
		- Code: 428.0
		- Table: DIAGNOSES_ICD
		- Vocabulary: ICD9CM
		- Timestamp: 2024-05-20 11:28:02.936396
	- Event from patient p001 visit v001:
		- Code: 00069153041
		- Table: PRESCRIPTIONS
		- Vocabulary: NDC
		- Timestamp: 2024-05-20 11:32:13.433763
		- active_on_discharge: True


In [13]:
print(visit1.available_tables)

['DIAGNOSES_ICD', 'PRESCRIPTIONS']


In [14]:
print(visit1.num_events)

2


In [15]:
print(visit1.get_event_list('DIAGNOSES_ICD'))
print(visit1.get_code_list('DIAGNOSES_ICD'))

[Event with ICD9CM code 428.0 from table DIAGNOSES_ICD]
['428.0']


In [16]:
event3 = Event(
    code="585.9",
    table="DIAGNOSES_ICD",
    vocabulary="ICD9CM",
    visit_id="v002",
    patient_id="p001",
    timestamp=datetime.now(),
)

visit2 = Visit(
    visit_id="v002",
    patient_id="p001",
    encounter_time=datetime.now() - timedelta(days=1),
    discharge_time=datetime.now(),
    discharge_status='Dead',
)

# add events
visit2.add_event(event3)

# summary
print (visit2)

Visit v002 from patient p001 with 1 events:
	- Encounter time: 2024-05-19 11:33:18.332733
	- Discharge time: 2024-05-20 11:33:18.332733
	- Discharge status: Dead
	- Available tables: ['DIAGNOSES_ICD']
	- Event from patient p001 visit v002:
		- Code: 585.9
		- Table: DIAGNOSES_ICD
		- Vocabulary: ICD9CM
		- Timestamp: 2024-05-20 11:33:18.332733


## pyhealth.Patient

In [245]:
from pyhealth.data import Patient
from datetime import datetime, timedelta

- **[Arguments]**:
  - ``patient_id`` – str, unique identifier of the patient.
  - ``birth_datetime`` – Optional[datetime], timestamp of patient’s birth. Defaults to None.
  - ``death_datetime`` – Optional[datetime], timestamp of patient’s death. Defaults to None.
  - ``gender`` – Optional, gender of the patient. E.g., “M”, “F”. Defaults to None.
  - ``ethnicity`` – Optional, ethnicity of the patient. E.g., “White”, “Black or African American”, “American Indian or Alaska Native”, “Asian”, “Native Hawaiian or Other Pacific Islander”. Defaults to None.
  - ``**attr`` – optional attributes of the patient. Attributes to add to patient as key=value pairs.

- **[Methods]**
  - `add_visit()`: Adds a visit to the patient.
  - `add_event()`: Adds an event to the patient.
  - `get_visit_by_id()`: Returns a visit by visit id.
  - `get_visit_by_index()`: Returns a visit by its index.

- **[Attributes]**
  - `available_tables`: Returns a list of available tables for the patient.

In [20]:
patient = Patient(
    patient_id="p001",
    birth_datetime=datetime(2012, 9, 16, 0, 0),
    death_datetime=None,
    gender="F",
    ethnicity="White"
)

patient.add_visit(visit1)
patient.add_visit(visit2)

print(patient)

Patient p001 with 2 visits:
	- Birth datetime: 2012-09-16 00:00:00
	- Death datetime: None
	- Gender: F
	- Ethnicity: White
	- Visit v001 from patient p001 with 2 events:
		- Encounter time: 2024-05-18 11:33:08.162798
		- Discharge time: 2024-05-19 11:33:08.162798
		- Discharge status: Alive
		- Available tables: ['DIAGNOSES_ICD', 'PRESCRIPTIONS']
		- Event from patient p001 visit v001:
			- Code: 428.0
			- Table: DIAGNOSES_ICD
			- Vocabulary: ICD9CM
			- Timestamp: 2024-05-20 11:28:02.936396
		- Event from patient p001 visit v001:
			- Code: 00069153041
			- Table: PRESCRIPTIONS
			- Vocabulary: NDC
			- Timestamp: 2024-05-20 11:32:13.433763
			- active_on_discharge: True
	- Visit v002 from patient p001 with 1 events:
		- Encounter time: 2024-05-19 11:33:18.332733
		- Discharge time: 2024-05-20 11:33:18.332733
		- Discharge status: Dead
		- Available tables: ['DIAGNOSES_ICD']
		- Event from patient p001 visit v002:
			- Code: 585.9
			- Table: DIAGNOSES_ICD
			- Vocabulary: ICD9CM
	

In [21]:
print(patient.get_visit_by_index(0))

Visit v001 from patient p001 with 2 events:
	- Encounter time: 2024-05-18 11:33:08.162798
	- Discharge time: 2024-05-19 11:33:08.162798
	- Discharge status: Alive
	- Available tables: ['DIAGNOSES_ICD', 'PRESCRIPTIONS']
	- Event from patient p001 visit v001:
		- Code: 428.0
		- Table: DIAGNOSES_ICD
		- Vocabulary: ICD9CM
		- Timestamp: 2024-05-20 11:28:02.936396
	- Event from patient p001 visit v001:
		- Code: 00069153041
		- Table: PRESCRIPTIONS
		- Vocabulary: NDC
		- Timestamp: 2024-05-20 11:32:13.433763
		- active_on_discharge: True


In [22]:
print(patient.get_visit_by_index(1))

Visit v002 from patient p001 with 1 events:
	- Encounter time: 2024-05-19 11:33:18.332733
	- Discharge time: 2024-05-20 11:33:18.332733
	- Discharge status: Dead
	- Available tables: ['DIAGNOSES_ICD']
	- Event from patient p001 visit v002:
		- Code: 585.9
		- Table: DIAGNOSES_ICD
		- Vocabulary: ICD9CM
		- Timestamp: 2024-05-20 11:33:18.332733


## Mimic 3 dataset

In [23]:
from pyhealth.datasets import MIMIC3Dataset

mimic3_ds = MIMIC3Dataset(
    # Argument 1: It specifies the data folder root.
    root="https://storage.googleapis.com/pyhealth/Synthetic_MIMIC-III/", 
    
    # Argument 2: The users need to input a list of raw table names (e.g., DIAGNOSES_ICD.csv, PROCEDURES_ICD.csv).
    tables=["DIAGNOSES_ICD", "PROCEDURES_ICD"],
    
    # Argument 3: This argument input a dictionary (key is the source code 
    # vocabulary and value is the target code vocabulary . 
    # Default is empty dict, which means the original code will be used.
    code_mapping={"ICD9CM": "CCSCM"},
)

INFO: Pandarallel will run on 24 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/
finish basic patient information parsing : 11.5877206325531s
finish parsing DIAGNOSES_ICD : 11.218774318695068s
finish parsing PROCEDURES_ICD : 9.194064378738403s


Mapping codes: 100%|██████████| 49993/49993 [00:03<00:00, 13205.90it/s]


In [24]:
mimic3_ds.info()


dataset.patients: patient_id -> <Patient>

<Patient>
    - visits: visit_id -> <Visit> 
    - other patient-level info
    
    <Visit>
        - event_list_dict: table_name -> List[Event]
        - other visit-level info
    
        <Event>
            - code: str
            - other event-level info



In [25]:
# You can also print the statistics of the entire dataset.
mimic3_ds.stat()


Statistics of base dataset (dev=False):
	- Dataset: MIMIC3Dataset
	- Number of patients: 49993
	- Number of visits: 52769
	- Number of visits per patient: 1.0555
	- Number of events per visit in DIAGNOSES_ICD: 9.1038
	- Number of events per visit in PROCEDURES_ICD: 3.2186



'\nStatistics of base dataset (dev=False):\n\t- Dataset: MIMIC3Dataset\n\t- Number of patients: 49993\n\t- Number of visits: 52769\n\t- Number of visits per patient: 1.0555\n\t- Number of events per visit in DIAGNOSES_ICD: 9.1038\n\t- Number of events per visit in PROCEDURES_ICD: 3.2186\n'

In [26]:
patient_dict = mimic3_ds.patients

In [27]:
print(list(patient_dict.keys())[:10])

['1', '10', '100', '1000', '10000', '10001', '10002', '10003', '10004', '10005']


In [28]:
def dict_search(dictionary, n):
    # dict를 tuple list 형태로 변환
    items_list = list(dictionary.items())
    if n < len(items_list):
        return items_list[n]
    else:
        return None
    
# 3번째 환자 조회
dict_search(patient_dict, 2)

('100', Patient 100 with 1 visits)

In [29]:
# patient_id 1005 환자 조회
patient = patient_dict["1005"]
patient.gender, patient.birth_datetime

('F', datetime.datetime(1961, 9, 7, 0, 0))

In [30]:
visit_dict = patient.visits
print(list(visit_dict.keys()))

['101058']


In [31]:
# patients_visit에서 101058 환자 조회
visit = visit_dict['101058']
visit.encounter_time, visit.available_tables

(datetime.datetime(1962, 3, 15, 0, 0), ['DIAGNOSES_ICD', 'PROCEDURES_ICD'])

In [32]:
visit.get_event_list("DIAGNOSES_ICD")

[Event with CCSCM code 98 from table DIAGNOSES_ICD,
 Event with CCSCM code 238 from table DIAGNOSES_ICD,
 Event with CCSCM code 239 from table DIAGNOSES_ICD,
 Event with CCSCM code 661 from table DIAGNOSES_ICD,
 Event with CCSCM code 52 from table DIAGNOSES_ICD,
 Event with CCSCM code 2621 from table DIAGNOSES_ICD,
 Event with CCSCM code 660 from table DIAGNOSES_ICD]

## Pyhealth.tasks

This module is used to define the healthcare AI task by task function. The task function specifics how to process each pateint's data (the structured dataset) into a set of samples for the downstream machine learning models.

- **[Arguments]**: 
  - `patient`: A [Patient](https://pyhealth.readthedocs.io/en/latest/api/data/pyhealth.data.Patient.html) object.

- **[Returns]**: 
    - `samples`: a list of samples, each sample is a dict with patient_id, visit_id, and other task-specific attributes as key

- **[Functionality]**: currently, we provide [the following tasks](https://pyhealth.readthedocs.io/en/latest/api/tasks.html) on the datasets:
  - drug_recommendation_mimic3_fn
  - readmission_prediction_mimic3_fn
  - mortality_prediction_mimic3_fn
  - length_of_stay_prediction_mimic3_fn
  - We provide similar functions for MIMIC-IV, eICU, and OMOP dataset.

In [33]:
def mortality_prediction_mimic3_fn(patient):
    """
    Mortality prediction aims at predicting whether the patient will decease in the
        next hospital visit based on the clinical information from current visit
        (e.g., conditions and procedures).

    """
    samples = []
    for visit_idx in range(len(patient) - 1):
        visit = patient[visit_idx]
        next_visit = patient[visit_idx + 1]

        # obtain the label
        if next_visit.discharge_status not in [0, 1]:
            mortality_label = 0
        else:
            mortality_label = int(next_visit.discharge_status)

        # step 1: obtain features
        conditions = visit.get_code_list(table="DIAGNOSES_ICD")
        procedures = visit.get_code_list(table="PROCEDURES_ICD")

        # step 2: exclusion criteria
        if len(conditions) + len(procedures) == 0: continue

        # step 3: assemble the sample
        # REMEMBER: the key here will be the "feature_keys" and "label_key" for initializing the downstream model
        samples.append(
            {
                "visit_id": visit.visit_id,
                "patient_id": patient.patient_id,
                "conditions": conditions, # feature key 1
                "procedures": procedures, # feature key 2
                "label": mortality_label, # label key
            }
        )
    
    return samples

In [35]:
dataset = MIMIC3Dataset(
    root="https://storage.googleapis.com/pyhealth/Synthetic_MIMIC-III/",
    tables=["DIAGNOSES_ICD", "PROCEDURES_ICD"],
    dev=True,
)

INFO: Pandarallel will run on 24 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/
finish basic patient information parsing : 7.542724847793579s
finish parsing DIAGNOSES_ICD : 7.7668633460998535s
finish parsing PROCEDURES_ICD : 6.248070240020752s


Mapping codes: 100%|██████████| 1000/1000 [00:00<00:00, 167230.33it/s]


In [36]:
# use `.set_task()`, which returns a task dataset
task_ds = dataset.set_task(task_fn=mortality_prediction_mimic3_fn)

Generating samples for mortality_prediction_mimic3_fn: 100%|██████████| 1000/1000 [00:00<00:00, 826138.27it/s]


In [37]:
task_ds.stat()

Statistics of sample dataset:
	- Dataset: MIMIC3Dataset
	- Task: mortality_prediction_mimic3_fn
	- Number of samples: 53
	- Number of patients: 44
	- Number of visits: 53
	- Number of visits per patient: 1.2045
	- conditions:
		- Number of conditions per sample: 10.3962
		- Number of unique conditions: 335
		- Distribution of conditions (Top-10): [('4019', 14), ('4280', 14), ('25000', 12), ('2851', 9), ('53081', 9), ('41401', 9), ('311', 6), ('2762', 6), ('486', 6), ('2724', 6)]
	- procedures:
		- Number of procedures per sample: 2.8679
		- Number of unique procedures: 99
		- Distribution of procedures (Top-10): [('3893', 11), ('9904', 7), ('8872', 6), ('3995', 5), ('9604', 4), ('3722', 4), ('966', 3), ('9359', 3), ('4513', 2), ('4516', 2)]
	- label:
		- Number of label per sample: 1.0000
		- Number of unique label: 1
		- Distribution of label (Top-10): [(0, 53)]


"Statistics of sample dataset:\n\t- Dataset: MIMIC3Dataset\n\t- Task: mortality_prediction_mimic3_fn\n\t- Number of samples: 53\n\t- Number of patients: 44\n\t- Number of visits: 53\n\t- Number of visits per patient: 1.2045\n\t- conditions:\n\t\t- Number of conditions per sample: 10.3962\n\t\t- Number of unique conditions: 335\n\t\t- Distribution of conditions (Top-10): [('4019', 14), ('4280', 14), ('25000', 12), ('2851', 9), ('53081', 9), ('41401', 9), ('311', 6), ('2762', 6), ('486', 6), ('2724', 6)]\n\t- procedures:\n\t\t- Number of procedures per sample: 2.8679\n\t\t- Number of unique procedures: 99\n\t\t- Distribution of procedures (Top-10): [('3893', 11), ('9904', 7), ('8872', 6), ('3995', 5), ('9604', 4), ('3722', 4), ('966', 3), ('9359', 3), ('4513', 2), ('4516', 2)]\n\t- label:\n\t\t- Number of label per sample: 1.0000\n\t\t- Number of unique label: 1\n\t\t- Distribution of label (Top-10): [(0, 53)]"

In [38]:
task_ds.available_keys

['visit_id', 'patient_id', 'conditions', 'procedures', 'label']

In [39]:
task_ds.samples[0]

{'visit_id': '100144',
 'patient_id': '141',
 'conditions': ['27651',
  '311',
  '2768',
  '07070',
  '4928',
  '78001',
  '8730',
  '51889'],
 'procedures': ['9604', '370'],
 'label': 0}

In [None]:
# time_window = 재입원 예측을 위한 시간 단위(예시 코드에는 15일로 설정)
def readmission_prediction_mimic4_fn(patient: Patient, time_window=15):
    
    samples = []

    # 마지막 방문을 제외함
    for i in range(len(patient) - 1):
        visit: Visit = patient[i]
        next_visit: Visit = patient[i + 1]

        # 방문 간의 시간 차이(다음 방문 - 현재 방문)를 계산하여 재입원 레이블을 생성함
        time_diff = (next_visit.encounter_time - visit.encounter_time).days
        # time_diff가 time_window(ex.15)보다 작으면 1을 반환, else 0
        readmission_label = 1 if time_diff < time_window else 0
        
				# diagnoses_icd 테이블에서 해당 visit의 condition에 해당하는 코드를 가져옴
        conditions = visit.get_code_list(table="diagnoses_icd")
        # procedures_icd 테이블에서 해당 visit의 procedures에 해당하는 코드를 가져옴
        procedures = visit.get_code_list(table="procedures_icd")
        # prescriptions 테이블에서 해당 visit의 prescriptions에 해당하는 코드를 가져옴
        drugs = visit.get_code_list(table="prescriptions")
        
        # 코드 중 하나만 부재하더라도 해당 방문 데이터는 제외함
        if len(conditions) * len(procedures) * len(drugs) == 0:
            continue
        samples.append(
            {
                "visit_id": visit.visit_id,
                "patient_id": patient.patient_id,
                "conditions": [conditions],
                "procedures": [procedures],
                "drugs": [drugs],
                "label": readmission_label,
            }
        )
        
    return samples

## Mimic 4에 적용

In [190]:
# procedures_icd
procedures_icd = pd.read_csv("C:/Users/gangmin/dahs/data/mimic/hosp/procedures_icd.csv")
procedures_icd.to_parquet('C:/Users/gangmin/dahs/data/mimic/hosp/parquet/procedures_icd.parquet')
procedures_icd = pd.read_parquet('C:/Users/gangmin/dahs/data/mimic/hosp/parquet/procedures_icd.parquet')

In [47]:
procedures_icd[procedures_icd['subject_id']==10000032]

Unnamed: 0,subject_id,hadm_id,seq_num,chartdate,icd_code,icd_version
0,10000032,22595853,1,2180-05-07,5491,9
1,10000032,22841357,1,2180-06-27,5491,9
2,10000032,25742920,1,2180-08-06,5491,9


In [48]:
procedures_icd['icd_version'].value_counts()

9     446079
10    223107
Name: icd_version, dtype: int64

## Medcode

Currently, we support the following coding systems:

- Diagnosis codes:
    - ICD9CM
    - ICD10CM
    - CCSCM
- Procedure codes:
    - ICD9PROC
    - ICD10PROC
    - CCSPROC
- Medication codes:
    - NDC
    - RxNorm
    - ATC

In [116]:
icd_9 = procedures_icd[procedures_icd['icd_version']==9]
icd_10 = procedures_icd[procedures_icd['icd_version']==10]

In [191]:
from pyhealth.medcode import CrossMap, InnerMap

mapping_icd9 = CrossMap.load(source_vocabulary="ICD9PROC", target_vocabulary="CCSPROC")
mapping_icd10 = CrossMap.load(source_vocabulary="ICD10PROC", target_vocabulary="CCSPROC")

In [192]:
# ICD9PROC & ICD10PROC -> CCSPROC
def map_icd_to_ccscm(row):
    try :
        if row['icd_version'] == 9 :
            ccs_code = mapping_icd9.map(row['icd_code'])
        elif row['icd_version'] == 10 :
            ccs_code = mapping_icd10.map(row['icd_code'])
        else : 
            return 'Unknown'
        return ccs_code[0]
        
    except Exception as e:
        return 'Unknown'

In [108]:
# mapping의 메소드 확인
print(dir(mapping_icd9))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'load', 'map', 'mapping', 's_class', 's_vocab', 't_class', 't_vocab']


### Code Mapping

`class pyhealth.medcode.CodeMap`
- **[Args]**:
    - source: source code vocabulary to map from
    - target: target code vocabulary to map to

- **[Functionality]**:
- map(source_code): maps source_code to the target vocabulary

Currently, we support the following mapping:

- With in diagnosis codes:
    - ICD9CM <-> CCSCM
    - ICD10CM <-> CCSCM
- With in procedure codes:
    - ICD9PROC <-> CCSPROC
    - ICD10PROC <-> CCSPROC
- With in medication codes:
    - NDC <-> RxNorm
    - NDC <-> ATC
    - RxNorm <-> ATC
- Between diagnosis and medication codes:
    - ATC <-> ICD9CM

In [193]:
procedures_icd['ccsproc_code'] = procedures_icd.apply(map_icd_to_ccscm, axis=1)
procedures_icd

Unnamed: 0,subject_id,hadm_id,seq_num,chartdate,icd_code,icd_version,ccsproc_code
0,10000032,22595853,1,2180-05-07,5491,9,88
1,10000032,22841357,1,2180-06-27,5491,9,88
2,10000032,25742920,1,2180-08-06,5491,9,88
3,10000068,25022803,1,2160-03-03,8938,9,227
4,10000117,27988844,1,2183-09-19,0QS734Z,10,146
...,...,...,...,...,...,...,...
669181,19999840,21033226,5,2164-09-16,0331,9,4
669182,19999840,26071774,1,2164-07-25,8891,9,198
669183,19999840,26071774,2,2164-07-25,8841,9,188
669184,19999987,23865745,1,2145-11-07,8841,9,188


In [126]:
# 변환 후에 발생하는 결측치 전무함
procedures_icd['ccsproc_code'].isnull().sum()

0

In [194]:
ccsproc = InnerMap.load("CCSPROC")
ccsproc.lookup("88")

'Abdominal paracentesis"'

In [215]:
# ccscode detail 추가
ccsproc = InnerMap.load("CCSPROC")

def safe_lookup(code):
    try : 
        result = ccsproc.lookup(code)
        if result is None : 
            return None
        return result

    except Exception as e:
        return 'Unknown'
    
def remove_special_word(text):
    if pd.isna(text):
        return text
    return text.replace('"', '').replace("'", '')

procedures_icd['ccsproc_description'] = procedures_icd['ccsproc_code'].apply(safe_lookup)
procedures_icd['ccsproc_description'] = procedures_icd['ccsproc_description'].apply(remove_special_word)

In [216]:
print('ccsproc_code 미매칭 사례 : ', len(procedures_icd[procedures_icd['ccsproc_description']=='Unknown']))

ccsproc_code 미매칭 사례 :  39


In [217]:
procedures_icd.head(10)

Unnamed: 0,subject_id,hadm_id,seq_num,chartdate,icd_code,icd_version,ccsproc_code,ccsproc_description
0,10000032,22595853,1,2180-05-07,5491,9,88,Abdominal paracentesis
1,10000032,22841357,1,2180-06-27,5491,9,88,Abdominal paracentesis
2,10000032,25742920,1,2180-08-06,5491,9,88,Abdominal paracentesis
3,10000068,25022803,1,2160-03-03,8938,9,227,Other diagnostic procedures
4,10000117,27988844,1,2183-09-19,0QS734Z,10,146,Treatment; fracture or dislocation of hip and ...
5,10000280,25852320,1,2151-03-18,8938,9,227,Other diagnostic procedures
6,10000560,28979390,1,2189-10-16,5551,9,104,Nephrectomy; partial or complete
7,10000635,26134563,1,2136-06-19,3734,9,49,Other OR heart procedures
8,10000635,26134563,2,2136-06-19,3728,9,62,Other diagnostic cardiovascular procedures
9,10000635,26134563,3,2136-06-19,3727,9,62,Other diagnostic cardiovascular procedures


In [1]:
from pyhealth.datasets import MIMIC4Dataset
dataset = MIMIC4Dataset(
        root="C:/Users/gangmin/dahs/data/mimic/test",
        tables=["diagnoses_icd", "procedures_icd", "prescriptions", "labevents"],
        code_mapping={"NDC": ("ATC", {"target_kwargs": {"level": 5}})},
    )

INFO: Pandarallel will run on 24 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/
finish basic patient information parsing : 28.27490210533142s
finish parsing diagnoses_icd : 40.918240547180176s
finish parsing procedures_icd : 21.990015745162964s
finish parsing prescriptions : 160.89457845687866s
finish parsing labevents : 734.1699728965759s


Mapping codes: 100%|██████████| 180733/180733 [04:25<00:00, 681.10it/s] 


In [247]:
dataset.info()


dataset.patients: patient_id -> <Patient>

<Patient>
    - visits: visit_id -> <Visit> 
    - other patient-level info
    
    <Visit>
        - event_list_dict: table_name -> List[Event]
        - other visit-level info
    
        <Event>
            - code: str
            - other event-level info



In [248]:
dataset.available_tables

['procedures_icd', 'prescriptions', 'labevents', 'diagnoses_icd']

In [249]:
dataset.stat()


Statistics of base dataset (dev=False):
	- Dataset: MIMIC4Dataset
	- Number of patients: 180733
	- Number of visits: 431231
	- Number of visits per patient: 2.3860
	- Number of events per visit in diagnoses_icd: 11.0296
	- Number of events per visit in procedures_icd: 1.5518
	- Number of events per visit in prescriptions: 54.2354
	- Number of events per visit in labevents: 140.5324



'\nStatistics of base dataset (dev=False):\n\t- Dataset: MIMIC4Dataset\n\t- Number of patients: 180733\n\t- Number of visits: 431231\n\t- Number of visits per patient: 2.3860\n\t- Number of events per visit in diagnoses_icd: 11.0296\n\t- Number of events per visit in procedures_icd: 1.5518\n\t- Number of events per visit in prescriptions: 54.2354\n\t- Number of events per visit in labevents: 140.5324\n'

In [280]:
patient_dict = dataset.patients
print(list(patient_dict.keys())[:30])

['10000032', '10000068', '10000084', '10000108', '10000117', '10000248', '10000280', '10000560', '10000635', '10000719', '10000764', '10000826', '10000883', '10000886', '10000904', '10000935', '10000980', '10001176', '10001186', '10001217', '10001319', '10001338', '10001401', '10001472', '10001492', '10001663', '10001667', '10001725', '10001843', '10001860']


### 환자 한명씩 조회

In [283]:
patient = patient_dict["10001186"]
patient.gender, patient.birth_datetime

('F', datetime.datetime(2142, 5, 21, 0, 0))

In [284]:
# get the visit list of this patient
visit_dict = patient.visits
print (list(visit_dict.keys()))

['21334040', '24016413', '24906418']


In [286]:
# get the first visit
visit = visit_dict['24906418']
visit.encounter_time, visit.available_tables

(datetime.datetime(2188, 9, 24, 15, 33),
 ['diagnoses_icd', 'procedures_icd', 'prescriptions', 'labevents'])

In [287]:
# get the diagnosis information in the visit
visit.get_event_list('diagnoses_icd')

[Event with ICD9CM code 99832 from table diagnoses_icd,
 Event with ICD9CM code 5559 from table diagnoses_icd,
 Event with ICD9CM code V1085 from table diagnoses_icd,
 Event with ICD9CM code E8782 from table diagnoses_icd,
 Event with ICD9CM code 27800 from table diagnoses_icd,
 Event with ICD9CM code 6989 from table diagnoses_icd]

In [2]:
def readmission_prediction_mimic4_fn(patient, time_window=15):
    """
    Readmission prediction aims at predicting whether the patient will be readmitted
        into hospital within time_window days based on the clinical information from
        current visit (e.g., conditions and procedures).
    """
    samples = []
    
    # we will drop the last visit
    for i in range(len(patient) - 1):
        visit = patient[i]
        next_visit = patient[i + 1]

        # obtain label: get time difference between current visit and next visit
        time_diff = (next_visit.encounter_time - visit.encounter_time).days
        readmission_label = 1 if time_diff < time_window else 0

        # step 1: obtain features
        conditions = visit.get_code_list(table="diagnoses_icd")
        procedures = visit.get_code_list(table="procedures_icd")
        drugs = visit.get_code_list(table="prescriptions")

        # step 2: exclusion criteria
        # exclude: visits without condition, procedure, and drug code
        if len(conditions) + len(procedures) + len(drugs) == 0: continue

        # step 3: assemble the sample
        samples.append(
            {
                "visit_id": visit.visit_id,
                "patient_id": patient.patient_id,
                "conditions": conditions,
                "procedures": procedures,
                "drugs": drugs,
                "label": readmission_label,
            }
        )

    return samples

### 전체 환자 조회

In [3]:
def process_all_patients(patient_dict, time_window=15):
    all_samples = []
    for patient_id, patient in patient_dict.items():
        patient_samples = readmission_prediction_mimic4_fn(patient, time_window)
        all_samples.extend(patient_samples)
    return all_samples

In [4]:
patient_dict = dataset.patients
all_readmission_samples = process_all_patients(patient_dict, time_window=15)

In [302]:
for sample in all_readmission_samples[:10]:
    print(sample)

{'visit_id': '22595853', 'patient_id': '10000032', 'conditions': ['5723', '78959', '5715', '07070', '496', '29680', '30981', 'V1582'], 'procedures': ['5491'], 'drugs': ['B01A', 'J07B', 'A12B', 'C03D', 'C03C', 'N02B', 'J05A', 'R03A', 'N07B', 'R03B'], 'label': 0}
{'visit_id': '22841357', 'patient_id': '10000032', 'conditions': ['07071', '78959', '2875', '2761', '496', '5715', 'V08', '3051'], 'procedures': ['5491'], 'drugs': ['J07B', 'B01A', 'C03C', 'A07A', 'A06A', 'A07X', 'N02B', 'J05A', 'R03A', 'J01E', 'R03B', 'B05A'], 'label': 0}
{'visit_id': '25742920', 'patient_id': '10000032', 'conditions': ['07054', '78959', 'V462', '5715', '2767', '2761', '496', 'V08', '3051', '78791'], 'procedures': ['5491'], 'drugs': ['N02A', 'A02A', 'A12A', 'R03A', 'B01A', 'A06A', 'R03B', 'J05A', 'J07B', 'A07A', 'V06D', 'A10A', 'V04C', 'B05A', 'N06A', 'N05C', 'C03C'], 'label': 1}
{'visit_id': '23052089', 'patient_id': '10000084', 'conditions': ['G3183', 'F0280', 'R441', 'R296', 'E785', 'Z8546'], 'procedures': [

In [7]:
readmission_df = pd.DataFrame(all_readmission_samples)
readmission_df

Unnamed: 0,visit_id,patient_id,conditions,procedures,drugs,label
0,22595853,10000032,"[5723, 78959, 5715, 07070, 496, 29680, 30981, ...",[5491],"[B01AB01, J07BB02, A12BA01, C03DA01, C03CA01, ...",0
1,22841357,10000032,"[07071, 78959, 2875, 2761, 496, 5715, V08, 3051]",[5491],"[J07BB02, B01AB01, C03CA01, A07AA11, A06AD11, ...",0
2,25742920,10000032,"[07054, 78959, V462, 5715, 2767, 2761, 496, V0...",[5491],"[N02AX02, A02AC01, A12AA04, R03AC02, B01AB01, ...",1
3,23052089,10000084,"[G3183, F0280, R441, R296, E785, Z8546]",[],"[B01AB01, A06AB06, A06AD15, N04BC05, N04BD02, ...",0
4,22927623,10000117,"[R1310, R0989, K31819, K219, K449, F419, I341,...",[],[B01AB01],0
...,...,...,...,...,...,...
250398,29324445,19999784,"[Z5111, C8331, D472, E876, I10, Z87891]",[3E04305],"[A06AB02, A06AG02, A06AD10, A06AD65, V06DC01, ...",1
250399,29355057,19999784,"[Z5111, C8599, D701, D472, M21372, Z87891, G83...",[3E04305],"[B01AB05, J07BB02, V06DC01, B05XA02, B01AB01, ...",0
250400,29889147,19999784,"[Z5111, C8599, B1910, D472, Z87891, F1290, E87...",[3E04305],"[V06DC01, B05XA02, B01AB01, N02AA05, N05BA01, ...",0
250401,25744818,19999828,"[T8141XA, E1110, K632, D682, L02211, T83728A, ...","[0J980ZZ, 0WPF0JZ, 05HY33Z]","[J07BB02, B05XA03, B05XA01, A10AB01, A10AD01, ...",1


In [309]:
readmission_df['label'].value_counts()

1    134544
0    115859
Name: label, dtype: int64