# Space

In [1]:
import os
import logging
import pandas as pd 
from IPython.display import display, HTML
KEY = 'WorkSpace'
WORKSPACE_PATH = os.getcwd().split(KEY)[0] + KEY
print(WORKSPACE_PATH)
os.chdir(WORKSPACE_PATH)

import sys
sys.path.append(WORKSPACE_PATH)
from proj_space import PROJECT, TaskName, SPACE
SPACE['WORKSPACE_PATH'] = WORKSPACE_PATH
sys.path.append(SPACE['CODE_FN'])

recfldtkn_config_path = os.path.join(SPACE['CODE_FN'], 'config_recfldtkn/')

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')

/Users/floydluo/Library/CloudStorage/GoogleDrive-junjie.luo.jhu@gmail.com/Shared drives/CDHAI-MIMIC/2024-MIMIC-SPACE/_MIMIC-RFT-WorkSpace


# [Part 1]: develop fn_humanrec.py
 
 we want to calculate how many records each patient has under different columns. 

## [Step 1]: Create Cohort Yaml

Motivation: To create a Cohort Yaml file to preprare for next steps.

Insturction: To run the following code. 

In [2]:
# Create a HTML link and display it
path = os.path.join(recfldtkn_config_path, 'Cohort.yaml')
full_path = os.path.join(WORKSPACE_PATH, path)
display(HTML(f'{path} <a href="{full_path}" target="_blank">Open File</a>'))

## [Step 2]: Update Cohort Yaml
Motivation: 
We want to store the Cohort information for future development. 

Aim:
To store Chort raw data path and also specify RawRootID and RoodID

Input: 
CohortInfo, RawRootID and RootID

Output:
Updated Yaml file

Instruction: 

We only need to change the following part of Cohort Yaml file, 
1. ChortInfo: This folder where we save the raw data 
2. RawRootID: HumanID in Raw Data, for WellDoc project, this is Patient, for MedStar Project, this Pid_ms
3. RootID: this si HumanID in RFT Data, in WellDoc project, this is PID



**Template**

```yaml
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  the folders where to save the data
CohortInfo:
  COHORT_NAME_XXXXXX: # <---- change this.
    cohort_label: 1
    FolderPath: $DATA_RAW$/COHORT_NAME_XXXXXX

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  where to get the raw Human data.
RawRootID: 'XXXXXXXXX' # <--- HumanID in Raw Data. 
RootID:  'XXXXXXXXX'        # <--- HumanID in RFT Data. 
```

In [3]:
display(HTML(f'{path} <a href="{full_path}" target="_blank">Open File</a>'))

## [Step 3] Cohort: Pick OneCohort

Motivation: We want to choose one cohort and test our code in this one cohort.

Aim: Specify a cohort

Input: Cohort Yaml

Output: Cohort name and Cohort label of the cohort we want to choose.

Instruction: Change ```args_information = ['--cohort_label', '1'] ```

In [4]:
recfldtkn_config_path

'../pipeline/config_recfldtkn/'

In [5]:
################### in notebook ###################  
args_information = ['--cohort_label', '1'] # < ------- change here
###################################################

In [6]:
import argparse
from recfldtkn.configfn import load_cohort_args

cohort_args = load_cohort_args(recfldtkn_config_path, SPACE)


my_parser = argparse.ArgumentParser(description='Process Input.')

my_parser.add_argument('--cohort_name',
                    metavar='cohort_name',
                    default = None, 
                    type=str,
                    help='the cohort_name to process')

my_parser.add_argument('--cohort_label',
                    metavar='cohort_label',
                    default = None, 
                    type=str,
                    help='the label for cohort_name to process')

args = my_parser.parse_args(args_information)

cohort_label = int(args.cohort_label)
cohort_name = [v for k, v in cohort_args['CohortInfo'].items() 
               if v['cohort_label'] == cohort_label][0]['cohort_name']

print(f'=============== {cohort_name}: {cohort_label} ======================')
print(cohort_name, cohort_label)


mimiciv-2.2-hosp 1


## [Step 4] OneCohort_Args
Motivation: We want to check one cohort information 

Aim: Display one Cohort informaiton

Input: cohort_arges

Output: NA

Instruction: Run following code.


In [7]:
OneCohort_Args = cohort_args['CohortInfo'][cohort_name]
OneCohort_Args

{'cohort_label': 1,
 'cohort_name': 'mimiciv-2.2-hosp',
 'FolderPath': '../_Data/0-Data_Raw/mimiciv-2.2/hosp'}

In [8]:
os.listdir(OneCohort_Args['FolderPath'])

['poe.csv.gz',
 'd_hcpcs.csv.gz',
 'poe_detail.csv.gz',
 'patients.csv.gz',
 'index.html',
 '.DS_Store',
 'diagnoses_icd.csv.gz',
 'emar_detail.csv.gz',
 'provider.csv.gz',
 'prescriptions.csv.gz',
 'drgcodes.csv.gz',
 'd_icd_diagnoses.csv.gz',
 'd_labitems.csv.gz',
 'transfers.csv.gz',
 'admissions.csv.gz',
 'labevents.csv.gz',
 'pharmacy.csv.gz',
 'procedures_icd.csv.gz',
 'hcpcsevents.csv.gz',
 'services.csv.gz',
 'd_icd_procedures.csv.gz',
 'omr.csv.gz',
 'emar.csv.gz',
 'microbiologyevents.csv.gz']

## [Step 5] selected_source_file_suffix_list
Motivation:  we select file suffixes relates to the varied data storage formats used across different projects. For instance, in the WellDoc project, our data is stored in CSV format. By specifying file suffixes, we can accurately identify or filter files with particular extensions.

Aim: Ensures that our processing or analysis workflows interact only with the relevant types of files

Input: selected_source_file_suffix_list = ['csv']

Output: selected_file_list

Instruction: Based on the project, choose selected_source_file_suffix_list and then run following code



In [9]:
#########################
selected_source_file_suffix_list = ['csv.gz']
#########################


# this step will be conducted within the pipeline
selected_file_list = [file for file in os.listdir(OneCohort_Args['FolderPath']) 
                      if file.endswith(tuple(selected_source_file_suffix_list))]
print(len(selected_file_list))

22


In [10]:
selected_file_list

['poe.csv.gz',
 'd_hcpcs.csv.gz',
 'poe_detail.csv.gz',
 'patients.csv.gz',
 'diagnoses_icd.csv.gz',
 'emar_detail.csv.gz',
 'provider.csv.gz',
 'prescriptions.csv.gz',
 'drgcodes.csv.gz',
 'd_icd_diagnoses.csv.gz',
 'd_labitems.csv.gz',
 'transfers.csv.gz',
 'admissions.csv.gz',
 'labevents.csv.gz',
 'pharmacy.csv.gz',
 'procedures_icd.csv.gz',
 'hcpcsevents.csv.gz',
 'services.csv.gz',
 'd_icd_procedures.csv.gz',
 'omr.csv.gz',
 'emar.csv.gz',
 'microbiologyevents.csv.gz']

## [Step 6] Map Raw Table file to RawName

Motivation: Associating each file's data with its name as the column header can help in efficiently organizing and identifying data sourced from multiple files. 

Aim:  '05_12_2022_PatientCGMDeviceDetail.csv' to 'PatientCGMDeviceDetail'

Input: selected_file_list from last step 

Output: Raw Table name

Instruction: Run following code 



In [11]:
import inspect
#########################
def get_tablename_from_raw_filename(file_path):
    # name = file_path.split('/')[-1].split('_df_')[0]

    filename = file_path.split('/')[-1]
    tablename = filename.split('.')[0]
    return tablename

get_tablename_from_raw_filename.fn_string = inspect.getsource(get_tablename_from_raw_filename)
#########################

In [12]:
file_path = selected_file_list[0]

tablename = get_tablename_from_raw_filename(file_path)
print(file_path)
print(tablename)

poe.csv.gz
poe


In [13]:
# loop the selected_file_list
l = []
for i in selected_file_list:
    tablename = get_tablename_from_raw_filename(i)
    l.append(tablename)
raw_table_name_list = l 

raw_table_name_list

['poe',
 'd_hcpcs',
 'poe_detail',
 'patients',
 'diagnoses_icd',
 'emar_detail',
 'provider',
 'prescriptions',
 'drgcodes',
 'd_icd_diagnoses',
 'd_labitems',
 'transfers',
 'admissions',
 'labevents',
 'pharmacy',
 'procedures_icd',
 'hcpcsevents',
 'services',
 'd_icd_procedures',
 'omr',
 'emar',
 'microbiologyevents']

## [Step 7]: get_rawrootid_column

Motivation: We want to get the identifier columns as ```rawrootid```By identifying the correct identifier column name, subsequent data processing or analysis steps can use a consistent reference, regardless of the specific naming convention used in the raw data.

Aim: Get ```rawrootid```

Input: selected_file_list

Output: rawrootid for files in selected_file_list

Instruction: Change the following part according to different projects since rawrootid is different for different project group. 
```def get_rawrootid_from_raw_table_column(raw_table_columns):
    if 'PatientID' in raw_table_columns: 
        rawrootid = 'PatientID' 
    if 'UserID' in raw_table_columns:
        rawrootid = 'UserID'
    if 'PatientId' in raw_table_columns: 
        rawrootid = 'PatientId' 
    return rawrootid```

In [14]:
##################################################
def get_rawrootid_from_raw_table_column(raw_table_columns):
    rawrootid = None
    if 'subject_id' in raw_table_columns: 
        rawrootid = 'subject_id' 
    # if 'UserID' in raw_table_columns:
    #     rawrootid = 'UserID'
    # if 'PatientId' in raw_table_columns: 
    #     rawrootid = 'PatientId' 
    return rawrootid

get_rawrootid_from_raw_table_column.fn_string = inspect.getsource(get_rawrootid_from_raw_table_column)
##################################################


In [15]:
# Iterate through each file in the selected file list.
for file_path in selected_file_list:
    print('\n') # Print a newline for clearer output separation.
     # Construct the full path to the file using the base folder path and the file name.
    full_path = os.path.join(OneCohort_Args['FolderPath'], file_path)
    # Check if the file is a CSV file.
    if full_path.endswith('.csv'):
        print(file_path, '<--- file_path')
        df = pd.read_csv(full_path, nrows=5)
        raw_table_columns = df.columns.tolist()
        print(raw_table_columns, '<--- raw_table_columns')
        # Identify the key identifier column from the column names list using a predefined function.
        rawrootid = get_rawrootid_from_raw_table_column(raw_table_columns)
        print(rawrootid, '<--- rawrootid')# Print the identified key identifier column name.
        # Check if the file is a Pickle file and raise an error if so, as it's not supported.
    elif full_path.endswith('.csv.gz'):
        print(file_path, '<--- file_path')
        df = pd.read_csv(full_path, compression = 'gzip', nrows=5)
        raw_table_columns = df.columns.tolist()
        print(raw_table_columns, '<--- raw_table_columns')
        # Identify the key identifier column from the column names list using a predefined function.
        rawrootid = get_rawrootid_from_raw_table_column(raw_table_columns)
        print(rawrootid, '<--- rawrootid')# Print the identified key identifier column name.

    elif full_path.endswith('.p'):
        raise ValueError(f'.p is not supported: {full_path}')
     # Raise an error for any file type that is not CSV or Pickle, indicating unsupported file type.
    else:
        raise ValueError(f'file type not supported: {full_path}')



poe.csv.gz <--- file_path
['poe_id', 'poe_seq', 'subject_id', 'hadm_id', 'ordertime', 'order_type', 'order_subtype', 'transaction_type', 'discontinue_of_poe_id', 'discontinued_by_poe_id', 'order_provider_id', 'order_status'] <--- raw_table_columns
subject_id <--- rawrootid


d_hcpcs.csv.gz <--- file_path
['code', 'category', 'long_description', 'short_description'] <--- raw_table_columns
None <--- rawrootid


poe_detail.csv.gz <--- file_path
['poe_id', 'poe_seq', 'subject_id', 'field_name', 'field_value'] <--- raw_table_columns
subject_id <--- rawrootid


patients.csv.gz <--- file_path
['subject_id', 'gender', 'anchor_age', 'anchor_year', 'anchor_year_group', 'dod'] <--- raw_table_columns
subject_id <--- rawrootid


diagnoses_icd.csv.gz <--- file_path
['subject_id', 'hadm_id', 'seq_num', 'icd_code', 'icd_version'] <--- raw_table_columns
subject_id <--- rawrootid


emar_detail.csv.gz <--- file_path
['subject_id', 'emar_id', 'emar_seq', 'parent_field_ordinal', 'administration_type', 'p

## [Step 8]: Exclude Raw Table when we select the Patients.

Motivation: 
We only focus on the patients who have at least one records. Also, some table we don't want to consider it. For example, Patient Table. 

Aim: 
patient A: PatientTable 1, ATable 0, BTable 0, CTable 0. 

patient B: PatientTable 1, ATable 1, BTable 0, CTable 0

We don't want A because A's total record number except PatientTable is 0. 
We want B because B's total  record number except PatientTable is larger than 0.

So we put `PatientTable` into `excluded_raw_table_name`.

Instruciton: 
1. Initially, we set the list as empty
2. Based on excluded_raw_table_name is empty, we will get the df_Human (df_Human2RawRecNum) at the end of this notebook.
3. Then, we will check the df_Human, and see if there is any table that we want to exclude. If there is, we will add the table name to the excluded_raw_table_name list.

In [16]:
# How to get the list. 

# initially, the list is empty
# based on excluded_raw_table_name is empty
# we will get the df_Human (df_Human2RawRecNum) at the end of this notebook.
# Then, we will check the df_Human, and see if there is any table that we want to exclude.
# if there is, we will add the table name to the excluded_raw_table_name list.

#########################
# excluded_raw_table_names = [] 
excluded_raw_table_names = []
#########################

## [Step 9]: Save the above tools into the pipeline folder
Motivation: creating a more dynamic, efficient, and customizable workflow that can adapt to various data processing needs, automate repetitive tasks, and enhance the overall data analysis and handling process.

Instruction: Run the following code.


In [17]:
from recfldtkn.loadtools import convert_variables_to_pystirng, load_module_variables

prefix = ['import pandas as pd', 'import numpy as np']
iterative_variables = [selected_source_file_suffix_list, excluded_raw_table_names]
fn_variables = [get_tablename_from_raw_filename, get_rawrootid_from_raw_table_column]
pycode = convert_variables_to_pystirng(iterative_variables = iterative_variables, 
                                       fn_variables = fn_variables, 
                                       prefix = prefix)
pypath = cohort_args['pypath']
if not os.path.exists(os.path.dirname(pypath)): os.makedirs(os.path.dirname(pypath))

# print(pypath)

with open(pypath, 'w') as file: file.write(pycode)
# Create a HTML link and display it
full_path = os.path.join(WORKSPACE_PATH, pypath)

display(HTML(f'{pypath} <a href="{full_path}" target="_blank">Open File</a>'))

[INFO:2024-03-05 14:38:59,102:(config.py@58 datasets)]: PyTorch version 2.0.0 available.
[INFO:2024-03-05 14:38:59,103:(config.py@95 datasets)]: TensorFlow version 2.12.0 available.


# [Part 2] Process dfHumanRec with Pipeline


Given ../pipeline/fn_humanrec/humanrec.py  is ready. 

We have a pipeline tool: get_cohort_level_record_number_counts

This will call this humanrec.py internally, to process our records. 

In [18]:
cohort_args['pypath']

'../pipeline/fn_humanrec/humanrec.py'

## [Step 1]: Call Pipeline to get df_Human
Motivation: To calculate rec_num

Aim: 

Input: ```cohort_name, cohort_label, cohort_args```

Output: ```df_Human```

Instruction: 
1. Here we have a pipeline function `get_cohort_level_record_number_counts` to get the df_Human.

2. It will load the tools from '../pipeline/fn_humanrec/humanrec.py' to do the process.

If you want to know the details of the pipeline_fn, you can go to the unpacking notebook to understand it more. 

In [19]:
from recfldtkn.pipeline_record import get_RawName_to_dfRawPath


OneCohort_Args

{'cohort_label': 1,
 'cohort_name': 'mimiciv-2.2-hosp',
 'FolderPath': '../_Data/0-Data_Raw/mimiciv-2.2/hosp'}

In [20]:
rft_config = {
    'base_config': cohort_args, 
}

In [21]:
RawName_to_dfRawPath = get_RawName_to_dfRawPath(OneCohort_Args, rft_config)
RawName_to_dfRawPath

[INFO:2024-03-05 14:38:59,458:(pipeline_record.py@33 recfldtkn.pipeline_record)]: ../_Data/0-Data_Raw/mimiciv-2.2/hosp <-- FolderPath
[INFO:2024-03-05 14:38:59,459:(pipeline_record.py@34 recfldtkn.pipeline_record)]: 22 <--- fullfile_list


{'poe': '../_Data/0-Data_Raw/mimiciv-2.2/hosp/poe.csv.gz',
 'd_hcpcs': '../_Data/0-Data_Raw/mimiciv-2.2/hosp/d_hcpcs.csv.gz',
 'poe_detail': '../_Data/0-Data_Raw/mimiciv-2.2/hosp/poe_detail.csv.gz',
 'patients': '../_Data/0-Data_Raw/mimiciv-2.2/hosp/patients.csv.gz',
 'diagnoses_icd': '../_Data/0-Data_Raw/mimiciv-2.2/hosp/diagnoses_icd.csv.gz',
 'emar_detail': '../_Data/0-Data_Raw/mimiciv-2.2/hosp/emar_detail.csv.gz',
 'provider': '../_Data/0-Data_Raw/mimiciv-2.2/hosp/provider.csv.gz',
 'prescriptions': '../_Data/0-Data_Raw/mimiciv-2.2/hosp/prescriptions.csv.gz',
 'drgcodes': '../_Data/0-Data_Raw/mimiciv-2.2/hosp/drgcodes.csv.gz',
 'd_icd_diagnoses': '../_Data/0-Data_Raw/mimiciv-2.2/hosp/d_icd_diagnoses.csv.gz',
 'd_labitems': '../_Data/0-Data_Raw/mimiciv-2.2/hosp/d_labitems.csv.gz',
 'transfers': '../_Data/0-Data_Raw/mimiciv-2.2/hosp/transfers.csv.gz',
 'admissions': '../_Data/0-Data_Raw/mimiciv-2.2/hosp/admissions.csv.gz',
 'labevents': '../_Data/0-Data_Raw/mimiciv-2.2/hosp/labevents

In [22]:
from recfldtkn.pipeline_record import get_CohortLevel_df_Human2RawRecNum

OneCohort_config = OneCohort_Args
RawName_to_dfRaw = RawName_to_dfRawPath


df_Human = get_CohortLevel_df_Human2RawRecNum(OneCohort_config, 
                                                rft_config, 
                                                RawName_to_dfRaw)


df_Human

[INFO:2024-03-05 14:38:59,465:(pipeline_record.py@96 recfldtkn.pipeline_record)]: {'cohort_label': 1, 'cohort_name': 'mimiciv-2.2-hosp', 'FolderPath': '../_Data/0-Data_Raw/mimiciv-2.2/hosp'}
[INFO:2024-03-05 14:38:59,466:(pipeline_record.py@139 recfldtkn.pipeline_record)]: '../_Data/0-Data_Raw/mimiciv-2.2/hosp/poe.csv.gz' # use csv.gz file
[INFO:2024-03-05 14:38:59,470:(pipeline_record.py@60 recfldtkn.pipeline_record)]: id_column: subject_id
[INFO:2024-03-05 14:39:25,616:(pipeline_record.py@170 recfldtkn.pipeline_record)]: poe:'path-../_Data/0-Data_Raw/mimiciv-2.2/hosp/poe.csv.gz' # (179797, 3)
[INFO:2024-03-05 14:39:25,617:(pipeline_record.py@139 recfldtkn.pipeline_record)]: '../_Data/0-Data_Raw/mimiciv-2.2/hosp/d_hcpcs.csv.gz' # use csv.gz file
[INFO:2024-03-05 14:39:25,620:(pipeline_record.py@60 recfldtkn.pipeline_record)]: id_column: None
[INFO:2024-03-05 14:39:25,621:(pipeline_record.py@170 recfldtkn.pipeline_record)]: d_hcpcs:'path-../_Data/0-Data_Raw/mimiciv-2.2/hosp/d_hcpcs.csv

RawName,PID,subject_id,admissions,diagnoses_icd,drgcodes,emar,emar_detail,hcpcsevents,labevents,microbiologyevents,...,patients,pharmacy,poe,poe_detail,prescriptions,procedures_icd,services,transfers,TotalRecNum,CohortLabel
0,10000001,10000032,4,39,8,139,267,,623,36,...,1,79,265,50,81,3,4,15,1655,1
1,10000002,10000048,,,,,,,45,,...,1,,,,,,,1,47,1
2,10000003,10000068,1,1,,,,1,,,...,1,,6,1,,1,1,3,16,1
3,10000004,10000084,2,12,2,51,139,1,274,4,...,1,13,59,18,13,,2,6,600,1
4,10000005,10000102,,,,,,,,,...,1,,,,,,,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299707,10299708,19999828,2,41,3,955,1882,,1046,30,...,1,193,414,31,209,8,2,7,4836,1
299708,10299709,19999829,,,,,,,63,,...,1,,,,,,,1,65,1
299709,10299710,19999840,2,19,4,,,,560,24,...,1,108,288,6,137,7,3,8,1167,1
299710,10299711,19999914,,,,,,,,,...,1,,,,,,,1,2,1


# [Step 2] Check and Update excluded_raw_table_names
Motivation: Delect unnessary columns????

In [None]:
df_Human

RawName,PID,subject_id,admissions,diagnoses_icd,drgcodes,emar,emar_detail,hcpcsevents,labevents,microbiologyevents,...,patients,pharmacy,poe,poe_detail,prescriptions,procedures_icd,services,transfers,TotalRecNum,CohortLabel
0,10000001,10000032,4.0,39.0,8.0,139.0,267.0,,623.0,36.0,...,1.0,79.0,265.0,50.0,81.0,3.0,4.0,15.0,1655.0,1
1,10000002,10000048,,,,,,,45.0,,...,1.0,,,,,,,1.0,47.0,1
2,10000003,10000068,1.0,1.0,,,,1.0,,,...,1.0,,6.0,1.0,,1.0,1.0,3.0,16.0,1
3,10000004,10000084,2.0,12.0,2.0,51.0,139.0,1.0,274.0,4.0,...,1.0,13.0,59.0,18.0,13.0,,2.0,6.0,600.0,1
4,10000005,10000102,,,,,,,,,...,1.0,,,,,,,1.0,2.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299707,10299708,19999828,2.0,41.0,3.0,955.0,1882.0,,1046.0,30.0,...,1.0,193.0,414.0,31.0,209.0,8.0,2.0,7.0,4836.0,1
299708,10299709,19999829,,,,,,,63.0,,...,1.0,,,,,,,1.0,65.0,1
299709,10299710,19999840,2.0,19.0,4.0,,,,560.0,24.0,...,1.0,108.0,288.0,6.0,137.0,7.0,3.0,8.0,1167.0,1
299710,10299711,19999914,,,,,,,,,...,1.0,,,,,,,1.0,2.0,1


# [Step 3] Save ds_Human as HFDS

In [None]:
import datasets
ds_HumanRec = datasets.Dataset.from_pandas(df_Human)
print(ds_HumanRec)

Dataset({
    features: ['PID', 'subject_id', 'admissions', 'diagnoses_icd', 'drgcodes', 'emar', 'emar_detail', 'hcpcsevents', 'labevents', 'microbiologyevents', 'omr', 'patients', 'pharmacy', 'poe', 'poe_detail', 'prescriptions', 'procedures_icd', 'services', 'transfers', 'TotalRecNum', 'CohortLabel'],
    num_rows: 299712
})


In [None]:
cohort_args['RecName']

'PRawRecNum'

In [None]:
print(SPACE['DATA_RFT'])

full_cohort_name = f'{cohort_label}-{cohort_name}'
print(full_cohort_name)
# ------------------------------------------------------------------------- # 
path = os.path.join(SPACE['DATA_RFT'], full_cohort_name, cohort_args['RecName'] + '_data')
print(path)
ds_HumanRec.save_to_disk(path)
print(ds_HumanRec)

../_Data/1-Data_RFT
1-mimiciv-2.2-hosp
../_Data/1-Data_RFT/1-mimiciv-2.2-hosp/PRawRecNum_data


Saving the dataset (0/1 shards):   0%|          | 0/299712 [00:00<?, ? examples/s]

Dataset({
    features: ['PID', 'subject_id', 'admissions', 'diagnoses_icd', 'drgcodes', 'emar', 'emar_detail', 'hcpcsevents', 'labevents', 'microbiologyevents', 'omr', 'patients', 'pharmacy', 'poe', 'poe_detail', 'prescriptions', 'procedures_icd', 'services', 'transfers', 'TotalRecNum', 'CohortLabel'],
    num_rows: 299712
})


# [Part 3] Select Patients with PID

In [None]:
# from recfldtkn.loadtools import load_ds_rec_and_info
# ds_Human, _ = load_ds_rec_and_info(cohort_args['RecName'], cohort_args)
# df_Human = ds_Human.to_pandas()
# df_Human
# RootID = cohort_args['RootID']
# RawRootID = cohort_args['RawRootID']
# PID_list = [1013405, 1002538, 1022279, 1004432, 1016032, 1032308, 1031363, 1001133, 1007343, 1026067]
# print(PID_list)
# def get_patient_records_Ri(RawRootID_sample, RawRootID, cohort_args):
#     d = {}
#     cohort_config = cohort_args['CohortInfo'][cohort_name]
#     FolderPath = cohort_config['FolderPath']
#     chunk_size = 100000

#     file_list = sorted(os.listdir(FolderPath))
#     file_list = [i for i in file_list if 'csv' in i]
#     for file in file_list:
#         full_file = os.path.join(FolderPath, file)
#         li = [chunk[chunk[RawRootID] == RawRootID_sample] 
#               for chunk in pd.read_csv(full_file, chunksize=chunk_size, low_memory=False)]
#         result = pd.concat(li)
#         logger.info(f'{result.shape}: {file}')
#         if len(result) == 0: continue
#         d[file] = result
        
#     return d

# for PID_sample in PID_list:
#     print('\n======== PID_sample:', PID_sample, '========')
#     PIDInfo_dict = df_Human[df_Human[RootID] == PID_sample].iloc[0].to_dict()
#     RawRootID_sample = PIDInfo_dict[RawRootID]
#     d = get_patient_records_Ri(RawRootID_sample, RawRootID, cohort_args)
#     folder = os.path.join(SPACE['DATA_RAW'], 'patient_sample', str(PID_sample))
#     if os.path.exists(folder) == False: os.makedirs(folder)
#     for file, df in d.items():
#         df.to_csv(os.path.join(folder, file), index = False)
#         print(file, df.shape)