In [1]:
import os
import pickle
import pandas as pd
os.chdir('../../')
import numpy as np

In [11]:
# For each dataset, grab minimum datetime per patient (must be from non-NaN rows of that dataset's feature col).
# Then, for each patient, find the lowest minimum and make that their timedelta_0


def get_timedelta_0(path, target_col, time_col):
    df = None
    if path.endswith('gzip'):
        df = pd.read_pickle(path, compression='gzip')
    else:
        df = pd.read_csv(path, compression='gzip', header=0, index_col=None, parse_dates=[time_col])
    if target_col:
        df.dropna(subset=[target_col], inplace=True)
        
    return pd.DataFrame(df.groupby('subject_id')[time_col].min()).reset_index()

def get_cohort_timedeltas(paths:list, target_cols:list, time_cols:list):
    cohort = pd.read_pickle("./data/cohort.gzip", compression='gzip')[['subject_id', 'label']]
    for path, target_col, time_col in zip(paths, target_cols, time_cols):
        cohort = cohort.merge(
            get_timedelta_0(path, target_col, time_col),
            how='left',
            left_on='subject_id',
            right_on='subject_id',
        )
    cohort['timedelta_0'] = cohort[['admittime','charttime','starttime','chartdate']].min(axis=1, skipna=True)
    return cohort[['subject_id', 'timedelta_0']]

def long_format_modules(cohort, paths, file_names, target_cols:list, time_cols:list):
    for path, file_name, target_col, time_col in zip(paths, file_names, target_cols, time_cols):
        df = None
        if path.endswith('gzip'):
            df = pd.read_pickle(path, compression='gzip')
        else:
            df = pd.read_csv(path, compression='gzip', header=0, index_col=None, parse_dates=[time_col])

        long_df = df.merge(cohort[['subject_id','timedelta_0']], how='inner', left_on='subject_id', right_on='subject_id')
        long_df['timedelta'] = (long_df[time_col] - long_df['timedelta_0']).dt.days

        cols = ['subject_id', 'hadm_id', target_col, 'timedelta']
        if 'lab' in path:
            cols.append('itemid')

        long_df[cols].to_csv(file_name, index=False, compression='gzip')



In [3]:
paths= [
    './data/adm_cohort.gzip',
    './data/long_format/labs/preproc_labs_units_cleaned.gzip',
    './data/long_format/meds/preproc_med_nonproprietaryname.csv.gz',
    './data/long_format/proc/preproc_proc_icd10.csv.gz'
    ]
time_cols = ['admittime', 'charttime', 'starttime', 'chartdate']
target_cols = [None, 'valuenum', 'nonproprietaryname', 'icd_code']
cohort = get_cohort_timedeltas(paths, target_cols, time_cols)

In [5]:
cohort

Unnamed: 0,subject_id,timedelta_0
0,10018928,2125-01-03 12:25:00
1,10074117,2118-02-17 20:45:00
2,10076543,2187-03-14 10:00:00
3,10098428,2119-05-12 00:20:00
4,10127185,2141-10-31 10:00:00
...,...,...
144043,19997448,2120-11-22 12:55:00
144044,19997887,2113-08-22 10:17:00
144045,19998203,2132-12-15 02:35:00
144046,19998350,2127-05-09 19:40:00


In [6]:
cohort.loc[cohort.timedelta_0.isna()]

Unnamed: 0,subject_id,timedelta_0


In [12]:
paths= [
    './data/long_format/diag/preproc_diag_icd10_roots.csv.gz',
    './data/long_format/labs/preproc_labs_units_cleaned.csv.gz',
    './data/long_format/meds/preproc_med_nonproprietaryname.csv.gz',
    './data/long_format/proc/preproc_proc_icd10.csv.gz'
    ]

file_names = [
    './data/long_format/diag/norm_diag_icd10_roots.csv.gz',
    './data/long_format/labs/norm_labs_units_cleaned.csv.gz',
    './data/long_format/meds/norm_med_nonproprietaryname.csv.gz',
    './data/long_format/proc/norm_proc_icd10.csv.gz'
    ]
time_cols = ['admittime', 'charttime', 'starttime', 'chartdate']
target_cols = ['root', 'valuenum', 'nonproprietaryname', 'icd_code']

long_format_modules(cohort, paths, file_names, target_cols, time_cols)

### Normalizing

In [5]:
# Save gzipped pickle files as csv
for dirpath, subdirs, files in os.walk("./data/long_format"):
    for x in files:
        path = os.path.join(dirpath, x)
        pd.read_pickle(path, compression='gzip').to_csv(path[:-5] + ".csv.gz", compression='gzip', index=False)

In [20]:
for name in file_names:
    df = pd.read_csv(name, compression='gzip', header=0, index_col=None)
    g = df.groupby('subject_id')['timedelta']
    df['timedelta'] -= g.transform('min')

    df.to_csv(name[:-7] + "_norm.csv.gz", compression='gzip', index=False)

In [13]:
test = pd.read_csv("./data/long_format/diag/norm_diag_icd10_roots.csv.gz", compression='gzip', header=0, index_col=None)
test


Unnamed: 0,subject_id,hadm_id,root,timedelta
0,15734973,20475282,D58,0
1,15734973,20012083,O44,1798
2,15734973,20012083,D58,1798
3,11442057,21518990,O61,22
4,11442057,21518990,O10,22
...,...,...,...,...
2800973,18104115,23751828,L02,2469
2800974,18104115,23751828,Z21,2469
2800975,18104115,23751828,Z23,2469
2800976,18104115,23751828,E78,2469


In [38]:
def test_size(path:str, cohort):
    subset_norm = set()
    subset_base = set()
    for dirpath, subdirs, files in os.walk(path):
        for x in files:
            f_path = os.path.join(dirpath, x)
            # print(f_path)
            if "norm_" in f_path:
                print("\tCurrent dataset: ", f_path)
                norm = pd.read_csv(f_path, compression='gzip', header=0, index_col=None)
                base = pd.read_csv(f_path.replace("norm_", "preproc_"), compression='gzip', header=0, index_col=None)

                # Test case 1; ensure shapes between original and normalized data haven't changed
                assert norm.shape == base.shape, f"Error; norm.shape should be {base.shape}, was {norm.shape}"
                print("\t\tShapes are identical")

                # Test case 2; each unique subject_id should have a timedelta row of 0 for their minimum
                # present recording
                subset_norm.update(norm.loc[norm.timedelta == 0].subject_id.unique())
                subset_base.update(base.subject_id.unique())
    return subset_norm, subset_base

def test_datasets(base_path, directories, cohort):
    norm_ids = set()
    preproc_ids = set()
    total_ids = cohort.subject_id.nunique()
    for f in directories:
        print("NOW TESTING: ", f.upper())
        subset_norm, subset_base = test_size(base_path + f, cohort)
        norm_ids.update(subset_norm)
        preproc_ids.update(subset_base)

    # assert len(norm_ids) == len(preproc_ids), f"Error; only {len(norm_ids)} unique ids in norms, should be {len(preproc_ids)}"
    # print("Identical number of unique IDs in normalized and preprocessed datasets")
    print(f"{total_ids - len(norm_ids)} IDs do not have a timedelta_0 in any of the 4 hosp datasets, but are present in patients and admissions")
    print(f"{total_ids - len(preproc_ids)} IDs are not present in any of the 4 hosp datasets, but are present in patients and admissions")

test_datasets("./data/long_format/", ["diag", "meds", "labs", "proc"], cohort)

NOW TESTING:  DIAG
	Current dataset:  ./data/long_format/diag\norm_diag_icd10_roots.csv.gz
		Shapes are identical
NOW TESTING:  MEDS
	Current dataset:  ./data/long_format/meds\norm_med_nonproprietaryname.csv.gz
		Shapes are identical
NOW TESTING:  LABS
	Current dataset:  ./data/long_format/labs\norm_labs_units_cleaned.csv.gz
		Shapes are identical
NOW TESTING:  PROC
	Current dataset:  ./data/long_format/proc\norm_proc_icd10.csv.gz
		Shapes are identical
30 IDs do not have a timedelta_0 in any of the 4 hosp datasets, but are present in patients and admissions
18 IDs are not present in any of the 4 hosp datasets, but are present in patients and admissions


In [39]:
cohort.shape[0]

144048

In [57]:
pd.read_csv('./data/long_format/diag\preproc_diag_icd10.csv.gz', compression='gzip', header=0, index_col=None).subject_id.nunique()

142558