# read subject csv

In [3]:
import os, sys, yaml
from tqdm import tqdm_notebook as tqdm

from utils.preprocessing import *
from utils.read_csv import *
from utils.episode_custom import *
cfg = yaml.load(open("./config.yaml","r"), Loader=yaml.FullLoader)
output_path = cfg["output_path"]
output_subject_path = cfg["output_subject_path"]
itemids_file = cfg["itemids_file"]
event_tables = cfg["event_tables"]
resources_path = cfg["resources"]
# read all variables in events
variables = read_variables(resources_path)

# extract episodes from subjects

In [4]:
def episode_split(subject_dir ,variables):
    dn = os.path.join(output_subject_path, str(subject_dir))
    print(dn)
    try:
        subject_id = int(subject_dir)
        if not os.path.isdir(dn):
            raise Exception
    except:
        sys.stderr.write('No dir for subject: {}\n'.format(subject_id))
        return
    print("reading tables...")
    # read tables of this subject
    stays = read_stays(dn)
    diagnoses = read_diagnoses(dn, resources_path)
    procedures = read_procedures(dn, resources_path)
    # map itemids to variables in event
    events = read_events(dn, resources_path)

    print("reading static data...")
    episodic_data = assemble_episodic_data(stays, diagnoses, procedures)

    # cleaning and converting to time series
    #events = clean_events(events)
    if events.shape[0] == 0:
        # no valid events for this subject
        sys.stderr.write('No valid events for this subject: {}\n'.format(subject_id))
        return
    print("reading timeseries data...")
    timeseries = convert_events_to_timeseries(events, variables=variables)
    # extracting separate episodes
    print("extracting separate episodes")
    for i in range(stays.shape[0]):
        stay_id = stays.ICUSTAY_ID.iloc[i]
        intime = stays.INTIME.iloc[i]
        outtime = stays.OUTTIME.iloc[i]

        episode = get_events_for_stay(timeseries, stay_id, intime, outtime)
        if episode.shape[0] == 0:
            # no data for this episode
            continue

        episode = add_hours_elpased_to_events(episode, intime).set_index('HOURS').sort_index(axis=0)
        
        episodic_data.loc[episodic_data.index == stay_id].to_csv(os.path.join(dn,'episode{}.csv'.format(i+1)),index_label='Icustay')
        columns = list(episode.columns)
        columns_sorted = sorted(columns, key=(lambda x: "" if x == "HOURS" else x))
        episode = episode[columns_sorted]
        episode.to_csv(os.path.join(dn, 'episode{}_timeseries.csv'.format(i+1)),index_label='HOURS')
episode_split(46600, variables)

/data2/lzylzy/mimic/subjects/46600
reading tables...
reading static data...
reading timeseries data...
extracting separate episodes
