In [415]:
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.impute import KNNImputer
pd.options.mode.chained_assignment = None

In [416]:
baseline_df = pd.read_csv("data/data_by_table/baseline.csv")
vitalsign_df = pd.read_csv("data/data_by_table/mimiciv_derived_vitalsign_new.csv")
labevents_df = pd.read_csv("data/data_by_table/mimiciv_hosp_labevents.csv")
cohort_subject_id_stay_id_df = pd.read_csv("data/data_by_table/cohort_subject_id_stay_id.csv")
ground_truth_df = pd.read_csv("data/data_by_table/ground_truth.csv")
ventilator_setting_df = pd.read_csv("data/paper_data/ven_setting.csv")
GCS_df = pd.read_csv("data/paper_data/GCS.csv")
anion_df = pd.read_csv("data/paper_data/Anion.csv")
urine_df = pd.read_csv("data/paper_data/urine_output.csv")
label_df = pd.read_csv("data/data_by_table/ground_truth.csv")
baseline_df.replace('___', pd.NA, inplace=True)
vitalsign_df.replace('___', pd.NA, inplace=True)
labevents_df.replace('___', pd.NA, inplace=True)
cohort_subject_id_stay_id_df.replace('___', pd.NA, inplace=True)
ventilator_setting_df.replace('___', pd.NA, inplace=True)

In [417]:
print(label_df['label'])

0       1
1       1
2       0
3       1
4       1
       ..
2835    1
2836    1
2837    1
2838    1
2839    1
Name: label, Length: 2840, dtype: int64


In [418]:
def generate_df_template(stay_id, endtime):
    end_time = pd.to_datetime(endtime).floor('H')
    time_intervals = [end_time - timedelta(hours=i) for i in range(24)]
    df = pd.DataFrame(time_intervals, columns=['charttime'])
    df['stay_id'] = stay_id 
    return df

In [419]:
def generate_all_template(ground_truth_df):
    data_template = pd.DataFrame()
    for index, row in ground_truth_df.iterrows():
        data_now = generate_df_template(row['stay_id'],row['endtime'])
        data_template = pd.concat([data_template, data_now], ignore_index=False)
    return data_template

In [420]:
def fill_na(df):
    df['charttime'] = pd.to_datetime(df['charttime'])
    df = df.sort_values(by=['stay_id', 'charttime'])
    df = df.set_index('charttime')
    df_resampled = df.groupby('stay_id').resample('H').max()
    for col in df.columns:
        if(col != 'stay_id' and col != 'subject_id' and col != 'charttime'):
            df_resampled[col] = df_resampled[col].groupby('stay_id').fillna(method='ffill')
            df_resampled[col] = df_resampled[col].groupby('stay_id').fillna(method='bfill')
    df_resampled = df_resampled.drop(columns='stay_id')
    df_resampled = df_resampled.reset_index()
    for col in df.columns:
        if col == 'subject_id':
            df_resampled = df_resampled.drop(columns='subject_id')
        if col == 'hadm_id':
            df_resampled = df_resampled.drop(columns='hadm_id')
    return df_resampled

In [421]:
def fill_and_merge(data_all, df_target, ground_truth_df):
    template = generate_all_template(ground_truth_df)
    df_target = fill_na(df_target)
    df = pd.merge(df_target, template, how='outer', on=['stay_id','charttime'])
    df['charttime'] = pd.to_datetime(df['charttime'])
    df = df.sort_values(by=['stay_id', 'charttime'])
    df = df.set_index('charttime')
    df_resampled = df.groupby('stay_id').resample('H').max()
    for col in df.columns:
        if(col != 'stay_id' and col != 'subject_id' and col != 'charttime'):
            df_resampled[col] = df_resampled[col].groupby('stay_id').fillna(method='ffill')
            df_resampled[col] = df_resampled[col].groupby('stay_id').fillna(method='bfill')
    df_resampled = df_resampled.drop(columns='stay_id')
    df_resampled = df_resampled.reset_index()
    df = pd.merge(data_all,df_resampled, how='inner', on=['stay_id','charttime'])
    return df
    

In [422]:
def check_missing_values(df):
    if df.isna().any().any():
        return 1 
    else:
        return 0 
def output_id_missing(flag_data_df,data_df):
    ass_hole = []
    for index, row in flag_data_df.iterrows():
        id_df = data_df[data_df['stay_id'] == row['stay_id']]
        if check_missing_values(id_df):
            ass_hole.append(row['stay_id'])
    return ass_hole

def output_id_not_missing(flag_data_df,data_df):
    ass_hole = []
    for index, row in flag_data_df.iterrows():
        id_df = data_df[data_df['stay_id'] == row['stay_id']]
        if not check_missing_values(id_df):
            ass_hole.append(row['stay_id'])
    return ass_hole

In [423]:
def add_label_id(df,stay_id, flag, r_v, dod):
    selected_data = df[df['stay_id'] == stay_id]
    selected_data['label'] = flag
    if np.isnan(r_v):
        selected_data['Rev_h'] = -1000
    else:
         selected_data['Rev_h'] = r_v
    if np.isnan(dod):
        selected_data['dod_h'] = -1000
    else:
         selected_data['dod_h'] = dod
    selected_data = selected_data.sort_values(by=['stay_id', 'charttime'])
    return selected_data

In [424]:
def add_label(df, df_label):
    cancate_data = pd.DataFrame()
    for index, row in df_label.iterrows():
        data_now = add_label_id(df,row['stay_id'],row['label'], row['re_vent_time_diff'], row['weaning_till_dod_hr'])
        cancate_data = pd.concat([cancate_data, data_now], ignore_index=False)
    return cancate_data

### baseline_df, ventilator_setting_df, vitalsign_df, labevents_df

In [425]:
data_template = generate_all_template(ground_truth_df)
ventilator_setting_df_24 = fill_and_merge(data_template, ventilator_setting_df, ground_truth_df)
print("ventilator_setting_df_24 finish")
labevents_df_24 = fill_and_merge(data_template, labevents_df, ground_truth_df)
print("labevents_df_24 finish")
vitalsign_df_24 = fill_and_merge(data_template, vitalsign_df, ground_truth_df)
print("vitalsign_df_24 finish")
GCS_df_24 = fill_and_merge(data_template, GCS_df, ground_truth_df)
print("GCS_df_24 finish")
anion_df_24 = fill_and_merge(data_template, anion_df, ground_truth_df)
print("anion_df_24 finish")

  df_resampled = df.groupby('stay_id').resample('H').max()


In [None]:
imputer = KNNImputer(n_neighbors=2)
baseline_df_p = baseline_df 
baseline_df_p[['height_cm', 'weight_kg']] = imputer.fit_transform(baseline_df_p[['height_cm', 'weight_kg']])
baseline_df_p['height_cm'].fillna(baseline_df_p.groupby('gender')['height_cm'].transform('mean'), inplace=True)
baseline_df_p['weight_kg'].fillna(baseline_df_p.groupby('gender')['weight_kg'].transform('mean'), inplace=True)
baseline_df_p = baseline_df_p.drop(columns=['subject_id', 'hadm_id'])

In [None]:
merged_df = pd.merge(labevents_df_24, vitalsign_df_24, on=['stay_id', 'charttime'], how='inner')
merged_df = pd.merge(merged_df, labevents_df_24, on=['stay_id', 'charttime'], how='inner')
merged_df = pd.merge(merged_df, ventilator_setting_df_24, on=['stay_id', 'charttime'], how='inner')
merged_df = pd.merge(merged_df, GCS_df_24, on=['stay_id', 'charttime'], how='inner')
#merged_df = pd.merge(merged_df, anion_df_24, on=['stay_id', 'charttime'], how='inner')
merged_df = pd.merge(merged_df, baseline_df, on=['stay_id'], how='inner')
merged_df = merged_df.drop(columns=['subject_id', 'hadm_id', 'charttime_1', 'ventilator_type'])
final_data = add_label(merged_df, ground_truth_df)

In [None]:
label_df_new = pd.DataFrame()
label_df_new['label'] = label_df['label']
label_df_new['stay_id'] = label_df['stay_id']
merged_df = pd.merge(merged_df, label_df_new, on=['stay_id'], how='inner')

In [None]:
final_data.to_csv("./data/data_by_table/pre_24h_data_v5.csv")