## Libraries

In [1]:
import pandas as pd
import numpy as np
import os
# change the working directory to the parent directory
current_dir = os.path.abspath(os.getcwd())

parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
os.chdir(parent_dir)

## Load data

In [2]:
df1 = pd.read_csv('datapre/eicu/sinflag_eICU_24_48.csv')
df2 = pd.read_csv('datapre/mimic/sinflag_mimic_24_48.csv')
df3 = pd.read_csv('datapre/eicu/sinflag_eICU_12_36.csv')
df4 = pd.read_csv('datapre/mimic/sinflag_mimic_12_36.csv')
df5 = pd.read_csv('datapre/eicu/sinflag_eICU_48_72.csv')
df6 = pd.read_csv('datapre/mimic/sinflag_mimic_48_72.csv')

## Difference of columns between databases

In [3]:
columns_diff = df2.columns.difference(df1.columns)
print(columns_diff)

Index(['aniongap_max', 'aniongap_min', 'milrinone'], dtype='object')


In [4]:
columns_diff2 = df4.columns.difference(df3.columns)
print(columns_diff2)

Index(['aniongap_max', 'aniongap_min', 'milrinone'], dtype='object')


In [5]:
columns_diff3 = df6.columns.difference(df5.columns)
print(columns_diff3)

Index(['aniongap_max', 'aniongap_min'], dtype='object')


In [6]:
# Combine all differences and extract unique ones
all_diffs = set(columns_diff).union(set(columns_diff2)).union(set(columns_diff3))
print(all_diffs)

{'aniongap_min', 'aniongap_max', 'milrinone'}


## Elimination 

In [7]:
df1 = df1.drop(columns=all_diffs, errors='ignore')
df2 = df2.drop(columns=all_diffs, errors='ignore')
df3 = df3.drop(columns=all_diffs, errors='ignore')
df4 = df4.drop(columns=all_diffs, errors='ignore')
df5 = df5.drop(columns=all_diffs, errors='ignore')
df6 = df6.drop(columns=all_diffs, errors='ignore')




In [8]:
# Verify if all DataFrames have the same columns
dataframes = [df1, df2, df3, df4, df5, df6]
column_sets = [set(df.columns) for df in dataframes]

# Check if all sets of columns are identical
columns_match = all(column_sets[0] == column_set for column_set in column_sets)

if columns_match:
    print("All DataFrames have the same columns.")
    print("Number of columns:", len(column_sets[0]))
    print("List of columns:", list(column_sets[0]))
else:
    print("The DataFrames do not have the same columns.")
    for i, col_set in enumerate(column_sets):
        print(f"DataFrame {i+1} - Number of columns: {len(col_set)}")
        print(f"List of columns: {sorted(col_set)}")

    # Identify specific column differences between DataFrames
    unique_columns = [col_set - column_sets[0] for col_set in column_sets[1:]]
    for i, diff in enumerate(unique_columns, start=2):
        if diff:
            print(f"Differences in DataFrame {i} compared to the first: {sorted(diff)}")



All DataFrames have the same columns.
Number of columns: 68
List of columns: ['resp_rate_max', 'potassium_min', 'norepinephrine', 'mbp_min', 'gcs_min', 'resp_rate_min', 'creatinine_min', 'dbp_max', 'sbp_min', 'hematocrit_max', 'sex', 'bicarbonate_min', 'chloride_max', 'Meropenem', 'mbp_max', 'gcs_eyes', 'bun_max', 'bicarbonate_max', 'glucose_max', 'vasopressin', 'los_icu', 'urineoutput', 'Vancomycin', 'calcium_max', 'heart_rate_mean', 'sodium_min', 'hemoglobin_min', 'spo2_mean', 'hematocrit_min', 'platelet_max', 'sbp_mean', 'sodium_max', 'wbc_min', 'heart_rate_min', 'temperature_max', 'epinephrine', 'phenylephrine', 'heart_rate_max', 'sbp_max', 'gcs_motor', 'race', 'resp_rate_mean', 'gcs_verbal', 'Metronidazole', 'mbp_mean', 'temperature_min', 'age', 'platelet_min', 'calcium_min', 'potassium_max', 'glucose_min', 'dobutamine', 'wbc_max', 'Quinolone', 'Macrolide', 'bun_min', 'dopamine', 'spo2_max', 'Cefalosporine', 'chloride_min', 'temperature_mean', 'Penicillin', 'sofa', 'spo2_min', 'db

In [None]:
df1.to_csv('datapre/eicu/ssinflag_eICU_24_48.csv', index=False)
df2.to_csv('datapre/mimic/ssinflag_mimic_24_48.csv', index=False)
df3.to_csv('datapre/eicu/ssinflag_eICU_12_36.csv', index=False)
df4.to_csv('datapre/mimic/ssinflag_mimic_12_36.csv', index=False)
df5.to_csv('datapre/eicu/ssinflag_eICU_48_72.csv', index=False)
df6.to_csv('datapre/mimic/ssinflag_mimic_48_72.csv', index=False)

In [10]:
print(df1.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3786 entries, 0 to 3785
Data columns (total 68 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               3786 non-null   int64  
 1   sex               3786 non-null   int64  
 2   race              3786 non-null   int64  
 3   los_icu           3786 non-null   float64
 4   temperature_max   3429 non-null   float64
 5   temperature_min   3429 non-null   float64
 6   temperature_mean  3429 non-null   float64
 7   spo2_max          3770 non-null   float64
 8   spo2_min          3770 non-null   float64
 9   spo2_mean         3770 non-null   float64
 10  heart_rate_max    3772 non-null   float64
 11  heart_rate_min    3772 non-null   float64
 12  heart_rate_mean   3772 non-null   float64
 13  resp_rate_max     3741 non-null   float64
 14  resp_rate_min     3741 non-null   float64
 15  resp_rate_mean    3741 non-null   float64
 16  sbp_max           3401 non-null   float64
