In [4]:
import pandas as pd
import os

# Define dataset paths
hosp_path = "../data/mimic_demo/hosp/"
icu_path = "../data/mimic_demo/icu/"

# Load datasets
dfs = {}  # Dictionary to store DataFrames

# Function to load CSVs and display first rows
def load_and_preview_csvs(directory, dataset_type):
    for file in os.listdir(directory):
        if file.endswith(".csv"):
            file_path = os.path.join(directory, file)
            try:
                df = pd.read_csv(file_path, low_memory=False)
                dfs[file.replace(".csv", "")] = df  # Store DataFrame
                print(f"\n📂 {dataset_type.upper()} - {file} loaded successfully! Shape: {df.shape}")
                print(df.head(2))  # Display first 2 rows
            except Exception as e:
                print(f"❌ Error loading {file}: {e}")

# Load & preview HOSP & ICU datasets
print("\n📂 Loading HOSP datasets...\n")
load_and_preview_csvs(hosp_path, "hosp")

print("\n📂 Loading ICU datasets...\n")
load_and_preview_csvs(icu_path, "icu")

print("\n✅ All datasets loaded!")



📂 Loading HOSP datasets...


📂 HOSP - admissions.csv loaded successfully! Shape: (275, 16)
   subject_id   hadm_id            admittime            dischtime deathtime  \
0    10004235  24181354  2196-02-24 14:38:00  2196-03-04 14:02:00       NaN   
1    10009628  25926192  2153-09-17 17:08:00  2153-09-25 13:20:00       NaN   

  admission_type admit_provider_id      admission_location  \
0         URGENT            P03YMR  TRANSFER FROM HOSPITAL   
1         URGENT            P41R5N  TRANSFER FROM HOSPITAL   

         discharge_location insurance language marital_status  \
0  SKILLED NURSING FACILITY  Medicaid  ENGLISH         SINGLE   
1          HOME HEALTH CARE  Medicaid        ?        MARRIED   

                             race            edregtime            edouttime  \
0              BLACK/CAPE VERDEAN  2196-02-24 12:15:00  2196-02-24 17:07:00   
1  HISPANIC/LATINO - PUERTO RICAN                  NaN                  NaN   

   hospital_expire_flag  
0                     0

In [5]:
# Function to summarize each DataFrame
def summarize_dataframe(name, df):
    print(f"\n📂 **{name.upper()}** - Shape: {df.shape}")
    
    # Data types
    print("\n📝 Data Types:")
    print(df.dtypes)
    
    # Missing values
    print("\n🔍 Missing Values:")
    missing = df.isnull().sum()
    print(missing[missing > 0])
    
    # Unique values (only for small columns)
    print("\n🔹 Unique Values (first 5 columns):")
    for col in df.columns[:5]:
        print(f"  {col}: {df[col].nunique()} unique values")
    
    print("="*50)

# Apply function to all datasets
for name, df in dfs.items():
    summarize_dataframe(name, df)

print("\n✅ Data profiling completed!")


📂 **ADMISSIONS** - Shape: (275, 16)

📝 Data Types:
subject_id               int64
hadm_id                  int64
admittime               object
dischtime               object
deathtime               object
admission_type          object
admit_provider_id       object
admission_location      object
discharge_location      object
insurance               object
language                object
marital_status          object
race                    object
edregtime               object
edouttime               object
hospital_expire_flag     int64
dtype: object

🔍 Missing Values:
deathtime             260
discharge_location     42
marital_status         12
edregtime              93
edouttime              93
dtype: int64

🔹 Unique Values (first 5 columns):
  subject_id: 100 unique values
  hadm_id: 275 unique values
  admittime: 275 unique values
  dischtime: 275 unique values
  deathtime: 15 unique values

📂 **DIAGNOSES_ICD** - Shape: (4506, 5)

📝 Data Types:
subject_id      int64
hadm_id   

In [None]:
# View column names to confirm expected fields exist
print(dfs['icustays'].columns)
print(dfs['admissions'].columns)
print(dfs['patients'].columns)


Patients who received vasopressors: 0
