# MIMIC-III ICU Mortality Analysis

## Introduction
This notebook performs an exploratory data analysis (EDA) on the MIMIC-III Clinical Database Demo to identify factors influencing ICU mortality and length of stay.

## data Loading and Cleaning

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set up paths
DATA_DIR = '../data'
RAW_DATA_DIR = '../../mimic-iii-clinical-database-demo-1.4'
IMAGES_DIR = '../report/images'

if not os.path.exists(IMAGES_DIR):
    os.makedirs(IMAGES_DIR)

# Load processed data
df = pd.read_csv(os.path.join(DATA_DIR, 'processed_admissions.csv'))

# Display first few rows
df.head()

## Exploratory Data Analysis

### 1. Age Distribution

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['age'].dropna(), bins=20, kde=True, color='skyblue')
plt.title('Age Distribution of Patients')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

### 2. ICU Length of Stay (LOS) Distributions

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['total_icu_los'].dropna(), bins=20, kde=True, color='green')
plt.title('ICU Stay Duration Distribution')
plt.xlabel('Length of Stay (Days)')
plt.ylabel('Frequency')
plt.show()

### 3. Top Diagnoses

In [None]:
top_diag = df['diagnosis'].value_counts().head(10)
plt.figure(figsize=(12, 6))
sns.barplot(x=top_diag.values, y=top_diag.index, palette='viridis')
plt.title('Top 10 Most Common Admitting Diagnoses')
plt.xlabel('Count')
plt.ylabel('Diagnosis')
plt.show()

### 4. Correlation Analysis

In [None]:
# Select numeric columns
numeric_df = df[['age', 'total_icu_los', 'hospital_expire_flag']]
plt.figure(figsize=(8, 6))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap')
plt.show()

## Lab Values Analysis

In [None]:
# Load raw lab events
lab_path = os.path.join(RAW_DATA_DIR, 'LABEVENTS.csv')
if os.path.exists(lab_path):
    labs = pd.read_csv(lab_path)
    
    # Glucose (50931, 50809) and Creatinine (50912)
    glucose_df = labs[labs['itemid'].isin([50931, 50809])]
    creatinine_df = labs[labs['itemid'].isin([50912])]

    fig, ax = plt.subplots(1, 2, figsize=(15, 6))

    sns.histplot(glucose_df['valuenum'].dropna(), bins=30, kde=True, color='orange', ax=ax[0])
    ax[0].set_title('Glucose Level Distribution')
    ax[0].set_xlabel('Glucose (mg/dL)')
    ax[0].set_xlim(0, 400)

    sns.histplot(creatinine_df['valuenum'].dropna(), bins=30, kde=True, color='purple', ax=ax[1])
    ax[1].set_title('Creatinine Level Distribution')
    ax[1].set_xlabel('Creatinine (mg/dL)')
    ax[1].set_xlim(0, 10)

    plt.show()
else:
    print("LABEVENTS.csv not found.")