# Initialize

## Importing libraries

In [1]:
# Standard library imports
import os
import sys
from pathlib import Path
import pandas as pd
from google.oauth2 import service_account
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import numpy as np
from scipy import stats, ndimage, interpolate
from sklearn.preprocessing import StandardScaler
import google.auth
warnings.filterwarnings('ignore')  # Suppresses all warnings

# Add parent directory to Python path for local imports
notebook_path = Path.cwd()  # Gets current working directory
project_root = notebook_path.parent.parent  # Navigate up to project root
sys.path.append(str(project_root))

# Local application imports
from src.mimicdf import MIMICDF
from src.preprocessing.data_preprocessor import DataPreprocessor

# Initialize MIMIC database connection to GCP
mimicdf = MIMICDF.create_connection()

# Initialize MIMIC demo database
# mimicdf = MIMICDF.create_demo()

Successfully connected to MIMIC-IV ED dataset


In [6]:
edstays = mimicdf.edstays()
patients = mimicdf.patients()
age = mimicdf.age()
ed_data = mimicdf.ed_data()

Table loaded: edstays
Table loaded: patients
Table loaded: patients
Loading edstays...
Table loaded: edstays
Loading demographics...
Table loaded: edstays
Table loaded: age
Loading age data...
Table loaded: patients
Calculating ED visit age...
Merging time features...
Table loaded: edstays
Merging triage features...
Table loaded: triage
Cleaning up columns...

 Dataframe shape: (425087, 18) 

Dataframe info: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 425087 entries, 0 to 425086
Data columns (total 18 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   subject_id         425087 non-null  Int64  
 1   stay_id            425087 non-null  Int64  
 2   gender             425087 non-null  object 
 3   arrival_transport  425087 non-null  object 
 4   disposition        425087 non-null  object 
 5   race               425087 non-null  object 
 6   age_at_ed          425087 non-null  Int64  
 7   dow                425087 non

In [9]:
print(ed_data.shape)
print(age.shape)

# Get unique subject_ids from both tables

(425087, 18)
(299712, 6)


In [11]:
# Get unique subject_ids from both tables
ed_subjects = set(edstays['subject_id'])
patient_subjects = set(age['subject_id'])

# Calculate overlaps
total_ed = len(ed_subjects)
total_patients = len(patient_subjects)
common_subjects = len(ed_subjects.intersection(patient_subjects))
only_in_ed = len(ed_subjects - patient_subjects)
only_in_patients = len(patient_subjects - ed_subjects)

print(f"Total unique subjects in ED data: {total_ed:,}")
print(f"Total unique subjects in patients data: {total_patients:,}")
print(f"Subjects in both tables: {common_subjects:,}")
print(f"Subjects only in ED data: {only_in_ed:,}")
print(f"Subjects only in patients data: {only_in_patients:,}")
print(f"\nOverlap percentage: {(common_subjects/total_ed)*100:.1f}% of ED subjects are in patients table")

Total unique subjects in ED data: 205,504
Total unique subjects in patients data: 299,712
Subjects in both tables: 205,504
Subjects only in ED data: 0
Subjects only in patients data: 94,208

Overlap percentage: 100.0% of ED subjects are in patients table


In [13]:
ed_data.isna().sum()

subject_id               0
stay_id                  0
gender                   0
arrival_transport        0
disposition              0
race                     0
age_at_ed                0
dow                      0
hour                     0
los_minutes              0
temperature          23415
heartrate            17090
resprate             20353
o2sat                20596
sbp                  18291
dbp                  19091
pain                 28116
acuity                6987
dtype: int64

## Data Preprocessing

In [None]:
data_preprocessor = DataPreprocessor(mimicdf)
df = data_preprocessor.prepare_data()
df.head()


In [None]:
missingness_ratio = df.isna().mean()
print('Missingness ratio:')
missingness_ratio


In [None]:
df.describe().T

In [None]:
# Create binary indicators for missingness
missing_matrix = df.isna().astype(int)

# 1. Correlation of missingness
missing_corr = missing_matrix.corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(missing_corr, 
            cmap='RdBu_r', 
            center=0,
            annot=True, 
            fmt='.2f',
            square=True)
plt.title('Correlation of Missing Values')
plt.tight_layout()
plt.show()

# 2. Analyze if missingness is related to observed values
# For example, let's check if missing vital signs are related to age or acuity
vital_signs = ['temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp']
observed_vars = ['age_at_ed', 'acuity']

results = []
for vital in vital_signs:
    for var in observed_vars:
        # Calculate mean of observed variable for missing and non-missing groups
        missing_mean = df[df[vital].isna()][var].mean()
        present_mean = df[~df[vital].isna()][var].mean()
        
        results.append({
            'vital_sign': vital,
            'observed_var': var,
            'missing_mean': missing_mean,
            'present_mean': present_mean,
            'difference': missing_mean - present_mean
        })

# Convert results to DataFrame
comparison_df = pd.DataFrame(results)
print("\nComparison of means between missing and non-missing groups:")
print(comparison_df.round(2))

In [None]:
# Focus on vital signs only
vital_signs = ['temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp', 'pain']

# Create correlation matrix of missingness for vital signs only
vital_missing = df[vital_signs].isna().astype(int)
vital_corr = vital_missing.corr()

# Plot correlation heatmap with values for each cell
plt.figure(figsize=(10, 8))

# Create the heatmap
heatmap = sns.heatmap(vital_corr, 
                      cmap='RdBu_r', 
                      center=0,
                      square=True,
                      linewidths=0.5)

# Add annotations manually
for i in range(len(vital_corr.index)):
    for j in range(len(vital_corr.columns)):
        value = vital_corr.iloc[i, j]
        heatmap.text(j + 0.5, i + 0.5, f'{value:.2f}', 
                    ha='center', va='center',
                    color='black', fontsize=10)

plt.title('Correlation of Missing Values in Vital Signs')
plt.tight_layout()
plt.show()

# Print missingness percentages
missingness_pct = vital_missing.mean() * 100
print("\nPercentage of missing values for each vital sign:")
for vital, pct in missingness_pct.items():
    print(f"{vital}: {pct:.1f}%")

# print correlation matrix of vital signs only
print('Missingness correlation matrix of vital signs:')
print(vital_corr)


In [None]:
# dropping rows wtih missing values
df_clean = df.dropna()

# print missingness ratio of cleaned dataframe
missingness_ratio_clean = df_clean.isna().mean()
print('Missingness ratio of cleaned dataframe:')
missingness_ratio_clean




In [None]:
#histogram of los_minutes
plt.hist(df_clean['los_minutes'], bins=100)
plt.show()

# standardize los_minutes on a spearate dataframe
df_los = df_clean['los_minutes'].copy()
df_los = (df_los - df_los.mean()) / df_los.std()

# histogram of los_minutes
plt.hist(df_los, bins=100)
plt.show()

# standardize los_minutes with log transformation
df_los = np.log(df_los)

# histogram of los_minutes
plt.hist(df_los, bins=100)
plt.show()



In [None]:
from scipy import stats

# Option 1: Current log transformation (simple and interpretable)
los_log = np.log(df_clean['los_minutes'])

# Option 2: Box-Cox transformation (potentially better normalization)
los_boxcox, lambda_param = stats.boxcox(df_clean['los_minutes'])

# Compare distributions
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Log transform
ax1.hist(los_log, bins=100)
ax1.set_title('Log transformed los_minutes')

# Box-Cox
ax2.hist(los_boxcox, bins=100)
ax2.set_title('Box-Cox transformed los_minutes')

plt.show()

# Print Box-Cox lambda parameter
print(f"Box-Cox lambda parameter: {lambda_param:.3f}")

In [None]:
# histograme of age_at_ed
plt.hist(df_clean['age_at_ed'], bins=100)
plt.title('Histogram of Age at ED')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

# histograme of sbp
plt.hist(df_clean['sbp'], bins=100)
plt.title('Histogram of SBP')
plt.xlabel('SBP')
plt.ylabel('Frequency')
plt.show()

# histogram of dbp
plt.hist(df_clean['dbp'], bins=100)
plt.title('Histogram of DBP')
plt.xlabel('DBP')
plt.ylabel('Frequency')
plt.show()

# histogram of o2sat
plt.hist(df_clean['o2sat'], bins=100)
plt.title('Histogram of O2Sat')
plt.xlabel('O2Sat')
plt.ylabel('Frequency')
plt.show()

# histogram of resprate
plt.hist(df_clean['resprate'], bins=100)
plt.title('Histogram of Resprate')
plt.xlabel('Resprate')
plt.ylabel('Frequency')
plt.show()

# histogram of temperature
plt.hist(df_clean['temperature'], bins=100)
plt.title('Histogram of Temperature')
plt.xlabel('Temperature')
plt.ylabel('Frequency')
plt.show()

# histogram of pain
plt.hist(df_clean['pain'], bins=100)
plt.title('Histogram of Pain')
plt.xlabel('Pain')
plt.ylabel('Frequency')
plt.show()



In [None]:
from scipy import stats
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np

# 1. Select features
features = ['age_at_ed', 'los_minutes', 'heartrate', 'sbp', 'dbp', 'o2sat', 
           'resprate', 'temperature', 'pain']
df_prep = df_clean[features].copy()

# 2. Box-Cox transform los_minutes
df_prep['los_minutes'], lambda_param = stats.boxcox(df_prep['los_minutes'])
print(f"Box-Cox lambda parameter for los_minutes: {lambda_param:.3f}")

# 3. Standard scale all other features
other_features = [col for col in features if col != 'los_minutes']
scaler = StandardScaler()
df_prep[other_features] = scaler.fit_transform(df_prep[other_features])

# 4. Create histograms
n_features = len(features)
n_cols = 3
n_rows = (n_features + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
axes = axes.ravel()  # Flatten axes array for easier indexing

for idx, feature in enumerate(features):
    axes[idx].hist(df_prep[feature], bins=50)
    axes[idx].set_title(f'Distribution of {feature}')
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('Frequency')

# Remove empty subplots if any
for idx in range(n_features, len(axes)):
    fig.delaxes(axes[idx])

plt.tight_layout()
plt.show()

# Print basic statistics of transformed data
print("\nBasic statistics of transformed features:")
print(df_prep.describe().round(3))

# Return the prepared DataFrame
df_prep.head()

In [None]:
import cupy as cp
print("CUDA available:", cp.is_available())
print("CUDA version:", cp.cuda.runtime.runtimeGetVersion())

import cuml
print("cuML version:", cuml.__version__)

In [None]:

# Setup HDBSCAN with much larger min_cluster_size
hdbscan_model = HDBSCAN(
    min_cluster_size=5000,  # Significantly increased
    min_samples=100,
    cluster_selection_method='eom'
)
hdbscan_model.fit(df_prep_gpu)



In [None]:
# Try different cut distances
for cut_distance in [0.05, 0.1, 0.2, 0.5, 1.0]:
    labels = hdbscan_model.single_linkage_tree_.get_clusters(
        cut_distance=cut_distance,
        min_cluster_size=5000  # Match the min_cluster_size from above
    )
    
    print(f"\nCut distance: {cut_distance}")
    print("Number of clusters:", len(np.unique(labels)))
    print("Cluster sizes:")
    print(pd.Series(labels).value_counts().sort_index())

In [None]:
from hdbscan import HDBSCAN
# faeture engineering
df_hdbscan = df_clean[['age_at_ed', 'los_minutes', 'heartrate', 'sbp', 'dbp', 'o2sat', 'resprate', 'temperature', 'pain', 'acuity']].copy()

# setup HDBSCAN to cluster patients into groups
hdbscan_model = HDBSCAN(min_cluster_size=10, min_samples=50)
df_hdbscan['cluster'] = hdbscan_model.fit_predict(df_hdbscan)

# print the number of clusters
print('Number of clusters:')
df_hdbscan['cluster'].nunique()

# print the number of patients in each cluster
print('Number of patients in each cluster:')
df_hdbscan['cluster'].value_counts()


In [None]:
# Extract different numbers of clusters from the existing model
for cut_distance in [0.1, 0.2, 0.3, 0.4, 0.5]:  # Try different cut distances
    labels = hdbscan_model.single_linkage_tree_.get_clusters(
        cut_distance=cut_distance,
        min_cluster_size=100
    )
    
    print(f"\nCut distance: {cut_distance}")
    print("Number of clusters:", len(np.unique(labels)))
    print("Cluster sizes:")
    print(pd.Series(labels).value_counts().sort_index())

    # Add these labels to the dataframe for analysis if needed
    df_hdbscan[f'cluster_dist_{cut_distance}'] = labels

In [None]:
# import StandardScaler
from sklearn.preprocessing import StandardScaler

# Feature engineering
df_hdbscan = df_clean[['age_at_ed', 'los_minutes', 'heartrate', 'sbp', 'dbp', 'o2sat', 'resprate', 'temperature', 'pain', 'acuity']].copy()

# Scale the features
scaler = StandardScaler()
df_hdbscan_scaled = scaler.fit_transform(df_hdbscan)

# Setup HDBSCAN with much larger min_cluster_size
hdbscan_model = HDBSCAN(
    min_cluster_size=5000,  # Significantly increased
    min_samples=100,
    cluster_selection_method='eom'
)
hdbscan_model.fit(df_hdbscan_scaled)

# Try different cut distances
for cut_distance in [0.05, 0.1, 0.2, 0.5, 1.0]:
    labels = hdbscan_model.single_linkage_tree_.get_clusters(
        cut_distance=cut_distance,
        min_cluster_size=5000  # Match the min_cluster_size from above
    )
    
    print(f"\nCut distance: {cut_distance}")
    print("Number of clusters:", len(np.unique(labels)))
    print("Cluster sizes:")
    print(pd.Series(labels).value_counts().sort_index())

In [None]:
# Get the cluster labels at your preferred cut distance (e.g., 1.0)
cut_distance = 1.0
labels = hdbscan_model.single_linkage_tree_.get_clusters(
    cut_distance=cut_distance,
    min_cluster_size=5000
)

# Add cluster labels to original dataframe
df_with_clusters = df_hdbscan.copy()
df_with_clusters['cluster'] = labels

# Get summary statistics for each cluster
cluster_stats = df_with_clusters.groupby('cluster').agg({
    'age_at_ed': ['mean', 'std', 'median'],
    'los_minutes': ['mean', 'std', 'median'],
    'heartrate': ['mean', 'std', 'median'],
    'sbp': ['mean', 'std', 'median'],
    'dbp': ['mean', 'std', 'median'],
    'o2sat': ['mean', 'std', 'median'],
    'resprate': ['mean', 'std', 'median'],
    'temperature': ['mean', 'std', 'median'],
    'pain': ['mean', 'std', 'median'],
    'acuity': ['mean', 'std', 'median']
}).round(2)

print("Cluster Statistics:")
print(cluster_stats)

# You can also get the size of each cluster
cluster_sizes = df_with_clusters['cluster'].value_counts().sort_index()
print("\nCluster Sizes:")
print(cluster_sizes)

# To make it more readable, you can look at one feature at a time
for feature in df_hdbscan.columns:
    print(f"\n{feature} statistics by cluster:")
    print(df_with_clusters.groupby('cluster')[feature].agg(['mean', 'std', 'median']).round(2))

In [None]:
# Get clusters at cut_distance = 0.5
cut_distance = 0.5
labels = hdbscan_model.single_linkage_tree_.get_clusters(
    cut_distance=cut_distance,
    min_cluster_size=5000
)

# Create DataFrame with cluster labels
df_with_clusters = df_hdbscan.copy()
df_with_clusters['cluster'] = labels

# Create box plots for each feature
features = df_hdbscan.columns
n_features = len(features)
n_cols = 2  # 2 columns of plots
n_rows = (n_features + n_cols - 1) // n_cols

plt.figure(figsize=(15, 5 * n_rows))

for i, feature in enumerate(features, 1):
    plt.subplot(n_rows, n_cols, i)
    
    # Create boxplot
    sns.boxplot(data=df_with_clusters, x='cluster', y=feature)
    
    # Customize the plot
    plt.title(f'{feature} Distribution by Cluster')
    plt.xlabel('Cluster (-1 = outliers)')
    plt.ylabel(feature)
    
    # Add a grid for better readability
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print cluster sizes for reference
cluster_sizes = df_with_clusters['cluster'].value_counts().sort_index()
print("\nCluster Sizes:")
print(cluster_sizes)

In [6]:
# Import GPU libraries
from cuml.cluster import HDBSCAN
from cuml.preprocessing import StandardScaler
import cudf


In [None]:
nvidia-smi

# Playground

In [None]:
medrecon = mimicdf.medrecon()
medrecon.sample(10)


In [None]:
def aggregate_patient_medications(df):
    # Group by patient ID and combine all medications across visits
    # Filter out None/null values before creating sets
    patient_meds = df.groupby('stay_id')['etccode'].agg(
        lambda x: set().union(*[set([i]) for i in x if pd.notna(i)])
    )
    return patient_meds

patient_meds = aggregate_patient_medications(medrecon)
patient_meds.head()

# Models

## t-SNE

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.manifold import TSNE

# Create a copy and drop all rows with missing values
df_encoded = df.copy()
print(f"Shape before dropping NA: {df_encoded.shape}")
df_encoded = df_encoded.dropna()
print(f"Shape after dropping NA: {df_encoded.shape}")

# Take a random sample of 5000 records
sample_size = 15000
if len(df_encoded) > sample_size:
    df_encoded = df_encoded.sample(n=sample_size, random_state=42)
    print(f"Shape after sampling: {df_encoded.shape}")

# Encode categorical variables
categorical_cols = ['gender', 'race', 'arrival_transport', 'disposition', 'dow']
for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])

# Standardize numerical features
numerical_cols = ['age_at_ed', 'hour', 'los_minutes', 'temperature', 'heartrate', 
                 'resprate', 'o2sat', 'sbp', 'dbp', 'pain', 'acuity']
scaler = StandardScaler()
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

# Apply t-SNE
tsne_model = TSNE(n_components=2, random_state=42)
df_tsne = tsne_model.fit_transform(df_encoded)
df_tsne = pd.DataFrame(df_tsne, columns=['tsne1', 'tsne2'])

# Visualize
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df_tsne, x='tsne1', y='tsne2', alpha=0.5)
plt.title(f't-SNE visualization with gender (n={len(df_encoded)})')
plt.show()


In [None]:
def plot_tsne_with_contours(df_encoded, embedding, target_feature, 
                           grid_size=300, sigma=12, figsize=(10, 8)):
    """
    Create t-SNE visualization with contour overlays for a specified feature.
    
    Parameters:
        df_encoded (pd.DataFrame): Preprocessed dataframe with encoded values
        embedding (np.array): t-SNE embedding coordinates
        target_feature (str): Column name of the feature to visualize
        grid_size (int): Resolution of the contour grid
        sigma (float): Gaussian smoothing parameter
        figsize (tuple): Figure size in inches
    """
    # Get standardized values for target feature
    scaler = StandardScaler()
    standardized_values = scaler.fit_transform(df_encoded[[target_feature]]).ravel()
    
    # Create the visualization
    fig, ax = plt.subplots(figsize=figsize)
    
    # Base scatter plot
    plt.scatter(embedding[:, 0], embedding[:, 1], 
               color='gray', alpha=0.2, s=2)
    
    # Create grid
    x_min, x_max = embedding[:, 0].min(), embedding[:, 0].max()
    y_min, y_max = embedding[:, 1].min(), embedding[:, 1].max()
    xi = np.linspace(x_min, x_max, grid_size)
    yi = np.linspace(y_min, y_max, grid_size)
    xi, yi = np.meshgrid(xi, yi)
    
    # Interpolate using standardized values
    zi = griddata((embedding[:, 0], embedding[:, 1]), 
                 standardized_values,
                 (xi, yi), 
                 method='nearest',
                 fill_value=np.nan)
    
    # Apply Gaussian smoothing
    smooth_values = gaussian_filter(zi, sigma=sigma)
    
    # Create levels based on standard deviations
    levels = np.linspace(-2, 2, 20)  # From -2 to +2 standard deviations
    
    # Plot contours
    contourf = plt.contourf(xi, yi, smooth_values, 
                           levels=levels,
                           cmap='RdBu_r',  # Red-Blue reversed
                           alpha=0.5,
                           extend='both')
    
    contour = plt.contour(xi, yi, smooth_values, 
                         levels=levels,
                         colors='black',
                         alpha=0.3,
                         linewidths=0.5,
                         extend='both')
    
    # Add colorbar
    cbar = plt.colorbar(contourf, 
                       label=f'{target_feature} (standardized)', 
                       orientation='vertical',
                       fraction=0.046, 
                       pad=0.04)
    cbar.add_lines(contour)
    
    plt.title(f't-SNE visualization with {target_feature} Contours (n={len(df_encoded)})')
    plt.tight_layout()
    
    return fig, ax

# Example usage:
# First, create t-SNE embedding
tsne_model = TSNE(n_components=2, random_state=42)
tsne_embedding = tsne_model.fit_transform(df_encoded)

# Then plot with different features
plot_tsne_with_contours(df_encoded, tsne_embedding, 'age_at_ed')
plt.show()

plot_tsne_with_contours(df_encoded, tsne_embedding, 'los_minutes')
plt.show()



In [None]:
plot_tsne_with_contours(df_encoded, tsne_embedding, 'heartrate')
plt.show()

plot_tsne_with_contours(df_encoded, tsne_embedding, 'sbp')
plt.show()

plot_tsne_with_contours(df_encoded, tsne_embedding, 'dbp')
plt.show()

plot_tsne_with_contours(df_encoded, tsne_embedding, 'o2sat')
plt.show()

plot_tsne_with_contours(df_encoded, tsne_embedding, 'resprate')
plt.show()

plot_tsne_with_contours(df_encoded, tsne_embedding, 'temperature')
plt.show()


In [None]:
plot_tsne_with_contours(df_encoded, tsne_embedding, 'pain')
plt.show()

plot_tsne_with_contours(df_encoded, tsne_embedding, 'acuity')
plt.show()



In [9]:
def plot_tsne_categorical_density(df_encoded, embedding, feature, target_value, 
                                grid_size=300, sigma=12, figsize=(10, 8)):
    """
    Create t-SNE visualization with density contours for a categorical feature's specific value.
    
    Parameters:
        df_encoded (pd.DataFrame): Preprocessed dataframe with encoded values
        embedding (np.array): t-SNE embedding coordinates
        feature (str): Column name of the categorical feature
        target_value: The specific value to calculate density for (original category)
        grid_size (int): Resolution of the contour grid
        sigma (float): Gaussian smoothing parameter
        figsize (tuple): Figure size in inches
    """
    # Get original values from df_encoded's index to ensure matching lengths
    original_values = df.loc[df_encoded.index, feature]
    
    # Create binary indicator for target value using original values
    binary_indicator = (original_values == target_value).astype(float)
    
    # Create the visualization
    fig, ax = plt.subplots(figsize=figsize)
    
    # Base scatter plot
    plt.scatter(embedding[:, 0], embedding[:, 1], 
               color='gray', alpha=0.2, s=2)
    
    # Create grid
    x_min, x_max = embedding[:, 0].min(), embedding[:, 0].max()
    y_min, y_max = embedding[:, 1].min(), embedding[:, 1].max()
    xi = np.linspace(x_min, x_max, grid_size)
    yi = np.linspace(y_min, y_max, grid_size)
    xi, yi = np.meshgrid(xi, yi)
    
    # Interpolate density values
    zi = griddata((embedding[:, 0], embedding[:, 1]), 
                 binary_indicator,
                 (xi, yi), 
                 method='nearest',
                 fill_value=np.nan)
    
    # Apply Gaussian smoothing
    smooth_density = gaussian_filter(zi, sigma=sigma)
    
    # Create levels for density (0 to 1)
    levels = np.linspace(0, 1, 20)
    
    # Plot contours
    contourf = plt.contourf(xi, yi, smooth_density, 
                           levels=levels,
                           cmap='RdBu_r',  # Red-Blue reversed
                           alpha=0.5,
                           extend='both')
    
    contour = plt.contour(xi, yi, smooth_density, 
                         levels=levels,
                         colors='black',
                         alpha=0.3,
                         linewidths=0.5,
                         extend='both')
    
    # Add colorbar
    cbar = plt.colorbar(contourf, 
                       label=f'Density of {feature}={target_value}', 
                       orientation='vertical',
                       fraction=0.046, 
                       pad=0.04)
    cbar.add_lines(contour)
    
    plt.title(f't-SNE visualization: Density of {feature}={target_value} (n={len(df_encoded)})')
    plt.tight_layout()
    
    return fig, ax



In [None]:
# Example usage:
# Plot density of patients discharged home
plot_tsne_categorical_density(df_encoded, tsne_embedding, 'disposition', 'HOME')
plt.show()

# Plot density of patients discharged to home
plot_tsne_categorical_density(df_encoded, tsne_embedding, 'disposition', 'ADMITTED')
plt.show()

# Plot density of patients discharged to home
plot_tsne_categorical_density(df_encoded, tsne_embedding, 'disposition', 'OTHER')
plt.show()

plot_tsne_categorical_density(df_encoded, tsne_embedding, 'arrival_transport', 'WALK IN')
plt.show()

plot_tsne_categorical_density(df_encoded, tsne_embedding, 'arrival_transport', 'AMBULANCE')
plt.show()

plot_tsne_categorical_density(df_encoded, tsne_embedding, 'arrival_transport', 'OTHER')

In [None]:
# Plot density of female patients
plot_tsne_categorical_density(df_encoded, tsne_embedding, 'gender', 'F')
plt.show()

plot_tsne_categorical_density(df_encoded, tsne_embedding, 'gender', 'M')
plt.show()

plot_tsne_categorical_density(df_encoded, tsne_embedding, 'race', 'WHITE')
plt.show()

plot_tsne_categorical_density(df_encoded, tsne_embedding, 'race', 'BLACK')
plt.show()

plot_tsne_categorical_density(df_encoded, tsne_embedding, 'race', 'ASIAN')
plt.show()

plot_tsne_categorical_density(df_encoded, tsne_embedding, 'race', 'HISPANIC')
plt.show()

In [None]:

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.manifold import TSNE
import umap.umap_ as umap
import matplotlib.pyplot as plt

# Create a copy and drop all rows with missing values
df_encoded = df.copy()
print(f"Shape before dropping NA: {df_encoded.shape}")
df_encoded = df_encoded.dropna()
print(f"Shape after dropping NA: {df_encoded.shape}")

# Take a random sample
sample_size = 5000
if len(df_encoded) > sample_size:
    df_encoded = df_encoded.sample(n=sample_size, random_state=42)
    print(f"Shape after sampling: {df_encoded.shape}")

# Encode categorical variables
categorical_cols = ['gender', 'race', 'arrival_transport', 'disposition', 'dow']
for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])

# Standardize numerical features
numerical_cols = ['age_at_ed', 'hour', 'los_minutes', 'temperature', 'heartrate', 
                 'resprate', 'o2sat', 'sbp', 'dbp', 'pain', 'acuity']
scaler = StandardScaler()
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

# Apply t-SNE
tsne_model = TSNE(n_components=2, random_state=42)
df_tsne = pd.DataFrame(
    tsne_model.fit_transform(df_encoded), 
    columns=['tsne1', 'tsne2']
)

# Apply UMAP
umap_model = umap.UMAP(random_state=42)
df_umap = pd.DataFrame(
    umap_model.fit_transform(df_encoded),
    columns=['umap1', 'umap2']
)

# Create side-by-side plots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

# t-SNE plot
sns.scatterplot(data=df_tsne, x='tsne1', y='tsne2', alpha=0.5, ax=ax1)
ax1.set_title(f't-SNE visualization (n={len(df_encoded)})')

# UMAP plot
sns.scatterplot(data=df_umap, x='umap1', y='umap2', alpha=0.5, ax=ax2)
ax2.set_title(f'UMAP visualization (n={len(df_encoded)})')

plt.tight_layout()
plt.show()

In [17]:
def plot_umap_with_contours(df_encoded, embedding_df, target_feature, 
                           grid_size=300, sigma=12, figsize=(10, 8)):
    """
    Create UMAP visualization with contour overlays for a specified feature.
    
    Parameters:
        df_encoded (pd.DataFrame): Preprocessed dataframe with encoded values
        embedding_df (pd.DataFrame): UMAP embedding coordinates DataFrame
        target_feature (str): Column name of the feature to visualize
        grid_size (int): Resolution of the contour grid
        sigma (float): Gaussian smoothing parameter
        figsize (tuple): Figure size in inches
    """
    # Convert embedding DataFrame to numpy array
    embedding = embedding_df.values
    
    # Get standardized values for target feature
    scaler = StandardScaler()
    standardized_values = scaler.fit_transform(df_encoded[[target_feature]]).ravel()
    
    # Create the visualization
    fig, ax = plt.subplots(figsize=figsize)
    
    # Base scatter plot
    plt.scatter(embedding[:, 0], embedding[:, 1], 
               color='gray', alpha=0.2, s=2)
    
    # Create grid
    x_min, x_max = embedding[:, 0].min(), embedding[:, 0].max()
    y_min, y_max = embedding[:, 1].min(), embedding[:, 1].max()
    xi = np.linspace(x_min, x_max, grid_size)
    yi = np.linspace(y_min, y_max, grid_size)
    xi, yi = np.meshgrid(xi, yi)
    
    # Interpolate using standardized values
    zi = griddata((embedding[:, 0], embedding[:, 1]), 
                 standardized_values,
                 (xi, yi), 
                 method='nearest',
                 fill_value=np.nan)
    
    # Apply Gaussian smoothing
    smooth_values = gaussian_filter(zi, sigma=sigma)
    
    # Create levels based on standard deviations
    levels = np.linspace(-2, 2, 20)  # From -2 to +2 standard deviations
    
    # Plot contours
    contourf = plt.contourf(xi, yi, smooth_values, 
                           levels=levels,
                           cmap='RdBu_r',  # Red-Blue reversed
                           alpha=0.5,
                           extend='both')
    
    contour = plt.contour(xi, yi, smooth_values, 
                         levels=levels,
                         colors='black',
                         alpha=0.3,
                         linewidths=0.5,
                         extend='both')
    
    # Add colorbar
    cbar = plt.colorbar(contourf, 
                       label=f'{target_feature} (standardized)', 
                       orientation='vertical',
                       fraction=0.046, 
                       pad=0.04)
    cbar.add_lines(contour)
    
    plt.title(f'UMAP visualization with {target_feature} Contours (n={len(df_encoded)})')
    plt.tight_layout()
    
    return fig, ax

In [None]:
plot_umap_with_contours(df_encoded, df_umap, 'age_at_ed')
plt.show()

plot_umap_with_contours(df_encoded, df_umap, 'los_minutes')
plt.show()

plot_umap_with_contours(df_encoded, df_umap, 'heartrate')
plt.show()

plot_umap_with_contours(df_encoded, df_umap, 'sbp')
plt.show()

plot_umap_with_contours(df_encoded, df_umap, 'dbp')
plt.show()

plot_umap_with_contours(df_encoded, df_umap, 'o2sat')
plt.show()

plot_umap_with_contours(df_encoded, df_umap, 'resprate')
plt.show()

plot_umap_with_contours(df_encoded, df_umap, 'temperature')
plt.show()

plot_umap_with_contours(df_encoded, df_umap, 'pain')
plt.show()

plot_umap_with_contours(df_encoded, df_umap, 'acuity')
plt.show()



In [None]:
# Create multiple UMAP visualizations with different parameters
fig, axes = plt.subplots(2, 2, figsize=(20, 20))

# Different parameter combinations
params = [
    {'n_neighbors': 25, 'min_dist': 0.1, 'title': 'Default (n=15, dist=0.1)'},
    {'n_neighbors': 25, 'min_dist': 0.25, 'title': 'More Clustered (n=5, dist=0.0)'},
    {'n_neighbors': 25, 'min_dist': 0.50, 'title': 'More Spread (n=30, dist=0.5)'},
    {'n_neighbors': 25, 'min_dist': 0.85, 'title': 'Most Global (n=50, dist=0.8)'}
]

for (i, j), param in zip([(0,0), (0,1), (1,0), (1,1)], params):
    # Create UMAP embedding
    umap_model = umap.UMAP(
        n_neighbors=param['n_neighbors'],
        min_dist=param['min_dist'],
        random_state=42
    )
    embedding = umap_model.fit_transform(df_encoded)
    
    # Plot
    axes[i,j].scatter(embedding[:, 0], embedding[:, 1], alpha=0.5, s=2)
    axes[i,j].set_title(param['title'])

plt.tight_layout()
plt.show()