In [1]:
# Steps to preprocess data

# remove all features which underwent embedding
# remove features (admitdate)
# OHE for admission_type, admission_location, insurance, marital_status
# choose either readmitted_30 or readmitted_60 or days_to_next_admission
# currently, if next admission is not present, then days_to_next_admission is not an integer -- need to convert to -1

In [2]:
# Check if CUDA is installed
import torch
print(torch.cuda.is_available())

True


In [3]:
# Imports
import pickle
import torch
from transformers import BertTokenizer, BertModel

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Load data
X_filePath = '../../../data/processedData/X_RNN.pkl'
with open(X_filePath, 'rb') as f:
    data = pickle.load(f)

In [5]:
for subject_id, nested_dict in data.items():
    for hadm_id, record in nested_dict.items():
        record.pop('diagnoses', None)
        record.pop('drugs', None)
        record.pop('procedures', None)
        record.pop('days_to_next_admission', None)
        record.pop('readmitted_60', None)
        record.pop('admitdate', None)



In [6]:
def explore_nested_dict(data, top_key):
    if top_key in data:
        nested_dict = data[top_key]
        print(f"Top-level Key: {top_key}")
        print(f"Keys in nested dictionary: {nested_dict.keys()}")
        
        # Inspect one of the nested keys in detail
        for nested_key in nested_dict:
            print(f"\nNested Key: {nested_key}")
            print(f"Type of value: {type(nested_dict[nested_key])}")
            print(f"Value sample: {nested_dict[nested_key]}")
    else:
        print(f"Key {top_key} not found in the data.")


top_key_to_inspect = 10000032

explore_nested_dict(data, top_key_to_inspect)

Top-level Key: 10000032
Keys in nested dictionary: dict_keys([22595853, 22841357, 29079034])

Nested Key: 22595853
Type of value: <class 'dict'>
Value sample: {'timespent': 18, 'admission_type': 'URGENT', 'admission_location': 'TRANSFER FROM HOSPITAL', 'insurance': 'Other', 'marital_status': 'WIDOWED', 'hospital_expire_flag': 0, 'readmitted_30': 0, 'combined_embedding': array([ 7.06353039e-02,  1.66799352e-01, -2.93788612e-01,  2.35119641e-01,
       -3.58550340e-01,  1.39132544e-01,  5.30576766e-01,  3.92042994e-01,
        5.16894698e-01, -5.82762897e-01, -3.87033671e-01,  4.64646667e-01,
       -1.72051430e-01, -2.54900634e-01, -2.81801522e-01,  1.05978683e-01,
       -1.37993753e-01,  2.99448967e-01,  2.27836743e-01, -4.71671760e-01,
       -1.65306076e-01,  1.79964855e-01, -2.02964187e-01, -3.18241209e-01,
       -1.21349744e-01,  9.52861235e-02,  5.16720951e-01,  5.11681795e-01,
       -2.99426087e-04,  1.92941874e-01,  1.77122504e-01,  5.75323701e-01,
        5.03390208e-02, -1.

In [7]:
print(data.keys().__len__())

43814


In [8]:
from collections import defaultdict
import numpy as np

# Step 1: Find all unique categories for each field
unique_categories = defaultdict(set)
fields_to_encode = ["admission_type", "admission_location", "insurance", "marital_status"]

for subject_id, nested_dict in data.items():
    for hadm_id, record in nested_dict.items():
        for field in fields_to_encode:
            if field in record:
                value = record[field]
                if value is None:
                    value = "UNKNOWN"  # Handle null values
                value = str(value)  # Ensure all values are strings
                unique_categories[field].add(value)

# Step 2: Create a mapping from category to index
category_to_index = {
    field: {category: i for i, category in enumerate(sorted(categories))}
    for field, categories in unique_categories.items()
}

# Step 3: Replace string values with one-hot encoded vectors
for subject_id, nested_dict in data.items():
    for hadm_id, record in nested_dict.items():
        for field in fields_to_encode:
            if field in record:
                value = record[field]
                if value is None:
                    value = "UNKNOWN"  # Handle null values
                value = str(value)  # Ensure value is a string
                # Get the index for the category
                index = category_to_index[field][value]
                # Create a one-hot encoded vector
                ohe_vector = np.zeros(len(category_to_index[field]), dtype=int)
                ohe_vector[index] = 1
                # Replace the original string value with the one-hot vector
                record[field] = ohe_vector

# Confirmation message
print("One-hot encoding completed successfully.")


One-hot encoding completed successfully.


In [9]:
explore_nested_dict(data, top_key_to_inspect)

Top-level Key: 10000032
Keys in nested dictionary: dict_keys([22595853, 22841357, 29079034])

Nested Key: 22595853
Type of value: <class 'dict'>
Value sample: {'timespent': 18, 'admission_type': array([0, 0, 0, 0, 0, 0, 0, 0, 1]), 'admission_location': array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]), 'insurance': array([0, 0, 1]), 'marital_status': array([0, 0, 0, 1, 0]), 'hospital_expire_flag': 0, 'readmitted_30': 0, 'combined_embedding': array([ 7.06353039e-02,  1.66799352e-01, -2.93788612e-01,  2.35119641e-01,
       -3.58550340e-01,  1.39132544e-01,  5.30576766e-01,  3.92042994e-01,
        5.16894698e-01, -5.82762897e-01, -3.87033671e-01,  4.64646667e-01,
       -1.72051430e-01, -2.54900634e-01, -2.81801522e-01,  1.05978683e-01,
       -1.37993753e-01,  2.99448967e-01,  2.27836743e-01, -4.71671760e-01,
       -1.65306076e-01,  1.79964855e-01, -2.02964187e-01, -3.18241209e-01,
       -1.21349744e-01,  9.52861235e-02,  5.16720951e-01,  5.11681795e-01,
       -2.99426087e-04,  1.92941874e-0

In [10]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Masking, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [11]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Data Preparation Function with Truncation
def prepare_data_for_sequential_rnn(data_dict, 
                                    embedding_size=768, 
                                    procedures_embedding_size=100, 
                                    max_admissions=100):
    """
    Prepare data for an RNN model with ClinicalBERT embeddings and sequence truncation.

    Args:
        data_dict (dict): Dataset of patient admissions.
        embedding_size (int): Size of the ClinicalBERT embedding (combined_embedding).
        procedures_embedding_size (int): Size of the procedures embedding.
        max_admissions (int): Maximum number of admissions per patient to process.

    Returns:
        Tuple: Padded sequences and labels.
    """
    sequences = []  # Holds sequences of admissions for each patient
    labels = []     # Holds readmitted_30 labels for each admission

    for subject_id, admissions in data_dict.items():
        patient_sequence = []
        patient_labels = []
        for hadm_id, features in admissions.items():
            # Handle missing or empty combined_embedding
            combined_embedding = (
                features.get('combined_embedding', np.zeros(embedding_size))
                if isinstance(features.get('combined_embedding'), np.ndarray) and 
                features['combined_embedding'].shape[0] == embedding_size
                else np.zeros(embedding_size)  # Fallback to zero vector if missing
            )

            # Handle missing or empty procedures_embedding
            procedures_embedding = (
                features.get('procedures_embedding', np.zeros(procedures_embedding_size))
                if isinstance(features.get('procedures_embedding'), np.ndarray) and 
                features['procedures_embedding'].shape[0] == procedures_embedding_size
                else np.zeros(procedures_embedding_size)  # Fallback to zero vector if missing
            )

            # Combine features into a single vector
            feature_vector = np.concatenate([
                [features['timespent']],  # Numerical feature
                features['admission_type'],  # Categorical (OHE)
                features['admission_location'],  # Categorical (OHE)
                features['insurance'],  # Categorical (OHE)
                features['marital_status'],  # Categorical (OHE)
                [features['hospital_expire_flag']],  # Binary
                combined_embedding,  # ClinicalBERT embedding
                procedures_embedding  # Procedures embedding
            ])
            patient_sequence.append(feature_vector)
            patient_labels.append(features['readmitted_30'])

        # Truncate sequences if they exceed max_admissions
        if len(patient_sequence) > max_admissions:
            patient_sequence = patient_sequence[:max_admissions]
            patient_labels = patient_labels[:max_admissions]

        sequences.append(patient_sequence)
        labels.append(patient_labels)

    # Get feature vector size dynamically
    feature_size = len(sequences[0][0]) if sequences and sequences[0] else 0

    # Pad sequences for consistent input dimensions
    padded_sequences = pad_sequences(
        sequences,
        maxlen=max_admissions,
        padding='post',
        dtype='float16',
        value=np.zeros(feature_size, dtype='float16')
    )
    padded_labels = pad_sequences(labels, maxlen=max_admissions, padding='post', value=0)  # Use 0 for padding labels

    return padded_sequences, padded_labels




In [12]:
max_admissions = 100  # Limit to 100 admissions per patient
embedding_size = 768  # ClinicalBERT embedding size
procedures_embedding_size = 100  # Procedures embedding size

# Prepare the data
sequences, labels = prepare_data_for_sequential_rnn(data, embedding_size, procedures_embedding_size, max_admissions)

print("Sequences shape:", sequences.shape)  # (patients, max_admissions, features_per_admission)
print("Labels shape:", labels.shape)        # (patients, max_admissions)

Sequences shape: (43814, 100, 898)
Labels shape: (43814, 100)


In [13]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=0.2, random_state=42)

y_train = np.expand_dims(y_train, axis=-1)
y_test = np.expand_dims(y_test, axis=-1)

In [14]:
import tensorflow as tf

# Check if TensorFlow is using the GPU
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
tf.debugging.set_log_device_placement(True)

# Test computation on GPU
with tf.device('/GPU:0'):
    a = tf.constant([[1.0, 2.0, 3.0]])
    b = tf.constant([[4.0], [5.0], [6.0]])
    c = tf.matmul(a, b)
    print(c)

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("Memory growth enabled for GPU.")
    except RuntimeError as e:
        print(e)


Num GPUs Available:  0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op _MklMatMul in device /job:localhost/replica:0/task:0/device:CPU:0
tf.Tensor([[32.]], shape=(1, 1), dtype=float32)


In [15]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Masking, TimeDistributed

def build_rnn_model(input_shape):
    model = tf.keras.Sequential([
        tf.keras.layers.Masking(mask_value=0.0, input_shape=input_shape),  # Mask padded inputs
        tf.keras.layers.LSTM(128, return_sequences=True),
        tf.keras.layers.LSTM(64, return_sequences=True),
        tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(1, activation='sigmoid'))  # Predict at each time step
    ])
    model.compile(
        loss='binary_crossentropy', 
        optimizer='adam', 
        metrics=['accuracy']
    )
    return model

input_shape = sequences.shape[1:]  # (max_admissions, features_per_admission)
model = build_rnn_model(input_shape)
model.summary()

Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Cast in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Cast in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Cast in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op FloorMod in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Cast in device /job:localhost/replica:0/t

  super().__init__(**kwargs)


Executing op StatelessRandomNormalV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Mul in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op AddV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Qr in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op MatrixDiagPartV3 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Sign in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Mul in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Transpose in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Reshape in device /job:localhost/replica:0/task:0/

In [22]:
history = model.fit(
        X_train,  # Full sequence data
        y_train,  # Full sequence labels (padded)
        validation_split=0.2,
        batch_size=8,
        epochs=10
    )

# save the model
model.save("rnn_model.h5")

MemoryError: Unable to allocate 11.7 GiB for an array with shape (35051, 100, 898) and data type float32