In [2]:
# =======================================================
# Notebook 1 (Corrected) - Early Sepsis Detection - Data Preprocessing
# Dataset: PhysioNet 2019 (via Kaggle)
# ======================================================
import pandas as pd
import numpy as np
import glob, os
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

# -----------------------------
# Paths (update if different)
# -----------------------------
pathA = "/kaggle/input/prediction-of-sepsis/training_setA/training/"
pathB = "/kaggle/input/prediction-of-sepsis/training_setB/training_setB/"
filesA = sorted(glob.glob(os.path.join(pathA, "*.psv")))
filesB = sorted(glob.glob(os.path.join(pathB, "*.psv")))
all_files = filesA + filesB

print(f"Total patients found: {len(all_files)}")
assert len(all_files) > 0, "No patient files found — check pathA/pathB"

# -----------------------------
# 2. Determine global columns to keep (sample subset)
# -----------------------------
print("Analyzing global feature coverage across a sample of patients...")

missing_ratios = []
sample_n = min(500, len(all_files))
for f in tqdm(all_files[:sample_n]):
    df = pd.read_csv(f, sep='|')
    missing_ratios.append(df.isnull().mean())

global_missing = pd.concat(missing_ratios, axis=1).mean(axis=1)
# keep columns that exist in at least 10% of sample patients (i.e. missing < 0.9)
keep_cols = global_missing[global_missing < 0.9].index.tolist()

# Ensure label column is present
if "SepsisLabel" not in keep_cols:
    keep_cols.append("SepsisLabel")

print(f"Keeping {len(keep_cols)} columns (including SepsisLabel).")

# -----------------------------
# 3. Build consistent arrays
# -----------------------------
X_raw, y_raw = [], []
max_timesteps = 48  # pad/truncate sequences to 48h

for f in tqdm(all_files):
    df = pd.read_csv(f, sep='|')

    # Keep only consistent columns and add missing ones
    for col in keep_cols:
        if col not in df.columns:
            df[col] = np.nan
    df = df[keep_cols]

    # ✅ FIX: Only forward-fill to avoid leaking future information.
    df = df.fillna(method='ffill')

    # After ffill there may still be NaNs at the beginning -> leave them for now
    label = int(df['SepsisLabel'].max()) if 'SepsisLabel' in df.columns else 0
    df = df.drop(columns=['SepsisLabel'])

    X_raw.append(df.values.astype(np.float32))
    y_raw.append(label)

print("✅ Loaded raw patient sequences (no backward-fill).")

# -----------------------------
# 4. Pad/truncate sequences (so we can split and compute training medians/scaler)
# -----------------------------
X_padded = pad_sequences(X_raw, maxlen=max_timesteps, dtype='float32', padding='post', truncating='post')
y = np.array(y_raw, dtype=np.int64)

print("Shapes before split: X_padded:", X_padded.shape, "y:", y.shape)

# -----------------------------
# 5. Train/Test split (do this before scaling/imputation using train stats)
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_padded, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train patients: {X_train.shape[0]}, Test patients: {X_test.shape[0]}")
print("Sepsis % in training set:", y_train.mean()*100)

# -----------------------------
# 6. Impute remaining NaNs using training medians (no future-leakage)
# -----------------------------
# Compute per-feature median across all (non-padded) values in X_train
n_train, T, F = X_train.shape
# mask padded rows (all-zero rows were originally not necessarily zero; but here empty entries are NaN)
# We'll treat zeros that were present from pad_sequences as 0; we need mask for actual values:
# Create mask where any non-zero or non-nan in row -> consider valid timesteps
valid_mask = ~np.isnan(X_train)  # True where not NaN
# To compute medians ignoring NaNs and ignoring padded timesteps, flatten and use nanmedian
train_flat = X_train.reshape(-1, F)
feat_meds = np.nanmedian(train_flat, axis=0)  # shape (F,)
# Replace remaining NaNs in both train and test with feature medians
def impute_with_median(X, medians):
    X2 = np.array(X, dtype=np.float32)
    inds = np.isnan(X2)
    if inds.any():
        X2[inds] = np.take(medians, np.where(inds)[2])  # set along feature axis
    return X2

X_train = impute_with_median(X_train, feat_meds)
X_test  = impute_with_median(X_test, feat_meds)
print("✅ Imputed remaining NaNs with training-feature medians.")

# -----------------------------
# 7. Scale: fit scaler on X_train only (flatten across time)
# -----------------------------
scaler = StandardScaler()
flat_train = X_train.reshape(-1, F)  # (n_train * T, F)
scaler.fit(flat_train)
# transform train & test
X_train = scaler.transform(flat_train).reshape(n_train, T, F)
X_test  = scaler.transform(X_test.reshape(-1, F)).reshape(X_test.shape[0], T, F)
print("✅ Scaled features using scaler fitted on X_train only.")

# -----------------------------
# 8. Save arrays for next notebook (ensure shapes correct)
# -----------------------------
print("Final shapes -> X_train:", X_train.shape, "y_train:", y_train.shape, "X_test:", X_test.shape, "y_test:", y_test.shape)
assert X_train.shape[0] == y_train.shape[0]
assert X_test.shape[0] == y_test.shape[0]

np.save("/kaggle/working/X_train.npy", X_train)
np.save("/kaggle/working/X_test.npy", X_test)
np.save("/kaggle/working/y_train.npy", y_train)
np.save("/kaggle/working/y_test.npy", y_test)
print("✅ Preprocessing complete and saved to /kaggle/working/")


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Total patients found: 40336
Analyzing global feature coverage across a sample of patients...


100%|██████████| 500/500 [00:05<00:00, 87.94it/s]


Keeping 20 columns (including SepsisLabel).


100%|██████████| 40336/40336 [07:40<00:00, 87.53it/s] 


✅ Loaded raw patient sequences (no backward-fill).
Shapes before split: X_padded: (40336, 48, 19) y: (40336,)
Train patients: 32268, Test patients: 8068
Sepsis % in training set: 7.270360728895501
✅ Imputed remaining NaNs with training-feature medians.
✅ Scaled features using scaler fitted on X_train only.
Final shapes -> X_train: (32268, 48, 19) y_train: (32268,) X_test: (8068, 48, 19) y_test: (8068,)
✅ Preprocessing complete and saved to /kaggle/working/


In [7]:
# ===============================================================
# Notebook 2 (Corrected) - Load preprocessed data & train (2H)
# ===============================================================
import os, numpy as np, tensorflow as tf, matplotlib.pyplot as plt
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Masking, Conv1D, BatchNormalization, Activation, Dropout,
    Add, Bidirectional, LSTM, GRU, Dense, Multiply, Lambda,
    GlobalAveragePooling1D, Reshape, Concatenate
)
from tensorflow.keras.initializers import Constant
from tensorflow.keras import backend as K
from sklearn.metrics import (roc_auc_score, precision_recall_curve, auc,
                             classification_report, confusion_matrix, roc_curve)

# -------------------------
# 0. Load preprocessed arrays (saved by Notebook 1)
# -------------------------
print("Listing files in /kaggle/working/ ...")
for f in sorted(os.listdir("/kaggle/working"))[:20]:
    print(" -", f)
# change paths if you saved elsewhere
X_train = np.load("/kaggle/working/X_train.npy")
X_test  = np.load("/kaggle/working/X_test.npy")
y_train = np.load("/kaggle/working/y_train.npy")
y_test  = np.load("/kaggle/working/y_test.npy")

# -------------------------
# 1. Sanity checks & shapes
# -------------------------
print("Shapes after load:")
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test :", X_test.shape)
print("y_test :", y_test.shape)
assert X_train.ndim == 3 and X_test.ndim == 3
assert y_train.ndim == 1 and y_test.ndim == 1
assert X_train.shape[0] == y_train.shape[0], "X_train rows != y_train length"
assert X_test.shape[0] == y_test.shape[0],   "X_test rows != y_test length"

TIME_STEPS, N_FEATURES = X_train.shape[1], X_train.shape[2]
print(f"TIME_STEPS={TIME_STEPS}, N_FEATURES={N_FEATURES}")

# -------------------------
# Reproducibility
# -------------------------
SEED = 42
np.random.seed(SEED); tf.random.set_seed(SEED)

# -------------------------
# Bias initializer (dataset base rate)
# -------------------------
pos_rate = np.clip(y_train.mean(), 1e-6, 1 - 1e-6)
bias_init = np.log(pos_rate / (1 - pos_rate))
print("Bias init:", bias_init)

# -------------------------
# Define helper layers & blocks (TCN, SE, MHA wrapper)
# -------------------------
from tensorflow.keras.layers import MultiHeadAttention

def tcn_res_block(x, filters, kernel_size=3, dilation_rate=1, dropout_rate=0.15):
    conv = Conv1D(filters, kernel_size, padding='same', dilation_rate=dilation_rate)(x)
    conv = BatchNormalization()(conv)
    conv = Activation('relu')(conv)
    conv = Dropout(dropout_rate)(conv)
    conv = Conv1D(filters, kernel_size, padding='same', dilation_rate=dilation_rate)(conv)
    conv = BatchNormalization()(conv)
    if int(x.shape[-1]) != filters:
        res = Conv1D(filters, kernel_size=1, padding='same')(x)
    else:
        res = x
    out = Add()([res, conv])
    out = Activation('relu')(out)
    return out

def se_feature_gate(inputs, reduction=8):
    sq = GlobalAveragePooling1D()(inputs)
    se = Dense(int(inputs.shape[-1] // reduction), activation='relu')(sq)
    se = Dense(int(inputs.shape[-1]), activation='sigmoid')(se)
    se = Reshape((1, int(inputs.shape[-1])))(se)
    gated = Multiply()([inputs, se])
    return gated, se

# Multi-head temporal attention wrapper (Keras-safe)
def multihead_temporal_attention(seq, num_heads=4, key_dim=64, name="mha"):
    mha = MultiHeadAttention(num_heads=num_heads, key_dim=key_dim, name=f"{name}_mha")
    att_out = mha(seq, seq, return_attention_scores=False)  # (batch, time, hidden)
    context = Lambda(lambda z: tf.reduce_mean(z, axis=1), name=f"{name}_context_mean")(att_out)
    return context, att_out

# -------------------------
# Build model (TCN + BiRNN + Dual attention + SE)
# -------------------------
inputs = Input(shape=(TIME_STEPS, N_FEATURES), name="input_seq")
x = Masking(mask_value=0.0, name="masking")(inputs)

x = Conv1D(filters=64, kernel_size=3, padding='same', activation='relu', name="conv_in")(x)
x = BatchNormalization()(x)
x = Dropout(0.15)(x)

x = tcn_res_block(x, filters=64, dilation_rate=1)
x = tcn_res_block(x, filters=96, dilation_rate=2)
x = tcn_res_block(x, filters=128, dilation_rate=4)
x = tcn_res_block(x, filters=128, dilation_rate=8)

x, se_weights = se_feature_gate(x, reduction=8)

lstm = Bidirectional(LSTM(128, return_sequences=True))(x)
lstm = Dropout(0.15)(lstm)
gru  = GRU(128, return_sequences=True)(lstm)
gru_proj = Dense(256, activation='linear')(gru)
r = Add()([lstm, gru_proj])
r = Dropout(0.15)(r)

temp_context, temp_att_seq = multihead_temporal_attention(r, num_heads=4, key_dim=64, name="multihead")

feat_summary = Lambda(lambda z: K.mean(z, axis=1), name="feat_summary")(inputs)
fscore = Dense(N_FEATURES, name="feat_score")(feat_summary)
fweights = Activation('softmax', name="feat_weights")(fscore)
feat_context = Multiply()([feat_summary, fweights])
feat_context = Dense(128, activation='relu', name="feat_proj")(feat_context)

combined = Concatenate()([temp_context, feat_context])

h = Dense(256, activation='relu')(combined)
h = Dropout(0.15)(h)
h = Dense(128, activation='relu')(h)
h = Dropout(0.15)(h)
out = Dense(1, activation='sigmoid', bias_initializer=Constant(bias_init))(h)

model = Model(inputs=inputs, outputs=out)
model.summary()

# -------------------------
# Loss / Optimizer / Compile (stable defaults)
# -------------------------
def focal_loss(alpha=0.75, gamma=0.3):
    def loss_fn(y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        y_pred = tf.clip_by_value(y_pred, 1e-7, 1 - 1e-7)
        bce = tf.keras.backend.binary_crossentropy(y_true, y_pred)
        p_t = y_true*y_pred + (1-y_true)*(1-y_pred)
        alpha_factor = y_true*alpha + (1-y_true)*(1-alpha)
        modulating = tf.pow(1.0 - p_t, gamma)
        return tf.reduce_mean(alpha_factor * modulating * bce)
    return loss_fn

try:
    opt = tf.keras.optimizers.AdamW(learning_rate=1e-4, weight_decay=1e-5, clipnorm=1.0)
except Exception:
    opt = tf.keras.optimizers.Adam(learning_rate=1e-4, clipnorm=1.0)

model.compile(
    optimizer=opt,
    loss=focal_loss(alpha=0.75, gamma=0.3),
    metrics=[tf.keras.metrics.AUC(name="AUC"),
             tf.keras.metrics.Precision(name="Precision"),
             tf.keras.metrics.Recall(name="Recall")]
)

# -------------------------
# Callbacks (stable)
# -------------------------
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
callbacks_opt = [
    EarlyStopping(monitor="val_AUC", mode="max", patience=10, restore_best_weights=True, verbose=1),
    ModelCheckpoint("/kaggle/working/best_model_2H.h5", monitor="val_AUC", mode="max", save_best_only=True, verbose=1),
    ReduceLROnPlateau(monitor="val_AUC", mode="max", factor=0.5, patience=3, min_lr=1e-6, verbose=1)
]

# -------------------------
# Train (with pre-fit guard)
# -------------------------
print("Pre-fit shapes check:")
print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_test :", X_test.shape, "y_test :", y_test.shape)
assert X_train.shape[0] == y_train.shape[0]
assert X_test.shape[0] == y_test.shape[0]

class_weight = {0:1.0, 1:4.0}
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=60,
    batch_size=128,
    class_weight=class_weight,
    callbacks=callbacks_opt,
    verbose=2
)

# -------------------------
# Evaluate + save model (recommended .keras)
# -------------------------
y_pred_proba = model.predict(X_test, batch_size=128).ravel()
y_pred_proba = np.convolve(y_pred_proba, np.ones(3)/3, mode="same")

# threshold sweep
thresholds = np.linspace(0.05, 0.5, 10)
best_f1, best_t = 0, 0.5
for t in thresholds:
    pred_t = (y_pred_proba >= t).astype(int)
    tp = np.sum((pred_t==1) & (y_test==1))
    fp = np.sum((pred_t==1) & (y_test==0))
    fn = np.sum((pred_t==0) & (y_test==1))
    precision = tp/(tp+fp+1e-8)
    recall = tp/(tp+fn+1e-8)
    f1 = 2*precision*recall/(precision+recall+1e-8)
    if f1 > best_f1: best_f1, best_t = f1, t

print(f"Best F1 = {best_f1:.4f} at threshold = {best_t:.2f}")
y_pred = (y_pred_proba >= best_t).astype(int)

roc_auc = roc_auc_score(y_test, y_pred_proba)
prec, rec, _ = precision_recall_curve(y_test, y_pred_proba)
pr_auc = auc(rec, prec)
print(f"ROC AUC = {roc_auc:.4f}, PR AUC = {pr_auc:.4f}")
print(classification_report(y_test, y_pred, digits=4))
print(confusion_matrix(y_test, y_pred))

# Save final model in modern Keras format
model.save("/kaggle/working/sepsis_detector_v2H.keras")
print("Saved model to /kaggle/working/sepsis_detector_v2H.keras")


Listing files in /kaggle/working/ ...
 - .virtual_documents
 - X_test.npy
 - X_train.npy
 - best_model_2H.h5
 - sepsis_detector_v2H.keras
 - y_test.npy
 - y_train.npy
Shapes after load:
X_train: (32268, 48, 19)
y_train: (32268,)
X_test : (8068, 48, 19)
y_test : (8068,)
TIME_STEPS=48, N_FEATURES=19
Bias init: -2.545882245560757


Pre-fit shapes check:
X_train: (32268, 48, 19) y_train: (32268,)
X_test : (8068, 48, 19) y_test : (8068,)
Epoch 1/60

Epoch 1: val_AUC improved from -inf to 0.77529, saving model to /kaggle/working/best_model_2H.h5




253/253 - 27s - 108ms/step - AUC: 0.6971 - Precision: 0.3450 - Recall: 0.1223 - loss: 0.1272 - val_AUC: 0.7753 - val_Precision: 0.6176 - val_Recall: 0.0358 - val_loss: 0.1082 - learning_rate: 1.0000e-04
Epoch 2/60

Epoch 2: val_AUC improved from 0.77529 to 0.81216, saving model to /kaggle/working/best_model_2H.h5




253/253 - 12s - 47ms/step - AUC: 0.7981 - Precision: 0.3888 - Recall: 0.2779 - loss: 0.1113 - val_AUC: 0.8122 - val_Precision: 0.6875 - val_Recall: 0.1502 - val_loss: 0.0955 - learning_rate: 1.0000e-04
Epoch 3/60

Epoch 3: val_AUC improved from 0.81216 to 0.81921, saving model to /kaggle/working/best_model_2H.h5




253/253 - 12s - 47ms/step - AUC: 0.8169 - Precision: 0.3927 - Recall: 0.3035 - loss: 0.1074 - val_AUC: 0.8192 - val_Precision: 0.6781 - val_Recall: 0.1689 - val_loss: 0.0947 - learning_rate: 1.0000e-04
Epoch 4/60

Epoch 4: val_AUC did not improve from 0.81921
253/253 - 12s - 47ms/step - AUC: 0.8266 - Precision: 0.4007 - Recall: 0.3261 - loss: 0.1053 - val_AUC: 0.8190 - val_Precision: 0.6347 - val_Recall: 0.1809 - val_loss: 0.0933 - learning_rate: 1.0000e-04
Epoch 5/60

Epoch 5: val_AUC did not improve from 0.81921
253/253 - 12s - 47ms/step - AUC: 0.8351 - Precision: 0.3964 - Recall: 0.3367 - loss: 0.1034 - val_AUC: 0.8156 - val_Precision: 0.6599 - val_Recall: 0.1655 - val_loss: 0.0992 - learning_rate: 1.0000e-04
Epoch 6/60

Epoch 6: val_AUC did not improve from 0.81921

Epoch 6: ReduceLROnPlateau reducing learning rate to 4.999999873689376e-05.
253/253 - 12s - 47ms/step - AUC: 0.8425 - Precision: 0.4089 - Recall: 0.3559 - loss: 0.1015 - val_AUC: 0.8137 - val_Precision: 0.6447 - val_Rec