<a href="https://colab.research.google.com/github/lama-a1/NewTech_Hackathon_Project/blob/main/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd
import numpy as np

# =========================
# 1) Load raw dataset
# =========================
df = pd.read_csv("Dataset.csv")

LABEL = "SepsisLabel"

# Keep only needed columns if they exist
base_cols = ["Hour", "HR", "O2Sat", "Resp", "Temp", "MAP", "Lactate", LABEL]
base_cols = [c for c in base_cols if c in df.columns]
df = df[base_cols].copy()

# =========================
# 2) Ensure Hour is numeric
# =========================
df["Hour"] = pd.to_numeric(df["Hour"], errors="coerce")

# =========================
# 3) Build temporary PatientID (when Hour resets to 0 -> new patient)
# =========================
# If Hour has NaN we treat as -1 so it won't be counted as reset-to-zero
df["PatientID_tmp"] = (df["Hour"].fillna(-1).eq(0)).cumsum()

# =========================
# 4) Sample patients (for hackathon speed)
# =========================
N_PATIENTS = 2000
patient_ids = df["PatientID_tmp"].drop_duplicates().head(N_PATIENTS)
df = df[df["PatientID_tmp"].isin(patient_ids)].copy()

# =========================
# 5) Sort within each patient by time
# =========================
df.sort_values(["PatientID_tmp", "Hour"], inplace=True)

# =========================
# 6) Missing values handling (FFill + BFill per patient)
# =========================
feat_cols = [c for c in df.columns if c not in [LABEL, "PatientID_tmp"]]
df[feat_cols] = df.groupby("PatientID_tmp")[feat_cols].transform(lambda x: x.ffill().bfill())

# =========================
# 7) Baseline flag (first 12 hours: 0..11)
# =========================
df["is_baseline"] = df["Hour"] <= 11

# ============================================================
# 8) NEW: Delta Features (change from previous hour per patient)
# ============================================================
# These capture "trend" which is crucial for sepsis early detection
vitals = [c for c in ["HR", "O2Sat", "Resp", "Temp", "MAP", "Lactate"] if c in df.columns]

for col in vitals:
    df[f"d_{col}"] = df.groupby("PatientID_tmp")[col].diff()  # (t - t-1)
    # Fill first diff in each patient with 0 (no previous hour)
    df[f"d_{col}"] = df[f"d_{col}"].fillna(0)
# =========================
# 9) Baseline Normalization (patient-centric deviation)
# =========================
baseline = df[df["is_baseline"]].copy()

baseline_means = (
    baseline.groupby("PatientID_tmp")[vitals]
    .mean()
    .add_prefix("base_mean_")
    .reset_index()
)
df = df.merge(baseline_means, on="PatientID_tmp", how="left")

baseline_stds = (
    baseline.groupby("PatientID_tmp")[vitals]
    .std()
    .replace(0, np.nan)
    .add_prefix("base_std_")
    .reset_index()
)
df = df.merge(baseline_stds, on="PatientID_tmp", how="left")

for col in vitals:
    df[f"{col}_dev"] = df[col] - df[f"base_mean_{col}"]

for col in vitals:
    std_col = f"base_std_{col}"
    z_col = f"{col}_z"
    df[z_col] = (df[col] - df[f"base_mean_{col}"]) / df[std_col]
    df[z_col] = df[z_col].replace([np.inf, -np.inf], np.nan).fillna(0)

# =========================
# 11) Save preprocessed dataset
# =========================
print(df.head())  # <-- fixed
df.to_csv("PreprocessedDataset.csv", index=False)
print("Saved PreprocessedDataset.csv", df.shape)
print("Patients:", df["PatientID_tmp"].nunique())
print("Columns:", len(df.columns))



   Hour    HR  O2Sat  Resp   Temp   MAP  Lactate  SepsisLabel  PatientID_tmp  \
0     0  65.0  100.0  16.5  35.78  72.0      1.9            0              1   
1     1  65.0  100.0  16.5  35.78  72.0      1.9            0              1   
2     2  78.0  100.0  16.5  35.78  42.5      1.9            0              1   
3     3  73.0  100.0  17.0  35.78  42.5      1.9            0              1   
4     4  70.0  100.0  14.0  35.78  74.0      1.9            0              1   

   is_baseline  ...  Resp_dev  Temp_dev    MAP_dev   Lactate_dev      HR_z  \
0         True  ...  1.458333 -0.050833  -1.541667  2.220446e-16 -0.757842   
1         True  ...  1.458333 -0.050833  -1.541667  2.220446e-16 -0.757842   
2         True  ...  1.458333 -0.050833 -31.041667  2.220446e-16  1.180246   
3         True  ...  1.958333 -0.050833 -31.041667  2.220446e-16  0.434828   
4         True  ... -1.041667 -0.050833   0.458333  2.220446e-16 -0.012424   

   O2Sat_z    Resp_z    Temp_z     MAP_z  Lactate_

In [17]:
# ============================================================
# GRU Multi-task Training (Multi-step Forecast 6h + Sepsis Risk 6h)
# For PreprocessedDataset.csv (already contains 40 preprocessed columns)
# ============================================================

import numpy as np
import pandas as pd
from dataclasses import dataclass
from typing import List, Tuple

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score

# -----------------------------
# 0) SETTINGS
# -----------------------------
CSV_PATH = "/content/PreprocessedDataset.csv"   # عدلي المسار إذا لازم
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Windows
SEQ_LEN = 12          # input: last 12 hours
HORIZON = 6           # forecast: next 6 hours (t+1 .. t+6)
RISK_HORIZON = 6      # risk: sepsis within next 6 hours (same window)

# Training
BATCH_SIZE = 256
EPOCHS = 20
LR = 1e-3
WEIGHT_DECAY = 1e-4
PATIENCE = 4

# Task weights
LAMBDA_FORECAST = 1.0
LAMBDA_RISK = 1.0

THRESH = 0.5
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

# -----------------------------
# 1) LOAD DATA
# -----------------------------
df = pd.read_csv(CSV_PATH)
df.columns = df.columns.str.strip()

PID_COL = "PatientID_tmp"
TIME_COL = "Hour"
LABEL_COL = "SepsisLabel"

VITALS = ["HR", "O2Sat", "Resp", "Temp", "MAP", "Lactate"]

required = [PID_COL, TIME_COL, LABEL_COL] + VITALS
missing_req = [c for c in required if c not in df.columns]
if missing_req:
    raise ValueError(f"Missing required columns: {missing_req}")

df[TIME_COL] = pd.to_numeric(df[TIME_COL], errors="coerce")
df = df.sort_values([PID_COL, TIME_COL]).reset_index(drop=True)

# -----------------------------
# 2) FEATURES (use preprocessed columns directly)
# Recommended: VITALS + deltas + deviations
# -----------------------------
FEATURES = (
    VITALS
    + [f"d_{c}" for c in VITALS]
    + [f"{c}_dev" for c in VITALS]
    # Optional later:
    # + [f"{c}_z" for c in VITALS]
)

missing_feat = [c for c in FEATURES if c not in df.columns]
if missing_feat:
    raise ValueError(f"Missing feature columns in preprocessed file: {missing_feat}")

TARGETS = VITALS  # we forecast these vitals for next 6 hours

# -----------------------------
# 3) PATIENT-LEVEL SPLIT
# -----------------------------
patient_ids = df[PID_COL].unique()
np.random.shuffle(patient_ids)

n = len(patient_ids)
train_ids = set(patient_ids[: int(0.70 * n)])
val_ids   = set(patient_ids[int(0.70 * n): int(0.85 * n)])
test_ids  = set(patient_ids[int(0.85 * n):])

train_df = df[df[PID_COL].isin(train_ids)].copy()
val_df   = df[df[PID_COL].isin(val_ids)].copy()
test_df  = df[df[PID_COL].isin(test_ids)].copy()

# -----------------------------
# 4) STANDARDIZE FEATURES + TARGETS using TRAIN only
# -----------------------------
feat_mean = train_df[FEATURES].mean()
feat_std  = train_df[FEATURES].std().replace(0, 1.0)

tgt_mean = train_df[TARGETS].mean()
tgt_std  = train_df[TARGETS].std().replace(0, 1.0)

def standardize(dfx: pd.DataFrame) -> pd.DataFrame:
    dfx = dfx.copy()
    dfx[FEATURES] = (dfx[FEATURES] - feat_mean) / feat_std
    dfx[TARGETS]  = (dfx[TARGETS]  - tgt_mean)  / tgt_std

    dfx[FEATURES] = dfx[FEATURES].replace([np.inf, -np.inf], np.nan).fillna(0.0)
    dfx[TARGETS]  = dfx[TARGETS].replace([np.inf, -np.inf], np.nan).fillna(0.0)
    return dfx

train_df = standardize(train_df)
val_df   = standardize(val_df)
test_df  = standardize(test_df)

# -----------------------------
# 5) WINDOWED SAMPLES (Multi-step)
# X: last SEQ_LEN hours features
# y_forecast: vitals for next 6 hours => shape [HORIZON, 6]
# y_risk: max SepsisLabel in the SAME next 6 hours window
# -----------------------------
@dataclass
class Sample:
    x: np.ndarray               # [SEQ_LEN, F]
    y_forecast: np.ndarray      # [HORIZON, 6]
    y_risk: float               # 0/1

def make_samples(dfx: pd.DataFrame) -> List[Sample]:
    samples = []
    for pid, g in dfx.groupby(PID_COL):
        g = g.sort_values(TIME_COL).reset_index(drop=True)

        X = g[FEATURES].values.astype(np.float32)
        Y = g[TARGETS].values.astype(np.float32)
        L = g[LABEL_COL].values.astype(np.float32)

        T = len(g)

        # Need: SEQ_LEN history + HORIZON future available
        # We predict future steps t+1..t+HORIZON, so last valid t is T - HORIZON - 1
        last_t = T - HORIZON - 1
        for t in range(SEQ_LEN - 1, last_t + 1):
            x_win = X[t-(SEQ_LEN-1): t+1]             # [SEQ_LEN, F]

            # Multi-step targets: next HORIZON rows
            y_future = Y[t+1 : t+1+HORIZON]           # [HORIZON, 6]

            # Risk: sepsis occurs in same future window (t+1..t+HORIZON)
            y_risk = float(np.max(L[t+1 : t+1+HORIZON]))

            samples.append(Sample(x_win, y_future, y_risk))

    return samples

train_samples = make_samples(train_df)
val_samples   = make_samples(val_df)
test_samples  = make_samples(test_df)

print("Train/Val/Test samples:", len(train_samples), len(val_samples), len(test_samples))
print("Features:", len(FEATURES))

class SepsisDataset(Dataset):
    def __init__(self, samples: List[Sample]):
        self.samples = samples
    def __len__(self):
        return len(self.samples)
    def __getitem__(self, idx):
        s = self.samples[idx]
        x = torch.from_numpy(s.x)                            # [SEQ_LEN, F]
        y_f = torch.from_numpy(s.y_forecast)                 # [HORIZON, 6]
        y_r = torch.tensor([s.y_risk], dtype=torch.float32)  # [1]
        return x, y_f, y_r

train_loader = DataLoader(SepsisDataset(train_samples), batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(SepsisDataset(val_samples),   batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(SepsisDataset(test_samples),  batch_size=BATCH_SIZE, shuffle=False)

# -----------------------------
# 6) CLASS IMBALANCE (pos_weight)
# -----------------------------
y_train_cls = np.array([s.y_risk for s in train_samples], dtype=np.float32)
pos = float((y_train_cls == 1).sum())
neg = float((y_train_cls == 0).sum())
pos_weight = torch.tensor([neg / max(pos, 1.0)], dtype=torch.float32).to(DEVICE)
print("Risk positives:", int(pos), "negatives:", int(neg), "pos_weight:", pos_weight.item())

# -----------------------------
# 7) MODEL: GRU Multi-task (Multi-step Forecast)
# Forecast head outputs: [B, HORIZON, 6]
# -----------------------------
class GRUMultiTaskMultiStep(nn.Module):
    def __init__(self, n_features: int, hidden: int = 128, num_layers: int = 1, dropout: float = 0.15):
        super().__init__()
        self.gru = nn.GRU(
            input_size=n_features,
            hidden_size=hidden,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0
        )
        self.shared = nn.Sequential(
            nn.LayerNorm(hidden),
            nn.Dropout(dropout)
        )

        # Forecast head: produce HORIZON*len(VITALS) then reshape to [HORIZON, 6]
        self.forecast_head = nn.Sequential(
            nn.Linear(hidden, 128),
            nn.ReLU(),
            nn.Linear(128, HORIZON * len(TARGETS))
        )

        # Risk head
        self.risk_head = nn.Sequential(
            nn.Linear(hidden, 64),
            nn.ReLU(),
            nn.Linear(64, 1)  # logits
        )

    def forward(self, x):
        out, _ = self.gru(x)          # [B, SEQ_LEN, H]
        last = out[:, -1, :]          # [B, H]
        z = self.shared(last)

        y_flat = self.forecast_head(z)                    # [B, HORIZON*6]
        y_fore = y_flat.view(-1, HORIZON, len(TARGETS))   # [B, HORIZON, 6]

        y_logit = self.risk_head(z)                       # [B, 1]
        return y_fore, y_logit

model = GRUMultiTaskMultiStep(n_features=len(FEATURES)).to(DEVICE)

# Losses
mse_loss = nn.MSELoss()
bce_loss = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

# -----------------------------
# 8) METRICS
# -----------------------------
@torch.no_grad()
def eval_metrics(all_probs, all_true):
    all_probs = np.asarray(all_probs)
    all_true = np.asarray(all_true).astype(int)
    preds = (all_probs >= THRESH).astype(int)
    metrics = {
        "acc": accuracy_score(all_true, preds),
        "f1": f1_score(all_true, preds, zero_division=0),
        "precision": precision_score(all_true, preds, zero_division=0),
        "recall": recall_score(all_true, preds, zero_division=0),
    }
    metrics["auc"] = roc_auc_score(all_true, all_probs) if len(np.unique(all_true)) > 1 else None
    return metrics

def run_epoch(loader, train: bool):
    model.train(train)

    total_loss = 0.0
    total_mse = 0.0
    total_bce = 0.0

    all_probs = []
    all_true = []

    for x, y_f, y_r in loader:
        x = x.to(DEVICE)     # [B, SEQ_LEN, F]
        y_f = y_f.to(DEVICE) # [B, HORIZON, 6]
        y_r = y_r.to(DEVICE) # [B, 1]

        if train:
            optimizer.zero_grad()

        pred_f, pred_logit = model(x)  # pred_f: [B, HORIZON, 6]

        loss_f = mse_loss(pred_f, y_f)         # multi-step MSE
        loss_r = bce_loss(pred_logit, y_r)

        loss = LAMBDA_FORECAST * loss_f + LAMBDA_RISK * loss_r

        if train:
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

        total_loss += loss.item() * x.size(0)
        total_mse  += loss_f.item() * x.size(0)
        total_bce  += loss_r.item() * x.size(0)

        probs = torch.sigmoid(pred_logit).detach().cpu().numpy().ravel()
        true  = y_r.detach().cpu().numpy().ravel()
        all_probs.append(probs)
        all_true.append(true)

    n = len(loader.dataset)
    all_probs = np.concatenate(all_probs)
    all_true  = np.concatenate(all_true)

    met = eval_metrics(all_probs, all_true)
    return {
        "loss": total_loss / n,
        "mse": total_mse / n,
        "bce": total_bce / n,
        **met
    }

# -----------------------------
# 9) TRAIN LOOP (early stopping)
# -----------------------------
best_val = float("inf")
best_state = None
bad = 0

for epoch in range(1, EPOCHS + 1):
    tr = run_epoch(train_loader, train=True)
    va = run_epoch(val_loader, train=False)

    print(
        f"Epoch {epoch:02d} | "
        f"Train loss={tr['loss']:.4f} mse={tr['mse']:.4f} bce={tr['bce']:.4f} "
        f"auc={tr['auc']} f1={tr['f1']:.3f} recall={tr['recall']:.3f} | "
        f"Val loss={va['loss']:.4f} mse={va['mse']:.4f} bce={va['bce']:.4f} "
        f"auc={va['auc']} f1={va['f1']:.3f} recall={va['recall']:.3f}"
    )

    if va["loss"] < best_val:
        best_val = va["loss"]
        best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
        bad = 0
    else:
        bad += 1
        if bad >= PATIENCE:
            print("Early stopping.")
            break

if best_state is not None:
    model.load_state_dict(best_state)

# -----------------------------
# 10) TEST EVALUATION
# -----------------------------
te = run_epoch(test_loader, train=False)
print("\nTEST RESULTS:")
for k, v in te.items():
    print(f"{k}: {v}")

# -----------------------------
# 11) INFERENCE helper: predict next 6 hours + risk
# Outputs are de-standardized back to original units
# -----------------------------
@torch.no_grad()
def predict_next_6h(pid: int, current_hour: int) -> Tuple[np.ndarray, float]:
    g_std = test_df[test_df[PID_COL] == pid].sort_values(TIME_COL).reset_index(drop=True)
    if g_std.empty:
        raise ValueError("Patient not found in test_df (try a different pid).")

    idx_list = g_std.index[g_std[TIME_COL] == current_hour].tolist()
    if not idx_list:
        raise ValueError("Hour not found for this patient in test set.")
    idx = idx_list[0]

    if idx < SEQ_LEN - 1:
        raise ValueError(f"Need at least {SEQ_LEN} hours history.")

    x_win = g_std.loc[idx-(SEQ_LEN-1): idx, FEATURES].values.astype(np.float32)
    x = torch.from_numpy(x_win).unsqueeze(0).to(DEVICE)  # [1, SEQ_LEN, F]

    pred_f_std, pred_logit = model(x)  # pred_f_std: [1, HORIZON, 6]
    risk = torch.sigmoid(pred_logit).item()

    pred_f_std = pred_f_std.squeeze(0).cpu().numpy()  # [HORIZON, 6]

    # De-standardize per vital
    mean = tgt_mean.values.reshape(1, -1)  # [1,6]
    std  = tgt_std.values.reshape(1, -1)   # [1,6]
    pred_f = pred_f_std * std + mean       # [HORIZON, 6] in original units

    return pred_f, risk

print("\nReady ✅ call: predict_next_6h(pid, hour)")


Train/Val/Test samples: 30269 7343 7294
Features: 18
Risk positives: 805 negatives: 29464 pos_weight: 36.60124206542969
Epoch 01 | Train loss=148.7602 mse=147.4075 bce=1.3527 auc=0.5172445203157701 f1=0.052 recall=0.468 | Val loss=2.8081 mse=1.4644 bce=1.3437 auc=0.6389148711743993 f1=0.048 recall=1.000
Epoch 02 | Train loss=2.6744 mse=1.3185 bce=1.3559 auc=0.5106166615792216 f1=0.050 recall=0.528 | Val loss=1.6078 mse=0.3067 bce=1.3011 auc=0.6418264383327905 f1=0.042 recall=0.056
Epoch 03 | Train loss=2.5387 mse=1.1889 bce=1.3499 auc=0.5050793430618774 f1=0.050 recall=0.447 | Val loss=1.6353 mse=0.3351 bce=1.3002 auc=0.6383451998697007 f1=0.041 recall=0.056
Epoch 04 | Train loss=2.4557 mse=1.1165 bce=1.3392 auc=0.5503171993868083 f1=0.058 recall=0.461 | Val loss=1.5501 mse=0.2559 bce=1.2942 auc=0.6496300432779561 f1=0.064 recall=0.806
Epoch 05 | Train loss=2.4214 mse=1.0866 bce=1.3348 auc=0.5641880269089303 f1=0.058 recall=0.550 | Val loss=1.5227 mse=0.2415 bce=1.2813 auc=0.6516520080

In [24]:
import torch
import numpy as np

artifact = {
    "model_state": model.state_dict(),
    "FEATURES": FEATURES,
    "VITALS": VITALS,
    "SEQ_LEN": SEQ_LEN,
    "HORIZON": HORIZON,
    "RISK_HORIZON": RISK_HORIZON,
    "feat_mean": feat_mean.values.astype(np.float32),
    "feat_std":  feat_std.values.astype(np.float32),
    "tgt_mean":  tgt_mean.values.astype(np.float32),
    "tgt_std":   tgt_std.values.astype(np.float32),
}
torch.save(artifact, "sepsis_gru_multistep_artifact.pt")
print("✅ Saved: sepsis_gru_multistep_artifact.pt")


✅ Saved: sepsis_gru_multistep_artifact.pt
