In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from google.cloud import bigquery
from google.cloud import storage
from sklearn.preprocessing import StandardScaler
import wandb



In [2]:
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkayleeyvo[0m ([33mkayleeyvo-harvard-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = f"dosewise-473716-9f4874e812d6.json"

In [4]:
class LSTMModel(nn.Module):
    def __init__(self, input_size=5, hidden_size=50, num_layers=2, output_size=1):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(
            input_size, hidden_size, num_layers, batch_first=True, dropout=0.2
        )
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)

        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])  # Use last time step
        return out

In [5]:
class FlexibleLSTMModel(nn.Module):
    def __init__(self, input_size=5, hidden_size=50, num_layers=2, output_size=1):
        super(FlexibleLSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # Single LSTM that can handle both cases
        self.lstm = nn.LSTM(
            input_size, hidden_size, num_layers, batch_first=True, dropout=0.2
        )
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, use_phen=True):
        if not use_phen:
            # Mask out PHEN_RATE by setting it to zero
            x = x.clone()
            x[:, :, 4] = 0  # Set PHEN_RATE (index 4) to zero

        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)

        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [10]:
class MedicalPredictor:
    def __init__(self, use_wandb=True):
        self.model = FlexibleLSTMModel(
            input_size=5, hidden_size=50, num_layers=2, output_size=1
        )
        self.scaler = StandardScaler()
        self.sequence_length = 10

        # W&B setup
        self.use_wandb = use_wandb
        if use_wandb:
            import wandb

            self.wandb = wandb
            # Initialize W&B (you'll configure this in your notebook)

    def save_model(self, base_path="medical_model"):
        """Save model, scaler, and metadata locally with timestamp versioning"""
        import os
        from datetime import datetime

        # Create base directory
        os.makedirs(base_path, exist_ok=True)

        # Add timestamp for versioning
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        model_filename = f"model_{timestamp}.pth"
        local_path = f"{base_path}/{model_filename}"

        # Save model state with timestamp
        torch.save(
            {
                "model_state_dict": self.model.state_dict(),
                "model_config": {
                    "input_size": 5,
                    "hidden_size": 50,
                    "num_layers": 2,
                    "output_size": 1,
                },
                "scaler_mean": (
                    self.scaler.mean_ if hasattr(self.scaler, "mean_") else None
                ),
                "scaler_scale": (
                    self.scaler.scale_ if hasattr(self.scaler, "scale_") else None
                ),
                "sequence_length": self.sequence_length,
                "timestamp": timestamp,  # Include timestamp in model metadata
            },
            local_path,
        )

        print(f"Model saved to {local_path}")
        return local_path  # Return full path for bucket upload

    def save_to_bucket(self, bucket_name, local_path):
        """Upload model files to Google Cloud Storage with timestamp versioning"""
        import os

        client = storage.Client()
        bucket = client.bucket(bucket_name)

        # Extract filename from local_path
        filename = os.path.basename(local_path)

        # Upload model file with timestamped name
        model_blob = bucket.blob(f"models/{filename}")
        model_blob.upload_from_filename(local_path)

        print(f"Model uploaded to gs://{bucket_name}/models/{filename}")

    def load_data_from_bigquery(self):
        """Fetch medical time series data from BigQuery - sampled for speed"""
        client = bigquery.Client()

        query = """
        WITH top_patients AS (
        SELECT DISTINCT id
        FROM `dosewise-473716.dosewisedb.hemodyn_table`
        ORDER BY id
        LIMIT 5
        )
        SELECT 
            h.id, h.ART, h.ECG_II, h.PLETH, h.CO2, h.PHEN_RATE, h.time
        FROM `dosewise-473716.dosewisedb.hemodyn_table` h
        INNER JOIN top_patients t ON h.id = t.id
        ORDER BY h.id, h.time
        """

        results = client.query(query).result()
        df = pd.DataFrame([dict(row) for row in results])

        # Debug info
        print(f"Loaded {len(df)} rows from {df['id'].nunique()} patients")
        print(f"Patient IDs: {sorted(df['id'].unique())}")
        print(f"Time range: {df['time'].min()} to {df['time'].max()}")
        print(
            f"Features summary:\n{df[['ART', 'ECG_II', 'PLETH', 'CO2', 'PHEN_RATE']].describe()}"
        )

        return df

    def prepare_sequences(self, df):
        """Convert time series data to sequences with per-patient scaling"""
        features = ["ART", "ECG_II", "PLETH", "CO2", "PHEN_RATE"]

        all_sequences_X = []
        all_sequences_y = []

        for patient_id, patient_data in df.groupby("id"):
            patient_data = patient_data.sort_values("time")
            data = patient_data[features].dropna().values

            if len(data) > self.sequence_length:
                # Scale PER PATIENT (crucial!)
                scaler = StandardScaler()
                scaled_data = scaler.fit_transform(data)

                X_patient, y_patient = [], []
                for i in range(len(scaled_data) - self.sequence_length):
                    X_patient.append(scaled_data[i : (i + self.sequence_length)])
                    y_patient.append(scaled_data[i + self.sequence_length, 0])

                all_sequences_X.extend(X_patient)
                all_sequences_y.extend(y_patient)

        return np.array(all_sequences_X), np.array(all_sequences_y)

    def train_model(self, X, y, epochs=50):
        """Train the flexible LSTM model with both configurations"""
        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001)

        X_tensor = torch.FloatTensor(X)
        y_tensor = torch.FloatTensor(y).view(-1, 1)

        # W&B: Log training start
        if self.use_wandb:
            self.wandb.log(
                {"dataset_size": len(X), "sequence_length": self.sequence_length}
            )

        for epoch in range(epochs):
            self.model.train()
            optimizer.zero_grad()

            # Get predictions with PHEN_RATE
            pred_with_phen = self.model(X_tensor, use_phen=True)
            loss_with_phen = criterion(pred_with_phen, y_tensor)

            # Get predictions without PHEN_RATE
            pred_without_phen = self.model(X_tensor, use_phen=False)
            loss_without_phen = criterion(pred_without_phen, y_tensor)

            # Total loss (both branches)
            total_loss = loss_with_phen + loss_without_phen

            total_loss.backward()
            optimizer.step()

            # W&B: Log metrics every epoch
            if self.use_wandb and epoch % 5 == 0:  # Log every 5 epochs to avoid spam
                self.wandb.log(
                    {
                        "epoch": epoch,
                        "loss_with_phen": loss_with_phen.item(),
                        "loss_without_phen": loss_without_phen.item(),
                        "total_loss": total_loss.item(),
                        "learning_rate": optimizer.param_groups[0]["lr"],
                    }
                )

            if epoch % 10 == 0:
                print(
                    f"Epoch [{epoch}/{epochs}], "
                    f"With PHEN: {loss_with_phen.item():.4f}, "
                    f"Without PHEN: {loss_without_phen.item():.4f}"
                )

    def predict_sequence(self, input_sequence, num_predictions=600):
        """Generate a sequence of predictions for the next 10 minutes"""
        self.model.eval()
        with torch.no_grad():
            # Start with the initial input sequence
            current_sequence = torch.FloatTensor(input_sequence).unsqueeze(
                0
            )  # shape: (1, seq_len, 5)
            predictions_with_phen = []
            predictions_without_phen = []
            phen_effects = []

            for i in range(num_predictions):
                # Get predictions for current sequence
                pred_with_phen = self.model(current_sequence, use_phen=True)
                pred_without_phen = self.model(current_sequence, use_phen=False)

                # Store predictions
                pred_with_phen_val = pred_with_phen.item()
                pred_without_phen_val = pred_without_phen.item()
                phen_effect_val = pred_with_phen_val - pred_without_phen_val

                predictions_with_phen.append(pred_with_phen_val)
                predictions_without_phen.append(pred_without_phen_val)
                phen_effects.append(phen_effect_val)

                # Update the sequence for next prediction (autoregressive)
                # Remove oldest time step, add new prediction
                if i < num_predictions - 1:
                    # Create new sequence by shifting window
                    new_sequence = current_sequence[
                        :, 1:, :
                    ].clone()  # Remove oldest time step

                    # Create new time step with predicted ART and current features
                    last_timestep = current_sequence[
                        :, -1:, :
                    ].clone()  # Get last timestep

                    # Update ART with prediction (with phen version)
                    last_timestep[:, :, 0] = pred_with_phen  # ART is index 0

                    # Append new timestep
                    current_sequence = torch.cat([new_sequence, last_timestep], dim=1)

                if (i + 1) % 100 == 0:
                    print(f"Generated {i + 1}/{num_predictions} predictions...")

            if self.use_wandb:
                self.wandb.log(
                    {
                        "predictions/with_phen_mean": np.mean(predictions_with_phen),
                        "predictions/with_phen_std": np.std(predictions_with_phen),
                        "predictions/without_phen_mean": np.mean(
                            predictions_without_phen
                        ),
                        "predictions/without_phen_std": np.std(
                            predictions_without_phen
                        ),
                        "predictions/phen_effect_mean": np.mean(phen_effects),
                        "predictions/phen_effect_std": np.std(phen_effects),
                        "num_predictions": num_predictions,
                    }
                )

                # Log sample predictions as table
                predictions_table = self.wandb.Table(
                    columns=["Timestep", "With_PHEN", "Without_PHEN", "Effect"]
                )
                for i in range(min(50, num_predictions)):  # First 50 predictions
                    predictions_table.add_data(
                        i + 1,
                        predictions_with_phen[i],
                        predictions_without_phen[i],
                        phen_effects[i],
                    )
                self.wandb.log({"sample_predictions": predictions_table})

            return {
                "predictions_with_phen": predictions_with_phen,
                "predictions_without_phen": predictions_without_phen,
                "phen_effects": phen_effects,
                "timestamps": list(range(1, num_predictions + 1)),
            }

    def run(self):
        """Main pipeline"""
        # W&B: Start run (you'll configure this in notebook)
        if self.use_wandb:
            self.wandb.init(
                project="dosewise-medical",
                config={
                    "model_type": "FlexibleLSTM",
                    "input_size": 5,
                    "hidden_size": 150,
                    "num_layers": 3,
                    "sequence_length": 20,
                    "features": ["ART", "ECG_II", "PLETH", "CO2", "PHEN_RATE"],
                },
            )

        print("Loading data from BigQuery...")
        df = self.load_data_from_bigquery()

        # W&B: Log dataset info
        if self.use_wandb:
            self.wandb.log(
                {
                    "data/num_patients": df["id"].nunique(),
                    "data/num_samples": len(df),
                }
            )

        print("Preparing sequences...")
        X, y = self.prepare_sequences(df)

        print(f"Training flexible model on {len(X)} sequences...")
        self.train_model(X, y, epochs=50)

        print("Model training completed!")

        # Generate predictions
        if len(X) > 0:
            print("Generating 10-minute prediction sequence...")
            predictions = self.predict_sequence(X[0], num_predictions=600)

            # Your existing print statements...

        # W&B: Finish run
        if self.use_wandb:
            self.wandb.finish()

In [11]:
# Initialize with W&B
predictor = MedicalPredictor(use_wandb=True)

# Run the pipeline (W&B will automatically track everything)
predictor.run()

Loading data from BigQuery...
Loaded 80637 rows from 5 patients
Patient IDs: [np.int64(20), np.int64(28), np.int64(61), np.int64(163), np.int64(185)]
Time range: 78414 to 544059
Features summary:
                ART        ECG_II         PLETH           CO2     PHEN_RATE
count  80637.000000  80637.000000  80637.000000  80637.000000  80637.000000
mean      74.036060      0.053053     37.622808     21.535812      3.904897
std       32.811139      0.305834     14.279384     18.631023      5.307602
min     -289.247009     -4.956260     -5.893770      0.000000      0.000000
25%       60.312302     -0.028849     32.814499      1.300000      0.000000
50%       73.149200      0.020523     37.949299     22.400000      0.100000
75%       89.935997      0.089645     44.269001     39.799999      5.100000
max      345.687012      5.145430     96.801804     67.800003     30.000000
Preparing sequences...
Training flexible model on 80587 sequences...
Epoch [0/50], With PHEN: 1.0152, Without PHEN: 1.01

0,1
data/num_patients,▁
data/num_samples,▁
dataset_size,▁
epoch,▁▂▃▃▄▅▆▆▇█
learning_rate,▁▁▁▁▁▁▁▁▁▁
loss_with_phen,██▇▆▄▂▂▂▁▁
loss_without_phen,██▇▆▄▂▂▂▁▁
num_predictions,▁
predictions/phen_effect_mean,▁
predictions/phen_effect_std,▁

0,1
data/num_patients,5
data/num_samples,80637
dataset_size,80587
epoch,45
learning_rate,0.001
loss_with_phen,0.47399
loss_without_phen,0.46836
num_predictions,600
predictions/phen_effect_mean,-0.05514
predictions/phen_effect_std,0.00074


In [None]:
# Save model
model_path = predictor.save_model("trained_model")

# Pass the FILE path, not the directory
predictor.save_to_bucket("dosewisedb", model_path)

Model saved to trained_model/model_20251011_025059.pth
Model saved to trained_model/model_20251011_025059.pth
Model uploaded to gs://dosewisedb/models/model_20251011_025059.pth
