### INSTALL PACKAGES ###

In [1]:
# Install survivalnet2 package
!pip install ./survivalnet2

import numpy as np
import os
import pandas as pd
import sys
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, GlobalAveragePooling1D
from tensorflow.keras.models import Model

import survivalnet2
from survivalnet2.data.labels import stack_labels, unstack_labels
from survivalnet2.losses import efron
from survivalnet2.metrics.concordance import HarrellsC
from survivalnet2.visualization import km_plot

# Set random seeds for reproducibility
np.random.seed(51)
tf.random.set_seed(51)

Processing ./survivalnet2
[33m  DEPRECATION: A future pip version will change local packages to be built in-place without first copying to a temporary directory. We recommend you use --use-feature=in-tree-build to test your packages with this new behavior before it becomes the default.
   pip 21.3 will remove support for this functionality. You can find discussion regarding this at https://github.com/pypa/pip/issues/7555.[0m
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
[?25hCollecting glimr@ git+https://github.com/PathologyDataScience/glimr
  Cloning https://github.com/PathologyDataScience/glimr to /private/var/folders/tz/qttd962d27n1g_l3f83f9s95byzb9n/T/pip-install-ieb1dnau/glimr_4785482cfdee4cdaa873699a831d6db0
  Running command git clone -q https://github.com/PathologyDataScience/glimr /private/var/folders/tz/qttd962

Collecting ray[air,tune]>=2.3.0
  Downloading ray-2.3.1-cp39-cp39-macosx_10_15_x86_64.whl (78.0 MB)
[K     |████████████████████████████████| 78.0 MB 35.6 MB/s eta 0:00:01
Collecting pyarrow>=6.0.1
  Downloading pyarrow-11.0.0-cp39-cp39-macosx_10_14_x86_64.whl (24.5 MB)
[K     |████████████████████████████████| 24.5 MB 27.2 MB/s eta 0:00:01
[?25hCollecting colorful
  Downloading colorful-0.5.5-py2.py3-none-any.whl (201 kB)
[K     |████████████████████████████████| 201 kB 28.0 MB/s eta 0:00:01
[?25hCollecting aiorwlock
  Downloading aiorwlock-1.3.0-py3-none-any.whl (10.0 kB)
Collecting opencensus
  Downloading opencensus-0.11.2-py2.py3-none-any.whl (128 kB)
[K     |████████████████████████████████| 128 kB 37.9 MB/s eta 0:00:01
[?25hCollecting uvicorn
  Downloading uvicorn-0.21.1-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 17.4 MB/s eta 0:00:01
[?25hCollecting fastapi
  Downloading fastapi-0.95.0-py3-none-any.whl (57 kB)
[K     |████████████████████

Collecting typing-extensions>=3.6.6
  Using cached typing_extensions-4.5.0-py3-none-any.whl (27 kB)
Collecting anyio<5,>=3.4.0
  Downloading anyio-3.6.2-py3-none-any.whl (80 kB)
[K     |████████████████████████████████| 80 kB 24.5 MB/s eta 0:00:01
Collecting opencensus-context>=0.1.3
  Downloading opencensus_context-0.1.3-py2.py3-none-any.whl (5.1 kB)
Collecting google-api-core<3.0.0,>=1.0.0
  Downloading google_api_core-2.11.0-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 34.6 MB/s eta 0:00:01
[?25hCollecting google-auth<3,>=1.6.3
  Downloading google_auth-2.17.2-py2.py3-none-any.whl (178 kB)
[K     |████████████████████████████████| 178 kB 28.8 MB/s eta 0:00:01
[?25hCollecting protobuf>=3.9.2
  Downloading protobuf-3.19.6-cp39-cp39-macosx_10_9_x86_64.whl (980 kB)
[K     |████████████████████████████████| 980 kB 30.1 MB/s eta 0:00:01
[?25hCollecting googleapis-common-protos<2.0dev,>=1.56.2
  Downloading googleapis_common_protos-1.59.0-py2.py3-none-a

### DATA PREPROCESSING ###

In [2]:
# Define dimensionality
D = 48
print(f"Dimensionality: {D}")

# Define the batch size you want to use
batch_size = 8

# Create data dirs
data_dir = '/Users/lac5440/Desktop/CPSII_40X'# Modify this to your data dir
csv_names = os.listdir(data_dir)
data_files = [os.path.join(data_dir, str(csv_name)) for csv_name in csv_names]


Dimensionality: 48


### RAGGED DATALOADER ###

In [20]:
def dataloader(data_files):
    rows_list = []
    time_list = []
    event_list = []
    for data_file in data_files:
        df = pd.read_csv(data_file)
        num_rows = df.shape[0]
        if(num_rows == 0):
            continue
        df = df.iloc[:, 3:]  # Drop the first three columns
        df = df.astype('float32')  # Convert all columns to float32
        rows_list.append(df.values)
        time_list.append(np.random.randint(50, 300, size=(1,1)).astype('float32'))
        event_list.append(np.random.randint(0, 2, size=(1,1)).astype('float32'))
    
    time_list = np.concatenate(time_list, axis=0)
    event_list = np.concatenate(event_list, axis=0)
    labels = stack_labels(tf.convert_to_tensor(time_list), tf.convert_to_tensor(event_list))

    # Convert lists to tensors
    rows_tensor = tf.ragged.constant(rows_list, ragged_rank=1, dtype=tf.float32)
    
    return rows_tensor, labels

# Load data from csv files
data, labels = dataloader(data_files)

# Convert data to a TensorFlow dataset
ds = tf.data.Dataset.from_tensor_slices((data, labels))

# Define a function to transform dense dataset to ragged dataset
def dense_to_ragged(dense, D, batch_size):
    
    # Transform to ragged dataset
    ragged = dense.apply(
        tf.data.experimental.dense_to_ragged_batch(
            batch_size=batch_size, drop_remainder=True
        )
    )

    return ragged

# Transform dataset to a ragged dataset with the defined batch size
ragged = dense_to_ragged(ds, D, batch_size)

### MODEL ARCHITECTURE ###

In [4]:
def build_model(D):
    # build a simple 2 layer model
    inputs = tf.keras.layers.Input(shape=(None, D), ragged=True)
    beta1 = tf.keras.layers.Dense(units=10, activation="selu")
    beta_time = tf.keras.layers.Dense(units=1, activation="linear", name="time")
    beta_event = tf.keras.layers.Dense(units=1, activation="linear", name="event") 
    
    output1 = beta_time(beta1(inputs))
    output2 = beta_event(beta1(inputs))
    output1 = tf.keras.layers.GlobalAveragePooling1D()(output1)
    output2 = tf.keras.layers.GlobalAveragePooling1D()(output2)
    model = tf.keras.models.Model(inputs=inputs, outputs=(output1, output2))

    print(model.input_shape)
    print(model.output_shape)
    return model

### MODEL TRAINING ###

In [22]:
model = build_model(D)

model.compile(
    loss=[efron, efron],
    metrics=[HarrellsC()],
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
)

# Train the model using the zipped dataset
model.fit(data.to_tensor(), labels, epochs=200, verbose=1)

(None, None, 48)
((None, 1), (None, 1))
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200

KeyboardInterrupt: 

# attention model

Here is what I could work out below. Once we set the last dimension above in `tf.ragged.constant` to 48, we can apply layers to the data batches.

Suppose we have `N_i` region vectors for patient `i`. Then with 1694 patients, the data has shape `[1694, None, 48]`, where `None` is the ragged dimension.

This code calculates an attention weight from each 48-element region feature vector to create a `[1694, None, 1]` set of attention weights (one weight per region per subject). We then normalize these within each subject to sum to 1, producing `normalized`. Within each subject, we then multiply the attention weights by the region feature vectors to produce a single 48-dimensional feature vector for each subject (`pooled.shape` is `[1694, 1, 48]`). Finally, we apply a simple linear model to the pooled vector to produce the scalar `risk` that is used as input to the `cox` regression loss.

In [58]:
# attention weights
att = tf.keras.layers.Dense(units=1, activation="selu", name="att")(data)
# optional - more layers here

# normalize weights to sum to 1
# we have to expand_dims of totals here for broadcasting to work correctly
totals = tf.reduce_sum(att, axis=1, name="att_total")
normalized = tf.math.divide_no_nan(att, tf.expand_dims(totals, axis=1), name="normalized")

# use attention weights to calculate weighted sum of regions
pooled = tf.linalg.matmul(normalized, data, transpose_a=True)

# apply a linear layer to the pooled vector to generate the risk value
risk = tf.keras.layers.Dense(units=1, activation="linear", name="risk")(pooled)