In [3]:
import pandas as pd
import numpy as np
import time
import sys
import os
import opticl
import utils_gastric as gi

# Optimization modelling
from pyomso import environ
from pyomo.environ import *

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

## Data loading

Load ```tox_summary```, which reports quantiles for various toxicities as candidate upper bounds

In [4]:
tox_summary = pd.read_csv('processed-data/gastric_toxicity_summary.csv')
tox_summary.set_index('outcome', inplace = True)

Load training and testing datasets for each outcome. Note that in this case, the training data (X) is the same for all outcomes, but our framework is generic to independent training sets. Thus, we load each as a separate file for this example.

In [5]:
outcomes = gi.outcomes
datasets_train = {}
datasets_test = {}

# Training datasets
for o in outcomes:
    data = pd.read_csv(f'processed-data/data_train_{o}.csv')
    y = data[o]
    X = data.drop([o], inplace=False, axis=1)
    datasets_train[o] = (X, y)
    
# Testing datasets
for o in outcomes:
    data = pd.read_csv(f'processed-data/data_test_{o}.csv')
    y = data[o]
    X = data.drop([o], inplace=False, axis=1)
    datasets_test[o] = (X, y)

## Specify the outcomes and relevant parameters for ML training + optimization
```outcome_list``` is a dictionary where each key is an outcome, and each value is a dictionary corresponding to that outcome. The outcome-specific dictionary specifies relevant parameters and data for (1) ML model training and (2) the final optimization problem.

In [40]:
# Outcome list
outcomes = gi.outcomes

## ML training parameters: 
# specify models to consider, and optionally specify a grid to search over in CV (if None, will use default grid)
alg_dict = {'cart': None, 'linear':None, 'rf': None}
# specify whether we train a single model (bs = 0) or bootstrapped models (bs >= 1) for the outcome
bs = 0
# if training multiple models, select the single best (gr = False) or group together as an ensemble (gr = True)
gr=False
# if grouping an ensemble, specify proportion that can violate (or "average" to constrain mean)
viol_rule = 0.5

Add all constraint outcomes to dictionary. For each toxicity, we want to enforce that the toxicity is below a certain quantile of the toxicities observed in the data (which we loaded in tox_summary). Here, we select the 70th percentile, givenin column ```quantile_0.7```.

We also specify the training and testing data for ML model training as well as reporting test set metrics. dataset_path specifies the data that will be used to define the trust region in the downstream optimization task.

In [41]:
constraints_embed = gi.outcomes[:-1]
ub_quantile = 'quantile_0.7'

outcome_list = {outcome: {'lb':None, 'ub':tox_summary.loc[outcome,ub_quantile],
                          'objective_weight':0,'group_models':gr,
                        'task_type': 'continuous', 'alg_list':alg_dict, 'bootstrap_iterations':bs,
                        'X_train':datasets_train[outcome][0], 'y_train':datasets_train[outcome][1], 
                          'X_test':datasets_test[outcome][0], 'y_test':datasets_test[outcome][1],
                        'dataset_path': f'processed-data/data_train_{outcome}.csv'} 
                for outcome in constraints_embed}

Add the objective outcome to dictionary. In this case, the only outcome that we are seeking to optimize is overall survival (OS). We want to maximize survival; since the optimization formulation assumes that the objective will be minimized, we set the weight to -1.

We similarly specify the training and testing sets and the trust region dataset path.

In [42]:
outcome_list['OS'] = {'lb':None, 'ub':None, 
                      'objective_weight':-1,'group_models':gr,
                        'task_type': 'continuous', 'alg_list':alg_dict, 'bootstrap_iterations':bs,
                       'X_train':datasets_train['OS'][0], 'y_train':datasets_train['OS'][1], 
                      'X_test':datasets_test['OS'][0], 'y_test':datasets_test['OS'][1],
                       'dataset_path':'processed-data/data_train_OS.csv'}

In [43]:
print("Algorithms = %s" % alg_dict)
print("Bootstrap iterations = %d" % bs)
print("Violation rule = %s" % str(viol_rule))
code_version = 'AAAI-23_CHEMOexample'

version = 'vAAAI-23_CHEMOexample'

Algorithms = {'cart': None, 'linear': None, 'rf': None}
Bootstrap iterations = 0
Violation rule = 0.5


## Train candidate ML models and select models to embed 

In [44]:
performance = opticl.train_ml_models(outcome_list, version)
if not os.path.exists('results'):
    os.makedirs('results')
performance.to_csv('results/%s_performance.csv' % (code_version))
# performance = pd.read_csv('results/%s_performance.csv' % (code_version))

print("\nPreparing model master")
if viol_rule == 'average':
    gr_method = 'average'
    max_viol = None
    print("Group method = %s" % (gr_method))
    gr_string = 'average'
else: 
    gr_method = 'violation'
    max_viol = float(viol_rule)
    print("Group method = %s (violation limit = %.2f)" % (gr_method, max_viol))
    gr_string = 'violation_%.2f' % max_viol

Learning a model for Neutro4
No bootstrap - training on full training data
training Neutro4 with cart
------------- Initialize grid  ----------------
------------- Running model  ----------------
Algorithm = cart, metric = None
saving... results/cart_Neutro4_trained.pkl
------------- Model evaluation  ----------------
-------------------training evaluation-----------------------
Train MSE: 0.0200432326421685
Train R2: 0.3021467451768366
-------------------testing evaluation-----------------------
Test MSE: 0.019041832379617617
Test R2: -0.27527754538206173

training Neutro4 with linear
------------- Initialize grid  ----------------
------------- Running model  ----------------
Algorithm = linear, metric = None
saving... results/linear_Neutro4_trained.pkl
------------- Model evaluation  ----------------
-------------------training evaluation-----------------------
Train MSE: 0.02150490628718667
Train R2: 0.251255068725447
-------------------testing evaluation-----------------------
Tes

saving... results/linear_DLT_PROP_trained.pkl
------------- Model evaluation  ----------------
-------------------training evaluation-----------------------
Train MSE: 0.024609788630514644
Train R2: 0.1568406391760241
-------------------testing evaluation-----------------------
Test MSE: 0.02830775991392976
Test R2: 0.13945443947251135

training DLT_PROP with rf
------------- Initialize grid  ----------------
------------- Running model  ----------------
Algorithm = rf_shallow, metric = None
saving... results/rf_shallow_DLT_PROP_trained.pkl
------------- Model evaluation  ----------------
-------------------training evaluation-----------------------
Train MSE: 0.013322680881295186
Train R2: 0.5435497937432003
-------------------testing evaluation-----------------------
Test MSE: 0.025273454578621846
Test R2: 0.23169621323078016

Learning a model for OS
No bootstrap - training on full training data
training OS with cart
------------- Initialize grid  ----------------
------------- Runni

You can optionally inspect the individual models, saved as pickle files. For example, here we look at the linear Neutropenia model.

In [45]:
import pickle as pkl
outcome = 'Neutro4'
model_path = 'results/linear_%s_trained.pkl' % outcome
with open(model_path, "rb") as input_file:
    m = pkl.load(input_file)

# Generate predictions for the first 5 patients in the test set
m.predict(datasets_test[outcome][0].loc[0:5, :])

array([0.09976899, 0.10513361, 0.11800877, 0.11800877, 0.1753539 ,
       0.2175087 ])

## Formulate + solve optimization problem

Define the basic conceptual model for the chemo optimization problem, without embedded constraints and objective terms.

In [46]:
def init_conceptual_model(pt, contex_vars):
    model = ConcreteModel('chemo')

    N = list(pt.keys())
    ########### STEP 1: Define Decision Variables ###########
    # Create x variable, and initialize empty y for outcome values
    model.x = Var(N,domain=NonNegativeReals)

    # Restrict some x to be binary
    x_binary = [i for i in N if '_Ind' in i]
    for i in x_binary:
        model.x[i].domain = Binary

    ########### STEP 2: Define Objective function ###########
        def obj_function(model):
            return 0
    model.OBJ = Objective(rule=obj_function, sense=minimize)

    ###### STEP 3: Add (optionally) any known constraints #####
    def constraint_rule1(model):
        return sum(model.x[i] for i in x_binary) <= 3
    model.Constraint1 = Constraint(rule=constraint_rule1)
    
    ###### STEP 4: Fix (optionally) any non-optimization variables #####
    def constraint_rule2(model, i):
        return model.x[i] == pt[i]
    model.Constraint2 = Constraint(contex_vars, rule=constraint_rule2)

    return model

Select a test sample to optimize (i.e. a cohort of patients treated in 2008 or later), given as a row number of the test set.

In [74]:
patient_ID = 5 # 2
print("\nPatient %d" % patient_ID)


Patient 5


Specify **global** trust region data and clustering model (if relevant). This will force the solution to lie within the convex hull of the datapoints given in 'data'. It will only enforce the convex hull condition on the features that appear in 'data': features not in the trust region dataset will not be constrained by the convex hull. 

In this case, we enforce the trust region *only* on the treatment features. We do not apply it to the contextual features.

In [68]:
tr_data = datasets_train['OS'][0][[i for i in gi.T_cols if i in features]]
trust_region_specs = {'data': tr_data,
                      'clustering_model':None,
                      'enlargement':[0]}

Pull the test sample's data and define model master. If the outcome models are not grouped (i.e., not ensemble), we use the validation metrics from the cross-validation training procedure to select the best model for each outcome. Otherwise, all models will be used.

In [69]:
pt = datasets_test['OS'][0].loc[patient_ID, :]
features = pt.keys()

var_fts = [i for i in gi.T_cols if i in features]
context_fts = [i for i in gi.X_cols if i in features]

In [70]:
mm = opticl.initialize_model_master(outcome_list)
mm.loc[outcomes,'group_method'] = gr_method
mm.loc[outcomes,'max_violation'] = max_viol
mm.loc[outcomes, 'trust_region'] = False
mm.loc[outcomes, 'var_features'] = [var_fts]
mm.loc[outcomes, 'contex_features'] = [{i:pt[i] for i in context_fts}]
model_master = opticl.model_selection(mm, performance)

Formulate the model and solve!

In [71]:
conceptual_model = init_conceptual_model(pt, [i for i in gi.X_cols if i in pt.keys()])
final_model = opticl.optimization_MIP(conceptual_model, model_master, trust_region_specs)
opt = SolverFactory('glpk')
print('Solving...')
results = opt.solve(final_model) 
print('Done!')
print(results.solver.termination_condition)

Generating constraints for the trust region using 122 samples.
... Trust region defined.
Embedding constraints for Neutro4
Adding single model.
Embedding constraints for OTHER_34
Adding single model.
Embedding constraints for GINONV_34
Adding single model.
Embedding constraints for CONSTITUTIONAL_34
Adding single model.
Embedding constraints for INFECTION_34
Adding single model.
Embedding constraints for DLT_PROP
Adding single model.
Embedding objective function for OS
Adding single model.
Solving...
Done!
optimal


## Inspect the solution

What were the contextual features of this patient?

In [72]:
for i in context_fts:
    val = value(final_model.x[i])
    print("%s: %.3f" % (i, val))

Asia: 1.000
N_Patient: 14.000
FRAC_MALE: 0.500
AGE_MED: 45.900
Prior_Palliative_Chemo: 0.000
Primary_Stomach: 1.000
Primary_GEJ: 0.000
ECOG_MEAN: 0.905


What drugs are recommended, and in what doses (average and instantaneous)?

In [73]:
for i in var_fts:
    val = value(final_model.x[i])
    if val > 1e-6:
        print("%s: %.3f" % (i, val))

Fluorouracil_Ind: 1.000
Leucovorin_Ind: 1.000
Paclitaxel_Ind: 1.000
Fluorouracil_Avg: 1500.000
Leucovorin_Avg: 375.000
Paclitaxel_Avg: 43.750
Fluorouracil_Inst: 2000.000
Leucovorin_Inst: 500.000
Paclitaxel_Inst: 175.000


In [75]:
## What is the predicted value of each toxicity and objective?

In [76]:
for i in outcome_list:
    val = value(final_model.y[i])
    try: # print constraints
        print("%s: %.3f (limit = %.3f)" % (i, val, tox_summary.loc[i,ub_quantile]))
    except: # no bound for objective
        print("%s: %.3f" % (i, val))

Neutro4: 0.079 (limit = 0.150)
OTHER_34: 0.091 (limit = 0.101)
GINONV_34: 0.020 (limit = 0.106)
CONSTITUTIONAL_34: 0.060 (limit = 0.082)
INFECTION_34: 0.048 (limit = 0.075)
DLT_PROP: 0.527 (limit = 0.637)
OS: 11.306
