In [1]:
import numpy as np

import scipy.integrate as integrate

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torch.optim as optim

In [None]:
x = 1
y = x
x = 2
print("x: ", x)
print("y: ", y)

x:  2
y:  1


In [52]:
batchSize = 32
numberOfActions = 4
numberOfEpochs = 4
numberOfPolicyValueIterations = 10
numberOfElementsRequiredInDataset = 10 * batchSize

# Some of the following code is based on a PyTorch tutorial in the official PyTorch website:
# Below is the definition of the neural networks used for the pair-wize classification of the actions
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()

        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(2, 16)
        self.fc2 = nn.Linear(16, 16)
        self.fc3 = nn.Linear(16, 16)
        self.fc4 = nn.Linear(16, 2)

    def forward(self, x):

        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        logits = self.fc4(x)
        return logits

# We use the cross entropy loss function
lossFunction = nn.CrossEntropyLoss()

# Initialization of the classifiers for each pairs of actions:
classifiers = {}

for actionIndex1 in range(numberOfActions):
  for actionIndex2 in range(actionIndex1 + 1, numberOfActions):

    net = Net()
    input = torch.randn(1, 2)
    out = net(input)

    net.zero_grad()
    out.backward(torch.randn(1, 2))

    classifiers[(actionIndex1, actionIndex2)] = net

rng = np.random.default_rng()

# Each action is represented by an index in a dictionary. Such that each action is accessed via this index
actionsDictionary = {}
actionsDictionary.update({0 : 0.1})
actionsDictionary.update({1 : 0.4})
actionsDictionary.update({2 : 0.7})
actionsDictionary.update({3 : 1.0})



# Below are the constants used in the simulations of the cancer treatement plan
a1 = 0.15
a2 = 0.1
b1 = 1.2
b2 = 1.2
c0 = -4
c1 = 1
c2 = 1
d1 = 0.5
d2 = 0.5


# Below is the definition of delta Y as described in the paper. It takes among its arguments the index of an action that corresponds to the amount of chemical given to the patient
def deltaY(XState, YState,
           initialXState,
           actionIndex):

  #print("deltaY here -1: ", actionIndex)
  if(YState > 0):
    indicatorFunctionResult = 1
  else:
    indicatorFunctionResult = 0

  #print("deltaY here 0: ", actionIndex)
  #print("actionsDictionary[actionIndex]: ", actionsDictionary[actionIndex])
  actionsDictionary[actionIndex]
  np.maximum(XState, initialXState)
  return (a1 * np.maximum(XState, initialXState) - b1 * (actionsDictionary[actionIndex] - d1)) * indicatorFunctionResult

# Below is the definition of delta X as described in the paper. It takes among its arguments the index of an action that corresponds to the amount of chemical given to the patient
def deltaX(XState, YState,
           initialYState,
           actionIndex):

  #print("deltaX  here 1: ", actionIndex)

  return a2 * np.maximum(YState, initialYState) + b2 * (actionsDictionary[actionIndex] - d2)

# Below is just adding the deltas to the states, for 1 time step
def simulate1Step(XState, YState,
                  initialXState, initialYState,
                  actionIndex):

  #print("here 3")
  return (XState + deltaX(XState, YState, initialYState, actionIndex), YState + deltaY(XState, YState, initialXState, actionIndex))

# Below is the function that returns (0) if the patient has died during the present point in time in the simulation. If the patient lives, it returns 1
def checkLifeStatus(previousXState, previousYState, presentXState, presentYState):

  def XAsAFunctionOfTime(time):
    return previousXState + time * (presentXState - previousXState)

  def YAsAFunctionOfTime(time):
    return previousYState + time * (presentYState - previousYState)

  def lambdaAsAFunctionOfTime(time):
    return np.exp(c0 + c1 * YAsAFunctionOfTime(time) + c2 * XAsAFunctionOfTime(time))

  lambdaIntegral = integrate.quad(lambdaAsAFunctionOfTime, 0, 1)[0]

  probabilityOfDeath = 1 - np.exp(-lambdaIntegral)

  if(rng.random() < probabilityOfDeath):
    return 0
  else:
    return 1



# A return value of (-1) means that action1 is preferable to action2, a return value of (1) means that action2 is preferable to action1
# We follow here the treatement plan for (1) patient under different starting actions (actionIndex1) and (actionIndex2)
# It does not make sense to take (XState) and (YState) different from (initialXState) and (initialYState) repectively; I just though at first that
# I needed code for simulations which would start somewhere in the middle of the treatement plan (somewhere else than at the beginning); but I didn't change the code to make it cleaner yet.
def evaluatePreference(XState, YState,
                       initialXState, initialYState,
                       actionIndex1, actionIndex2,
                       timeIndex,
                       policy):

  maximumToxicityWithActionIndex1 = XState
  maximumToxicityWithActionIndex2 = XState

  #print("here0")

  previousXStateWithActionIndex1 = XState
  previousXStateWithActionIndex2 = XState

  previousYStateWithActionIndex1 = YState
  previousYStateWithActionIndex2 = YState

  # We simulate (1) time step here:

  (XStateWithActionIndex1, YStateWithActionIndex1) = simulate1Step(XState, YState, initialXState, initialYState, actionIndex1)

  (XStateWithActionIndex2, YStateWithActionIndex2) = simulate1Step(XState, YState, initialXState, initialYState, actionIndex2)


  # We check the life status here:
  lifeStatusWithActionIndex1 = checkLifeStatus(previousXStateWithActionIndex1, previousYStateWithActionIndex1, XStateWithActionIndex1, YStateWithActionIndex1)
  lifeStatusWithActionIndex2 = checkLifeStatus(previousXStateWithActionIndex2, previousYStateWithActionIndex2, XStateWithActionIndex2, YStateWithActionIndex2)

  # If the patient has died for (1) of the actions, then the following logic gives the pareto dominance relationship:
  if(lifeStatusWithActionIndex2 > lifeStatusWithActionIndex1):
    return 1
  elif(lifeStatusWithActionIndex2 < lifeStatusWithActionIndex1):
    return -1
  elif((lifeStatusWithActionIndex2 == 0) and (lifeStatusWithActionIndex1 == 0)):
    return 0

  # We store the maximum toxicity here, that tells us about pareto dominance:
  maximumToxicityWithActionIndex1 = np.maximum(maximumToxicityWithActionIndex1, XStateWithActionIndex1)
  maximumToxicityWithActionIndex2 = np.maximum(maximumToxicityWithActionIndex2, XStateWithActionIndex2)




  timeIndex = timeIndex + 1

  # The following are the remaining time steps in the simulation; they follow the same logic are previously described
  while(timeIndex < 6):

    # The remaining action indices are chosen according to the policy in all remaining simulation steps:
    actionIndex1 = policy(classifiers, XStateWithActionIndex1, YStateWithActionIndex1)
    actionIndex2 = policy(classifiers, XStateWithActionIndex2, YStateWithActionIndex2)

    # We always store the state X and Y for each action indices. (ActionIndex1) and (ActionIndex2) correspond to the initial action taken at the beginning that have to be compared
    previousXStateWithActionIndex1 = XStateWithActionIndex1
    previousXStateWithActionIndex2 = XStateWithActionIndex2

    previousYStateWithActionIndex1 = YStateWithActionIndex1
    previousYStateWithActionIndex2 = YStateWithActionIndex2

    (XStateWithActionIndex1, YStateWithActionIndex1) = simulate1Step(XStateWithActionIndex1, YStateWithActionIndex1, initialXState, initialYState, actionIndex1)
    (XStateWithActionIndex2, YStateWithActionIndex2) = simulate1Step(XStateWithActionIndex2, YStateWithActionIndex2, initialXState, initialYState, actionIndex2)

    lifeStatusWithActionIndex1 = checkLifeStatus(previousXStateWithActionIndex1, previousYStateWithActionIndex1, XStateWithActionIndex1, YStateWithActionIndex1)
    lifeStatusWithActionIndex2 = checkLifeStatus(previousXStateWithActionIndex2, previousYStateWithActionIndex2, XStateWithActionIndex2, YStateWithActionIndex2)

    if(lifeStatusWithActionIndex2 > lifeStatusWithActionIndex1):
      return 1
    elif(lifeStatusWithActionIndex2 < lifeStatusWithActionIndex1):
      return -1
    elif((lifeStatusWithActionIndex2 == 0) and (lifeStatusWithActionIndex1 == 0)):
      return 0

    maximumToxicityWithActionIndex1 = np.maximum(maximumToxicityWithActionIndex1, XStateWithActionIndex1)
    maximumToxicityWithActionIndex2 = np.maximum(maximumToxicityWithActionIndex2, XStateWithActionIndex2)

    timeIndex = timeIndex + 1

  tumorSizeAtTheEndWithActionIndex1 = YStateWithActionIndex1
  tumorSizeAtTheEndWithActionIndex2 = YStateWithActionIndex2

  # The following logic describes the pareto dominance relationship when the patient has survived under the 2 choices of initial actions:
  if((tumorSizeAtTheEndWithActionIndex2 < tumorSizeAtTheEndWithActionIndex1) and (maximumToxicityWithActionIndex2 < maximumToxicityWithActionIndex1)):
    return 1
  elif((tumorSizeAtTheEndWithActionIndex1 < tumorSizeAtTheEndWithActionIndex2) and (maximumToxicityWithActionIndex1 < maximumToxicityWithActionIndex2)):
    return -1
  else:
    return 0


# A return value of (-1) means that policy1 is preferable to policy2, a return value of (1) means that policy2 is preferable to policy1
# The code below is almost identical to the code for function (evaluatePreference) above, except that here, at each time steps, actions are taken
# from each of the (2) different policies (policy1) and (policy2).
def evaluatePreferenceBetween2Policies(XState, YState,
                       initialXState, initialYState,
                       timeIndex,
                       policy1, policy2, classifiers):

  maximumToxicityWithActionIndex1 = XState
  maximumToxicityWithActionIndex2 = XState

  #print("here0")

  previousXStateWithActionIndex1 = XState
  previousXStateWithActionIndex2 = XState

  previousYStateWithActionIndex1 = YState
  previousYStateWithActionIndex2 = YState

  actionIndex1 = policy1(classifiers, XState, YState)
  actionIndex2 = policy2(classifiers, XState, YState)

  (XStateWithActionIndex1, YStateWithActionIndex1) = simulate1Step(XState, YState, initialXState, initialYState, actionIndex1)

  #print("here 0.1")
  (XStateWithActionIndex2, YStateWithActionIndex2) = simulate1Step(XState, YState, initialXState, initialYState, actionIndex2)


  #print("here1")

  lifeStatusWithActionIndex1 = checkLifeStatus(previousXStateWithActionIndex1, previousYStateWithActionIndex1, XStateWithActionIndex1, YStateWithActionIndex1)
  lifeStatusWithActionIndex2 = checkLifeStatus(previousXStateWithActionIndex2, previousYStateWithActionIndex2, XStateWithActionIndex2, YStateWithActionIndex2)

  if(lifeStatusWithActionIndex2 > lifeStatusWithActionIndex1):
    return 1
  elif(lifeStatusWithActionIndex2 < lifeStatusWithActionIndex1):
    return -1
  elif((lifeStatusWithActionIndex2 == 0) and (lifeStatusWithActionIndex1 == 0)):
    return 0

  maximumToxicityWithActionIndex1 = np.maximum(maximumToxicityWithActionIndex1, XStateWithActionIndex1)
  maximumToxicityWithActionIndex2 = np.maximum(maximumToxicityWithActionIndex2, XStateWithActionIndex2)




  timeIndex = timeIndex + 1

  while(timeIndex < 6):

    actionIndex1 = policy1(classifiers, XStateWithActionIndex1, YStateWithActionIndex1)
    actionIndex2 = policy2(classifiers, XStateWithActionIndex2, YStateWithActionIndex2)

    previousXStateWithActionIndex1 = XStateWithActionIndex1
    previousXStateWithActionIndex2 = XStateWithActionIndex2

    previousYStateWithActionIndex1 = YStateWithActionIndex1
    previousYStateWithActionIndex2 = YStateWithActionIndex2

    (XStateWithActionIndex1, YStateWithActionIndex1) = simulate1Step(XStateWithActionIndex1, YStateWithActionIndex1, initialXState, initialYState, actionIndex1)
    (XStateWithActionIndex2, YStateWithActionIndex2) = simulate1Step(XStateWithActionIndex2, YStateWithActionIndex2, initialXState, initialYState, actionIndex2)

    lifeStatusWithActionIndex1 = checkLifeStatus(previousXStateWithActionIndex1, previousYStateWithActionIndex1, XStateWithActionIndex1, YStateWithActionIndex1)
    lifeStatusWithActionIndex2 = checkLifeStatus(previousXStateWithActionIndex2, previousYStateWithActionIndex2, XStateWithActionIndex2, YStateWithActionIndex2)

    if(lifeStatusWithActionIndex2 > lifeStatusWithActionIndex1):
      return 1
    elif(lifeStatusWithActionIndex2 < lifeStatusWithActionIndex1):
      return -1
    elif((lifeStatusWithActionIndex2 == 0) and (lifeStatusWithActionIndex1 == 0)):
      return 0

    maximumToxicityWithActionIndex1 = np.maximum(maximumToxicityWithActionIndex1, XStateWithActionIndex1)
    maximumToxicityWithActionIndex2 = np.maximum(maximumToxicityWithActionIndex2, XStateWithActionIndex2)

    timeIndex = timeIndex + 1

  tumorSizeAtTheEndWithActionIndex1 = YStateWithActionIndex1
  tumorSizeAtTheEndWithActionIndex2 = YStateWithActionIndex2

  if((tumorSizeAtTheEndWithActionIndex2 < tumorSizeAtTheEndWithActionIndex1) and (maximumToxicityWithActionIndex2 < maximumToxicityWithActionIndex1)):
    return 1
  elif((tumorSizeAtTheEndWithActionIndex1 < tumorSizeAtTheEndWithActionIndex2) and (maximumToxicityWithActionIndex1 < maximumToxicityWithActionIndex2)):
    return -1
  else:
    return 0



# # We create of dictionary of the pairs of actions' indices to be able to randomly access the classifiers;
# # in order to randomly take a pareto optimal action if there are multiple of them.

# pairsOfActionsIndices = {}
# numberOfPairsOfActions = 0

# for actionIndex1 in range(4):
#   for actionIndex2 in range(actionIndex1 + 1):

#     pairsOfActionsIndices[pairIndex] = (actionIndex1, actionIndex2)

#     numberOfPairsOfActions = numberOfPairsOfActions + 1

def constantPolicy0(classifiers, XState, YState):
  return 0
def constantPolicy1(classifiers, XState, YState):
  return 1
def constantPolicy2(classifiers, XState, YState):
  return 2
def constantPolicy3(classifiers, XState, YState):
  return 3

def randomPolicy(classifiers, XState, YState):

  return np.random.randint(numberOfActions)

# This is the construction of the policy with the pair-wise classifiers
def policy(classifiers, XState, YState):

  # We first choose a random index:
  randomInitialActionIndex = np.random.randint(numberOfActions)
  bestActionIndex = randomInitialActionIndex

  # (actionIndicesToCheck) gives the series of actions to check successively to find the best action
  actionIndicesToCheck = [0, 1, 2, 3]
  actionIndicesToCheck.remove(bestActionIndex)

  # (classifiersAppliedOnTheStateChoices) is a dictionary containing the classifiers applied on the input state (XState, YState)
  # The results are going to be used to classify the actions
  classifiersAppliedOnTheStateChoices = {}

  for actionIndex1 in range(numberOfActions):
    for actionIndex2 in range(actionIndex1 + 1, numberOfActions):

      classifier = classifiers[(actionIndex1, actionIndex2)]

      # The best action is given by taking the argmax of the classifier applied on the state. A value of (0) means that, for the classifier at hand,
      # (actionIndex1) is pareto dominant to (actionIndex2). A value of (1) give the opposite dominance
      classifiersAppliedOnTheStateChoices[(actionIndex1, actionIndex2)] = torch.argmax(classifier(torch.Tensor([[XState, YState]]))).cpu().numpy()

  for actionIndexToCheck in actionIndicesToCheck:

    # (bestActionIndex) is checked against all possibilities of (actionIndexToCheck) in (actionIndicesToCheck)
    # The key (bestActionIndex, actionIndexToCheck) might not correspond to a classifier in the classifiers dictionary, because switching the 2 indices would have just given the reversed classifier.
    # Since we might not have learned the classifier corresponding to (bestActionIndex, actionIndexToCheck), because of symmetry, we first check we have it in the outer (if) statement
    if (bestActionIndex, actionIndexToCheck) in classifiersAppliedOnTheStateChoices:

      # The following means that actionIndexToCheck is pareto dominant to the previous bestActionIndex. So we store it, and continue the outer (for) loop for the other actionIndicesToCheck.
      # We do this until we have verified all actions and found the dominant one
        if(classifiersAppliedOnTheStateChoices[(bestActionIndex, actionIndexToCheck)] == 1):

          bestActionIndex = actionIndexToCheck
    else:
      if(classifiersAppliedOnTheStateChoices[(actionIndexToCheck, bestActionIndex)] == 0):

          bestActionIndex = actionIndexToCheck

  return bestActionIndex

In [7]:
# This is the policy iteration routine:

for policyValueIterationIndex in range(numberOfPolicyValueIterations):

  # The following dictionary contains the present training data at this specific policy value iteration step
  trainingSetDictionary = {}

  # Below we collect the training sets for each classifiers:
  for actionIndex1 in range(numberOfActions):
    for actionIndex2 in range(actionIndex1 + 1, numberOfActions):

      trainingSetDictionary[(actionIndex1, actionIndex2)] = []

      rolloutIndex = 0

      while(rolloutIndex < numberOfElementsRequiredInDataset):

        # The initial states of the patient are taken to be random, as stated in the paper
        initialXState = rng.random() * 2
        initialYState = rng.random() * 2

        # We specify (2) actions (actionIndex1) and (actionIndex2) that are to be compared by the present state of the present pair-wise classifier
        preferenceViaParetoDominance = evaluatePreference(initialXState, initialYState,
                        initialXState, initialYState,
                        actionIndex1, actionIndex2,
                        0,
                        policy)

        # We only store cases that have a definite pareto dominance for training. A value of (0) returned by (evaluatePreference) means that none of (actionIndex1) or (actionIndex2) is preferable over the other for the present state
        if(preferenceViaParetoDominance != 0):

          # We naturally use one-hot encoding for training via the cross-entropy loss
          onehotEncoding = torch.Tensor([0,0])
          if(preferenceViaParetoDominance == 1):
            onehotEncoding[1] = 1
          else:
            onehotEncoding[0] = 1

          trainingSetDictionary[(actionIndex1, actionIndex2)].append((torch.Tensor([initialXState, initialYState]), onehotEncoding))

          rolloutIndex = rolloutIndex + 1

  # We train below the classifiers with the training elements that we found above:
  for actionIndex1 in range(numberOfActions):
    for actionIndex2 in range(actionIndex1 + 1, numberOfActions):

      training_data = trainingSetDictionary[(actionIndex1, actionIndex2)]
      train_dataloader = DataLoader(training_data, batch_size = batchSize, shuffle = True)

      classifierToTrain = classifiers[(actionIndex1, actionIndex2)]

      optimizer = optim.SGD(classifierToTrain.parameters(), lr=0.01)

      # Training for (1) specific classifier (classifierToTrain):
      for epochNumber in range(numberOfEpochs):
        for (inputState, preference) in train_dataloader:

            optimizer.zero_grad()   # zero the gradient buffers
            output = classifierToTrain(inputState)


            loss = lossFunction(output, preference)
            loss.backward()
            optimizer.step()


In [55]:
# numberOfRolloutsToTestIfLearnedPolicyIsBetterThanRandomPolicy = 500

# finalTumorSizesLearnedPolicy = []
# finalTumorSizesRandomPolicy = []

# maximumToxicityWithLearnedPolicy = []
# maximumToxicityWithRandomPolicy = []

# for rolloutIndex in range(numberOfRolloutsToTestIfLearnedPolicyIsBetterThanRandomPolicy):

#   initialXState = rng.random() * 2
#   initialYState = rng.random() * 2

#   (xStateLearnedPolicy, yStateLearnedPolicy) = (initialXState, initialYState)
#   (xStateRandomPolicy, yStateRandomPolicy) = (initialXState, initialYState)

#   maximumToxicityWithLearnedPolicy.append(0)
#   maximumToxicityWithRandomPolicy.append(0)

#   for timeStepIndex in range(6):

#     actionIndexLearnedPolicy = policy(classifiers, xStateLearnedPolicy, yStateLearnedPolicy)
#     actionIndexRandomPolicy = np.random.randint(4)

#     # print("(initialXState, initialYState): ", (initialXState, initialYState))
#     # print("(xStateLearnedPolicy, yStateLearnedPolicy): ", (xStateLearnedPolicy, yStateLearnedPolicy))
#     # print("(xStateRandomPolicy, yStateRandomPolicy): ", (xStateRandomPolicy, yStateRandomPolicy))

#     (xStateLearnedPolicy, yStateLearnedPolicy) = simulate1Step(xStateLearnedPolicy, yStateLearnedPolicy, initialXState, initialYState, actionIndexLearnedPolicy)
#     (xStateRandomPolicy, yStateRandomPolicy) = simulate1Step(xStateRandomPolicy, yStateRandomPolicy, initialXState, initialYState, actionIndexRandomPolicy)

#     maximumToxicityWithLearnedPolicy[rolloutIndex] = np.maximum(xStateLearnedPolicy, maximumToxicityWithLearnedPolicy[rolloutIndex])
#     maximumToxicityWithRandomPolicy[rolloutIndex] = np.maximum(xStateRandomPolicy, maximumToxicityWithRandomPolicy[rolloutIndex])

#   finalTumorSizesLearnedPolicy.append(yStateLearnedPolicy)
#   finalTumorSizesRandomPolicy.append(yStateRandomPolicy)

# print("Average Final Tumor Sizes for Learned Policy: ", np.mean(np.array(finalTumorSizesLearnedPolicy)))
# print("Average Final Tumor Sizes for Random Policy: ", np.mean(np.array(finalTumorSizesRandomPolicy)))

# print("Average Maximum Toxicity for Learned Policy: ", np.mean(np.array(maximumToxicityWithLearnedPolicy)))
# print("Average Maximum Toxicity for Random Policy: ", np.mean(np.array(maximumToxicityWithRandomPolicy)))

Average Final Tumor Sizes for Learned Policy:  2.8734570180725734
Average Final Tumor Sizes for Random Policy:  1.770044668306209
Average Maximum Toxicity for Learned Policy:  1.247728429787222
Average Maximum Toxicity for Random Policy:  2.3104074160619863


Below we check if the learned policy is pareto dominant to a random policy where each actions is taken randomly at each steps of the simulation:

In [49]:
paretoDominanceBetweenPoliciesArray = []

numberOfValidComparisons = 0

for policyEvaluationIndex in range(100000):

  initialXState = rng.random() * 2
  initialYState = rng.random() * 2

  paretoDominanceBetweenPolicies = evaluatePreferenceBetween2Policies(initialXState, initialYState,
                       initialXState, initialYState,
                       0,
                       randomPolicy, policy, classifiers)

  paretoDominanceBetweenPoliciesArray.append(paretoDominanceBetweenPolicies)

  if(paretoDominanceBetweenPolicies != 0):
    numberOfValidComparisons = numberOfValidComparisons + 1

print("Sum of ParetoDominances: ", np.sum(np.array(paretoDominanceBetweenPoliciesArray)))
print("Number of Valid Comparisons: ", numberOfValidComparisons)

Sum of ParetoDominances:  1722
Number of Valid Comparisons:  74542


We numerically check below the statistical significance of the difference between the random policy and the learned policy

In [47]:
sums = []
numberOfSumsToComputeTheStandardDeviation = 100


for sumIndex in range(numberOfSumsToComputeTheStandardDeviation):
  sum = 0

  for randomNumberIndex in range(74886):
    number = np.random.randint(2)

    if(number == 0):
      sum = sum + 1
    elif(number == 1):
      sum = sum - 1

  if(sumIndex % 10 == 0):
    print("sumIndex: ", sumIndex)

  sums.append(sum)

meanOfSums = 0

standardDeviation = np.sqrt(np.mean(np.power(np.array(sums) - meanOfSums, 2)))

print("standardDeviation: ", standardDeviation)

sumIndex:  0
sumIndex:  10
sumIndex:  20
sumIndex:  30
sumIndex:  40
sumIndex:  50
sumIndex:  60
sumIndex:  70
sumIndex:  80
sumIndex:  90
standardDeviation:  272.9952380537067


We check below how many standard deviations our sum of pareto dominances is away from the average value of random choices, which is 0. (We are seeing how much time it happens that our learned policy is better than the random policy. If none is better than the other statistically, then the value of our sum of pareto dominances should be near the mean value of a random sum (0), sum of random values of (1) or (-1))

In [50]:
(np.sum(np.array(paretoDominanceBetweenPoliciesArray)) - meanOfSums)/standardDeviation

6.30780233485695

The process is clearly well approximated by a Gaussian distribution. And 6 standard deviations or more happens only about twice every 100 000 000 000 000 000 trials (1/(2 * 10^17))

This means that our result is statistically significant.

We compare our learned policy with constant policies, which give a constant amount of chemical to the patients

In [54]:
constantPolicies = []
constantPolicies.append(constantPolicy0)
constantPolicies.append(constantPolicy1)
constantPolicies.append(constantPolicy2)
constantPolicies.append(constantPolicy3)

paretoDominancesBetweenLearnedAndConstantPoliciesArray = []

numberOfValidComparisonsWithConstantPolicies = []

for constantPolicyIndex in range(numberOfActions):

  numberOfValidComparisonsWithConstantPolicies.append(0)
  paretoDominancesBetweenLearnedAndConstantPoliciesArray.append([])

  numberOfValidComparisons = 0

  for policyEvaluationIndex in range(100000):

    initialXState = rng.random() * 2
    initialYState = rng.random() * 2

    paretoDominanceBetweenPolicies = evaluatePreferenceBetween2Policies(initialXState, initialYState,
                        initialXState, initialYState,
                        0,
                        constantPolicies[constantPolicyIndex], policy, classifiers)

    paretoDominancesBetweenLearnedAndConstantPoliciesArray[constantPolicyIndex].append(paretoDominanceBetweenPolicies)

    if(paretoDominanceBetweenPolicies != 0):
      numberOfValidComparisonsWithConstantPolicies[constantPolicyIndex] = numberOfValidComparisonsWithConstantPolicies[constantPolicyIndex] + 1

  print("Sum of ParetoDominances for Constant Policy " + str(constantPolicyIndex) + ": ", np.sum(np.array(paretoDominancesBetweenLearnedAndConstantPoliciesArray[constantPolicyIndex])))
  print("Number of Valid Comparisons: ", numberOfValidComparisons)

Sum of ParetoDominances for Constant Policy 0:  3633
Number of Valid Comparisons:  0
Sum of ParetoDominances for Constant Policy 1:  -262
Number of Valid Comparisons:  0
Sum of ParetoDominances for Constant Policy 2:  2625
Number of Valid Comparisons:  0
Sum of ParetoDominances for Constant Policy 3:  14995
Number of Valid Comparisons:  0


In [55]:
for constantPolicyIndex in range(numberOfActions):
  print("Sum of ParetoDominances for Constant Policy " + str(constantPolicyIndex) + ": ", np.sum(np.array(paretoDominancesBetweenLearnedAndConstantPoliciesArray[constantPolicyIndex])))
  print("Number of Valid Comparisons: ", numberOfValidComparisonsWithConstantPolicies[constantPolicyIndex])

Sum of ParetoDominances for Constant Policy 0:  3633
Number of Valid Comparisons:  74425
Sum of ParetoDominances for Constant Policy 1:  -262
Number of Valid Comparisons:  73318
Sum of ParetoDominances for Constant Policy 2:  2625
Number of Valid Comparisons:  74127
Sum of ParetoDominances for Constant Policy 3:  14995
Number of Valid Comparisons:  79347


We compute below the statistical significance of the results

In [56]:
sums = []
numberOfSumsToComputeTheStandardDeviation = 100


for sumIndex in range(numberOfSumsToComputeTheStandardDeviation):
  sum = 0

  for randomNumberIndex in range(79347):
    number = np.random.randint(2)

    if(number == 0):
      sum = sum + 1
    elif(number == 1):
      sum = sum - 1

  if(sumIndex % 10 == 0):
    print("sumIndex: ", sumIndex)

  sums.append(sum)

meanOfSums = 0

standardDeviation = np.sqrt(np.mean(np.power(np.array(sums) - meanOfSums, 2)))

print("standardDeviation: ", standardDeviation)

sumIndex:  0
sumIndex:  10
sumIndex:  20
sumIndex:  30
sumIndex:  40
sumIndex:  50
sumIndex:  60
sumIndex:  70
sumIndex:  80
sumIndex:  90
standardDeviation:  297.7540595860953


In [57]:
print(3633/297.8)
print(2625/297.8)
print(14995/297.8)

12.199462726662189
8.814640698455339
50.35258562793821


This means that our learned policy is better than giving a low dose level of chemical constantly at a minimum of 12 standard deviations; better than giving a high dose of chemical at a minimum of 8.8 standard deviations; and better than giving an extreme dose of chemical at 50 standard deviations.

In [58]:
sums = []
numberOfSumsToComputeTheStandardDeviation = 100


for sumIndex in range(numberOfSumsToComputeTheStandardDeviation):
  sum = 0

  for randomNumberIndex in range(73318):
    number = np.random.randint(2)

    if(number == 0):
      sum = sum + 1
    elif(number == 1):
      sum = sum - 1

  if(sumIndex % 10 == 0):
    print("sumIndex: ", sumIndex)

  sums.append(sum)

meanOfSums = 0

standardDeviation = np.sqrt(np.mean(np.power(np.array(sums) - meanOfSums, 2)))

print("standardDeviation: ", standardDeviation)

sumIndex:  0
sumIndex:  10
sumIndex:  20
sumIndex:  30
sumIndex:  40
sumIndex:  50
sumIndex:  60
sumIndex:  70
sumIndex:  80
sumIndex:  90
standardDeviation:  317.88954056401417


In [59]:
print(262/317)

0.8264984227129337


This means that our learned policy is worse than giving a medium dose of chemical constantly at about 0.82 standard deviations. Such that our learned policy and giving a medium dose of chemical constantly perform about as well

Our analysis

We save the classifier models below

In [33]:
for key in classifiers:
  torch.save(classifiers[key], "./Classifier Model for Action Indices " + str(key[0]) + " and " + str(key[1]) + ".pt")

Load the classifier models below

In [53]:
classifiers = {}

for actionIndex1 in range(numberOfActions):
  for actionIndex2 in range(actionIndex1 + 1, numberOfActions):

    classifiers[(actionIndex1, actionIndex2)] = torch.load("./Classifier Model for Action Indices " + str(actionIndex1) + " and " + str(actionIndex2) + ".pt")

Here today (2023-12-18)

In [65]:
1742/74886

0.023262024944582432

In [31]:
sum = 0

for randomNumberIndex in range(74886):
  number = np.random.randint(2)

  if(number == 0):
    sum = sum + 1
  elif(number == 1):
    sum = sum - 1

print("sum: ", sum)

sum:  546


In [None]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
X, y = load_iris(return_X_y=True)