Changes compared to OG work by Guillaume :
- policy is now an epsilon-greedy policy
- we keep 2 sets of actors (original + target)
- we keep past transitions in a replay buffer
- use reward shaping to create a preference-based reward function
- create a Q-network that takes state and policy network outputs (actions) as inputs
- use two networks to make updates (target and main network) for both policy (actor) and Q-netword (critic)
- introduce a discount factor to calculate critic target outputs
- refractored `simulate1Step`
- changed how neural nets are initialized

These last two changes help with off policiness and stability

In [None]:
!pip install -Uqq ipdb

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m1.2 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/1.6 MB[0m [31m6.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━[0m [32m0.9/1.6 MB[0m [31m9.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m1.5/1.6 MB[0m [31m11.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import numpy as np

import scipy.integrate as integrate

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torch.optim as optim

from collections import deque
from random import sample
from tqdm.notebook import tqdm_notebook
import ipdb  # Use `!pip install -Uqq ipdb`
from typing import Tuple

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
batchSize = 32
numberOfActions = 4
numberOfEpochs = 4

epsilon        = 0.05     #for the epsilon-greedy policy
buffer_size    = 5000
num_rollouts   = 10 * batchSize
num_iterations = 30
tau            = 0.005
gamma          = 0.99  # discount factor


# Below is the definition of the neural networks used for the pair-wize classification of the actions
class Net(nn.Module):
    def __init__(self, outdim, use_action=False):
        super(Net, self).__init__()

        self.use_action = use_action
        self.fc1 = nn.Linear(2, 16)
        self.fc2 = nn.Linear(16, 16)
        self.fc3 = nn.Linear(16, 16)
        self.fc4 = nn.Linear(16, outdim)
        if use_action:
          self.fca = nn.Linear(2, 16)

    def init_weights(m):
      if isinstance(m, nn.Linear):
          torch.nn.init.xavier_uniform(m.weight)
          m.bias.data.fill_(0.01)

    def forward(self, state, action=None):

        x = F.relu(self.fc1(state))
        if self.use_action and action is not None:
          x = x + F.relu(self.fca(action))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        logits = self.fc4(x)
        return logits

# We use the cross entropy loss function
lossFunction = nn.CrossEntropyLoss()

# Initialization of the actors (actors) for each pairs of actions:
actors = {}
target_actors = {}

# Initialization of the Q-nets (critics) for each pairs of actions:
critics = {}
target_critics = {}


for actionIndex1 in range(numberOfActions):
  for actionIndex2 in range(actionIndex1 + 1, numberOfActions):

    actor_net = Net(outdim=2)
    critic_net = Net(outdim=1, use_action=True)
    actor_net.init_weights()
    critic_net.init_weights()
    actors[(actionIndex1, actionIndex2)] = actor_net
    target_actors[(actionIndex1, actionIndex2)] = actor_net
    critics[(actionIndex1, actionIndex2)] = critic_net
    target_critics[(actionIndex1, actionIndex2)] = critic_net


rng = np.random.default_rng()

# Each action is represented by an index in a dictionary. Such that each action is accessed via this index
actionsDictionary = {}
actionsDictionary.update({0 : 0.1})
actionsDictionary.update({1 : 0.4})
actionsDictionary.update({2 : 0.7})
actionsDictionary.update({3 : 1.0})


# Below are the constants used in the simulations of the cancer treatement plan
a1 = 0.15
a2 = 0.1
b1 = 1.2
b2 = 1.2
c0 = -4
c1 = 1
c2 = 1
d1 = 0.5
d2 = 0.5


# Below is just adding the deltas to the states, for 1 time step
def simulate1Step(
    XState, YState, initialXState, initialYState, actionIndex):
  # args:
  #     actionIndex: index of an action that corresponds to the amount of chemical given to the patient

  def deltaX(XState, YState, initialYState, actionIndex):
    return a2 * np.maximum(YState, initialYState) + b2 * (actionsDictionary[actionIndex] - d2)

  def deltaY(XState, YState, initialXState, actionIndex):
    if(YState > 0):
      indicatorFunctionResult = 1
    else:
      indicatorFunctionResult = 0
    actionsDictionary[actionIndex]
    np.maximum(XState, initialXState)
    return (a1 * np.maximum(XState, initialXState) - b1 * (actionsDictionary[actionIndex] - d1)) * indicatorFunctionResult

  new_XState = XState + deltaX(XState, YState, initialYState, actionIndex)
  new_YState = YState + deltaY(XState, YState, initialXState, actionIndex)
  return (new_XState, new_YState)


# Below is the function that returns (0) if the patient has died during the present point in time in the simulation. If the patient lives, it returns 1
def checkLifeStatus(previousXState, previousYState, presentXState, presentYState):

  def XAsAFunctionOfTime(time):
    return previousXState + time * (presentXState - previousXState)

  def YAsAFunctionOfTime(time):
    return previousYState + time * (presentYState - previousYState)

  def lambdaAsAFunctionOfTime(time):
    return np.exp(c0 + c1 * YAsAFunctionOfTime(time) + c2 * XAsAFunctionOfTime(time))

  lambdaIntegral = integrate.quad(lambdaAsAFunctionOfTime, 0, 1)[0]
  probabilityOfDeath = 1 - np.exp(-lambdaIntegral)

  if(rng.random() < probabilityOfDeath):
    return 0
  else:
    return 1



# A return value of (-1) means that action1 is preferable to action2, a return value of (1) means that action2 is preferable to action1
# We follow here the treatement plan for (1) patient under different starting actions (actionIndex1) and (actionIndex2)
def evaluatePreference(
    XState, YState, initialXState, initialYState, actionIndex1, actionIndex2,
    policy, actors, timeIndex=0
  ):

  maximumToxicityWithActionIndex1 = XState
  maximumToxicityWithActionIndex2 = XState

  previousXStateWithActionIndex1 = XState
  previousXStateWithActionIndex2 = XState

  previousYStateWithActionIndex1 = YState
  previousYStateWithActionIndex2 = YState

  # We simulate (1) time step here:

  (XStateWithActionIndex1, YStateWithActionIndex1) = simulate1Step(XState, YState, initialXState, initialYState, actionIndex1)
  (XStateWithActionIndex2, YStateWithActionIndex2) = simulate1Step(XState, YState, initialXState, initialYState, actionIndex2)


  # We check the life status here:
  lifeStatusWithActionIndex1 = checkLifeStatus(previousXStateWithActionIndex1, previousYStateWithActionIndex1, XStateWithActionIndex1, YStateWithActionIndex1)
  lifeStatusWithActionIndex2 = checkLifeStatus(previousXStateWithActionIndex2, previousYStateWithActionIndex2, XStateWithActionIndex2, YStateWithActionIndex2)

  # If the patient has died for (1) of the actions, then the following logic gives the pareto dominance relationship:
  if(lifeStatusWithActionIndex2 > lifeStatusWithActionIndex1):
    return 1
  elif(lifeStatusWithActionIndex2 < lifeStatusWithActionIndex1):
    return -1
  elif((lifeStatusWithActionIndex2 == 0) and (lifeStatusWithActionIndex1 == 0)):
    return 0

  # We store the maximum toxicity here, that tells us about pareto dominance:
  maximumToxicityWithActionIndex1 = np.maximum(maximumToxicityWithActionIndex1, XStateWithActionIndex1)
  maximumToxicityWithActionIndex2 = np.maximum(maximumToxicityWithActionIndex2, XStateWithActionIndex2)


  timeIndex = timeIndex + 1
  # The following are the remaining time steps in the simulation; they follow the same logic are previously described
  while(timeIndex < 6):

    # The remaining action indices are chosen according to the policy in all remaining simulation steps:
    actionIndex1 = policy(actors, XStateWithActionIndex1, YStateWithActionIndex1)
    actionIndex2 = policy(actors, XStateWithActionIndex2, YStateWithActionIndex2)

    # We always store the state X and Y for each action indices. (ActionIndex1) and (ActionIndex2) correspond to the initial action taken at the beginning that have to be compared
    previousXStateWithActionIndex1 = XStateWithActionIndex1
    previousXStateWithActionIndex2 = XStateWithActionIndex2

    previousYStateWithActionIndex1 = YStateWithActionIndex1
    previousYStateWithActionIndex2 = YStateWithActionIndex2

    (XStateWithActionIndex1, YStateWithActionIndex1) = simulate1Step(XStateWithActionIndex1, YStateWithActionIndex1, initialXState, initialYState, actionIndex1)
    (XStateWithActionIndex2, YStateWithActionIndex2) = simulate1Step(XStateWithActionIndex2, YStateWithActionIndex2, initialXState, initialYState, actionIndex2)

    lifeStatusWithActionIndex1 = checkLifeStatus(previousXStateWithActionIndex1, previousYStateWithActionIndex1, XStateWithActionIndex1, YStateWithActionIndex1)
    lifeStatusWithActionIndex2 = checkLifeStatus(previousXStateWithActionIndex2, previousYStateWithActionIndex2, XStateWithActionIndex2, YStateWithActionIndex2)

    if(lifeStatusWithActionIndex2 > lifeStatusWithActionIndex1):
      return 1
    elif(lifeStatusWithActionIndex2 < lifeStatusWithActionIndex1):
      return -1
    elif((lifeStatusWithActionIndex2 == 0) and (lifeStatusWithActionIndex1 == 0)):
      return 0

    maximumToxicityWithActionIndex1 = np.maximum(maximumToxicityWithActionIndex1, XStateWithActionIndex1)
    maximumToxicityWithActionIndex2 = np.maximum(maximumToxicityWithActionIndex2, XStateWithActionIndex2)

    timeIndex = timeIndex + 1

  tumorSizeAtTheEndWithActionIndex1 = YStateWithActionIndex1
  tumorSizeAtTheEndWithActionIndex2 = YStateWithActionIndex2

  # The following logic describes the pareto dominance relationship when the patient has survived under the 2 choices of initial actions:
  if((tumorSizeAtTheEndWithActionIndex2 < tumorSizeAtTheEndWithActionIndex1) and (maximumToxicityWithActionIndex2 < maximumToxicityWithActionIndex1)):
    return 1
  elif((tumorSizeAtTheEndWithActionIndex1 < tumorSizeAtTheEndWithActionIndex2) and (maximumToxicityWithActionIndex1 < maximumToxicityWithActionIndex2)):
    return -1
  else:
    return 0


# A return value of (-1) means that policy1 is preferable to policy2, a return value of (1) means that policy2 is preferable to policy1
# The code below is almost identical to the code for function (evaluatePreference) above, except that here, at each time steps, actions are taken
# from each of the (2) different policies (policy1) and (policy2).
def evaluatePreferenceBetween2Policies(
    XState, YState, initialXState, initialYState,
    timeIndex, policy1, policy2, actors
  ):

  maximumToxicityWithActionIndex1 = XState
  maximumToxicityWithActionIndex2 = XState

  previousXStateWithActionIndex1 = XState
  previousXStateWithActionIndex2 = XState

  previousYStateWithActionIndex1 = YState
  previousYStateWithActionIndex2 = YState

  actionIndex1 = policy1(actors, XState, YState)
  actionIndex2 = policy2(actors, XState, YState)

  (XStateWithActionIndex1, YStateWithActionIndex1) = simulate1Step(XState, YState, initialXState, initialYState, actionIndex1)
  (XStateWithActionIndex2, YStateWithActionIndex2) = simulate1Step(XState, YState, initialXState, initialYState, actionIndex2)

  lifeStatusWithActionIndex1 = checkLifeStatus(previousXStateWithActionIndex1, previousYStateWithActionIndex1, XStateWithActionIndex1, YStateWithActionIndex1)
  lifeStatusWithActionIndex2 = checkLifeStatus(previousXStateWithActionIndex2, previousYStateWithActionIndex2, XStateWithActionIndex2, YStateWithActionIndex2)

  if(lifeStatusWithActionIndex2 > lifeStatusWithActionIndex1):
    return 1
  elif(lifeStatusWithActionIndex2 < lifeStatusWithActionIndex1):
    return -1
  elif((lifeStatusWithActionIndex2 == 0) and (lifeStatusWithActionIndex1 == 0)):
    return 0

  maximumToxicityWithActionIndex1 = np.maximum(maximumToxicityWithActionIndex1, XStateWithActionIndex1)
  maximumToxicityWithActionIndex2 = np.maximum(maximumToxicityWithActionIndex2, XStateWithActionIndex2)


  timeIndex = timeIndex + 1

  while(timeIndex < 6):

    actionIndex1 = policy1(actors, XStateWithActionIndex1, YStateWithActionIndex1)
    actionIndex2 = policy2(actors, XStateWithActionIndex2, YStateWithActionIndex2)

    previousXStateWithActionIndex1 = XStateWithActionIndex1
    previousXStateWithActionIndex2 = XStateWithActionIndex2

    previousYStateWithActionIndex1 = YStateWithActionIndex1
    previousYStateWithActionIndex2 = YStateWithActionIndex2

    (XStateWithActionIndex1, YStateWithActionIndex1) = simulate1Step(XStateWithActionIndex1, YStateWithActionIndex1, initialXState, initialYState, actionIndex1)
    (XStateWithActionIndex2, YStateWithActionIndex2) = simulate1Step(XStateWithActionIndex2, YStateWithActionIndex2, initialXState, initialYState, actionIndex2)

    lifeStatusWithActionIndex1 = checkLifeStatus(previousXStateWithActionIndex1, previousYStateWithActionIndex1, XStateWithActionIndex1, YStateWithActionIndex1)
    lifeStatusWithActionIndex2 = checkLifeStatus(previousXStateWithActionIndex2, previousYStateWithActionIndex2, XStateWithActionIndex2, YStateWithActionIndex2)

    if(lifeStatusWithActionIndex2 > lifeStatusWithActionIndex1):
      return 1
    elif(lifeStatusWithActionIndex2 < lifeStatusWithActionIndex1):
      return -1
    elif((lifeStatusWithActionIndex2 == 0) and (lifeStatusWithActionIndex1 == 0)):
      return 0

    maximumToxicityWithActionIndex1 = np.maximum(maximumToxicityWithActionIndex1, XStateWithActionIndex1)
    maximumToxicityWithActionIndex2 = np.maximum(maximumToxicityWithActionIndex2, XStateWithActionIndex2)

    timeIndex = timeIndex + 1

  tumorSizeAtTheEndWithActionIndex1 = YStateWithActionIndex1
  tumorSizeAtTheEndWithActionIndex2 = YStateWithActionIndex2

  if((tumorSizeAtTheEndWithActionIndex2 < tumorSizeAtTheEndWithActionIndex1) and (maximumToxicityWithActionIndex2 < maximumToxicityWithActionIndex1)):
    return 1
  elif((tumorSizeAtTheEndWithActionIndex1 < tumorSizeAtTheEndWithActionIndex2) and (maximumToxicityWithActionIndex1 < maximumToxicityWithActionIndex2)):
    return -1
  else:
    return 0


# We create of dictionary of the pairs of actions' indices to be able to randomly access the actors;
def randomPolicy(actors, XState, YState):
  return np.random.randint(numberOfActions)


# This is the construction of the epsilon-greedy policy with the pair-wise actors
def policy(actors, XState, YState, eps_greedy = False):

  # We first choose a random index:
  randomInitialActionIndex = np.random.randint(numberOfActions)

  if np.random.rand() < epsilon and eps_greedy:
      return randomInitialActionIndex


  bestActionIndex = randomInitialActionIndex

  # (actionIndicesToCheck) gives the series of actions to check successively to find the best action
  actionIndicesToCheck = [0, 1, 2, 3]
  actionIndicesToCheck.remove(bestActionIndex)

  # (actorsAppliedOnTheStateChoices) is a dictionary containing the actors applied on the input state (XState, YState)
  # The results are going to be used to classify the actions
  actorsAppliedOnTheStateChoices = {}

  for actionIndex1 in range(numberOfActions):
    for actionIndex2 in range(actionIndex1 + 1, numberOfActions):

      actor = actors[(actionIndex1, actionIndex2)]

      # The best action is given by taking the argmax of the actor applied on the state. A value of (0) means that, for the actor at hand,
      # (actionIndex1) is pareto dominant to (actionIndex2). A value of (1) give the opposite dominance
      actorsAppliedOnTheStateChoices[(actionIndex1, actionIndex2)] = torch.argmax(actor(torch.Tensor([[XState, YState]]))).cpu().numpy()

  for actionIndexToCheck in actionIndicesToCheck:

    # (bestActionIndex) is checked against all possibilities of (actionIndexToCheck) in (actionIndicesToCheck)
    # The key (bestActionIndex, actionIndexToCheck) might not correspond to a actor in the actors dictionary, because switching the 2 indices would have just given the reversed actor.
    # Since we might not have learned the actor corresponding to (bestActionIndex, actionIndexToCheck), because of symmetry, we first check we have it in the outer (if) statement
    if (bestActionIndex, actionIndexToCheck) in actorsAppliedOnTheStateChoices:

      # The following means that actionIndexToCheck is pareto dominant to the previous bestActionIndex. So we store it, and continue the outer (for) loop for the other actionIndicesToCheck.
      # We do this until we have verified all actions and found the dominant one
        if(actorsAppliedOnTheStateChoices[(bestActionIndex, actionIndexToCheck)] == 1):

          bestActionIndex = actionIndexToCheck
    else:
      if(actorsAppliedOnTheStateChoices[(actionIndexToCheck, bestActionIndex)] == 0):

          bestActionIndex = actionIndexToCheck

  return bestActionIndex

In [None]:
def checkDoneBatch(
    currentStates: torch.Tensor,
    nextStates: torch.Tensor
  ) -> torch.Tensor:
  '''Calls checkLifeStatus for each batch element.

  Returns: done = 1 if patient death 0 otherwise.
  '''
  done = []
  for currState, nextState in zip(currentStates.tolist(), nextStates.tolist()):
    alive = checkLifeStatus(
        currState[0], currState[1], nextState[0], nextState[1]
    )
    done.append(alive != 1)
  return torch.tensor(done, dtype=torch.float32).unsqueeze(-1)


def label_smooth_binary_actions(
    actions: torch.Tensor
  ) -> Tuple[torch.Tensor, torch.Tensor]:
  '''Create label smooth actions tensors.'''
  actions1 = torch.zeros_like(actions)
  actions1[:,0] = 0.9
  actions1[:,1] = 0.1
  actions2 = torch.zeros_like(actions)
  actions2[:,0] = 0.1
  actions2[:,1] = 0.9
  return actions1, actions2

In [None]:
ReplayBuffer = {}

for _ in tqdm_notebook(range(num_iterations)):

  rolloutIndex = 0

  while(rolloutIndex < num_rollouts):
    # The initial states of the patient are taken to be random, as stated in the paper
    initialXState = rng.random() * 2
    initialYState = rng.random() * 2

    actionIndex1 = policy(actors, initialXState, initialYState, eps_greedy=True)
    actionIndex2 = policy(actors, initialXState, initialYState, eps_greedy=True)
    if actionIndex1 == actionIndex2:
      continue

    actionIndex1, actionIndex2 = min(actionIndex1, actionIndex2), max(actionIndex1, actionIndex2)

    preferenceViaParetoDominance = evaluatePreference(
        initialXState, initialYState,
        initialXState, initialYState,
        actionIndex1, actionIndex2,
        policy, target_actors
    )
    (xState1, yState1) = simulate1Step(initialXState, initialYState, initialXState, initialYState, actionIndex1)
    (xState2, yState2) = simulate1Step(initialXState, initialYState, initialXState, initialYState, actionIndex2)

    if(preferenceViaParetoDominance != 0):

      onehotEncoding = torch.tensor([0,0], device=device, dtype=torch.float32)
      if(preferenceViaParetoDominance == 1):  # action2 is preferred
        onehotEncoding[1] = 1
      else:  # action1 is preferred
        onehotEncoding[0] = 1

      if (actionIndex1, actionIndex2) not in ReplayBuffer:
        ReplayBuffer[(actionIndex1, actionIndex2)] = deque([], maxlen = buffer_size)
      ReplayBuffer[(actionIndex1, actionIndex2)].append((
          torch.tensor([actionIndex1, actionIndex2], dtype=torch.float32),
          torch.tensor([initialXState, initialYState], dtype=torch.float32),
          torch.tensor([xState1, yState1], dtype=torch.float32),  # action 1 next state
          torch.tensor([xState2, yState2], dtype=torch.float32),  # action 2 next state
          onehotEncoding  # preferences
      ))

      rolloutIndex = rolloutIndex + 1

  # We train below the actors with the training elements that we found above:
  for actionIndex1 in range(numberOfActions):
    for actionIndex2 in range(actionIndex1 + 1, numberOfActions):

      if (actionIndex1, actionIndex2) not in ReplayBuffer:
          continue

      potential_samples = ReplayBuffer[(actionIndex1, actionIndex2)]
      if len(potential_samples) < batchSize:
          train_dataloader = DataLoader(potential_samples, batch_size = len(potential_samples), shuffle = True)
      else:
          training_data = sample(potential_samples, batchSize)
          train_dataloader = DataLoader(training_data, batch_size = batchSize, shuffle = True)

      actorToTrain = actors[(actionIndex1, actionIndex2)]
      criticToTrain = critics[(actionIndex1, actionIndex2)]

      optimizer = optim.SGD(actorToTrain.parameters(), lr=0.01)

      # Training for (1) specific actor (actorToTrain):
      for epochNumber in range(numberOfEpochs):
        for (actions, inputState, nextState1, nextState2, preferences) in train_dataloader:

            with torch.no_grad():
              Q_target_network = target_critics[(actionIndex1, actionIndex2)]
              mu_target = target_actors[(actionIndex1, actionIndex2)]
              Q_target_next1 = Q_target_network(nextState1, mu_target(nextState1))
              Q_target_next2 = Q_target_network(nextState2, mu_target(nextState2))
              done1 = checkDoneBatch(inputState, nextState1)
              done2 = checkDoneBatch(inputState, nextState2)
              reward = preferences
              Q_target = reward + torch.cat((
                  gamma * (1 - done1) * Q_target_next1,
                  gamma * (1 - done2) * Q_target_next2
              ), dim=-1)

            optimizer.zero_grad()   # zero the gradient buffers
            mu = actorToTrain
            Q_network = criticToTrain
            # Update Q-function weights
            actions1, actions2 = label_smooth_binary_actions(actions)
            Q_pred = torch.cat((
                Q_network(inputState, actions1),
                Q_network(inputState, actions2)
            ), dim=-1)
            # Update policy mu weights
            Q_adjusted = Q_network(inputState, mu(inputState)) * preferences
            # Calculate the loss
            critic_loss = F.mse_loss(Q_pred, Q_target.detach())
            actor_loss = -torch.sum(Q_adjusted)
            loss = critic_loss + actor_loss
            loss.backward()
            optimizer.step()

      # soft target update
      actor_target_state_dict = target_actors[(actionIndex1, actionIndex2)].state_dict()
      actor_state_dict = actors[(actionIndex1, actionIndex2)].state_dict()
      critic_target_state_dict = target_critics[(actionIndex1, actionIndex2)].state_dict()
      critic_state_dict = critics[(actionIndex1, actionIndex2)].state_dict()

      for key in actor_state_dict:
        actor_target_state_dict[key] = actor_target_state_dict[key]*tau + actor_state_dict[key]*(1-tau)
        critic_target_state_dict[key] = critic_target_state_dict[key]*tau + critic_state_dict[key]*(1-tau)
      target_actors[(actionIndex1, actionIndex2)].load_state_dict(actor_target_state_dict)
      target_critics[(actionIndex1, actionIndex2)].load_state_dict(critic_target_state_dict)


  0%|          | 0/30 [00:00<?, ?it/s]

In [None]:
numberOfRolloutsToTestIfLearnedPolicyIsBetterThanRandomPolicy = 500

finalTumorSizesLearnedPolicy = []
finalTumorSizesRandomPolicy = []

maximumToxicityWithLearnedPolicy = []
maximumToxicityWithRandomPolicy = []

for rolloutIndex in range(numberOfRolloutsToTestIfLearnedPolicyIsBetterThanRandomPolicy):

    initialXState = rng.random() * 2
    initialYState = rng.random() * 2

    (xStateLearnedPolicy, yStateLearnedPolicy) = (initialXState, initialYState)
    (xStateRandomPolicy, yStateRandomPolicy) = (initialXState, initialYState)

    maximumToxicityWithLearnedPolicy.append(0)
    maximumToxicityWithRandomPolicy.append(0)

    for timeStepIndex in range(6):

      actionIndexLearnedPolicy = policy(actors, xStateLearnedPolicy, yStateLearnedPolicy)
      actionIndexRandomPolicy = np.random.randint(4)

      (xStateLearnedPolicy, yStateLearnedPolicy) = simulate1Step(xStateLearnedPolicy, yStateLearnedPolicy, initialXState, initialYState, actionIndexLearnedPolicy)
      (xStateRandomPolicy, yStateRandomPolicy) = simulate1Step(xStateRandomPolicy, yStateRandomPolicy, initialXState, initialYState, actionIndexRandomPolicy)

      maximumToxicityWithLearnedPolicy[rolloutIndex] = np.maximum(xStateLearnedPolicy, maximumToxicityWithLearnedPolicy[rolloutIndex])
      maximumToxicityWithRandomPolicy[rolloutIndex] = np.maximum(xStateRandomPolicy, maximumToxicityWithRandomPolicy[rolloutIndex])

    finalTumorSizesLearnedPolicy.append(yStateLearnedPolicy)
    finalTumorSizesRandomPolicy.append(yStateRandomPolicy)

print("Average Final Tumor Sizes for Learned Policy: ", np.mean(np.array(finalTumorSizesLearnedPolicy)))
print("Average Final Tumor Sizes for Random Policy: ", np.mean(np.array(finalTumorSizesRandomPolicy)))
print("Average Maximum Toxicity for Learned Policy: ", np.mean(np.array(maximumToxicityWithLearnedPolicy)))
print("Average Maximum Toxicity for Random Policy: ", np.mean(np.array(maximumToxicityWithRandomPolicy)))

Average Final Tumor Sizes for Learned Policy:  0.5433815124156679
Average Final Tumor Sizes for Random Policy:  1.8473132806689005
Average Maximum Toxicity for Learned Policy:  4.16757385082077
Average Maximum Toxicity for Random Policy:  2.397687203293951


In [None]:
import numpy as np
from scipy.stats import rankdata, friedmanchisquare, norm

# Example data of three groups with their respective measurements
group1 = np.array([10, 12, 15, 14, 16])
group2 = np.array([8, 9, 11, 13, 12])
group3 = np.array([5, 7, 6, 9, 8])

# Combine all data
all_data = np.concatenate([group1, group2, group3])

# Assign ranks to the combined data
ranks = rankdata(all_data)

# Reshape ranks according to the original groups
ranks_group1 = ranks[:len(group1)]
ranks_group2 = ranks[len(group1):len(group1) + len(group2)]
ranks_group3 = ranks[len(group1) + len(group2):]

# Friedman test to check if there are significant differences among groups
statistic, p_value = friedmanchisquare(group1, group2, group3)
alpha = 0.05

if p_value < alpha:
    print("Friedman test: Significant differences exist among groups.")
    # Calculate critical difference
    k = 3  # Number of groups
    N = len(all_data)  # Total number of observations
    CD = norm.ppf(1 - alpha / (2 * k * (k - 1)) ** 0.5) * (k * (k + 1) / (6 * N)) ** 0.5
    print(f"Critical Difference (CD): {CD}")

    # Comparing pairwise differences using the critical difference
    pairwise_diff = [(np.abs(np.mean(x) - np.mean(y)), 'Significant' if np.abs(np.mean(x) - np.mean(y)) > CD else 'Not significant')
                     for x, y in [(group1, group2), (group1, group3), (group2, group3)]]

    print("Pairwise Differences:")
    for i, diff in enumerate(pairwise_diff):
        print(f"Group {i + 1} vs. Group {i + 2}: Difference = {diff[0]}, {diff[1]}")
else:
    print("Friedman test: No significant differences among groups.")

Friedman test: Significant differences exist among groups.
Critical Difference (CD): 0.7979554672643842
Pairwise Differences:
Group 1 vs. Group 2: Difference = 2.8000000000000007, Significant
Group 2 vs. Group 3: Difference = 6.4, Significant
Group 3 vs. Group 4: Difference = 3.5999999999999996, Significant


In [None]:
group1.mean()

13.4

In [None]:
group2.mean()

10.6

In [None]:
group3.mean()

7.0

In [None]:
ranks

array([ 8. , 10.5, 14. , 13. , 15. ,  4.5,  6.5,  9. , 12. , 10.5,  1. ,
        3. ,  2. ,  6.5,  4.5])