In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from copy import copy, deepcopy
import random
import math
import pickle

In [2]:
"""create action arrays"""
def define_action_space(i):
  actions=[]
  if(i==1):
    a1=np.array([[0],[0],[0],[0]])
    a2=np.array([[1],[0],[0],[0]])
    a3=np.array([[0],[1],[0],[0]])
    a4=np.array([[0],[0],[1],[0]])
    a5=np.array([[0],[0],[0],[1]])
    actions.append(a1)
    actions.append(a2)
    actions.append(a3)
    actions.append(a4)
    actions.append(a5)
  else:
    a1=np.array([[0],[0],[0],[0]])
    a2=np.array([[1],[0],[0],[0]])
    a3=np.array([[0],[0],[0],[1]])
    actions.append(a1)
    actions.append(a2)
    actions.append(a3)
  return actions

In [3]:
"""test create action"""
def test_create_actions():
  action_op1=define_action_space(1)
  actions_op0=define_action_space(0)
  assert(len(action_op1)==5)
  temp_array=np.array([[0],[0],[0],[1]])
  assert(np.array_equal(action_op1[4],temp_array))
  assert(np.array_equal(actions_op0[2],temp_array))
  assert(len(actions_op0)==3)
test_create_actions()

In [4]:
def create_states():
  states=[]
  first_array=np.array([[0],[0],[0],[0]])
  for i in range(16):
    number_list=[]
    byte_string='{0:04b}'.format(i)
    for ch in byte_string:
      number_list.append(int(ch))
    states.append(np.asarray(number_list).reshape(4,1))
  return states
#create_states()


In [5]:
def computeXOR(a,b):
  xor=[]
  for i in range(np.shape(a)[0]):
    if(a[i][0]==b[i][0]):
      ans=0
    else:
      ans=1
    xor.append(ans)
  return np.asarray(xor).reshape(np.shape(a)[0],1)
def computeL1Norm(a):   # assumes input is np array of shape x,1
  norm=0
  for i in range(np.shape(a)[0]):
    norm+=np.abs(a[i][0])
  return norm
def calc_transition_matrices(p,connectivity_matrix,states,actions): # connectivity matrix must be a np array
  rows=len(states)
  transition_matrices=[]
  for action in actions:
    transition_matrix=np.zeros((rows,rows))
    for i in range(rows):
      for j in range(rows):
        csi=np.matmul(connectivity_matrix,states[i])
        csi=np.where(csi > 0, 1, 0)
        xored = computeXOR(csi,action)
        prob=computeL1Norm(states[j]-xored)
        transition_matrix[i][j] = math.pow(p,prob)*math.pow(1-p,4-prob)
    transition_matrices.append(transition_matrix)
  return transition_matrices


In [6]:
def test_compute_XOR():
  arr1=np.array([[0],[0],[1],[1]])
  arr2=np.array([[0],[1],[0],[1]])
  arr3=computeXOR(arr1,arr2)
  assert(arr3.shape==(4,1))
  assert(arr3[0][0]==0)
  assert(arr3[1][0]==1)
  assert(arr3[2][0]==1)
  assert(arr3[3][0]==0)
test_compute_XOR()
def test_computeL1Norm():
  arr=np.array([[0],[1],[2]])
  assert(computeL1Norm(arr)==3)
test_computeL1Norm()
def test_calc_transition_matrices():
  states=create_states()
  actions=define_action_space(0)
  state=states[0]
  action=actions[0]
  connectivity_matrix = np.array([[0,0,-1,0],[1,0,-1,-1],[0,1,0,0],[-1,1,1,0]])
  test_txn_matrices=calc_transition_matrices(0.05,connectivity_matrix,states,actions)
  assert(len(test_txn_matrices)==3)
  assert(np.shape(test_txn_matrices[0])==(16,16))
  #assert(np.isclose(test_txn_matrices[0][0][0],0.6561))
  #print(test_txn_matrices)
test_calc_transition_matrices()


In [7]:
def calculate_two_state_reward(state1,state2,action):  # state 2 is the destination state
  return computeL1Norm(5*state2)-computeL1Norm(action)

In [8]:
def test_calculate_two_state_reward():
  state1=np.array([[1],[0]])
  state2=np.array([[0],[1]])
  action=np.array([[1],[1]])
  test_ans=calculate_two_state_reward(state1, state2,action)
  assert(test_ans==3)
test_calculate_two_state_reward()

In [9]:
def create_all_two_state_rewards(states,actions):
  rows=len(states)
  two_state_rewards=[]
  for action in actions:
    two_state_reward=np.zeros((rows,rows))
    for i in range(rows):
      for j in range(rows):
        two_state_reward[i][j]=calculate_two_state_reward(states[i],states[j],action)
    two_state_rewards.append(two_state_reward)
  return two_state_rewards


In [10]:
def test_create_all_two_state_rewards():
  states=create_states()
  actions=define_action_space(0)
  test_output=create_all_two_state_rewards(states,actions)
  assert(len(test_output)==3)
  #print(np.size(test_output[0]))
  #print(test_output[1])
  assert(np.shape(test_output[0])==(16,16))
test_create_all_two_state_rewards()

In [11]:
def create_all_single_state_rewards(two_state_rewards,transition_matrices):
  rows=np.shape(two_state_rewards[0])[0]
  single_state_rewards=[]
  constant=np.ones((rows,1))
  for i in range(len(two_state_rewards)):
    single_state_rewards.append(np.matmul(np.multiply(transition_matrices[i],two_state_rewards[i]),constant))
  return single_state_rewards


In [12]:
def test_create_all_single_state_rewards():
  states=create_states()
  actions=define_action_space(1)
  test_two_state_rewards=create_all_two_state_rewards(states,actions)
  connectivity_matrix = np.array([[0,0,-1,0],[1,0,-1,-1],[0,1,0,0],[-1,1,1,0]])
  test_txn_matrices=calc_transition_matrices(0.05,connectivity_matrix,states,actions)
  #print(test_txn_matrices)
  '''print("txn matrix for action 3")
  print(test_txn_matrices[2])
  print("two state reward for action 3")
  print(test_two_state_rewards[2])
  print("single state compare")
  test_value=np.matmul(np.multiply(file,test_two_state_rewards[2]),np.ones((16,1)))
  print("test value")
  print(test_value)'''
  test_single_state_reward = create_all_single_state_rewards(test_two_state_rewards,test_txn_matrices)
  #print(test_single_state_reward)
  #print(test_two_state_rewards[0])
  #print("transition matrix for action 0")
  #print(test_txn_matrices[0])
  #print("output")
  #print("method output")
  #print(test_single_state_reward[2])
  constant=np.ones((16,1))
  test_compare_output=np.matmul(np.multiply(test_txn_matrices[0],test_two_state_rewards[0]),constant)
  assert((test_single_state_reward[0]==test_compare_output).all())
  assert(len(test_single_state_reward)==5)
  assert(np.shape(test_single_state_reward[0])==(16,1))
  print(test_single_state_reward[4])
test_create_all_single_state_rewards()

[[4.50000000e+00]
 [4.50000000e+00]
 [5.55111512e-17]
 [5.55111512e-17]
 [4.50000000e+00]
 [4.50000000e+00]
 [4.50000000e+00]
 [4.50000000e+00]
 [9.00000000e+00]
 [4.50000000e+00]
 [4.50000000e+00]
 [4.50000000e+00]
 [1.35000000e+01]
 [9.00000000e+00]
 [4.50000000e+00]
 [4.50000000e+00]]


In [13]:
def compare_single_state_rewards(p1,p2):
  states=create_states()
  actions=define_action_space(0)
  test_two_state_rewards=create_all_two_state_rewards(states,actions)
  connectivity_matrix = np.array([[0,0,-1,0],[1,0,-1,-1],[0,1,0,0],[-1,1,1,0]])
  test_txn_matrices=calc_transition_matrices(p1,connectivity_matrix,states,actions)
  test_single_state_reward = create_all_single_state_rewards(test_two_state_rewards,test_txn_matrices)
  states2=create_states()
  actions2=define_action_space(0)
  test_two_state_rewards2=create_all_two_state_rewards(states,actions)
  test_txn_matrices2=calc_transition_matrices(p2,connectivity_matrix,states,actions)
  test_single_state_reward2 = create_all_single_state_rewards(test_two_state_rewards2,test_txn_matrices2)
  output = [test_single_state_reward2[i]-test_single_state_reward[i] for i in range(len(test_single_state_reward))]
  print(output[1])
compare_single_state_rewards(0.05,0.45)

[[ 4.00000000e+00]
 [ 4.00000000e+00]
 [ 3.55271368e-15]
 [ 3.55271368e-15]
 [-4.00000000e+00]
 [-4.00000000e+00]
 [-4.00000000e+00]
 [-4.00000000e+00]
 [ 3.55271368e-15]
 [ 4.00000000e+00]
 [ 4.00000000e+00]
 [ 4.00000000e+00]
 [-4.00000000e+00]
 [ 5.32907052e-15]
 [-4.00000000e+00]
 [-4.00000000e+00]]


In [14]:
def matrix_policy_improv(potential,transition_matrices,gamma,all_one_state_rewards):
  col_number=len(transition_matrices)
  row_number=np.shape(all_one_state_rewards[0])[0]
  all_potentials=np.zeros((row_number,col_number))
  for i in range(col_number):
    reward=all_one_state_rewards[i]
    Vt = (reward+gamma*(np.matmul(transition_matrices[i],potential))).reshape(row_number,)
    #print(np.shape(Vt))
    #print(np.shape(all_potentials))
    all_potentials[:,i]=Vt
  policy = (np.argmax(all_potentials,axis=1)).tolist()
  return policy

In [15]:
from numpy.matrixlib.defmatrix import matrix
def test_matrix_policy_improv():
  single_state_reward=[]
  rsa1=np.array([[0.1],[0.5]])
  rsa2=np.array([[-1],[0.8]])
  ma1=np.array([[0.1,0.8],[0,1]])
  ma2=np.array([[1,0],[0.1,0.9]])
  single_state_reward.append(rsa1)
  single_state_reward.append(rsa2)
  txn_matrices=[]
  txn_matrices.append(ma1)
  txn_matrices.append(ma2)
  potential=np.array([[-10],[5]])
  test_improved_policy=matrix_policy_improv(potential,txn_matrices,0.9,single_state_reward)
  assert(len(test_improved_policy)==2)
  assert(test_improved_policy[0]==0)
  assert(test_improved_policy[1]==0)
test_matrix_policy_improv()

In [16]:
def matrix_policy_evaluation(policy,txn_matrices,single_state_rewards,gamma): # policy is a list
  number_states=np.shape(txn_matrices[0])[0]
  txn_matrix_policy=np.zeros((number_states,number_states))
  reward_matrix_policy=np.zeros((number_states,1))
  identity=np.identity(number_states)
  for i in range(len(policy)):
    action=policy[i]
    txn_matrix_policy[i,:]=txn_matrices[action][i]
    reward_matrix_policy[i,0]=single_state_rewards[action][i]
  #print("txn matrix")
  #print(txn_matrix_policy)
  #print("reward matrix policy")
  #print(reward_matrix_policy)
  potential=np.matmul(np.linalg.inv(identity-gamma*txn_matrix_policy),reward_matrix_policy)
  return potential

In [17]:
def test_matrix_policy_evaluation():
  single_state_reward=[]
  rsa1=np.array([[0.1],[0.5]])
  rsa2=np.array([[-1],[0.8]])
  ma1=np.array([[0.2,0.8],[0,1]])
  ma2=np.array([[1,0],[0.1,0.9]])
  single_state_reward.append(rsa1)
  single_state_reward.append(rsa2)
  txn_matrices=[]
  txn_matrices.append(ma1)
  txn_matrices.append(ma2)
  policy=[0,1]
  potential = matrix_policy_evaluation(policy,txn_matrices,single_state_reward,0.9)
  print(potential)
test_matrix_policy_evaluation()

[[6.53846154]
 [7.30769231]]


In [18]:

def matrix_policy_iteration(intial_policy,txn_matrices,single_state_rewards,gamma):
  diff_exsists=True
  iter=0
  while(diff_exsists):
    iter+=1
    if(iter>1000):
      break
    final_potential = matrix_policy_evaluation(intial_policy,txn_matrices,single_state_rewards,gamma)
    new_policy=matrix_policy_improv(final_potential,txn_matrices,gamma,single_state_rewards)
    diff_exsists=not(new_policy==intial_policy)
    intial_policy=new_policy
  return intial_policy,final_potential,iter
  

In [19]:
def test_matrix_policy_iteration():
  init_policy=[1,0]
  single_state_reward=[]
  rsa1=np.array([[0.1],[0.5]])
  rsa2=np.array([[-1],[0.8]])
  ma1=np.array([[0.2,0.8],[0,1]])
  ma2=np.array([[1,0],[0.1,0.9]])
  single_state_reward.append(rsa1)
  single_state_reward.append(rsa2)
  txn_matrices=[]
  txn_matrices.append(ma1)
  txn_matrices.append(ma2)
  ans_policy,ans_potential,iter=matrix_policy_iteration(init_policy,txn_matrices,single_state_reward,0.9)
  print(ans_policy)
  print(ans_potential)
test_matrix_policy_iteration()

[0, 1]
[[6.53846154]
 [7.30769231]]


In [20]:
def check_conv(potential1,potential2,threshold):
  rows=np.shape(potential1)[0]
  maxlist=[]
  assert(rows==np.shape(potential2)[0])
  #print(np.shape(potential2))
  #print(np.shape(potential1))
  for i in range(rows):
    maxlist.append(np.abs(potential1[i,0]-potential2[i,0]))
  if(max(maxlist)<threshold):
    return True
  return False
def matrix_value_iteration(threshold,gamma,all_one_state_rewards,transition_matrices,intial_potential):
  iter1=0
  col_number=len(transition_matrices)
  row_number=np.shape(all_one_state_rewards[0])[0]
  all_potentials=np.zeros((row_number,col_number))
  conv=False
  while(conv==False):
    iter1+=1
    while(iter1>1000):
      break
    for i in range(col_number):
      reward=all_one_state_rewards[i]
      Vt = (reward+gamma*(np.matmul(transition_matrices[i],intial_potential))).reshape(row_number,)
      all_potentials[:,i]=Vt
    final_potential=np.amax(all_potentials,axis=1).reshape(row_number,1)
    #print("in method {}".format(final_potential))
    conv=check_conv(final_potential,intial_potential,threshold)
    intial_potential=final_potential
  return final_potential,iter1


In [21]:
def check_amax_fxn():
  test_arr=np.array([[2.27,0.6146],[2.81,3.04]])
  arr=np.amax(test_arr,axis=1).reshape(2,1)
  print(arr)
check_amax_fxn()

[[2.27]
 [3.04]]


In [22]:
def test_check_conv():
  pot1=np.array([[1.2],[0]])
  pot2=np.array([[1],[0]])
  assert(check_conv(pot1,pot2,0.01)==False)
test_check_conv()

In [23]:
def test_matrix_value_iteration():
  init_policy=[1,0]
  single_state_reward=[]
  rsa1=np.array([[0.1],[0.5]])
  rsa2=np.array([[-1],[0.8]])
  ma1=np.array([[0.2,0.8],[0,1]])
  ma2=np.array([[1,0],[0.1,0.9]])
  single_state_reward.append(rsa1)
  single_state_reward.append(rsa2)
  txn_matrices=[]
  txn_matrices.append(ma1)
  txn_matrices.append(ma2)
  intial_pot=np.zeros((2,1))
  final_potential,iter=matrix_value_iteration(0.5,0.9,single_state_reward,txn_matrices,intial_pot)
  print("final potential {}")
  print(final_potential)
  final_policy=matrix_policy_improv(final_potential,txn_matrices,0.9,single_state_reward)
  print("final policy")
  print(final_policy)
test_matrix_value_iteration()

final potential {}
[[2.27381558]
 [3.0430418 ]]
final policy
[0, 1]


In [24]:
def evaluate_performance(policy,intial_state_index,transition_matrices,states):  # policy has to be a list or array - say list
  state_index=intial_state_index
  action_to_do=policy[state_index]
  transition_matrix=transition_matrices[action_to_do]
  avg=0
  for i in range(100):
    episode_total=0
    for j in range(200):
      #state_index=np.argmax(transition_matrix[state_index],axis=1)
      state_index=np.random.choice(a=16,p=transition_matrix[state_index])
      episode_total+=computeL1Norm(states[state_index])
    avg+=episode_total/200
  avg=avg/100
  return avg
      


In [25]:
def run_mvi_case1():
  states=create_states()
  actions=define_action_space(0)
  connectivity_matrix = np.array([[0,0,-1,0],[1,0,-1,-1],[0,1,0,0],[-1,1,1,0]])
  txn_matrices=calc_transition_matrices(0.05,connectivity_matrix,states,actions)
  print(np.shape(txn_matrices))
  print("printing txn matrix of action 2-action3")
  print(np.linalg.det(txn_matrices[2]-txn_matrices[1]))
  two_state_rewards=create_all_two_state_rewards(states,actions)
  single_state_reward = create_all_single_state_rewards(two_state_rewards,txn_matrices)
  init_potential=np.zeros((16,1))
  final_potential,iter_vi_1=matrix_value_iteration(0.01,0.95,single_state_reward,txn_matrices,init_potential)
  print(final_potential)
  final_policy=matrix_policy_improv(final_potential,txn_matrices,0.95,single_state_reward)
  print(final_policy)
  for i in final_policy:
    print(actions[i])
  #choose random start state
  random_start_state=random.randrange(16)
  print("Number of iteration needed = {}".format(iter_vi_1))
  print("Starting with random state: {}".format(states[random_start_state]))
  print("Average activation obtained = {}".format(evaluate_performance(final_policy,random_start_state,txn_matrices,states)))
run_mvi_case1()

(3, 16, 16)
printing txn matrix of action 2-action3
0.0
[[190.98514127]
 [190.98514127]
 [190.7057201 ]
 [190.7057201 ]
 [195.11536386]
 [195.11536386]
 [195.11536386]
 [195.11536386]
 [197.08704976]
 [190.98514127]
 [190.98514127]
 [190.98514127]
 [198.80802017]
 [190.86691234]
 [195.11536386]
 [195.11536386]]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
Number of iteration needed = 136
Starting with random state: [[0]
 [1]
 [0]
 [0]]
Average activation obtained = 2.1553000000000004


In [26]:
def run_mpi_case1():
  states=create_states()
  actions=define_action_space(0)
  connectivity_matrix = np.array([[0,0,-1,0],[1,0,-1,-1],[0,1,0,0],[-1,1,1,0]])
  txn_matrices=calc_transition_matrices(0.05,connectivity_matrix,states,actions)
  print(np.shape(txn_matrices))
  #print(txn_matrices[0])
  two_state_rewards=create_all_two_state_rewards(states,actions)
  single_state_reward = create_all_single_state_rewards(two_state_rewards,txn_matrices)
  init_potential=np.zeros((16,1))
  initial_policy=[0]*16
  final_policy,final_potential,iter=matrix_policy_iteration(initial_policy,txn_matrices,single_state_reward,0.95)
  print("final potential")
  print(final_potential)
  print(final_policy)
  for i in final_policy:
    print(actions[i])
  #choose random start state
  random_start_state=random.randrange(16)
  print("Number of iteration needed = {}".format(iter))
  print("Starting with random state: {}".format(states[random_start_state]))
  print("Average activation obtained = {}".format(evaluate_performance(final_policy,random_start_state,txn_matrices,states)))
run_mpi_case1()

(3, 16, 16)
final potential
[[191.16747786]
 [191.16747786]
 [190.88805669]
 [190.88805669]
 [195.29770046]
 [195.29770046]
 [195.29770046]
 [195.29770046]
 [197.26938636]
 [191.16747786]
 [191.16747786]
 [191.16747786]
 [198.99035677]
 [191.04924894]
 [195.29770046]
 [195.29770046]]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
Number of iteration needed = 3
Starting with random state: [[0]
 [1]
 [0]
 [1]]
Average activation obtained = 2.1509999999999994


In [27]:
def run_base_case():
  states=create_states()
  actions=define_action_space(0)
  connectivity_matrix = np.array([[0,0,-1,0],[1,0,-1,-1],[0,1,0,0],[-1,1,1,0]])
  txn_matrices=calc_transition_matrices(0.05,connectivity_matrix,states,actions)
  print(np.shape(txn_matrices))
  #print(txn_matrices[0])
  two_state_rewards=create_all_two_state_rewards(states,actions)
  single_state_reward = create_all_single_state_rewards(two_state_rewards,txn_matrices)
  init_potential=np.zeros((16,1))
  #final_potential,iter_vi_1=matrix_value_iteration(0.01,0.95,single_state_reward,txn_matrices,init_potential)
  #print(final_potential)
  #final_policy=matrix_policy_improv(final_potential,txn_matrices,0.95,single_state_reward)
  final_policy=[0]*16
  print("final policy: {}".format(final_policy))
  for i in final_policy:
    print("action = {}".format(actions[int(i)]))
  #choose random start state
  random_start_state=random.randrange(16)
  print("Starting with random state: {}".format(states[random_start_state]))
  print("Average activation obtained = {}".format(evaluate_performance(final_policy,random_start_state,txn_matrices,states)))
run_base_case()

(3, 16, 16)
final policy: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
Starting with random state: [[1]
 [0]
 [1]
 [1]]
Average activation obtained = 0.47365000000000007


In [28]:
def run_mvi_case2():
  states=create_states()
  actions=define_action_space(0)
  connectivity_matrix = np.array([[0,0,-1,0],[1,0,-1,-1],[0,1,0,0],[-1,1,1,0]])
  txn_matrices=calc_transition_matrices(0.2,connectivity_matrix,states,actions)
  print(np.shape(txn_matrices))
  #print(txn_matrices[0])
  two_state_rewards=create_all_two_state_rewards(states,actions)
  single_state_reward = create_all_single_state_rewards(two_state_rewards,txn_matrices)
  init_potential=np.zeros((16,1))
  final_potential,iter_vi_1=matrix_value_iteration(0.01,0.95,single_state_reward,txn_matrices,init_potential)
  print(final_potential)
  final_policy=matrix_policy_improv(final_potential,txn_matrices,0.95,single_state_reward)
  print(final_policy)
  for i in final_policy:
    print(actions[i])
  #choose random start state
  random_start_state=random.randrange(16)
  print("Number of iteration needed = {}".format(iter_vi_1))
  print("Starting with random state: {}".format(states[random_start_state]))
  print("Average activation obtained = {}".format(evaluate_performance(final_policy,random_start_state,txn_matrices,states)))
run_mvi_case2()

(3, 16, 16)
[[175.93032265]
 [175.93032265]
 [177.22165643]
 [177.22165643]
 [180.14690192]
 [180.14690192]
 [180.14690192]
 [180.14690192]
 [180.51315569]
 [175.93032265]
 [175.93032265]
 [175.93032265]
 [182.66195534]
 [177.57406848]
 [180.14690192]
 [180.14690192]]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
[[1]
 [0]
 [0]
 [0]]
Number of iteration needed = 134
Starting with random state: [[0]
 [0]
 [1]
 [0]]
Average activation obtained = 1.989


In [29]:
def run_mvi_case3():
  states=create_states()
  actions=define_action_space(0)
  connectivity_matrix = np.array([[0,0,-1,0],[1,0,-1,-1],[0,1,0,0],[-1,1,1,0]])
  txn_matrices=calc_transition_matrices(0.45,connectivity_matrix,states,actions)
  print(np.shape(txn_matrices))
  #print(txn_matrices[0])
  two_state_rewards=create_all_two_state_rewards(states,actions)
  single_state_reward = create_all_single_state_rewards(two_state_rewards,txn_matrices)
  init_potential=np.zeros((16,1))
  final_potential,iter_vi_1=matrix_value_iteration(0.01,0.95,single_state_reward,txn_matrices,init_potential)
  print(final_potential)
  final_policy=matrix_policy_improv(final_potential,txn_matrices,0.95,single_state_reward)
  print(final_policy)
  for i in final_policy:
    print(actions[i])
  #choose random start state
  random_start_state=random.randrange(16)
  print("Number of iteration needed = {}".format(iter_vi_1))
  print("Starting with random state: {}".format(states[random_start_state]))
  print("Average activation obtained = {}".format(evaluate_performance(final_policy,random_start_state,txn_matrices,states)))
run_mvi_case3()

(3, 16, 16)
[[190.05579393]
 [190.05579393]
 [190.54235058]
 [190.54235058]
 [191.05474104]
 [191.05474104]
 [191.05474104]
 [191.05474104]
 [190.62775028]
 [190.05579393]
 [190.05579393]
 [190.05579393]
 [191.13724578]
 [190.56574015]
 [191.05474104]
 [191.05474104]]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[[0]
 [0]
 [0]
 [0]]
[[0]
 [0]
 [0]
 [0]]
[[0]
 [0]
 [0]
 [0]]
[[0]
 [0]
 [0]
 [0]]
[[0]
 [0]
 [0]
 [0]]
[[0]
 [0]
 [0]
 [0]]
[[0]
 [0]
 [0]
 [0]]
[[0]
 [0]
 [0]
 [0]]
[[0]
 [0]
 [0]
 [0]]
[[0]
 [0]
 [0]
 [0]]
[[0]
 [0]
 [0]
 [0]]
[[0]
 [0]
 [0]
 [0]]
[[0]
 [0]
 [0]
 [0]]
[[0]
 [0]
 [0]
 [0]]
[[0]
 [0]
 [0]
 [0]]
[[0]
 [0]
 [0]
 [0]]
Number of iteration needed = 135
Starting with random state: [[1]
 [0]
 [1]
 [0]]
Average activation obtained = 1.9029500000000004


In [30]:
def run_mvi_case4():
  states=create_states()
  actions=define_action_space(1)
  connectivity_matrix = np.array([[0,0,-1,0],[1,0,-1,-1],[0,1,0,0],[-1,1,1,0]])
  txn_matrices=calc_transition_matrices(0.05,connectivity_matrix,states,actions)
  print(np.shape(txn_matrices))
  #print(txn_matrices[0])
  two_state_rewards=create_all_two_state_rewards(states,actions)
  single_state_reward = create_all_single_state_rewards(two_state_rewards,txn_matrices)
  init_potential=np.zeros((16,1))
  final_potential,iter_vi_1=matrix_value_iteration(0.01,0.95,single_state_reward,txn_matrices,init_potential)
  print(final_potential)
  final_policy=matrix_policy_improv(final_potential,txn_matrices,0.95,single_state_reward)
  print(final_policy)
  for i in final_policy:
    print(actions[i])
  #choose random start state
  random_start_state=random.randrange(16)
  print("Number of iteration needed = {}".format(iter_vi_1))
  print("Starting with random state: {}".format(states[random_start_state]))
  print("Average activation obtained = {}".format(evaluate_performance(final_policy,random_start_state,txn_matrices,states)))
run_mvi_case4()

(5, 16, 16)
[[255.45551755]
 [255.45551755]
 [259.7727771 ]
 [259.7727771 ]
 [264.62085953]
 [264.62085953]
 [264.62085953]
 [264.62085953]
 [260.13047744]
 [255.45551755]
 [255.45551755]
 [255.45551755]
 [264.62085953]
 [260.13047744]
 [264.62085953]
 [264.62085953]]
[2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 4, 2, 2, 2]
[[0]
 [1]
 [0]
 [0]]
[[0]
 [1]
 [0]
 [0]]
[[0]
 [1]
 [0]
 [0]]
[[0]
 [1]
 [0]
 [0]]
[[0]
 [1]
 [0]
 [0]]
[[0]
 [1]
 [0]
 [0]]
[[0]
 [1]
 [0]
 [0]]
[[0]
 [1]
 [0]
 [0]]
[[0]
 [0]
 [1]
 [0]]
[[0]
 [1]
 [0]
 [0]]
[[0]
 [1]
 [0]
 [0]]
[[0]
 [1]
 [0]
 [0]]
[[0]
 [0]
 [0]
 [1]]
[[0]
 [1]
 [0]
 [0]]
[[0]
 [1]
 [0]
 [0]]
[[0]
 [1]
 [0]
 [0]]
Number of iteration needed = 142
Starting with random state: [[0]
 [1]
 [0]
 [1]]
Average activation obtained = 2.8463500000000006


In [34]:
def run_base_case2():
  states=create_states()
  actions=define_action_space(0)
  connectivity_matrix = np.array([[0,0,-1,0],[1,0,-1,-1],[0,1,0,0],[-1,1,1,0]])
  txn_matrices=calc_transition_matrices(0.2,connectivity_matrix,states,actions)
  print(np.shape(txn_matrices))
  #print(txn_matrices[0])
  two_state_rewards=create_all_two_state_rewards(states,actions)
  single_state_reward = create_all_single_state_rewards(two_state_rewards,txn_matrices)
  init_potential=np.zeros((16,1))
  #final_potential,iter_vi_1=matrix_value_iteration(0.01,0.95,single_state_reward,txn_matrices,init_potential)
  #print(final_potential)
  #final_policy=matrix_policy_improv(final_potential,txn_matrices,0.95,single_state_reward)
  final_policy=[0]*16
  print("final policy: {}".format(final_policy))
  for i in final_policy:
    print("action = {}".format(actions[int(i)]))
  #choose random start state
  random_start_state=random.randrange(16)
  print("Starting with random state: {}".format(states[random_start_state]))
  print("Average activation obtained = {}".format(evaluate_performance(final_policy,random_start_state,txn_matrices,states)))
run_base_case2()

(3, 16, 16)
final policy: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
Starting with random state: [[0]
 [1]
 [0]
 [1]]
Average activation obtained = 1.2523499999999996


In [33]:
def run_base_case3():
  states=create_states()
  actions=define_action_space(0)
  connectivity_matrix = np.array([[0,0,-1,0],[1,0,-1,-1],[0,1,0,0],[-1,1,1,0]])
  txn_matrices=calc_transition_matrices(0.45,connectivity_matrix,states,actions)
  print(np.shape(txn_matrices))
  #print(txn_matrices[0])
  two_state_rewards=create_all_two_state_rewards(states,actions)
  single_state_reward = create_all_single_state_rewards(two_state_rewards,txn_matrices)
  init_potential=np.zeros((16,1))
  #final_potential,iter_vi_1=matrix_value_iteration(0.01,0.95,single_state_reward,txn_matrices,init_potential)
  #print(final_potential)
  #final_policy=matrix_policy_improv(final_potential,txn_matrices,0.95,single_state_reward)
  final_policy=[0]*16
  print("final policy: {}".format(final_policy))
  for i in final_policy:
    print("action = {}".format(actions[int(i)]))
  #choose random start state
  random_start_state=random.randrange(16)
  print("Starting with random state: {}".format(states[random_start_state]))
  print("Average activation obtained = {}".format(evaluate_performance(final_policy,random_start_state,txn_matrices,states)))
run_base_case3()

(3, 16, 16)
final policy: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
Starting with random state: [[1]
 [0]
 [0]
 [0]]
Average activation obtained = 1.90935


In [35]:
def run_base_case4():
  states=create_states()
  actions=define_action_space(1)
  connectivity_matrix = np.array([[0,0,-1,0],[1,0,-1,-1],[0,1,0,0],[-1,1,1,0]])
  txn_matrices=calc_transition_matrices(0.05,connectivity_matrix,states,actions)
  print(np.shape(txn_matrices))
  #print(txn_matrices[0])
  two_state_rewards=create_all_two_state_rewards(states,actions)
  single_state_reward = create_all_single_state_rewards(two_state_rewards,txn_matrices)
  init_potential=np.zeros((16,1))
  #final_potential,iter_vi_1=matrix_value_iteration(0.01,0.95,single_state_reward,txn_matrices,init_potential)
  #print(final_potential)
  #final_policy=matrix_policy_improv(final_potential,txn_matrices,0.95,single_state_reward)
  final_policy=[0]*16
  print("final policy: {}".format(final_policy))
  for i in final_policy:
    print("action = {}".format(actions[int(i)]))
  #choose random start state
  random_start_state=random.randrange(16)
  print("Starting with random state: {}".format(states[random_start_state]))
  print("Average activation obtained = {}".format(evaluate_performance(final_policy,random_start_state,txn_matrices,states)))
run_base_case4()

(5, 16, 16)
final policy: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
action = [[0]
 [0]
 [0]
 [0]]
Starting with random state: [[1]
 [0]
 [0]
 [0]]
Average activation obtained = 0.48074999999999973
