In [2]:
import numpy as np
import math
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

import gym
from gym.spaces import Discrete, Tuple

from env_ipd import MetaGames
from rmax import RmaxAgent, Memory

def round_func(number, radius):
    #return (torch.round(torch.div(number, radius))) * radius
    #change made: originally: [0,3,6,9], now: [0,1,2,3]
    return (torch.round(torch.div(number, radius))) 

In [None]:
K_epochs = 4  # update policy for K epochs
inner_gamma = 0.9  #inner game discount factor
meta_gamma = 0.8   #meta game discount factor
R_max = 0.98
max_meta_epi = 500
max_meta_steps = 5
max_inner_epi = 10
max_inner_steps = 5

epsilon = 0.2
alpha = 0.4
bs = 2          #batch size (must need for mfos for good results)
radius = 5   #radius for discretization, assuming radius>1

plot_rew = torch.zeros(max_inner_epi, max_inner_steps, 2, bs).to(device)    #reward tensor for plotting purposes

# creating environment
env = MetaGames(bs, "NL", "IPD")

memory = Memory()
rmax = RmaxAgent(env, R_max, meta_gamma, max_meta_epi, max_meta_steps, radius, epsilon)

In [6]:
torch.ones(2, (3**10 * 2, 3**10, 3**10 * 2)

59049

In [None]:
for episode in range(rmax.max_episodes): #for each meta-episode
    print("meta-episode =", episode)
    #initialise meta-state and meta-action as zeros
    meta_s = torch.zeros(env.b, env.d, env.num_actions, env.num_agents).to(device) 
    memory.states.append(meta_s)
    meta_a = torch.zeros(env.b, env.d, env.num_actions).to(device) 
    memory.actions.append(meta_a)    
    
    for step in range(rmax.max_steps):    #for each meta time step
        print("meta-timestep =", step)
        
        for epi in range(max_inner_epi):              #for each inner episodes 
            print("inner episode =", epi)
            state = env.reset()   #reset environment 
            
            for t in range(max_inner_steps):                     #for each inner timestep
                #print("inner timestep =", t)
                if t == 0:
                    #initialised action 
                    best_action = env.init_action
                else:
                    #find action that has max Q value for current state for both agents
                    best_action = env.choose_action(state)   

                #run inner game according to that action, for K episodes & T timesteps, output used to be new_state, reward, done, _ 
                newstate, reward, info = env.step(best_action)  
                plot_rew[epi,t,0,:] = reward
                plot_rew[epi,t,1,:] = info
                
                #update inner r matrix
                for i in range(env.b):
                    #env.innerr[i, state[i], best_action[0,i], 0] += (inner_gamma**t) * reward[i] 
                    #env.innerr[i, state[i], best_action[1,i], 1] += (inner_gamma**t) * info[i]
                    env.innerr[i, state[i], best_action[0,i], 0] = reward[i] 
                    env.innerr[i, state[i], best_action[1,i], 1] = info[i]

                #update inner q matrix, another for loop since have to wait till inner r matrix gets updated    
                for i in range(env.b):    
                    env.innerq[i, state[i], best_action[0,i], 0] = env.innerr[i, state[i], best_action[0,i], 0] + inner_gamma * torch.max(env.innerq[i, newstate[i], :, 0]) 
                    env.innerq[i, state[i], best_action[1,i], 1] = env.innerr[i, state[i], best_action[1,i], 1] + inner_gamma * torch.max(env.innerq[i, newstate[i], :, 1])

                #set current state = new state
                state = newstate  
                
                #print("reward at inner timestep = ", t, ":", plot_rew[epi,t,:,:])
                 
        print("done 1 inner episode,", step)
        #meta-state = inner game Q table for all agents
        new_meta_s = round_func(env.innerq, radius)
        #meta-action = inner game Q table for our agent
        new_meta_a = round_func(env.innerq[:,:,:,0], radius)      
        #select meta-action that corresponds to our agent's max Q table
         
        our_REW = reward                           #meta-reward = sum of rewards of our agent in inner game of K episodes & T timesteps
        memory.rewards.append(reward)
        
        rmax.update(memory, meta_s, meta_a, new_meta_s)
        
        meta_s = new_meta_s
        meta_a = new_meta_a
        
#             if done:
#                 if not(reward==1):
#                     self.R[state][best_action]=-10
#                 break


In [11]:
plot_rew

tensor([[[[1.8000, 0.8000],
          [0.8000, 1.8000]],

         [[0.0000, 0.0000],
          [0.0000, 0.0000]],

         [[0.0000, 0.0000],
          [0.0000, 0.0000]],

         [[0.0000, 0.0000],
          [0.0000, 0.0000]],

         [[0.0000, 0.0000],
          [0.0000, 0.0000]]],


        [[[0.0000, 0.0000],
          [0.0000, 0.0000]],

         [[0.0000, 0.0000],
          [0.0000, 0.0000]],

         [[0.0000, 0.0000],
          [0.0000, 0.0000]],

         [[0.0000, 0.0000],
          [0.0000, 0.0000]],

         [[0.0000, 0.0000],
          [0.0000, 0.0000]]],


        [[[0.0000, 0.0000],
          [0.0000, 0.0000]],

         [[0.0000, 0.0000],
          [0.0000, 0.0000]],

         [[0.0000, 0.0000],
          [0.0000, 0.0000]],

         [[0.0000, 0.0000],
          [0.0000, 0.0000]],

         [[0.0000, 0.0000],
          [0.0000, 0.0000]]],


        [[[0.0000, 0.0000],
          [0.0000, 0.0000]],

         [[0.0000, 0.0000],
          [0.0000, 0.0000]],

        