In [16]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

from ipd_environment import MetaGames
from rmax import RmaxAgent, Memory

import gym
from gym.spaces import Discrete, Tuple

import numpy as np
import math

In [17]:
K_epochs = 4  # update policy for K epochs

gamma = 0.99  # discount factor
R_max = 0.98
max_meta_epi=500
max_meta_steps=5 
max_inner_epi = 10
max_inner_steps = 5
epsilon=0.2
alpha = 0.4
bs = 1          #batch size (mus need for mfos for good results)
radius_dp = 1   #no. of decimal point for discretization 

#all agents' trajectory, entries will be like {[s1, a1, r1], [s2, a2, r2],...}
#torch size: {no. of inner episodes, no of inner steps, batch size, 3+no. of other agents*2 (2 since they take up 1 in state, 1 in rewards)}
traj=torch.empty(max_inner_epi, max_inner_steps, bs, (3+1*2)).to(device) 
our_rew = torch.zeros(bs).to(device)     #our agent's reward
oppo_rew = torch.zeros(bs).to(device)    #opponent's reward

# creating environment
#env = MetaGames(b, opponent=args.opponent, game=args.game, mmapg_id=args.mamaml_id)
env = MetaGames(bs, "NL", "IPD", 0)
innerr = torch.zeros(bs, 10**(radius_dp)+1, env.num_actions, 2).to(device)      #reward table with discretized dimensions, (batch_size, discretized states, actions, player)
innerq = torch.zeros(bs, 10**(radius_dp)+1, env.num_actions, 2).to(device)
best_meta_a = torch.zeros(bs, 10**(radius_dp)+1, env.num_actions).to(device)
ref_arr = np.linspace(0, 1, 10**(radius_dp)+1)

#action_dim = env.d
nA = env.d
#state_dim = env.d * 2
nS = env.d * 2

memory = Memory()
rmax = RmaxAgent(env, R_max, gamma, max_meta_epi, max_meta_steps, radius_dp, epsilon = 0.2)

state = env.reset()   #reset environment
dstate_index=[]
for j in torch.round(state, decimals = radius_dp):
    for i,x in enumerate(ref_arr):
        #find dstate index for each agent
        for num_agents in range(state.size(dim=1)):
            if x==j[num_agents-1]:
                dstate_index.append(i)
best_action = torch.argmax(innerq[:, dstate_index[0], :, 0], dim=1).unsqueeze(1)   

#run inner game according to that action, for K episodes & T timesteps, output used to be new_state, reward, done, _ 
newstate, reward, info, _ = env.step(best_action)  

traj[0,0] = torch.cat((state, best_action, reward, info), dim=1)
our_rew += reward.reshape(-1)
oppo_rew += info.reshape(-1)
#find index of discretized state value, logic: match ref_arr to state values
state = newstate

for i in range(bs):
    #update Q table for our agent
    innerq[i, dstate_index[0], best_action[i], 0] += alpha * (innerr[i, dstate_index[0], best_action[i], 0] + rmax.gamma * torch.amax(innerq[i, :, :, 0]) - innerq[i, dstate_index[0], best_action[i], 0])    
    #update Q table for opponent
    innerq[i, dstate_index[1], best_action[i], 1] += alpha * (innerr[i, dstate_index[1], best_action[i], 1] + rmax.gamma * torch.amax(innerq[i, :, :, 1]) - innerq[i, dstate_index[1], best_action[i], 1])        
    #best_meta_a[i] = torch.amax(innerq[i,:,:,0])                 #best-meta-action = max. Q
    
meta_s = torch.round(traj, decimals = radius_dp)                #meta-state = trajectory of all agents
memory.states.append(meta_s)
meta_a = torch.round(innerq[:, :, :, 0], decimals = radius_dp)  #meta-action = Q-table of inner game of our agent
memory.actions.append(meta_a)
meta_r = torch.round(our_rew, decimals = radius_dp)             #meta-reward = sum of inner rewards of our agent over K episodes & T timesteps
memory.rewards.append(meta_r)

best_meta_a = torch.max(meta_a)

In [23]:
innerr

tensor([[[[0., 0.],
          [0., 0.]],

         [[0., 0.],
          [0., 0.]],

         [[0., 0.],
          [0., 0.]],

         [[0., 0.],
          [0., 0.]],

         [[0., 0.],
          [0., 0.]],

         [[0., 0.],
          [0., 0.]],

         [[0., 0.],
          [0., 0.]],

         [[0., 0.],
          [0., 0.]],

         [[0., 0.],
          [0., 0.]],

         [[0., 0.],
          [0., 0.]],

         [[0., 0.],
          [0., 0.]]],


        [[[0., 0.],
          [0., 0.]],

         [[0., 0.],
          [0., 0.]],

         [[0., 0.],
          [0., 0.]],

         [[0., 0.],
          [0., 0.]],

         [[0., 0.],
          [0., 0.]],

         [[0., 0.],
          [0., 0.]],

         [[0., 0.],
          [0., 0.]],

         [[0., 0.],
          [0., 0.]],

         [[0., 0.],
          [0., 0.]],

         [[0., 0.],
          [0., 0.]],

         [[0., 0.],
          [0., 0.]]],


        [[[0., 0.],
          [0., 0.]],

         [[0., 0.],
          

In [18]:
rmax.update(memory, torch.flatten(best_meta_a) , torch.flatten(meta_s))

IndexError: tensors used as indices must be long, byte or bool tensors

In [None]:
for episode in range(rmax.max_meta_epi): #for each meta-episode
    print("episode =", episode)
    
    for step in range(rmax.max_meta_steps):    #for each meta time step
        print("timestep =", step)
        state = env.reset()   #reset environment
        
        #find discretized state index that corresponds to the real state value
        dstate_index=[]
        for j in torch.round(state, decimals = radius_dp):
            for i,x in enumerate(ref_arr):
                if x==j:
                    dstate_index.append(i)
                    
        #for each inner episodes            
        for epi in range(max_inner_epi):                 
            for t in range(max_inner_steps):                     #for each inner timestep
                #find action that has max Q value for current state of our agent --> thats why [0]
                #the index represents which action to take already since actions are discrete??????? ASKKKKK
                best_action = torch.argmax(innerq[:, dstate_index[0], :, 0])     
                
                #run inner game according to that action, for K episodes & T timesteps, output used to be new_state, reward, done, _ 
                newstate, reward, info, _ = env.step(best_action.reshape(1))  
                traj[epi].append(state, best_action, [reward,info])
                our_rew += reward.reshape(-1)
                oppo_rew += info.reshape(-1)

                #find index of discretized state value, logic: match ref_arr to state values
                state = newstate
                           
                #find discretized state index that corresponds to the real state value
                dstate=[]
                for j in torch.round(state, decimals = radius_dp):
                    for i,x in enumerate(ref_arr):
                        if x==j:
                            dstate_index.append(i)
                            
                #update Q table for our agent
                innerq[:, dstate_index[0], best_action, 0] += alpha * (innerr[:, dstate_index[0], best_action, 0] + rmax.gamma * torch.amax(innerq[:, :, :, 0], dim=(1,2)) - innerq[:, dstate_index[0], best_action, 0])    
                #update Q table for opponent
                innerq[:, dstate_index[1], best_action, 1] += alpha * (innerr[:, dstate_index[1], best_action, 1] + rmax.gamma * torch.amax(innerq[:, :, :, 1], dim=(1,2)) - innerq[:, dstate_index[1], best_action, 1])        
        print("done 1 inner episode,", step)
        meta_s = torch.round(innerq, decimals = radius_dp).long().to(device)               #meta-state = inner game Q table for all agents
        meta_a = torch.round(innerq[:,:,:,0], decimals = radius_dp).long().to(device)       #meta-action = inner game Q table for our agent
        
        best_meta_a = torch.argmax(rmax.Q[:,torch.flatten(meta_s)]).to(device)  #select meta-action that corresponds to our agent's maxi Q table
        our_REW = our_rew                           #meta-reward = sum of rewards of our agent in inner game of K episodes & T timesteps
        rmax.update(memory, torch.flatten(best_meta_a) , torch.flatten(meta_s))
        
#             if done:
#                 if not(reward==1):
#                     self.R[state][best_action]=-10
#                 break
