In [None]:
class RmaxAgent:

    def __init__(self, env, R_max, gamma, max_visits_per_state, max_episodes, max_steps, epsilon=0.2):
        self.gamma = gamma  # discount factor
        self.epsilon = epsilon  # exploration probability
        self.max_steps = max_steps  # max steps allowed in each episode
        self.max_visits_per_state = max_visits_per_state  # number of times to visit each state-action pair
        self.max_episodes = max_episodes  # maximum number of episodes to run
        self.Q = np.ones((nS, nA)) * R_max / (1 - self.gamma)  # Q table initialized with optimistic rewards
        # self.Q = np.random.random((nS, nA))  # Q table initialized with random values
        self.R = np.zeros((nS, nA))  # reward table for each state-action pair
        self.nSA = np.zeros((nS, nA))  # counter for each state-action pair
        self.nSAS = np.zeros((nS, nA, nS))  # counter for each state-action-next state triple
        self.val1 = []  # list to store mean rewards per 500 episodes
        self.val2 = []  # list to store episode numbers for mean rewards calculation
        print(int(np.ceil(np.log(1 / (self.epsilon * (1 - self.gamma))) / (1 - self.gamma)))) # print exploration depth


    def estimate_transition_probabilities(self):
        for episode in range(self.max_episodes):
            obs = env.reset()
            if episode % 20 == 0:
                self.val1.append(self.mean_rewards_per_500())
                self.val2.append(episode)

            for step in range(self.max_steps):
                best_action = self.choose_action(obs)
                new_obs, reward, done, _ = env.step(best_action)

                if self.nSA[obs][best_action] < self.max_visits_per_state:
                    self.nSA[obs][best_action] += 1
                    self.R[obs][best_action] += reward
                    self.nSAS[obs][best_action][new_obs] += 1

                    if self.nSA[obs][best_action] == self.max_visits_per_state:
                        for i in range(int(np.ceil(np.log(1 / (self.epsilon * (1 - self.gamma))) / (1 - self.gamma)))):

                            for state in range(nS):
                                for action in range(nA):
                                    if self.nSA[state][action] >= self.max_visits_per_state:
                                        # In the cited paper it is given that reward[s,a]= summation of rewards / nSA[s,a]
                                        # We have already calculated the summation of rewards in line 28
                                        q = (self.R[state][action] / self.nSA[state][action])

                                        for next_state in range(nS):
                                            # In the cited paper it is given that transition[s,a] = nSAS'[s,a,s']/nSA[s,a]
                                            transition = self.nSAS[state][action][next_state] / self.nSA[state][action]
                                            q += (transition * np.max(self.Q[next_state, :]))

                                        self.Q[state][action] = q

                if done:
                    if not(reward == 1):
                        self.R[obs][best_action] = -10
                    break

                obs = new_obs


    def mean_rewards_per_500(self):
        total_reward = 0
        for episodes in range(500):
            observation = env.reset()
            for _ in range(1000):
                action = self.choose_action(observation)
                observation, reward, done, info = env.step(action)
                total_reward += reward
                if done:
                    observation = env.reset()
                    break
        return (total_reward/500) 

    def choose_action(self,observation):
        if np.random.random() > (1-self.epsilon):
            action = env.action_space.sample()
        else:
            action = np.argmax(self.Q[observation])
        return action


In [None]:
rmaxagent = RmaxAgent(env, 1 , 0.98, 25, 500, 10)
env.close()
#rmaxagent.mean_rewards_per_500()
rmaxagent.estimate_transition_probabilities()


In [None]:
plt.plot(rmaxagent.val2[::7],rmaxagent.val1[::7])
plt.xlabel("Number of Episodes")
plt.ylabel("Average Reward")
plt.title("R-Max on a 4x4 environment")