<a href="https://colab.research.google.com/github/kailing231/treasurecube/blob/main/TreasureHunt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# environment.py

import numpy as np
import time
import random

from abc import ABC, abstractmethod


class AbstractEnvironment(ABC):
    def __init__(self):
        self.agent_sign = '+'
        self.goal_sign = 'G'
        self.corridor_sign = '-'

    def render(self):
        raise NotImplemented

    def reset(self):
        raise NotImplemented

    def step(self, action):
        raise NotImplemented

class TreasureCube(AbstractEnvironment):
    def __init__(self, max_step=20):
        super(TreasureCube, self).__init__()
        self.dim = 4
        self.max_step = max_step
        self.curr_pos = [0, 0, 0]  # (z, x, y)
        self.time_step = 0
        self.end_pos = [self.dim - 1, self.dim - 1, self.dim - 1]
        self.visual_state = []
        self.seed = None
        self.set_seed()
        self.all_actions = ['right', 'left', 'up', 'down', 'forward', 'backward']
        self.slip_actions = dict()
        self.set_action_rules()

    def reset(self):
        self.curr_pos = [0, 0, 0]
        self.time_step = 0
        self.end_pos = [self.dim - 1, self.dim - 1, self.dim - 1]
        self._reset_visual_states(self.curr_pos, self.end_pos)
        return ''.join(str(pos) for pos in self.curr_pos)

    # def randomState(self): # todo remove
    #     x = random.randrange(0,3)
    #     y = random.randrange(0,3)
    #     z = random.randrange(0,3)
    #     return [x,y,z]

    def getNextState(self, state, action):
        assert action in self.all_actions
        curr_state = list(map(int,str(state)))      
        if action == 'left':
            if curr_state[1] == 0:  # wall
                pass
            else:
                curr_state[1] -= 1
        elif action == 'right':
            if curr_state[1] == self.dim - 1:  # wall
                pass
            elif curr_state[1] == self.dim - 2 and curr_state[0] == self.dim - 1 and curr_state[
                2] == self.dim - 1:
                curr_state[1] += 1
                # is_terminate = True
                # reward = 1
            else:
                curr_state[1] += 1

        elif action == 'forward':
            if curr_state[0] == self.dim - 1:  # wall
                pass
            elif curr_state[0] == self.dim - 2 and curr_state[1] == self.dim - 1 and curr_state[
                2] == self.dim - 1:
                curr_state[0] += 1
                # is_terminate = True
                # reward = 1
            else:
                curr_state[0] += 1

        elif action == 'backward':
            if curr_state[0] == 0:  # wall
                pass
            else:
                curr_state[0] -= 1

        elif action == 'up':
            if curr_state[2] == self.dim - 1:  # wall
                pass
            elif curr_state[2] == self.dim - 2 and curr_state[0] == self.dim - 1 and curr_state[
                1] == self.dim - 1:
                curr_state[2] += 1
                # is_terminate = True
                # reward = 1
            else:
                curr_state[2] += 1

        elif action == 'down':
            if curr_state[2] == 0:
                pass
            else:
                curr_state[2] -= 1
                
        # self._reset_visual_states(self.curr_pos, self.end_pos)
        next_state = ''.join(str(pos) for pos in curr_state)
        return next_state
    
    def step(self, action, stochastic=True):
        in_action = action  # action from agent
        assert action in self.all_actions
        reward = -0.1
        is_terminate = False
        pre_pos = self.curr_pos
        r = random.random()
        if action == 'right':
            if r < 0.1:
                action = 'up'
            elif r < 0.2:
                action = 'down'
            elif r < 0.3:
                action = 'forward'
            elif r < 0.4:
                action = 'backward'
            else:
                action = 'right'
        elif action == 'left':
            if r < 0.1:
                action = 'up'
            elif r < 0.2:
                action = 'down'
            elif r < 0.3:
                action = 'forward'
            elif r < 0.4:
                action = 'backward'
            else:
                action = 'left'
        elif action == 'up':
            if r < 0.1:
                action = 'left'
            elif r < 0.2:
                action = 'right'
            elif r < 0.3:
                action = 'forward'
            elif r < 0.4:
                action = 'backward'
            else:
                action = 'up'
        elif action == 'down':
            if r < 0.1:
                action = 'left'
            elif r < 0.2:
                action = 'right'
            elif r < 0.3:
                action = 'forward'
            elif r < 0.4:
                action = 'backward'
            else:
                action = 'down'
        elif action == 'forward':
            if r < 0.1:
                action = 'left'
            elif r < 0.2:
                action = 'right'
            elif r < 0.3:
                action = 'up'
            elif r < 0.4:
                action = 'down'
            else:
                action = 'forward'
        else:
            if r < 0.1:
                action = 'left'
            elif r < 0.2:
                action = 'right'
            elif r < 0.3:
                action = 'up'
            elif r < 0.4:
                action = 'down'
            else:
                action = 'backward'

        if not stochastic:
            action = in_action

        assert action in self.all_actions
        if action == 'left':
            if self.curr_pos[1] == 0:  # wall
                pass
            else:
                self.curr_pos[1] -= 1
        elif action == 'right':
            if self.curr_pos[1] == self.dim - 1:  # wall
                pass
            elif self.curr_pos[1] == self.dim - 2 and self.curr_pos[0] == self.dim - 1 and self.curr_pos[
                2] == self.dim - 1:
                self.curr_pos[1] += 1
                is_terminate = True
                reward = 1
            else:
                self.curr_pos[1] += 1

        elif action == 'forward':
            if self.curr_pos[0] == self.dim - 1:  # wall
                pass
            elif self.curr_pos[0] == self.dim - 2 and self.curr_pos[1] == self.dim - 1 and self.curr_pos[
                2] == self.dim - 1:
                self.curr_pos[0] += 1
                is_terminate = True
                reward = 1
            else:
                self.curr_pos[0] += 1
        elif action == 'backward':
            if self.curr_pos[0] == 0:  # wall
                pass
            else:
                self.curr_pos[0] -= 1

        elif action == 'up':
            if self.curr_pos[2] == self.dim - 1:  # wall
                pass
            elif self.curr_pos[2] == self.dim - 2 and self.curr_pos[0] == self.dim - 1 and self.curr_pos[
                1] == self.dim - 1:
                self.curr_pos[2] += 1
                is_terminate = True
                reward = 1
            else:
                self.curr_pos[2] += 1
        elif action == 'down':
            if self.curr_pos[2] == 0:
                pass
            else:
                self.curr_pos[2] -= 1

        assert action in self.all_actions
        self.time_step += 1
        if self.time_step == self.max_step - 1:
            is_terminate = True

        self._reset_visual_states(self.curr_pos, self.end_pos)
        return reward, is_terminate, ''.join(str(pos) for pos in self.curr_pos)

    def render(self):
        print(' '.join(['*'] * self.dim))
        for i in range(self.dim):
            for line in self.visual_state[i]:
                print(' '.join(line))
            print(' '.join(['#'] * self.dim))
        print(' '.join(['*'] * self.dim))

    def set_seed(self, seed=10086):
        self.seed = seed
        random.seed(seed)

    def _reset_visual_states(self, agent_pos, goal_pos):
        self.visual_state = [[[self.corridor_sign] * self.dim for _ in range(self.dim)] for _ in range(self.dim)]
        self.visual_state[agent_pos[0]][agent_pos[1]][agent_pos[2]] = self.agent_sign
        self.visual_state[goal_pos[0]][goal_pos[1]][goal_pos[2]] = self.goal_sign

    def set_action_rules(self):
        self.slip_actions['right'] = ['up', 'down', 'forward', 'backward', 'right']
        self.slip_actions['left'] = ['up', 'down', 'forward', 'backward', 'left']
        self.slip_actions['up'] = ['left', 'right', 'forward', 'backward', 'up']
        self.slip_actions['down'] = ['left', 'right', 'forward', 'backward', 'down']
        self.slip_actions['forward'] = ['left', 'right', 'up', 'down', 'forward']
        self.slip_actions['backward'] = ['left', 'right', 'up', 'down', 'backward']



test.py

In [2]:
import argparse
import random

import pandas as pd

In [3]:
# # initial all 0's
# # each coordinate value = [0,3]
# # row = state, no of states = 4*4*4 = 64
# # col = action, no of actions = 6

# env = TreasureCube();
# length = env.dim

# indexnames = []
# for x in range(length):
#     for y in range(length):
#         for z in range(length):
#           indexnames.append(str(x) + str(y) + str(z))

# Qtable = np.zeros([64, 6])
# Qtable = pd.DataFrame(Qtable, index=indexnames, columns=env.all_actions)

In [None]:
# qdf = Qtable.copy()

In [None]:
# # Qtable.loc['000'] # find all action values, of the state
# state = "000"
# # Qtable.loc[state].idxmax()
# action = "left"
# # qdf.loc[state, action] = 999
# qdf.loc[state].max()

In [None]:
# state = env.randomState()
# "".join([str(s) for s in state]) 

In [3]:
class RandomAgent(object):
    def __init__(self):
        self.action_space = ['left','right','forward','backward','up','down'] # in TreasureCube
        self.Q = []

    def take_action(self, state):
        action = random.choice(self.action_space)
        return action

    # implement your train/update function to update self.V or self.Q
    # you should pass arguments to the train function
#    def train(self, state, action, next_state, reward):
    def train(self, state, action, next_state, reward, Qtable):
#      pass
      alpha = 0.5 # learning rate
      gamma = 0.99 # discount factor

      old_value = Qtable.loc[state, action]
      next_max = Qtable.loc[next_state].max()
      new_value = (1.0 - alpha) * old_value + alpha * (reward + gamma * next_max)
      Qtable.loc[state, action] = new_value    


In [None]:
# TODO
max_episode = 500
max_step = 500

env = TreasureCube()
agent = RandomAgent()

# ADDED
length = env.dim
indexnames = []
for x in range(length):
    for y in range(length):
        for z in range(length):
          indexnames.append(str(x) + str(y) + str(z))
Qtable = np.zeros([64, 6])
Qtable = pd.DataFrame(Qtable, index=indexnames, columns=env.all_actions)
# END

for epsisode_num in range(0, max_episode):
  state = env.reset()
  terminate = False
  t = 0   # step number ? TODO remove
  episode_reward = 0

#        alpha = 0.5 # learning rate
#        gamma = 0.99 # discount factor
  epsilon = 0.01 # exploration rate # todo remove

  while not terminate:
      action = agent.take_action(state) # choose a random action
      reward, terminate, next_state = env.step(action)
      episode_reward += reward            

      # you can comment the following two lines, if the output is too much
      # env.render() # comment
      # print(f'step: {t}, action: {action}, reward: {reward}') # comment  

      t += 1
#            agent.train(state, action, next_state, reward)
      agent.train(state, action, next_state, reward, Qtable)
      state = next_state
  print(f'epsisode: {epsisode_num}, total_steps: {t} episode reward: {episode_reward}')

In [5]:
Qtable

Unnamed: 0,right,left,up,down,forward,backward
000,-0.618428,-0.696427,-0.612388,-0.633617,-0.590953,-0.603898
001,-0.500031,-0.581196,-0.511297,-0.671772,-0.504302,-0.570911
002,-0.383995,-0.489429,-0.441792,-0.531209,-0.376972,-0.493124
003,-0.361593,-0.459171,-0.377068,-0.486588,-0.408383,-0.450737
010,-0.523268,-0.690692,-0.521938,-0.634295,-0.527963,-0.585137
...,...,...,...,...,...,...
323,0.447500,0.246872,0.306395,0.146921,0.283750,-0.008175
330,-0.209101,-0.177255,-0.257129,-0.217870,-0.146391,-0.214992
331,-0.014078,0.021561,0.082572,-0.028233,-0.043980,0.092722
332,0.306346,0.035918,0.199868,-0.093750,0.114743,0.095866


In [14]:
# # Evaluate agent's performance after Q-learning
# # Qtable alrdy done

# total_epochs, total_penalties = 0, 0
# episodes = 100

# for _ in range(episodes):
#     state = env.reset()
#     epochs, penalties, reward = 0, 0, 0
    
#     done = False
    
#     while not done:
#         action = Qtable.loc[state].idxmax() # get index of highest action
#         reward, done, next_state = env.step(action)

#         if reward == -10:
#             penalties += 1

#         epochs += 1

#     total_penalties += penalties
#     total_epochs += epochs

# print(f"Results after {episodes} episodes:")
# print(f"Average timesteps per episode: {total_epochs / episodes}")
# print(f"Average penalties per episode: {total_penalties / episodes}")

Results after 100 episodes:
Average timesteps per episode: 18.97
Average penalties per episode: 0.0


In [7]:
# show "path"
# Qtable is populated

env = TreasureCube()

end_state = "333"
is_end_state = False;
i = 1

state = env.reset()

while not is_end_state:
  print("=== Step", i, "and current state =" , state)
  env.render() # render

  # # check if terminal state
  # if(state == end_state):
  #   print("*** Reach end state.")
  #   break

  # choose best action of this state
  action = Qtable.loc[state].idxmax() # get index of highest action

  reward, terminate, next_state = env.step(action)
  # next_state = env.getNextState(state, action) # TODO DOESNT work T_T
  # state = next_state
  # i += 1

  print("=== Take action =", action, "and next state =",next_state)

  # go to next state
  state = next_state
  i += 1

  # check if terminal state
  if(state == end_state):
    print("*** Reach end state.")
    is_end_state = True



=== Step 1 and current state = 000
* * * *
+ - - -
- - - -
- - - -
- - - -
# # # #
- - - -
- - - -
- - - -
- - - -
# # # #
- - - -
- - - -
- - - -
- - - -
# # # #
- - - -
- - - -
- - - -
- - - G
# # # #
* * * *
=== Take action = forward and next state = 000
=== Step 2 and current state = 000
* * * *
+ - - -
- - - -
- - - -
- - - -
# # # #
- - - -
- - - -
- - - -
- - - -
# # # #
- - - -
- - - -
- - - -
- - - -
# # # #
- - - -
- - - -
- - - -
- - - G
# # # #
* * * *
=== Take action = forward and next state = 100
=== Step 3 and current state = 100
* * * *
- - - -
- - - -
- - - -
- - - -
# # # #
+ - - -
- - - -
- - - -
- - - -
# # # #
- - - -
- - - -
- - - -
- - - -
# # # #
- - - -
- - - -
- - - -
- - - G
# # # #
* * * *
=== Take action = forward and next state = 101
=== Step 4 and current state = 101
* * * *
- - - -
- - - -
- - - -
- - - -
# # # #
- + - -
- - - -
- - - -
- - - -
# # # #
- - - -
- - - -
- - - -
- - - -
# # # #
- - - -
- - - -
- - - -
- - - G
# # # #
* * * *
=== Take action

In [6]:
Qtable.loc["200"]

right      -0.424323
left       -0.412819
up         -0.414116
down       -0.502229
forward    -0.420291
backward   -0.544757
Name: 200, dtype: float64

In [None]:
# def test_cube(max_episode, max_step):
#     env = TreasureCube(max_step=max_step)
#     agent = RandomAgent() # TODO replace ??

#     for epsisode_num in range(0, max_episode):
#         state = env.reset()
#         terminate = False
#         t = 0
#         episode_reward = 0
#         while not terminate:
#             action = agent.take_action(state)
#             reward, terminate, next_state = env.step(action)
#             episode_reward += reward
#             # you can comment the following two lines, if the output is too much
#             env.render() # comment
#             print(f'step: {t}, action: {action}, reward: {reward}') # comment
#             t += 1
#             agent.train(state, action, next_state, reward)
#             state = next_state
#         print(f'epsisode: {epsisode_num}, total_steps: {t} episode reward: {episode_reward}')


# if __name__ == '__main__':
#     parser = argparse.ArgumentParser(description='Test')
#     parser.add_argument('--max_episode', type=int, default=500)
#     parser.add_argument('--max_step', type=int, default=500)
#     args = parser.parse_args()

#     test_cube(args.max_episode, args.max_step)
