<a href="https://colab.research.google.com/github/kailing231/treasurecube/blob/main/TreasureHunt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# environment.py

import numpy as np
import time
import random

from abc import ABC, abstractmethod


class AbstractEnvironment(ABC):
    def __init__(self):
        self.agent_sign = '+'
        self.goal_sign = 'G'
        self.corridor_sign = '-'

    def render(self):
        raise NotImplemented

    def reset(self):
        raise NotImplemented

    def step(self, action):
        raise NotImplemented

class TreasureCube(AbstractEnvironment):
    def __init__(self, max_step=20):
        super(TreasureCube, self).__init__()
        self.dim = 4
        self.max_step = max_step
        self.curr_pos = [0, 0, 0]  # (z, x, y)
        self.time_step = 0
        self.end_pos = [self.dim - 1, self.dim - 1, self.dim - 1]
        self.visual_state = []
        self.seed = None
        self.set_seed()
        self.all_actions = ['right', 'left', 'up', 'down', 'forward', 'backward']
        self.slip_actions = dict()
        self.set_action_rules()

    def reset(self):
        self.curr_pos = [0, 0, 0]
        self.time_step = 0
        self.end_pos = [self.dim - 1, self.dim - 1, self.dim - 1]
        self._reset_visual_states(self.curr_pos, self.end_pos)
        return ''.join(str(pos) for pos in self.curr_pos)

    # def randomState(self): # todo remove
    #     x = random.randrange(0,3)
    #     y = random.randrange(0,3)
    #     z = random.randrange(0,3)
    #     return [x,y,z]
    
    def step(self, action, stochastic=True):
        in_action = action  # action from agent
        assert action in self.all_actions
        reward = -0.1
        is_terminate = False
        pre_pos = self.curr_pos
        r = random.random()
        if action == 'right':
            if r < 0.1:
                action = 'up'
            elif r < 0.2:
                action = 'down'
            elif r < 0.3:
                action = 'forward'
            elif r < 0.4:
                action = 'backward'
            else:
                action = 'right'
        elif action == 'left':
            if r < 0.1:
                action = 'up'
            elif r < 0.2:
                action = 'down'
            elif r < 0.3:
                action = 'forward'
            elif r < 0.4:
                action = 'backward'
            else:
                action = 'left'
        elif action == 'up':
            if r < 0.1:
                action = 'left'
            elif r < 0.2:
                action = 'right'
            elif r < 0.3:
                action = 'forward'
            elif r < 0.4:
                action = 'backward'
            else:
                action = 'up'
        elif action == 'down':
            if r < 0.1:
                action = 'left'
            elif r < 0.2:
                action = 'right'
            elif r < 0.3:
                action = 'forward'
            elif r < 0.4:
                action = 'backward'
            else:
                action = 'down'
        elif action == 'forward':
            if r < 0.1:
                action = 'left'
            elif r < 0.2:
                action = 'right'
            elif r < 0.3:
                action = 'up'
            elif r < 0.4:
                action = 'down'
            else:
                action = 'forward'
        else:
            if r < 0.1:
                action = 'left'
            elif r < 0.2:
                action = 'right'
            elif r < 0.3:
                action = 'up'
            elif r < 0.4:
                action = 'down'
            else:
                action = 'backward'

        if not stochastic:
            action = in_action

        assert action in self.all_actions
        if action == 'left':
            if self.curr_pos[1] == 0:  # wall
                pass
            else:
                self.curr_pos[1] -= 1
        elif action == 'right':
            if self.curr_pos[1] == self.dim - 1:  # wall
                pass
            elif self.curr_pos[1] == self.dim - 2 and self.curr_pos[0] == self.dim - 1 and self.curr_pos[
                2] == self.dim - 1:
                self.curr_pos[1] += 1
                is_terminate = True
                reward = 1
            else:
                self.curr_pos[1] += 1

        elif action == 'forward':
            if self.curr_pos[0] == self.dim - 1:  # wall
                pass
            elif self.curr_pos[0] == self.dim - 2 and self.curr_pos[1] == self.dim - 1 and self.curr_pos[
                2] == self.dim - 1:
                self.curr_pos[0] += 1
                is_terminate = True
                reward = 1
            else:
                self.curr_pos[0] += 1
        elif action == 'backward':
            if self.curr_pos[0] == 0:  # wall
                pass
            else:
                self.curr_pos[0] -= 1

        elif action == 'up':
            if self.curr_pos[2] == self.dim - 1:  # wall
                pass
            elif self.curr_pos[2] == self.dim - 2 and self.curr_pos[0] == self.dim - 1 and self.curr_pos[
                1] == self.dim - 1:
                self.curr_pos[2] += 1
                is_terminate = True
                reward = 1
            else:
                self.curr_pos[2] += 1
        elif action == 'down':
            if self.curr_pos[2] == 0:
                pass
            else:
                self.curr_pos[2] -= 1

        assert action in self.all_actions
        self.time_step += 1
        if self.time_step == self.max_step - 1:
            is_terminate = True

        self._reset_visual_states(self.curr_pos, self.end_pos)
        return reward, is_terminate, ''.join(str(pos) for pos in self.curr_pos)

    def render(self):
        print(' '.join(['*'] * self.dim))
        for i in range(self.dim):
            for line in self.visual_state[i]:
                print(' '.join(line))
            print(' '.join(['#'] * self.dim))
        print(' '.join(['*'] * self.dim))

    def set_seed(self, seed=10086):
        self.seed = seed
        random.seed(seed)

    def _reset_visual_states(self, agent_pos, goal_pos):
        self.visual_state = [[[self.corridor_sign] * self.dim for _ in range(self.dim)] for _ in range(self.dim)]
        self.visual_state[agent_pos[0]][agent_pos[1]][agent_pos[2]] = self.agent_sign
        self.visual_state[goal_pos[0]][goal_pos[1]][goal_pos[2]] = self.goal_sign

    def set_action_rules(self):
        self.slip_actions['right'] = ['up', 'down', 'forward', 'backward', 'right']
        self.slip_actions['left'] = ['up', 'down', 'forward', 'backward', 'left']
        self.slip_actions['up'] = ['left', 'right', 'forward', 'backward', 'up']
        self.slip_actions['down'] = ['left', 'right', 'forward', 'backward', 'down']
        self.slip_actions['forward'] = ['left', 'right', 'up', 'down', 'forward']
        self.slip_actions['backward'] = ['left', 'right', 'up', 'down', 'backward']



test.py

In [2]:
import argparse
import random

import pandas as pd

In [3]:
# # initial all 0's
# # each coordinate value = [0,3]
# # row = state, no of states = 4*4*4 = 64
# # col = action, no of actions = 6

# env = TreasureCube();
# length = env.dim

# indexnames = []
# for x in range(length):
#     for y in range(length):
#         for z in range(length):
#           indexnames.append(str(x) + str(y) + str(z))

# Qtable = np.zeros([64, 6])
# Qtable = pd.DataFrame(Qtable, index=indexnames, columns=env.all_actions)

In [None]:
# qdf = Qtable.copy()

In [None]:
# # Qtable.loc['000'] # find all action values, of the state
# state = "000"
# # Qtable.loc[state].idxmax()
# action = "left"
# # qdf.loc[state, action] = 999
# qdf.loc[state].max()

In [None]:
# state = env.randomState()
# "".join([str(s) for s in state]) 

In [13]:
Qtable.loc["000", "left"]

-0.09900982376070896

In [17]:
class RandomAgent(object):
    def __init__(self):
        self.action_space = ['left','right','forward','backward','up','down'] # in TreasureCube
        length = 4
        indexnames = []
        for x in range(length):
            for y in range(length):
                for z in range(length):
                  indexnames.append(str(x) + str(y) + str(z))
        Qtable = np.zeros([64, 6])
        Qtable = pd.DataFrame(Qtable, index=indexnames, columns=self.action_space)
        self.Q = Qtable
        

    def take_action(self, state):
        action = random.choice(self.action_space)
        return action

    # implement your train/update function to update self.V or self.Q
    # you should pass arguments to the train function
    def train(self, state, action, next_state, reward):
      alpha = 0.5 # learning rate
      gamma = 0.99 # discount factor

      old_value = self.Q.loc[state, action]
      next_max = self.Q.loc[next_state].max()
      # new_value = (2.0 - alpha) * old_value + alpha * (reward + gamma * next_max)
      new_value = old_value + alpha * (reward + gamma * next_max) - old_value
      self.Q.loc[state, action] = new_value

In [18]:
# TODO
max_episode = 500
max_step = 500

env = TreasureCube()
agent = RandomAgent()

# # ADDED
# length = env.dim
# indexnames = []
# for x in range(length):
#     for y in range(length):
#         for z in range(length):
#           indexnames.append(str(x) + str(y) + str(z))
# Qtable = np.zeros([64, 6])
# Qtable = pd.DataFrame(Qtable, index=indexnames, columns=env.all_actions)
# # END

for epsisode_num in range(0, max_episode):
  state = env.reset()
  terminate = False
  t = 0   # step number ? TODO remove
  episode_reward = 0

#        alpha = 0.5 # learning rate
#        gamma = 0.99 # discount factor
  epsilon = 0.01 # exploration rate # todo remove

  while not terminate:
      if random.uniform(0, 1) < epsilon:
        action = agent.take_action(state) # choose a random action
      else:
        action = agent.Q.loc[state].idxmax() # or choose highest action

      reward, terminate, next_state = env.step(action)
      episode_reward += reward            

      # you can comment the following two lines, if the output is too much
      # env.render() # comment
      # print(f'step: {t}, action: {action}, reward: {reward}') # comment  

      t += 1
      agent.train(state, action, next_state, reward)
      state = next_state
  print(f'epsisode: {epsisode_num}, total_steps: {t} episode reward: {episode_reward}')

epsisode: 0, total_steps: 19 episode reward: -1.9000000000000006
epsisode: 1, total_steps: 19 episode reward: -1.9000000000000006
epsisode: 2, total_steps: 19 episode reward: -1.9000000000000006
epsisode: 3, total_steps: 19 episode reward: -1.9000000000000006
epsisode: 4, total_steps: 19 episode reward: -1.9000000000000006
epsisode: 5, total_steps: 19 episode reward: -1.9000000000000006
epsisode: 6, total_steps: 19 episode reward: -1.9000000000000006
epsisode: 7, total_steps: 19 episode reward: -1.9000000000000006
epsisode: 8, total_steps: 19 episode reward: -1.9000000000000006
epsisode: 9, total_steps: 19 episode reward: -1.9000000000000006
epsisode: 10, total_steps: 19 episode reward: -1.9000000000000006
epsisode: 11, total_steps: 19 episode reward: -1.9000000000000006
epsisode: 12, total_steps: 19 episode reward: -1.9000000000000006
epsisode: 13, total_steps: 19 episode reward: -1.9000000000000006
epsisode: 14, total_steps: 19 episode reward: -1.9000000000000006
epsisode: 15, total_

In [19]:
# Qtable
agent.Q

Unnamed: 0,left,right,forward,backward,up,down
000,-0.099010,-0.099010,-0.099009,-0.099010,-0.099010,-0.099010
001,-0.099009,-0.099009,-0.099009,-0.099009,-0.099009,-0.099010
002,-0.099007,-0.099007,-0.099009,-0.099007,-0.099007,-0.099008
003,-0.099008,-0.099005,-0.099005,-0.099007,-0.099005,-0.099005
010,-0.099010,-0.099009,-0.099010,-0.099010,-0.099009,-0.099010
...,...,...,...,...,...,...
323,-0.098946,-0.090198,-0.098922,-0.098922,-0.098922,-0.098967
330,-0.099005,-0.099005,-0.099006,-0.099005,-0.098994,-0.099005
331,-0.099002,-0.098988,-0.098994,-0.099002,-0.099002,-0.098481
332,-0.098922,-0.098922,-0.098880,-0.098922,0.500000,-0.098880


In [15]:
# show "path"
# Qtable is populated

env = TreasureCube()

end_state = "333"
is_end_state = False;
i = 1

state = env.reset()

while not is_end_state:
  print("=== Move", i, "\tcurrent state =" , state)
  # env.render() # render

  if(state == end_state):
    print("*** Reach end state.")
    break

  # choose best action of this state
  action = Qtable.loc[state].idxmax() # get column name of highest action value

  reward, terminate, next_state = env.step(action)

  print("\tTake action =", action, ", next state =",next_state)

  # go to next state
  state = next_state
  i += 1

  # # check if terminal state
  # if(state == end_state):
  #   print("*** Reach end state.")
  #   is_end_state = True



=== Move 1 	current state = 000
	Take action = forward , next state = 000
=== Move 2 	current state = 000
	Take action = forward , next state = 100
=== Move 3 	current state = 100
	Take action = right , next state = 200
=== Move 4 	current state = 200
	Take action = up , next state = 201
=== Move 5 	current state = 201
	Take action = backward , next state = 200
=== Move 6 	current state = 200
	Take action = up , next state = 201
=== Move 7 	current state = 201
	Take action = backward , next state = 101
=== Move 8 	current state = 101
	Take action = forward , next state = 102
=== Move 9 	current state = 102
	Take action = forward , next state = 202
=== Move 10 	current state = 202
	Take action = right , next state = 212
=== Move 11 	current state = 212
	Take action = forward , next state = 312
=== Move 12 	current state = 312
	Take action = backward , next state = 212
=== Move 13 	current state = 212
	Take action = forward , next state = 213
=== Move 14 	current state = 213
	Take action

In [7]:
Qtable.loc["000"]

right      -0.809712
left       -0.800342
up         -0.527926
down       -0.787894
forward    -0.781186
backward   -0.799240
Name: 000, dtype: float64

In [None]:
# def test_cube(max_episode, max_step):
#     env = TreasureCube(max_step=max_step)
#     agent = RandomAgent() # TODO replace ??

#     for epsisode_num in range(0, max_episode):
#         state = env.reset()
#         terminate = False
#         t = 0
#         episode_reward = 0
#         while not terminate:
#             action = agent.take_action(state)
#             reward, terminate, next_state = env.step(action)
#             episode_reward += reward
#             # you can comment the following two lines, if the output is too much
#             env.render() # comment
#             print(f'step: {t}, action: {action}, reward: {reward}') # comment
#             t += 1
#             agent.train(state, action, next_state, reward)
#             state = next_state
#         print(f'epsisode: {epsisode_num}, total_steps: {t} episode reward: {episode_reward}')


# if __name__ == '__main__':
#     parser = argparse.ArgumentParser(description='Test')
#     parser.add_argument('--max_episode', type=int, default=500)
#     parser.add_argument('--max_step', type=int, default=500)
#     args = parser.parse_args()

#     test_cube(args.max_episode, args.max_step)
