In [None]:
"""
1D Maze Environment
"""

import sys
import time
import numpy as np
import pandas as pd

class Maze1DEnv(object):
    def __init__(self, *args, **kwargs):
        # render frequency, default 0.5s for rendering interval
        self._refresh_interval = kwargs.get('refresh_interval', 0.5)
        
        # 1D maze length, default 5
        self._n_states = kwargs.get('n_states', 5)
        
        # space for actions
        self._action_space = ['left', 'right']
        self._n_actions = len(self._action_space)
        
        # observation, default 0 (from the very left side)
        self.obs = 0
        
    def reset(self):
        # reset observation
        self.obs = 0
        return self.obs

    def step(self, action):
        # init
        obs_ = self.obs
        reward = 0
        done = False
        
        # action move, 
        if action == 0:  # 0 for self._action_space[0], 'left'
            obs_ -= 1
            if obs_ < 0:
                obs_ = 0
        elif action == 1:  # 1 for self._action_space[1], 'right'
            obs_ += 1
            if obs_ == self._n_states - 1:
                obs_ = 'treasure'
                reward = 1
                done = True
        else:
            raise Exception('invalid action code', action)

        # update observation (next state)
        self.obs = obs_
        
        return obs_, reward, done
    
    def close(self):
        pass
    
    def render(self):
        # init, display like '------T', T for 'treasure'
        env_list = ['_'] * (self._n_states - 1) + ['T']
        
        if self.obs == 'treasure':
            sys.stdout.write('\r%s WIN' % ''.join(env_list))
        else:
            # self.obs is offset of 1D maze
            env_list[self.obs] = '-'
            sys.stdout.write('\r%s' % ''.join(env_list))
        
        time.sleep(self._refresh_interval)
    
    @property
    def n_actions(self):
        return self._n_actions

In [None]:
"""
Agent with Q Learning algorithm

Q(s, a) <- Q(s, a) + alpha * (R + gamma * maxQ(s_, a_) - Q(s, a))
s <- s_

alpha: learning rate
gamma: reward decay

q_target: R + gamma * maxQ(s_, a_)
q_predict: Q(s, a)
"""

import pandas as pd
import numpy as np


class QLearning(object):
    def __init__(self, actions, alpha=0.1, gamma=0.9, eplison_greedy=0.9, *args, **kwargs):
        # init available actions
        self._actions = actions
        
        # init learning paras
        self._alpha = alpha
        self._gamma = gamma
        self._eplison_greedy = eplison_greedy
        
        # init Q table
        self._q_table = pd.DataFrame(columns=actions, dtype=np.float)
    
    def _check_state_available(self, obs):
        # add obs to q_table if q_table does not contain it
        if obs not in self._q_table.index:
            self._q_table = self._q_table.append(
                pd.Series(
                    [0] * len(self._actions),
                    index=self._q_table.columns,
                    name=obs
                )
            )

    def choose_action(self, obs):
        self._check_state_available(obs)
        
        # e-greedy policy for choosing action
        if np.random.uniform() < self._eplison_greedy:
            # p < 0.9, choose maxQ(s_, a_) for next move
            obs_actions = self._q_table.loc[obs, :]
            action = np.random.choice(obs_actions[obs_actions == np.max(obs_actions)].index)
        else:
            # p >= 0.9, choose random move
            action = np.random.choice(self._actions)
        
        return action
    
    def learn(self, obs, a, r, obs_):
        self._check_state_available(obs_)
        
        if obs_ != 'treasure':
            q_target = r + self._gamma * self._q_table.loc[obs_, :].max()
        else:
            q_target = r
        
        q_predict = self._q_table.loc[obs, a]
        
        self._q_table.loc[obs, a] = q_predict + self._alpha * (q_target - q_predict)
    
    def __str__(self):
        return self._q_table.__str__()

In [None]:
# import numpy as np
# import pandas as pd
# import sys
# import time
# import Maze1DEnv
# import QLearning

N_EPISODE = 100
N_MAX_STEP_PER_EPISODE = 30

env = Maze1DEnv(n_states=6, refresh_interval=0.1)
rl = QLearning(list(range(env.n_actions)))

# N_EPISODE episode loop
for i_episode in range(N_EPISODE):
    print("[Episode %d]" % (i_episode + 1))
    
    obs = env.reset()
    # N_MAX_STEP_PER_EPISODE step loop
    for i_step in range(N_MAX_STEP_PER_EPISODE):
#         print("\n[Step %d]" % (i_step + 1))
        env.render()
        
        # choose next action
        action = rl.choose_action(str(obs))
        
        # move and feedback from env
        obs_, reward, done = env.step(action)
        
        # update q_table
        rl.learn(str(obs), action, reward, str(obs_))
        
        obs = obs_
        
        # show q_table
        if done or i_step == N_MAX_STEP_PER_EPISODE - 1:
            print('\n%s' % rl)
        
        # earlier out loop
        if done:
            break

env.close()