In [None]:
"""
2D Maze Environment
"""

import sys
import time
import numpy as np
import pandas as pd
import tkinter as tk


class Maze2DEnv(tk.Tk, object):
    def __init__(self, maze_objects, shape=(4, 5), refresh_interval=0.5, *args, **kwargs):
        super(Maze2DEnv, self).__init__(*args, **kwargs)
        # render frequency, default 0.5s for rendering interval
        self._refresh_interval = refresh_interval
        
        # unit width and height
        self._unit_width = 35
        self._unit_height = 35
        
        # 2D shape, eg: rows x cols: 4 x 5
        self._shape = shape
        
        # space for actions
        self._action_space = ['left', 'right', 'up', 'down']
        self._n_actions = len(self._action_space)
        
        self.geometry('%dx%d' % (self._shape[1] * self._unit_width, self._shape[0] * self._unit_height))
        
        # init objects in maze
        self._maze_objects = maze_objects

        # observation, default (0, 0) (from the very left/top position)
        self._origin = (0, 0)
        self.obs = self._origin
        
        # draw maze
        self._draw_maze();
        
#         self.bind("<Left>", lambda e: self._move_obs(0))
#         self.bind("<Right>", lambda e: self._move_obs(1))
#         self.bind("<Up>", lambda e: self._move_obs(2))
#         self.bind("<Down>", lambda e: self._move_obs(3))
    
    def _draw_maze(self):
        # canvas
        self.canvas = tk.Canvas(self, bg='white',
                                width=self._shape[1] * self._unit_width,
                                height=self._shape[0] * self._unit_height)
        self.canvas.pack()
        self.redraw_all(self.canvas)
        
    def redraw_all(self, canvas):
        # clear all objects on the canvas
        canvas.delete("all")
        
        # draw grids
        for x in range(0, self._shape[1] * self._unit_width, self._unit_width):
            x0, y0, x1, y1 = x, 0, x, self._shape[0] * self._unit_height
            canvas.create_line(x0, y0, x1, y1)
        for y in range(0, self._shape[0] * self._unit_height, self._unit_height):
            x0, y0, x1, y1 = 0, y, self._shape[1] * self._unit_width, y
            canvas.create_line(x0, y0, x1, y1)
            
        # draw mines, black square
        # draw treasure, yellow square
        for key, value in self._maze_objects.items():
            color = 'black'
            if value == 'treasure':
                color = 'yellow'
            elif value == 'mine':
                color = 'black'
                
            canvas.create_rectangle(
                key[1] * self._unit_width + 2, key[0] * self._unit_height + 2,
                (key[1] + 1) * self._unit_width - 2, (key[0] + 1) * self._unit_height - 2,
                fill=color)

        # draw obs, grey square
        canvas.create_rectangle(
            self.obs[1] * self._unit_width + 2, self.obs[0] * self._unit_height + 2,
            (self.obs[1] + 1) * self._unit_width - 2, (self.obs[0] + 1) * self._unit_height - 2,
            fill='grey')
        
    def _conflict_check(self, obs):
        conflict = obs in self._maze_objects.keys()
        obj = self._maze_objects.get(obs, None)
        return conflict, obj
    
    def reset(self):
        # reset observation
        self.obs = self._origin
        self.render()
        return self.obs

    def step(self, action):
        # init
        obs_ = list(self.obs)
        reward = 0
        done = False
        
        # action move, 
        if action == 0:  # 'left'
            obs_[1] -= 1
            if obs_[1] < 0:
                obs_[1] = 0
        elif action == 1:  # 'right'
            obs_[1] += 1
            if obs_[1] > self._shape[1] - 1:
                obs_[1] = self._shape[1] - 1
        elif action == 2:  # 'up'
            obs_[0] -= 1
            if obs_[0] < 0:
                obs_[0] = 0
        elif action == 3:  # 'down'
            obs_[0] += 1
            if obs_[0] > self._shape[0] - 1:
                obs_[0] = self._shape[0] - 1
        else:
            raise Exception('invalid action code', action)

        # conflict check
        conflict, obj = self._conflict_check(tuple(obs_))
        if conflict:
            if obj == 'treasure':
                obs_ = obj
                reward = 1
                done = True
            elif obj == 'mine':
                obs_ = obj
                reward = -1
                done = True
            else:
                raise Exception('invalid object', obj)
        else:
            # update observation (next state)
            obs_ = tuple(obs_)
        
        self.obs = obs_
        
        return obs_, reward, done
        
    def render(self):
        time.sleep(self._refresh_interval)
        self.redraw_all(self.canvas)
        self.update()
    
    @property
    def n_actions(self):
        return self._n_actions

In [None]:
"""
Agent with Q Learning algorithm

Q(s, a) <- Q(s, a) + alpha * (R + gamma * maxQ(s_, a_) - Q(s, a))
s <- s_

alpha: learning rate
gamma: reward decay

q_target: R + gamma * maxQ(s_, a_)
q_predict: Q(s, a)
"""

import pandas as pd
import numpy as np


class QLearning(object):
    def __init__(self, actions, alpha=0.1, gamma=0.9, eplison_greedy=0.9, *args, **kwargs):
        # init available actions
        self._actions = actions
        
        # init learning paras
        self._alpha = alpha
        self._gamma = gamma
        self._eplison_greedy = eplison_greedy
        
        # init Q table
        self._q_table = pd.DataFrame(columns=actions, dtype=np.float)
    
    def _check_state_available(self, obs):
        # add obs to q_table if q_table does not contain it
        if obs not in self._q_table.index:
            self._q_table = self._q_table.append(
                pd.Series(
                    [0] * len(self._actions),
                    index=self._q_table.columns,
                    name=obs
                )
            )

    def choose_action(self, obs):
        self._check_state_available(obs)
        
        # e-greedy policy for choosing action
        if np.random.uniform() < self._eplison_greedy:
            # p < 0.9, choose maxQ(s_, a_) for next move
            obs_actions = self._q_table.loc[obs, :]
            action = np.random.choice(obs_actions[obs_actions == np.max(obs_actions)].index)
        else:
            # p >= 0.9, choose random move
            action = np.random.choice(self._actions)
        
        return action
    
    def learn(self, obs, a, r, obs_):
        self._check_state_available(obs_)
        
        if obs_ not in ['treasure', 'mine']:
            q_target = r + self._gamma * self._q_table.loc[obs_, :].max()
        else:
            q_target = r
        
        q_predict = self._q_table.loc[obs, a]
        
        self._q_table.loc[obs, a] = q_predict + self._alpha * (q_target - q_predict)
    
    def __str__(self):
        return self._q_table.__str__()

In [None]:
N_EPISODE = 200
N_MAX_STEP_PER_EPISODE = 100

def update():
    # N_EPISODE episode loop
    for i_episode in range(N_EPISODE):
        print("[Episode %d]" % (i_episode + 1))

        obs = env.reset()
        # N_MAX_STEP_PER_EPISODE step loop
        for i_step in range(N_MAX_STEP_PER_EPISODE):
    #         print("\n[Step %d]" % (i_step + 1))
            env.render()

            # choose next action
            action = rl.choose_action(str(obs))

            # move and feedback from env
            obs_, reward, done = env.step(action)

            # update q_table
            rl.learn(str(obs), action, reward, str(obs_))

            obs = obs_

            # show q_table
            if done or i_step == N_MAX_STEP_PER_EPISODE - 1:
                print('\n%s' % rl)

            # earlier out loop
            if done:
                break


maze_objects = {
    (3, 4): 'treasure',  # treasure
    (1, 2): 'mine',
    (2, 2): 'mine'
}
env = Maze2DEnv(maze_objects=maze_objects, shape=(4, 5), refresh_interval=0.1)
rl = QLearning(list(range(env.n_actions)))

env.after(100, update)
env.mainloop()

In [None]:
import tkinter as tk
import time

class Demo(tk.Tk):
    def __init__(self, *args, **kwargs):
        super(Demo, self).__init__(*args, **kwargs)
        self.x = 0
        self.y = 0
        self.geometry('200x150')
        self.canvas = tk.Canvas(self, bg='white', width=200, height=150)
        self.canvas.pack()
        
    def model_update(self):
        self.x += 20
        self.y += 13
        self.x %= 200
        self.y %= 150
        print(self.x, self.y)
    
    def redraw(self):
        # draw grids
        self.canvas.delete("all")
        self.canvas.create_line(self.x % 200, self.y % 150, (self.x + 30) % 200, (self.y + 30) % 150)
        
    def render(self):
        self.redraw()
        self.update()
        time.sleep(0.5)

top = Demo()

def callback_func():
    for i in range(20):
        top.model_update()
        top.render()

top.after(1000, callback_func)
top.mainloop()