# Grid World render

In [2]:
import gym
import pygame
import sys
import time

colors = {
    "black": (0,0,0),
    "white": (255,255,255),
    "light_white": (200,200,200),
    "blue": (0,0,255),
    "green": (51, 204, 51),
}
class GridWorldRenderer():
    def __init__(self, rows, columns, cell_size = 50):
        self.rows = rows 
        self.columns = columns
        self.cell_size = cell_size

    def start(self, state):
        pygame.init()
        # self.test = False 
        # self.test_image = False 

        self._window_width = self.columns * self.cell_size
        self._window_height = self.rows * self.cell_size

        self.screen = pygame.display.set_mode((self._window_width, self._window_height))
        pygame.display.set_caption(f"Grid World {self.rows}x{self.columns}" )
        self.clock = pygame.time.Clock()
        self.screen.fill(colors['white'])

        self.border_color = colors['black']
        self.state = state

        self.update(state)

    # def _drawgrid(self):
    #     for i, x in enumerate(range(0, self._window_width, self.cell_size)):
    #         for j, y in enumerate(range(0, self._window_height, self.cell_size)):
    #             color = colors['white']

    #             rect = pygame.Rect(x,y, self.cell_size, self.cell_size)

    #             pygame.draw.rect(self.screen, color, rect)
    #             if self.test_image and i == 0 and j == 0 :
    #                 self.draw_robot(self.screen, x,y)
    #             if i == 1 and j == 0 :
    #                 self.draw_battery(self.screen, x,y)
    #             if i == 0 and j == 1 :
    #                 self.draw_crap(self.screen, x,y)

    #             border = pygame.Rect(x,y, self.cell_size, self.cell_size)
    #             pygame.draw.rect(self.screen, self.border_color, border, 1)

    def _drawstate(self):
        for i, x in enumerate(range(0, self._window_width, self.cell_size)):
            for j, y in enumerate(range(0, self._window_height, self.cell_size)):
                if self.state[i][j] == 1:
                    rect = pygame.Rect(x,y, self.cell_size, self.cell_size)
                    blue = colors['blue']
                    pygame.draw.rect(self.screen, blue, rect)
                else:
                    rect = pygame.Rect(x,y, self.cell_size, self.cell_size)
                    white = colors['white']
                    pygame.draw.rect(self.screen, white, rect)

                black = colors['black']
                pygame.draw.rect(self.screen, black, rect, 1)
                

    # def toggle(self):
    #     self.test = not self.test
    
    # def _draw_object(self, screen, img, x,y):
    #     img = pygame.image.load(img)
    #     img = pygame.transform.scale(img, (self.cell_size, self.cell_size))
    #     screen.blit(img, (x,y))
    #     border = pygame.Rect(x,y, self.cell_size, self.cell_size)
    #     pygame.draw.rect(self.screen, self.border_color, border, 1)

    # def draw_robot(self, screen, x, y):
    #     robot_img = './assets/robot.jpg'
    #     self._draw_object(screen, robot_img, x, y)

    # def draw_crap(self, screen, x, y):
    #     img = './assets/crap.png'
    #     self._draw_object(screen, img, x, y)

    # def draw_battery(self, screen, x, y):
    #     img = './assets/battery.png'
    #     self._draw_object(screen, img, x, y)
    
    def update(self, new_state):
        #clear screen
        self.clock.tick(60)
        self.screen.fill(colors['white'])
        
        self.state = new_state
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                self.end()
            

        self._drawstate()
        pygame.display.update()
        
    def end(self):
        print('exit')
        pygame.quit()
        sys.exit()

    
    #call a while  loop to update pygame drawing
    def run(self):
        pass

# gridworld = GridWorldRenderer(8,8)


In [3]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
from tqdm import tqdm 
import random

# Gym env

In [4]:
class QLearningAlgorithm():
    def __init__(self, env, **kwargs: dict) -> None:
        self.n_training_eps = int(self._get(kwargs, "n_training_eps", 10000))
        self.n_eval_eps = int(self._get(kwargs, "n_eval_eps", 100))
        self.max_steps = int(self._get(kwargs, "max_steps", 99))
        self.learning_rate = float(self._get(kwargs, "learning_rate", 0.001))
        self.max_epsilon = float(self._get(kwargs, "max_epsilon", 1.0))
        self.min_epsilon = float(self._get(kwargs, "min_epsilon", 0.005))
        self.decay_rate = float(self._get(kwargs, "decay_rate", 0.0005))
        self.gamma = float(self._get(kwargs, "gamma", 0.95))
        self.env = env
        self._reset_qtable()
    
    def _reset_qtable(self):
        self.qtable = self._init_qtable(self.env.observation_space.n, self.env.action_space.n)

    def __str__(self):
        print("=" * 20)
        print(f"Q-TABLE, shape = {self.qtable.shape}")
        print("=" * 20)
        print(self.qtable)
        print("=" * 20)
        print("PARAMS")
        print("=" * 20)
        print(f'n_training_eps = {self.n_training_eps}')
        print(f'n_eval_eps = {self.n_eval_eps}')
        print(f'max_steps = {self.max_steps}')
        print(f'learning_rate = {self.learning_rate}')
        print(f'max_epsilon = {self.max_epsilon}')
        print(f'min_epsilon = {self.min_epsilon}')
        print(f'decay rate = {self.decay_rate}')
        print(f'gamma = {self.gamma}')
        print(f'env = {self.env}')
        return ''

    def _get(self, dict, key, default):
        return dict[key] if key in dict else default

    def _init_qtable(self, state_space,action_space):
        qtable = np.zeros((state_space, action_space))
        return qtable
    
    def _epsilon_greedy_policy(self, state, epsilon):
        random_init = random.uniform(-1,1)
        if random_init > epsilon:
            action = np.argmax(self.qtable[state])
        else:
            action = self.env.action_space.sample()
        return action
    
    def train(self):
        loop = tqdm(list(range(self.n_training_eps)))
        #reset qtable
        self._reset_qtable()

        print("=" * 20)
        print("TRAINING")
        print("=" * 20)

        for ep in loop:
            epsilon = self.min_epsilon + (self.max_epsilon - self.min_epsilon) * np.exp(-self.decay_rate * ep)
            state = self.env.reset()
            done = False

            for step in range(self.max_steps):
                action = self._epsilon_greedy_policy(state, epsilon)
                new_state, reward, done, info = self.env.step(action)

                self.qtable[state][action] = self.qtable[state][action] + \
                    self.learning_rate * (reward  + self.gamma * np.max(self.qtable[new_state]) - self.qtable[state][action])
                
                state = new_state

                if done:
                    break
            
            loop.set_description(f"ep = {ep}, eposilon = {epsilon:.2f}")
    
    def get_action(self, state):
        return np.argmax(self.qtable[state])


In [5]:
def default_map():
    return 'SFFH\nH'

def GET_MAP(map):
    if map == "4x4": 
        return [
            "SFFF",
            "FHFH",
            "FFFH",
            "HFFG"
        ]

    if map == "8x8": 
        return [
            "SFFFFFFF",
            "FFFFFFFF",
            "FFFHFFFF",
            "FFFFFHFF",
            "FFFHFFFF",
            "FHHFFFHF",
            "FHFFHFHF",
            "FFFHFFFG",
        ],

class GridWorldEnv(gym.Env):
    def __init__(self, map, reward_dict, **kwargs):

        #Check if map has the same dimension as rows and cols
        rows, cols, map = self._process_map(map)

        self.map = map
        nS = rows * cols
        nA = 4
        self.rows, self.cols = rows, cols
        self.agent_pos = [0,0]
        self.goal_pos = [rows - 1, cols - 1]

        #save reward dict
        self.reward_dict = reward_dict

        #default state
        self.state = np.zeros((self.rows, self.cols))
        self.set_state(self.agent_pos, self.goal_pos)

        self.observation_space = spaces.Discrete(nS)
        self.action_space = spaces.Discrete(nA)

        #time step used to track how long this agent performing
        #if it's to long, terminate early
        self.timestep = 0
        self.max_timestep = int(kwargs['max_timestep']) if kwargs['max_timestep'] != None else 100

        #termination status
        self.terminated = False

        #render using pygame
        self.renderer = GridWorldRenderer(rows, cols)
    
    #Reset environmentj
    def reset(self):
        self.agent_pos = [0,0]
        self.goal_pos = [self.rows - 1, self.cols - 1]
        self.set_state(self.agent_pos, self.goal_pos)
        self.timestep = 0
        return 0

    def _process_map(self, map_data):
        rows = map_data
        rows_n = len(rows)
        cols_n = len(rows[0])
        map = [['' for _ in range(cols_n)] for _ in range(rows_n)]
        for i, row in enumerate(rows):
            for j, val in enumerate(row):
                map[i][j] = val 

        return rows_n, cols_n, map

    def set_state(self, agent_pos, goal_pos):
        self.state = np.zeros((self.rows, self.cols))
        self.state[tuple(agent_pos)] = 1
        self.state[tuple(goal_pos)] = 0.5
        observation = self.state.flatten()
        return observation
    
    def reached_goal(self, pos):
        x,y = pos
        return True if self.map[x][y] == 'G'else False
    
    def get_reward(self, pos):
        x,y = pos
        val = self.map[x][y]
        return self.reward_dict[val] if val in self.reward_dict else 0

    #Step function: agent take step in env
    def step(self, action):
        #actions:
        #0: down
        #1:up 
        #2:right
        #3:left

        if action == 0: 
            self.agent_pos[0] += 1
        elif action == 1: 
            self.agent_pos[0] -= 1
        elif action == 2: 
            self.agent_pos[1] += 1
        elif action == 3: 
            self.agent_pos[1] -= 1

        #clip the agent position to avoid out of bound
        self.agent_pos[0] = np.clip(self.agent_pos[0], 0, self.cols - 1)
        self.agent_pos[1] = np.clip(self.agent_pos[1], 0, self.rows - 1)

        self.set_state(self.agent_pos, self.goal_pos)
        observation = self.state.flatten()

        #Check if the agent takes too long to go to goal
        self.timestep += 1
        self.terminated = True if self.timestep > self.max_timestep else False

        #Define your reward function
        reward = self.get_reward(self.agent_pos)

        if np.array_equal(self.agent_pos, self.goal_pos):
            self.terminated = True
            reward = 1
        
        if self.reached_goal(self.agent_pos):
            self.terminated = True
        
        info = {}
        #return:
        #next state, argmax to get the new state of agent, np.argmax([0,0,1,0,0,0,0.5]) = 2
        #reward
        #done or not
        #extra infomation
        return np.argmax(observation), reward, self.terminated, info
    
    def render(self, agent: QLearningAlgorithm) -> None:
        #Put a renderer here
        self.reset()
        self.renderer.start(self.state)
        max_iter = 100
        iter =0 
        curr_state = 0
        while self.terminated == False and iter < max_iter:
            self.renderer.update(self.state)
            observation = self.state.flatten()
            curr_state = np.argmax(observation)
            action = agent.get_action(curr_state)

            curr_state, _,_,_ = self.step(action)
            pygame.time.wait(500)
            iter += 1
        self.renderer.end()
        print("Terminated")
    
    def __str__(self):
        print("=" * 20)
        print('ENV MAP')
        print("=" * 20)
        for row in self.map:
            print(row)
        print("=" * 20)
        print('ENV STATE')
        print("=" * 20)
        print(self.state)
        print("=" * 20)
        print("ENV DESCRIPTION")
        print("=" * 20)
        print(f'reward dict = {self.reward_dict}')
        print(f'timestep = {self.timestep}, max_timestep = {self.max_timestep},')
        print(f'obs space = {self.observation_space.n}')
        print(f'actions = {self.action_space.n}')
        print(f'agent position = {self.agent_pos}')
        print(f'goal position = {self.goal_pos}')
        print(f'terminated = {self.terminated}')
        return ''


## Run

In [6]:
from src.agent import QLearningAgent
from src.env import GridWorldEnv

In [7]:
# env = gym.make("FrozenLake-v1", map_name="4x4",is_slippery=False)
reward_dict = {
    'G': 1,
}
env = GridWorldEnv('4x4', reward_dict = reward_dict, max_timestep = 10)

E:\ML\ml_projects\rl\src


In [8]:
qlearning = QLearningAgent(env)
# print(qlearning)
qlearning.train()
qlearning.save()


ep = 66, eposilon = 0.97:   1%|          | 60/10000 [00:00<00:16, 594.77it/s]

TRAINING


ep = 4852, eposilon = 0.09:  48%|████▊     | 4822/10000 [00:07<00:07, 648.69it/s]

In [None]:
# print(qlearning)

In [None]:
env = GridWorldEnv('4x4', reward_dict = reward_dict, max_timestep = 10)
env.render(qlearning)

exit


SystemExit: 

In [None]:
print(qlearning)

Q-TABLE, shape = (16, 4)
[[0.09949599 0.11322295 0.36305579 0.12115583]
 [0.47276961 0.15376244 0.0712842  0.10412594]
 [0.21747489 0.01687942 0.0089549  0.04121438]
 [0.08598744 0.00277095 0.00310443 0.00603573]
 [0.03180685 0.04334721 0.30378327 0.03944882]
 [0.60242254 0.14529019 0.22189815 0.09074389]
 [0.57151982 0.02734006 0.10009132 0.09252812]
 [0.50228571 0.00580963 0.04459946 0.05191821]
 [0.00381836 0.01686956 0.26355942 0.01684962]
 [0.1245455  0.19352862 0.73239184 0.07149765]
 [0.35437275 0.17116539 0.86928223 0.25285425]
 [0.97256123 0.14824445 0.40984399 0.2756309 ]
 [0.00123246 0.00506839 0.05341668 0.00131691]
 [0.03208811 0.06096813 0.36581754 0.00425051]
 [0.12264679 0.12525629 0.76723571 0.03907166]
 [0.         0.         0.         0.        ]]
PARAMS
n_training_eps = 10000
n_eval_eps = 100
max_steps = 99
learning_rate = 0.001
max_epsilon = 1.0
min_epsilon = 0.005
decay rate = 0.0005
gamma = 0.95
ENV MAP
['S', 'F', 'F', 'F']
['F', 'H', 'F', 'H']
['F', 'F', 'F', '

In [None]:
qlearning.qtable
#save qtable
with open('qtable.npy', 'wb') as file:
    np.save(file, qlearning.qtable)

In [None]:
#load qtable
with open('qtable.npy', 'rb') as file:
    qtable = np.load(file)

print(qtable)

[[3.95689949e-01 1.38064714e-01 1.06121211e-01 1.35516080e-01]
 [3.18065017e-01 3.82771463e-02 6.15608108e-03 4.34399335e-02]
 [5.39815032e-02 1.68079049e-03 2.32974938e-04 9.53439141e-03]
 [5.84049610e-03 6.89223099e-05 6.28668322e-05 3.17291475e-04]
 [1.44302505e-01 1.20382128e-01 5.09127587e-01 1.66794701e-01]
 [6.42008184e-01 9.20341031e-02 1.17422646e-01 1.66434772e-01]
 [3.29896150e-01 4.05492912e-03 1.06120522e-02 6.45226069e-02]
 [8.74166558e-02 1.67897546e-04 3.32167255e-03 7.29796541e-03]
 [5.84698450e-02 5.28496164e-02 4.20337201e-01 4.89881527e-02]
 [7.74774138e-01 2.30768245e-01 2.95454158e-01 1.24687229e-01]
 [7.21301622e-01 4.26779797e-02 1.00492352e-01 1.30932674e-01]
 [4.38019536e-01 3.81403863e-03 2.98114722e-02 3.52019758e-02]
 [2.77711916e-02 2.98212636e-02 3.99026040e-01 3.47768298e-02]
 [3.79940201e-01 2.80023416e-01 8.91075089e-01 1.22651781e-01]
 [4.78308328e-01 2.60349632e-01 9.84378075e-01 3.37625110e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.000000

In [None]:
a = np.array([[1,2,3],[4,5,6],[7,8,9]])
print(a[(0,0)])

1


In [None]:
reward_dict = {
    'G': 1,
}
env = GridWorldEnv(map = GET_MAP('4x4'), reward_dict = reward_dict, max_timestep = 10)
print(env)

ENV MAP
['S', 'F', 'F', 'F']
['F', 'H', 'F', 'H']
['F', 'F', 'F', 'H']
['H', 'F', 'F', 'G']
ENV STATE
[[1.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0.5]]
ENV DESCRIPTION
reward dict = {'G': 1}
timestep = 0, max_timestep = 10,
obs space = 16
actions = 4
agent position = [0, 0]
goal position = [3, 3]
terminated = False



In [None]:
env.render(None)

exit


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


# Test Renderere

In [1]:
from src.env import GridWorldRenderer, GridWorldEnv
from src.agent import QLearningAgent
import numpy as np
import os

In [25]:
map = [
    ['A', 'R', '', 'S'],
    ['', '', '', ''],
    ['B', '', 'B', 'R'],
    ['', '', '', 'G']
    ]

rows = 4 
cols =4 

def gen_obj():
    choices = ['R', 'S', 'B', '-']
    weights =  [0.05, 0.05, 0.05, 0.85]
    return np.random.choice(choices, p = weights)

map = [[gen_obj() for _ in range(rows)] for _ in range(cols)]
map[0][0] = 'A'
map[rows - 1][cols - 1] = 'G'

map = [''.join(row) for row in map]

for row in map:
    print(row)

A--B
--BR
----
B--G


In [2]:
reward_dict = {
    'G': 3,
    'B': 2,
    'R': -1,
    'S': -2,
    'out-of-bound': -5,
    'terminated': -3,
}
map = [
    "A---",
    "----",
    "BR--",
    "---G",
]
env = GridWorldEnv(map = map, reward_dict = reward_dict, max_timestep = 100)
agent = QLearningAgent(env)
# agent.train()
# agent.save()
agent.load('./qtable.npy')
print(agent.qtable)

# print(env)

[[ 1.94331337e+01  1.73022398e+01  1.71475241e+01  1.73079067e+01]
 [ 1.54318993e+01  1.30066867e+01  8.40089874e+00  1.83807183e+01]
 [ 4.28382953e+00  1.99359493e+00  1.77101893e-01  1.40452154e+01]
 [ 1.21162336e+00 -3.89155858e-01 -4.06209135e-01  8.71076482e-01]
 [ 2.04662011e+01  1.84452127e+01  1.84367522e+01  1.84287353e+01]
 [ 1.77511937e+01  1.66063534e+01  1.50838763e+01  1.94263437e+01]
 [ 6.28635991e+00  5.14440615e+00  1.79720533e+00  1.76086234e+01]
 [ 1.28879245e+00  9.79087665e-02  3.74045197e-03  7.21647972e+00]
 [ 1.94146321e+01  1.94405378e+01  1.84075971e+01  1.74539225e+01]
 [ 1.71506018e+01  1.77314631e+01  1.40113240e+01  2.04503409e+01]
 [ 2.88491502e+00  6.07424896e+00  1.36734599e+00  1.70712707e+01]
 [ 2.49504139e+00  4.39674854e-01  1.66887851e-01  1.69518770e+00]
 [ 1.81152771e+01  2.04553918e+01  1.75986175e+01  1.81169056e+01]
 [ 1.17283095e+01  1.35548018e+01  4.24256163e+00  1.92961728e+01]
 [ 1.21005319e+00  3.17041091e+00  2.86586946e+00  9.63399243e

In [22]:
info = env.step(2)
print(env.state)
for row in env.map:
    print(row)
print(info)

[[0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0.5]]
['A', '-', '-', '-']
['-', '-', '-', '-']
['B', 'R', '-', '-']
['-', '-', '-', 'G']
(15, -1, True, {})


In [3]:
env.render_simple(agent)

MAP
['A', '-', '-', '-']
['-', '-', '-', '-']
['B', 'R', '-', '-']
['-', '-', '-', 'G']
took action =  0  to state  4
[[0.  0.  0.  0. ]
 [1.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0.5]]
took action =  0  to state  8
[[0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [1.  0.  0.  0. ]
 [0.  0.  0.  0.5]]
took action =  1  to state  4
[[0.  0.  0.  0. ]
 [1.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0.5]]
took action =  0  to state  8
[[0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [1.  0.  0.  0. ]
 [0.  0.  0.  0.5]]
took action =  1  to state  4
[[0.  0.  0.  0. ]
 [1.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0.5]]
took action =  0  to state  8
[[0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [1.  0.  0.  0. ]
 [0.  0.  0.  0.5]]
took action =  1  to state  4
[[0.  0.  0.  0. ]
 [1.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0.5]]
took action =  0  to state  8
[[0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [1.  0.  0.  0. ]
 [0.  0.  0.  0.5]]
took action =  1  to state  4
[[0.  0.  0.  0. ]
 [1.  0

KeyboardInterrupt: 

In [5]:
agent = QLearningAgent(env, max_steps = 200)
print(agent)
agent.train()
agent.save()

Q-TABLE, shape = (16, 4)
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
PARAMS
n_training_eps = 10000
n_eval_eps = 100
max_steps = 200
learning_rate = 0.001
max_epsilon = 1.0
min_epsilon = 0.005
decay rate = 0.0005
gamma = 0.95
ENV MAP
['A', '-', '-', '-']
['-', '-', '-', '-']
['B', 'R', '-', '-']
['-', '-', '-', 'G']
ENV STATE
[[0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  1.  0.  0.5]]
ENV DESCRIPTION
reward dict = {'G': 3, 'B': 2, 'R': -1, 'S': -2}
timestep = 7, max_timestep = 100,
obs space = 16
actions = 4
agent position = [4, 1]
goal position = [3, 3]
terminated = False
env = 



  0%|          | 0/10000 [00:00<?, ?it/s]

TRAINING





IndexError: list index out of range

In [17]:
print(agent.qtable)

[[37.99841867 35.59281339 32.77162579 35.54556581]
 [27.45043182 23.76353979 13.63400298 35.95221693]
 [ 8.62379783  5.2020622   0.83496694 26.37552016]
 [ 0.60434528  0.18024974  0.17498958  5.72524838]
 [39.99923927 36.0894567  36.08305595 37.99398737]
 [35.34697408 30.94700231 27.72537515 37.99700723]
 [16.01948031  8.40583918  3.22011847 34.41264039]
 [ 1.49783784  0.32244212  0.88319057 12.814537  ]
 [37.99902403 37.99904318 36.99887677 39.99935853]
 [36.00245775 36.00404187 34.59107063 39.9991033 ]
 [18.25235434 19.17633682  7.96456602 36.83781842]
 [ 0.72846166  1.06870185  1.80261941 19.18696926]
 [37.98994321 39.99921512 36.07392497 37.98991065]
 [34.33821483 35.36339968 26.9367389  37.99676534]
 [11.51062226 15.71171512  0.79947243 33.82815573]
 [ 0.          0.          0.          0.        ]]


In [24]:
env.render_simple(agent)

AttributeError: 'GridWorldEnv' object has no attribute 'render_simple'