# Grid World RL

* Agent - Jack
* Environment - Mansion with 4x4 rooms
* Rewards - amount of gold collected
* Actions - movement from one room to next one
* State - present room location

Per episode there are fixed number of actions

The env is available in grid_env.py

**Higher exploration rate implies less episodes**

In [1]:
import numpy as np

from grid_env import GridWorldEnv

In [2]:
env = GridWorldEnv()
env.render()

Present Grid: 
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]



In [3]:
observation_space = env.grid.size
action_space = len(env.actions.keys())
print(observation_space)
print(action_space)

16
4


In [4]:
class QLearning():
    def __init__(self, env, exp_rate=0.5, gamma=0.95, lr=0.01):
        self.env = env
        self.exp_rate = exp_rate
        self.gamma = gamma
        self.lr = lr
        
        self.env.exp_rate = self.exp_rate
        
        self.Q = np.zeros((self.env.observation_space, self.env.action_space))
    
    def get_action(self, state, explore = True):
        """Epsilon greedy selection of epsilon in the explore phase.
        Args:
            state (np.ndarray): Environment state.
            explore (bool, optional): True if exploration is required. False if not.
        Returns:
            int: action.
        """
        action = np.random.randint(low=0, high=self.env.action_space)
        if explore:    
            if np.random.uniform(0, 1) < self.exp_rate:
                # exploration
                # action = np.random.choice(self.env.action_space)
                return action
                
            # exploitation: choosing the action which gives most reward. Assuming agent can only see the next state
        if np.argmax(self.Q[state,:]) is not None:
            return np.argmax(self.Q[state, :]) % 4
        else:
            return action
    
    def update(self, transition):
        state, action, reward, new_state = transition
        self.Q[state, action] = self.Q[state, action] + self.lr * (reward + self.gamma * np.max(self.Q[new_state, :]) - self.Q[state, action])
    

In [5]:
episodes = 10000
MAX_ACTIONS = 6

env = GridWorldEnv()
agent = QLearning(env, exp_rate=0.9)
ep_rewards = []
temp_actions = list(env.actions.keys())

In [6]:
for episode in range(episodes):
    episode_rew = 0
    for num_action in range(MAX_ACTIONS):
        pre_state = env.state
        action = agent.get_action(env.state)
#         print(action)
        next_state = env.step(temp_actions[action], update=True)
        reward = env.getReward(env.state)
#         env.render()
#         print(env.state)
        agent.update((pre_state, action, reward, next_state))
        
        episode_rew += reward
    if episode % 20000 == 0:
        env.render()
        print("[DEBUG]: Prev State {}, New State {}".format(pre_state, next_state))
    env.reset() 
    ep_rewards.append(episode_rew)

Present Grid: 
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 3. 0. 0.]
 [0. 0. 0. 0.]]

[DEBUG]: Prev State [1 1], New State [1 2]


In [7]:
print(agent.Q)
# print(env.state)
print(np.argmax(agent.Q[env.state, :]))
print(agent.Q[env.state, :])

[[22.30212538 22.42732241 23.40323986 23.29485789]
 [22.54626504 22.58732761 23.47183111 23.95678438]
 [22.78103494 22.65703613 24.03054301 24.76267889]
 [21.86632459 21.86178346 25.88023347 23.29069452]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]]
2
[[22.30212538 22.42732241 23.40323986 23.29485789]
 [22.30212538 22.42732241 23.40323986 23.29485789]]


In [8]:
env.reset()
for num in range(MAX_ACTIONS):
    pre_state = env.state
    action = agent.get_action(env.state, explore=False)
    next_state = env.step(temp_actions[action], update=True)
    reward = env.getReward(env.state)
    print(next_state)
    env.render()

[0 1]
Present Grid: 
[[0. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]

[1 1]
Present Grid: 
[[0. 0. 0. 0.]
 [1. 2. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]

[2 1]
Present Grid: 
[[0. 0. 0. 0.]
 [1. 2. 3. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]

[3 1]
Present Grid: 
[[0. 0. 0. 0.]
 [1. 2. 3. 4.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]

[3 2]
Present Grid: 
[[0. 0. 0. 0.]
 [1. 2. 3. 4.]
 [0. 0. 0. 5.]
 [0. 0. 0. 0.]]

[3 3]
Present Grid: 
[[0. 0. 0. 0.]
 [1. 2. 3. 4.]
 [0. 0. 0. 5.]
 [0. 0. 0. 6.]]

