This project is a hands-on project that implements a simple grid world environment and a Q-learning agent in Python using reinforcement learning. The agent learns to navigate the grid, avoiding pitfalls and reaching the goal.

# Step 1: Define the GridWorld class

* The GridWorld class represents the environment the agent interacts with
* The environment has a grid structure with an agent, an exit, and a pitfall
* The agent can take actions ('up', 'down', 'left', 'right') to navigate the grid

In [1]:
import random

class GridWorld:
    def __init__(self):
        # Define the grid world properties
        self.rows = 4
        self.cols = 4
        self.agent_position = (0, 0)
        self.exit_position = (self.rows - 1, self.cols - 1)
        self.pitfall_position = (1, 1)

    def is_terminal(self):
        # Check if the agent is at the exit position
        return self.agent_position == self.exit_position

    def is_pitfall(self):
        # Check if the agent is at a pitfall position
        return self.agent_position == self.pitfall_position

    def take_action(self, action):
        # Update the agent's position based on the chosen action
        i, j = self.agent_position
        if action == 'up' and i > 0:
            self.agent_position = (i - 1, j)
        elif action == 'down' and i < self.rows - 1:
            self.agent_position = (i + 1, j)
        elif action == 'left' and j > 0:
            self.agent_position = (i, j - 1)
        elif action == 'right' and j < self.cols - 1:
            self.agent_position = (i, j + 1)

    def get_reward(self):
        # Define the reward structure based on the agent's state
        if self.is_terminal():
            return 1
        elif self.is_pitfall():
            return -1
        else:
            return 0

    def reset(self):
        # Reset the agent's position to the starting point
        self.agent_position = (0, 0)


# Step 2: Define the QLearningAgent class

* The QLearningAgent class represents the Q-learning agent
* The agent has Q-values for state-action pairs, and it can choose actions based on an epsilon-greedy strategy
* The Q-values are updated using the Q-learning update rule

In [2]:
class QLearningAgent:
    def __init__(self, actions):
        # Initialize Q-learning agent properties
        self.actions = actions
        self.learning_rate = 0.1
        self.discount_factor = 0.9
        self.q_values = {}

    def get_q_value(self, state, action):
        # Retrieve the Q-value for a state-action pair
        return self.q_values.get((state, action), 0.0)

    def update_q_value(self, state, action, reward, next_state):
        # Update the Q-value based on the Q-learning update rule
        max_next_q = max([self.get_q_value(next_state, a) for a in self.actions])
        current_q = self.get_q_value(state, action)
        new_q = (1 - self.learning_rate) * current_q + self.learning_rate * (reward + self.discount_factor * max_next_q)
        self.q_values[(state, action)] = new_q

    def choose_action(self, state, epsilon):
        # Choose an action using epsilon-greedy exploration
        if random.uniform(0, 1) < epsilon:
            return random.choice(self.actions)
        else:
            q_values = [self.get_q_value(state, a) for a in self.actions]
            return self.actions[q_values.index(max(q_values))]


# Step 3: Train the Q-Learning Agent

* Here, a function to train the Q-learning agent is defined
* It runs a specified number of episodes, where the agent interacts with the environment, chooses actions, and updates Q-values
* The total reward is printed every 100 episodes for monitoring

In [3]:
def train_q_learning_agent(agent, env, epsilon, num_episodes):
    for episode in range(num_episodes):
        env.reset()
        total_reward = 0

        while not env.is_terminal() and not env.is_pitfall():
            action = agent.choose_action(env.agent_position, epsilon)
            current_position = env.agent_position
            env.take_action(action)
            reward = env.get_reward()
            total_reward += reward
            agent.update_q_value(current_position, action, reward, env.agent_position)

        if episode % 100 == 0:
            print(f"Episode: {episode}, Total Reward: {total_reward}")

# Step 4: Test the Q-Learning Agent

* Here, a function to test the trained Q-learning agent is defined
* The agent interacts with the environment without exploration (epsilon=0), and the current position and chosen action are printed



In [4]:
def test_q_learning_agent(agent, env):
    env.reset()
    print("\nTesting the trained agent:")
    while not env.is_terminal() and not env.is_pitfall():
        action = agent.choose_action(env.agent_position, epsilon=0)  # No exploration during testing
        env.take_action(action)
        print(f"Current Position: {env.agent_position}, Chosen Action: {action}")

# Step 5: Execution

* Here, the script is executed as the main program
* It creates an instance of the grid world and the Q-learning agent, trains the agent, and then tests its performance in the environment
* The epsilon parameter controls exploration during training

In [5]:
if __name__ == "__main__":
    world = GridWorld()
    agent_actions = ['up', 'down', 'left', 'right']
    q_agent = QLearningAgent(actions=agent_actions)

    # Training the agent
    train_q_learning_agent(q_agent, world, epsilon=0.2, num_episodes=1000)

    # Testing the agent
    test_q_learning_agent(q_agent, world)

Episode: 0, Total Reward: -1
Episode: 100, Total Reward: 1
Episode: 200, Total Reward: 1
Episode: 300, Total Reward: 1
Episode: 400, Total Reward: -1
Episode: 500, Total Reward: 1
Episode: 600, Total Reward: 1
Episode: 700, Total Reward: 1
Episode: 800, Total Reward: 1
Episode: 900, Total Reward: 1

Testing the trained agent:
Current Position: (0, 1), Chosen Action: right
Current Position: (0, 2), Chosen Action: right
Current Position: (0, 3), Chosen Action: right
Current Position: (1, 3), Chosen Action: down
Current Position: (2, 3), Chosen Action: down
Current Position: (3, 3), Chosen Action: down
