<a href="https://colab.research.google.com/github/hissain/ml/blob/main/codes/reinforcement/RL_frozen_lake_policy_gradient.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

class CustomFrozenLakeEnv:
    def __init__(self, grid_size=4):
        self.grid_size = grid_size
        self.action_space = 4  # Actions: 0 (Left), 1 (Down), 2 (Right), 3 (Up)
        self.observation_space = grid_size * grid_size  # One state for each grid cell

        # Define the lake grid (S=start, F=frozen, H=hole, G=goal)
        self.lake = np.array([
            ['S', 'F', 'F', 'F'],
            ['F', 'H', 'F', 'H'],
            ['F', 'F', 'F', 'H'],
            ['H', 'F', 'F', 'G']
        ])
        self.start_pos = (0, 0)
        self.reset()

    def reset(self):
        # Start at the starting position
        self.current_pos = self.start_pos
        return self.get_state()

    def get_state(self):
        # Encode the position as a one-hot vector
        row, col = self.current_pos
        state = np.zeros(self.observation_space)
        state[row * self.grid_size + col] = 1
        return state

    def step(self, action):
        # Action effects
        if action == 0:  # Left
            next_pos = (self.current_pos[0], max(self.current_pos[1] - 1, 0))
        elif action == 1:  # Down
            next_pos = (min(self.current_pos[0] + 1, self.grid_size - 1), self.current_pos[1])
        elif action == 2:  # Right
            next_pos = (self.current_pos[0], min(self.current_pos[1] + 1, self.grid_size - 1))
        elif action == 3:  # Up
            next_pos = (max(self.current_pos[0] - 1, 0), self.current_pos[1])

        # Move to the new position
        self.current_pos = next_pos
        row, col = next_pos
        cell = self.lake[row, col]

        # Determine reward and if the episode is done
        if cell == 'H':  # Fell into a hole
            return self.get_state(), 0.0, True, {}
        elif cell == 'G':  # Reached the goal
            return self.get_state(), 1.0, True, {}
        else:  # Safe on frozen ground
            return self.get_state(), 0.0, False, {}

    def render(self):
        # Display the lake grid with the current position marked
        lake_render = self.lake.copy()
        row, col = self.current_pos
        lake_render[row, col] = 'A'  # Agent's position
        print("\n".join(" ".join(row) for row in lake_render))
        print()

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class PolicyNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        return torch.softmax(self.fc2(x), dim=-1)

class REINFORCEAgent:
    def __init__(self, state_size, action_size, learning_rate=0.01, gamma=0.99):
        self.policy = PolicyNetwork(state_size, action_size)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=learning_rate)
        self.gamma = gamma
        self.log_probs = []
        self.rewards = []

    def select_action(self, state):
        state = torch.tensor([state], dtype=torch.float32)
        probs = self.policy(state)
        dist = torch.distributions.Categorical(probs)
        action = dist.sample()
        self.log_probs.append(dist.log_prob(action))
        return action.item()

    def store_reward(self, reward):
        self.rewards.append(reward)

    def learn(self):
        # Compute discounted rewards
        discounted_rewards = []
        G = 0
        for reward in reversed(self.rewards):
            G = reward + self.gamma * G
            discounted_rewards.insert(0, G)

        # Normalize rewards
        discounted_rewards = torch.tensor(discounted_rewards)
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9)

        # Calculate loss
        loss = []
        for log_prob, reward in zip(self.log_probs, discounted_rewards):
            loss.append(-log_prob * reward)
        loss = torch.stack(loss).sum()

        # Backpropagate and optimize
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Clear rewards and log_probs
        self.log_probs = []
        self.rewards = []

In [None]:
# Initialize environment and agent
env = CustomFrozenLakeEnv(grid_size=4)
agent = REINFORCEAgent(state_size=env.observation_space, action_size=env.action_space)

# Training parameters
num_episodes = 1000
for episode in range(num_episodes):
    state = env.reset()
    done = False
    episode_reward = 0

    while not done:
        action = agent.select_action(state)
        next_state, reward, done, _ = env.step(action)
        agent.store_reward(reward)
        episode_reward += reward
        state = next_state

    # Update policy after each episode
    agent.learn()

    if (episode + 1) % 100 == 0:
        print(f"Episode {episode + 1}, Total Reward: {episode_reward}")

print("Training completed.")

  state = torch.tensor([state], dtype=torch.float32)


Episode 100, Total Reward: 0.0
Episode 200, Total Reward: 0.0
Episode 300, Total Reward: 0.0
Episode 400, Total Reward: 0.0
Episode 500, Total Reward: 0.0
Episode 600, Total Reward: 0.0
Episode 700, Total Reward: 0.0
Episode 800, Total Reward: 0.0
Episode 900, Total Reward: 0.0
Episode 1000, Total Reward: 0.0
Training completed.


In [14]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# Custom Frozen Lake Environment
class CustomFrozenLakeEnv:
    def __init__(self, grid_size=4):
        self.grid_size = grid_size
        self.action_space = 4  # Actions: 0 (Left), 1 (Down), 2 (Right), 3 (Up)
        self.observation_space = grid_size * grid_size  # One state for each grid cell
        self.lake = np.array([
            ['S', 'F', 'F', 'F'],
            ['F', 'H', 'F', 'H'],
            ['F', 'F', 'F', 'H'],
            ['H', 'F', 'F', 'G']
        ])
        self.start_pos = (0, 0)
        self.reset()

    def reset(self):
        self.current_pos = self.start_pos
        return self.get_state()

    def get_state(self):
        row, col = self.current_pos
        state = np.zeros(self.observation_space)
        state[row * self.grid_size + col] = 1
        return state

    def step(self, action):
        if action == 0:
            next_pos = (self.current_pos[0], max(self.current_pos[1] - 1, 0))
        elif action == 1:
            next_pos = (min(self.current_pos[0] + 1, self.grid_size - 1), self.current_pos[1])
        elif action == 2:
            next_pos = (self.current_pos[0], min(self.current_pos[1] + 1, self.grid_size - 1))
        elif action == 3:
            next_pos = (max(self.current_pos[0] - 1, 0), self.current_pos[1])

        self.current_pos = next_pos
        row, col = next_pos
        cell = self.lake[row, col]

        if cell == 'H':
            return self.get_state(), -1.0, True, {}
        elif cell == 'G':
            return self.get_state(), 10.0, True, {}
        else:
            return self.get_state(), -0.01, False, {}

    def render(self):
        lake_render = self.lake.copy()
        row, col = self.current_pos
        lake_render[row, col] = 'A'
        print("\n".join(" ".join(row) for row in lake_render))
        print()

# Policy Network
class PolicyNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        return torch.softmax(self.fc2(x), dim=-1)

# REINFORCE Agent
class REINFORCEAgent:
    def __init__(self, state_size, action_size, learning_rate=0.01, gamma=0.99):
        self.policy = PolicyNetwork(state_size, action_size)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=learning_rate)
        self.gamma = gamma
        self.log_probs = []
        self.rewards = []

    def select_action(self, state):
        state = torch.tensor([state], dtype=torch.float32)
        probs = self.policy(state)
        dist = torch.distributions.Categorical(probs)
        action = dist.sample()
        self.log_probs.append(dist.log_prob(action))
        return action.item()

    def store_reward(self, reward):
        self.rewards.append(reward)

    def learn(self):
        discounted_rewards = []
        G = 0
        for reward in reversed(self.rewards):
            G = reward + self.gamma * G
            discounted_rewards.insert(0, G)

        discounted_rewards = torch.tensor(discounted_rewards)
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9)

        loss = []
        for log_prob, reward in zip(self.log_probs, discounted_rewards):
            loss.append(-log_prob * reward)
        loss = torch.stack(loss).sum()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.log_probs = []
        self.rewards = []

# Training Loop
env = CustomFrozenLakeEnv(grid_size=4)
agent = REINFORCEAgent(state_size=env.observation_space, action_size=env.action_space)
num_episodes = 10000

for episode in range(num_episodes):
    state = env.reset()
    done = False
    episode_reward = 0

    while not done:
        action = agent.select_action(state)
        next_state, reward, done, _ = env.step(action)
        agent.store_reward(reward)
        episode_reward += reward
        state = next_state

    agent.learn()

    if (episode + 1) % 100 == 0:
        print(f"Episode {episode + 1}, Total Reward: {episode_reward}")

print("Training completed.")

Episode 100, Total Reward: -1.03
Episode 200, Total Reward: -1.5900000000000003
Episode 300, Total Reward: -1.32
Episode 400, Total Reward: -1.06
Episode 500, Total Reward: -2.280000000000001
Episode 600, Total Reward: -1.01
Episode 700, Total Reward: -2.0100000000000007
Episode 800, Total Reward: -1.16
Episode 900, Total Reward: -1.5000000000000002
Episode 1000, Total Reward: -1.8200000000000005
Episode 1100, Total Reward: -2.310000000000001
Episode 1200, Total Reward: -2.240000000000001
Episode 1300, Total Reward: -3.239999999999996
Episode 1400, Total Reward: -2.130000000000001
Episode 1500, Total Reward: -3.299999999999995
Episode 1600, Total Reward: -1.1199999999999999
Episode 1700, Total Reward: -1.03
Episode 1800, Total Reward: -2.0900000000000007
Episode 1900, Total Reward: -1.8700000000000006
Episode 2000, Total Reward: -1.23
Episode 2100, Total Reward: -3.07
Episode 2200, Total Reward: -1.7500000000000004
Episode 2300, Total Reward: -3.4199999999999924
Episode 2400, Total Rew