<a href="https://colab.research.google.com/github/kunalr33/SOC_RlForAgents/blob/main/week3_taxiProblemRL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gymnasium
!pip install torch

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manyli

In [None]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim

# Check if GPU is available and set the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Initialize the Taxi environment
env = gym.make('Taxi-v3')

Using device: cpu


In [None]:
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

In [None]:
def to_one_hot(state, state_size):
    one_hot = np.zeros(state_size)
    one_hot[state] = 1
    return one_hot

In [None]:
def monte_carlo_control(env, num_episodes, gamma=0.99, epsilon=0.1):
    state_size = env.observation_space.n
    action_size = env.action_space.n
    Q = QNetwork(state_size, action_size).to(device)
    optimizer = optim.Adam(Q.parameters(), lr=0.001)
    returns_sum = torch.zeros((state_size, action_size)).to(device)
    returns_count = torch.zeros((state_size, action_size)).to(device)

    def policy(state):
        if np.random.rand() < epsilon:
            return env.action_space.sample()
        else:
            state = torch.FloatTensor(to_one_hot(state, state_size)).to(device)
            with torch.no_grad():
                return torch.argmax(Q(state)).item()

    cumulative_rewards = []

    for episode in range(num_episodes):
        state, _ = env.reset()
        episode = []
        done = False
        while not done:
            action = policy(state)
            next_state, reward, done, _, _ = env.step(action)
            episode.append((state, action, reward))
            state = next_state

        G = 0
        for state, action, reward in reversed(episode):
            G = gamma * G + reward
            state_tensor = torch.FloatTensor(to_one_hot(state, state_size)).to(device)
            if not (state, action) in [(x[0], x[1]) for x in episode[:episode.index((state, action, reward))]]:
                returns_sum[state, action] += G
                returns_count[state, action] += 1
                Q_values = Q(state_tensor)
                Q_values[action] = returns_sum[state, action] / returns_count[state, action]
                loss = torch.mean((Q(state_tensor) - Q_values.detach()) ** 2)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        cumulative_rewards.append(sum([x[2] for x in episode]))

    return Q, cumulative_rewards


In [None]:
def q_learning(env, num_episodes, alpha=0.1, gamma=0.99, epsilon=0.1):
    state_size = env.observation_space.n
    action_size = env.action_space.n
    q_network = QNetwork(state_size, action_size).to(device)
    optimizer = optim.Adam(q_network.parameters(), lr=alpha)
    criterion = nn.MSELoss()

    def policy(state):
        if np.random.rand() < epsilon:
            return env.action_space.sample()
        else:
            state = torch.FloatTensor(to_one_hot(state, state_size)).to(device)
            with torch.no_grad():
                return torch.argmax(q_network(state)).item()

    cumulative_rewards = []

    for episode in range(num_episodes):
        state, _ = env.reset()
        total_reward = 0
        done = False
        while not done:
            action = policy(state)
            next_state, reward, done, _, _ = env.step(action)
            total_reward += reward

            state_tensor = torch.FloatTensor(to_one_hot(state, state_size)).to(device)
            next_state_tensor = torch.FloatTensor(to_one_hot(next_state, state_size)).to(device)

            target = reward + gamma * torch.max(q_network(next_state_tensor)).item() if not done else reward
            target_f = q_network(state_tensor)
            target_f = target_f.clone()
            target_f[action] = target

            optimizer.zero_grad()
            loss = criterion(q_network(state_tensor), target_f)
            loss.backward()
            optimizer.step()

            state = next_state

        cumulative_rewards.append(total_reward)

    return q_network, cumulative_rewards


In [None]:
num_episodes = 3000 #less no of episode to reduce runtime
gamma = 0.98
epsilon = 0.1
alpha = 0.001

In [None]:
# Train Q-Learning
q_network, ql_cumulative_rewards = q_learning(env, num_episodes, alpha, gamma, epsilon)

In [None]:
# Train Monte Carlo
Q_mc, mc_cumulative_rewards = monte_carlo_control(env, num_episodes, gamma, epsilon)

In [None]:
# Plotting cumulative rewards for both algorithms
plt.figure(figsize=(12, 8))
plt.plot(mc_cumulative_rewards, label='Monte Carlo')
plt.plot(ql_cumulative_rewards, label='Q-Learning')
plt.xlabel('Episodes')
plt.ylabel('Cumulative Reward')
plt.title('Cumulative Reward over Episodes for Monte Carlo and Q-Learning')
plt.legend()
plt.grid(True)
plt.show()