<a href="https://colab.research.google.com/github/jainprashant03/ML_projects/blob/Tutorials%40Udemy/ChatGPTLunarLander.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gymnasium
!pip install "gymnasium[atari, accept-rom-license]"
!apt-get install -y swig
!pip install gymnasium[box2d]

Collecting gymnasium
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-1.0.0-py3-none-any.whl (958 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m958.1/958.1 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-1.0.0
Collecting ale-py>=0.9 (from gymnasium[accept-rom-license,atari])
  Downloading ale_py-0.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.6 kB)
Downloading ale_py-0.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling colle

In [None]:
import gymnasium as gym
import numpy as np
import random
from collections import deque
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from gym.wrappers import RecordVideo

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create the Lunar Lander environment
env = gym.make("LunarLander-v3")
#env.seed(42)

# Hyperparameters
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
learning_rate = 0.001
gamma = 0.99
epsilon = 1.0
epsilon_decay = 0.995
epsilon_min = 0.01
batch_size = 64
max_episodes = 1000
max_steps = 1000
memory_size = 100000

# Replay buffer
memory = deque(maxlen=memory_size)

# Neural network for Q-learning
class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Initialize the networks
q_network = DQN(state_size, action_size).to(device)
target_network = DQN(state_size, action_size).to(device)
target_network.load_state_dict(q_network.state_dict())
target_network.eval()

optimizer = optim.Adam(q_network.parameters(), lr=learning_rate)

# Helper function to sample from memory
 #Sample Method: To randomly select batch of experiences from the memory buffer
def sample(memory, batchSize):
  batch = random.sample(memory, batch_size)
  states, actions, rewards, next_states, dones = zip(*batch)
  return (torch.tensor(states, dtype=torch.float32).to(device),
          torch.tensor(actions, dtype=torch.int64).to(device),
          torch.tensor(rewards, dtype=torch.float32).to(device),
          torch.tensor(next_states, dtype=torch.float32).to(device),
          torch.tensor(dones, dtype=torch.float32).to(device))

# Training loop
episode_rewards = []
for episode in range(max_episodes):
    state = env.reset()
    total_reward = 0
    for step in range(max_steps):
        # Select action
        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            state_tensor = torch.tensor([state], dtype=torch.float32).to(device)
            q_values = q_network(state_tensor)
            action = torch.argmax(q_values).item()

        # Take action in the environment
        next_state, reward, done, _,_ = env.step(action)
        memory.append((state, action, reward, next_state, done))

        state = next_state
        total_reward += reward

        # Train the network if enough samples are available
        if len(memory) >= batch_size:
            states, actions, rewards, next_states, dones = sample(memory, batch_size)

            # Compute Q-values and targets
            q_values = q_network(states).gather(1, actions.unsqueeze(1)).squeeze(1)
            next_q_values = target_network(next_states).max(1)[0]
            targets = rewards + (gamma * next_q_values * (1 - dones))

            # Compute loss and backpropagate
            loss = nn.MSELoss()(q_values, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if done:
            break

    # Update epsilon
    epsilon = max(epsilon_min, epsilon * epsilon_decay)
    episode_rewards.append(total_reward)

    # Update target network
    if episode % 10 == 0:
        target_network.load_state_dict(q_network.state_dict())

    print(f"Episode {episode + 1}, Reward: {total_reward}, Epsilon: {epsilon:.3f}")

    if np.mean(episode_rewards[-100:]) >= 200:
        print(f"Solved in {episode + 1} episodes!")
        break

# Plot rewards
plt.plot(episode_rewards)
plt.xlabel("Episodes")
plt.ylabel("Reward")
plt.title("Training Rewards")
plt.show()

# Generate video of the trained agent
env = RecordVideo(env, './videos', episode_trigger=lambda x: True)
state = env.reset()
done = False
while not done:
    state_tensor = torch.tensor([state], dtype=torch.float32).to(device)
    q_values = q_network(state_tensor)
    action = torch.argmax(q_values).item()
    state, _, done, _ = env.step(action)
env.close()

print("Training complete! Video saved in './videos'.")


  return (torch.tensor(states, dtype=torch.float32).to(device),


ValueError: expected sequence of length 8 at dim 1 (got 2)