In [None]:
import random
import numpy as np
from collections import deque
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt

# Set device for torch computations
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:

class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, action_size)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return self.fc3(x)


In [None]:

class ReplayBuffer:
    def __init__(self, buffer_size, batch_size):
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size

    def add(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def sample(self):
        experiences = random.sample(self.memory, k=self.batch_size)
        states = torch.FloatTensor(np.vstack([e[0] for e in experiences])).to(device)
        actions = torch.LongTensor(np.vstack([e[1] for e in experiences])).to(device)
        rewards = torch.FloatTensor(np.vstack([e[2] for e in experiences])).to(device)
        next_states = torch.FloatTensor(np.vstack([e[3] for e in experiences])).to(device)
        dones = torch.FloatTensor(np.vstack([e[4] for e in experiences])).to(device)
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        return len(self.memory)


In [None]:

class Agent:
    def __init__(self, state_size, action_size, buffer_size=int(1e5), batch_size=128,
                 gamma=0.99, lr=1e-3, tau=1e-3, update_every=4):
        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.update_every = update_every
        self.t_step = 0

        self.qnetwork_local = QNetwork(state_size, action_size).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)
        self.memory = ReplayBuffer(buffer_size, batch_size)

    def act(self, state, epsilon=0.0):
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()
        if random.random() > epsilon:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0 and len(self.memory) >= self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        # Obtain maximum predicted Q values (for next states) from the target network
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # Compute Q targets for current states using the Bellman equation
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        # Get expected Q values from the local network
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        loss = F.mse_loss(Q_expected, Q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update()

    def soft_update(self):
        # Soft update model parameters:
        # θ_target = τ*θ_local + (1 − τ)*θ_target
        for target_param, local_param in zip(self.qnetwork_target.parameters(), self.qnetwork_local.parameters()):
            target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)

In [None]:
def shaped_reward(next_state, terminated):
    # Reconstruct the angles from cosine and sine components
    theta1 = np.arctan2(next_state[1], next_state[0])
    theta2 = np.arctan2(next_state[3], next_state[2])
    # Compute tip height (approximately in the range [-2, 0])
    height = -np.cos(theta1) - np.cos(theta1 + theta2)
    # Normalize height so that 0 corresponds to lowest and 1 corresponds to highest
    height_norm = (height + 2) / 2

    base = -1.0
    height_reward = 2.0 * height_norm
    vel1 = next_state[4]
    vel2 = next_state[5]
    velocity_penalty = -0.1 * (abs(vel1) + abs(vel2))
    success_bonus = 100.0 if terminated else 0.0

    # Standstill bonus if near the top (height_norm > 0.95) and very low angular velocities
    if height_norm > 0.95 and abs(vel1) < 0.05 and abs(vel2) < 0.05:
        stillness_bonus = 50.0
    else:
        stillness_bonus = 0.0

    time_penalty = 0.0  # We already have base reward -1 per step

    return base + height_reward + velocity_penalty + success_bonus + stillness_bonus + time_penalty


In [None]:

def train_agent(agent, env, n_episodes=2000, max_t=500, eps_start=1.0, eps_end=0.05, eps_decay=0.995):
    scores = []  # Will store cumulative environment rewards (-1 per step until termination)
    scores_window = deque(maxlen=100)
    eps = eps_start

    for episode in range(1, n_episodes + 1):
        state, _ = env.reset()
        score = 0  # Raw environment reward; note: each step gives -1 unless terminated early.

        for t in range(max_t):
            action = agent.act(state, eps)
            next_state, env_reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            # Compute shaped reward (which now includes a standstill bonus)
            reward = shaped_reward(next_state, terminated)
            agent.step(state, action, reward, next_state, done)

            state = next_state
            score += env_reward  # accumulate raw environment reward

            if done:
                break

        scores.append(score)
        scores_window.append(score)
        eps = max(eps_end, eps_decay * eps)
        print(f'\rEpisode {episode}\tAverage Env Reward: {np.mean(scores_window):.2f}', end='')
        if episode % 100 == 0:
            print(f'\rEpisode {episode}\tAverage Env Reward: {np.mean(scores_window):.2f}')

        # Acrobot-v1 is "solved" if episodes end early (e.g. around -100 vs. -500)
        if np.mean(scores_window) >= -100:
            print(f'\nEnvironment solved in {episode} episodes!\tAverage Env Reward: {np.mean(scores_window):.2f}')
            torch.save(agent.qnetwork_local.state_dict(), 'acrobot_solved.pth')
            break

    return scores


In [None]:
if __name__ == "__main__":
    env = gym.make('Acrobot-v1')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    # Create our DQN agent
    agent = Agent(state_size, action_size)

    # Train the agent
    scores = train_agent(agent, env)

    # Plot the training progress (environment rewards)
    plt.figure(figsize=(10, 5))
    plt.plot(scores, label='Episode Score (Env Reward)')
    running_avg = [np.mean(scores[max(0, i - 100):i + 1]) for i in range(len(scores))]
    plt.plot(running_avg, label='Running Average (100 eps)', linestyle='--')
    plt.title('Training Progress on Acrobot-v1')
    plt.xlabel('Episode #')
    plt.ylabel('Score (Env Reward)')
    plt.legend()
    plt.show()

    env.close()


In [None]:
import os
import base64
import imageio
import gym
import torch
from IPython.display import HTML, display


def show_video_of_model(agent, env_name="Acrobot-v1", output_path="acrobot_video.mp4"):

    env = gym.make(env_name, render_mode="rgb_array")
    state, _ = env.reset()
    done = False
    frames = []

    while not done:
        frame = env.render()
        frames.append(frame)
        action = agent.act(state)
        state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

    env.close()

    # Save the video using imageio.
    imageio.mimsave(output_path, frames, fps=30)
    print(f"✅ Video saved to: {os.path.abspath(output_path)}")


def show_video(video_path="acrobot_video.mp4"):

    if os.path.exists(video_path):
        with open(video_path, "rb") as f:
            video = f.read()
        encoded = base64.b64encode(video).decode("ascii")
        display(HTML(f"""
        <video autoplay loop controls style="height: 400px;">
            <source src="data:video/mp4;base64,{encoded}" type="video/mp4" />
        </video>
        """))
    else:
        print("❌ No video found.")


agent.qnetwork_local.load_state_dict(torch.load("acrobot_solved.pth", map_location=torch.device("cpu")))
show_video_of_model(agent, env_name="Acrobot-v1", output_path="acrobot_video.mp4")
show_video("acrobot_video.mp4")
