In [1]:
!pip install --upgrade gym==0.25.2



In [2]:
!pip install pyvirtualdisplay

Collecting pyvirtualdisplay
  Downloading PyVirtualDisplay-3.0-py3-none-any.whl (15 kB)
Installing collected packages: pyvirtualdisplay
Successfully installed pyvirtualdisplay-3.0


In [3]:
import argparse
import os

import gym
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
from time import time
import torch
import torch.nn as nn
import torch.nn.functional as F

import imageio

In [4]:
env_name = None
initial_timestamp = 0.0
np.random.seed(42)
torch.manual_seed(42)

  and should_run_async(code)


<torch._C.Generator at 0x7cf3b85f5210>

In [5]:
class DQNetwork:

    def __init__(self, state_size, action_size, action_high=1.0, action_low=0.0, layer_sizes=(64, 64),
                 batch_norm_options=(True, True), dropout_options=(0, 0), learning_rate=0.0001):
        self.state_size = state_size
        self.action_size = action_size
        self.action_high = action_high
        self.action_low = action_low
        self.layer_sizes = layer_sizes
        self.batch_norm_options = batch_norm_options
        self.dropout_options = dropout_options
        self.learning_rate = learning_rate

        self.build_model()

    def build_model(self):
        layers = []
        # hidden layers

        for layer_count in range(len(self.layer_sizes)):
            if layer_count == 0:
              layers.append(nn.Linear(self.state_size, self.layer_sizes[layer_count]))
            else:
              layers.append(nn.Linear(self.layer_sizes[layer_count - 1], self.layer_sizes[layer_count]))
            layers.append(nn.ReLU())
            if self.batch_norm_options[layer_count]:
                layers.append(nn.BatchNorm1d(self.layer_sizes[layer_count]))
            layers.append(nn.Dropout(self.dropout_options[layer_count]))

        layers.append(nn.Linear(self.layer_sizes[-1], self.action_size))
        self.criterion = nn.MSELoss()
        self.model = nn.Sequential(*layers)
        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=self.learning_rate)

In [6]:
import random
from collections import namedtuple, deque
class DDQNAgent:

    def __init__(self, env, buffer_size=int(1e5), batch_size=64, gamma=0.99, tau=1e-3, lr=5e-4, callbacks=()):
        self.env = env
        self.env.seed(42)
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.Q_targets = 0.0
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.n
        self.callbacks = callbacks

        layer_sizes = [256, 256]
        batch_norm_options = [False, False]
        dropout_options = [0, 0]

        print("Initialising DDQN Agent with params : {}".format(self.__dict__))

        # Make local & target model
        print("Initialising Local DQNetwork")
        self.local_network = DQNetwork(self.state_size, self.action_size,
                                       layer_sizes=layer_sizes,
                                       batch_norm_options=batch_norm_options,
                                       dropout_options=dropout_options,
                                       learning_rate=lr)

        print("Initialising Target DQNetwork")
        self.target_network = DQNetwork(self.state_size, self.action_size,
                                        layer_sizes=layer_sizes,
                                        batch_norm_options=batch_norm_options,
                                        dropout_options=dropout_options,
                                        learning_rate=lr)

        self.memory = ReplayBuffer(buffer_size=buffer_size, batch_size=batch_size)

    def reset_episode(self):
        state = self.env.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        self.memory.add(self.last_state, action, reward, next_state, done)

        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma)

        self.last_state = next_state

    def act(self, state, eps=0.):
        state = np.reshape(state, [-1, self.state_size])
        state_tensor = torch.Tensor(state)
        action = self.local_network.model(state_tensor)

        if random.random() > eps:
            return torch.argmax(action).item()
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences

        for itr in range(len(states)):
            state, action, reward, next_state, done = states[itr], actions[itr], rewards[itr], next_states[itr], dones[
                itr]
            state = np.reshape(state, [-1, self.state_size])
            next_state = np.reshape(next_state, [-1, self.state_size])
            next_state_tensor = torch.Tensor(next_state)
            state_tensor = torch.Tensor(state)
            self.Q_targets = self.local_network.model(state_tensor)
            if done:
                self.Q_targets[0][action] = torch.Tensor(reward)
            else:
                next_Q_target = self.target_network.model(next_state_tensor)[0]
                self.Q_targets[0][action] = (reward[0] + gamma * torch.max(next_Q_target))

            self.local_network.model.train()
            self.local_network.optimizer.zero_grad()
            outputs = self.local_network.model(state_tensor)
            loss = self.local_network.criterion(outputs, self.Q_targets)

            # Backward and Optimize
            loss.backward()
            self.local_network.optimizer.step()

    def update_target_model(self):
        torch.save(self.local_network.model, "/content/update_model.pt")
        self.target_network.model = torch.load("/content/update_model.pt")


class ReplayBuffer:

    def __init__(self, buffer_size, batch_size):
        self.memory = deque(maxlen=buffer_size)  # internal memory (deque)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])

    def add(self, state, action, reward, next_state, done):
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)

    def sample(self):
        experiences = random.sample(self.memory, k=self.batch_size)

        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.vstack([e.action for e in experiences if e is not None])
        rewards = np.vstack([e.reward for e in experiences if e is not None])
        next_states = np.vstack([e.next_state for e in experiences if e is not None])
        dones = np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)

        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        return len(self.memory)

In [7]:
from numpy.lib.nanfunctions import nanpercentile
import gym
import numpy as np
import matplotlib.pyplot as plt
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display

def train_model(n_episodes=2000, eps_start=1.0, eps_end=0.001, eps_decay=0.9, target_reward=1000):
    scores = []
    scores_window = deque(maxlen=100)
    eps = eps_start
    print("Starting model training for {} episodes.".format(n_episodes))
    consolidation_counter = 0
    for i_episode in range(1, n_episodes + 1):
        init_time = time()
        state = agent.reset_episode()
        score = 0
        done = False
        while not done:
            action = agent.act(state, eps)
            next_state, reward, done, _ = env.step(action)
            agent.step(action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                agent.update_target_model()
                break
        time_taken = time() - init_time
        scores_window.append(score)
        scores.append(score)
        eps = max(eps_end, eps_decay * eps)
        print('Episode {}\tAverage Score: {:.2f}\tScore: {:.2f}\tState: {}\tMean Q-Target: {:.4f}'
                     '\tEffective Epsilon: {:.3f}\tTime Taken: {:.2f} sec'.format(
            i_episode, np.mean(scores_window), score, state[0], torch.mean(agent.Q_targets), eps, time_taken))
        if i_episode % 100 == 0:
            print(
                'Episode {}\tAverage Score: {:.2f}\tScore: {:.2f}\tState: {}\tMean Q-Target: {:.4f}\tTime Taken: {:.2f} sec '.format(
                    i_episode, np.mean(scores_window), score, state[0], torch.mean(agent.Q_targets), time_taken))
            torch.save(agent.local_network.model, '/content/save_{}_local_model_{}.pt'.format(env_name, initial_timestamp))
            torch.save(agent.target_network.model, '/content/save_{}_target_model_{}.pt'.format(env_name, initial_timestamp))
        if np.mean(scores_window) >= target_reward:
            consolidation_counter += 1
            if consolidation_counter >= 5:
                print("Completed model training with avg reward {} over last {} episodes."
                                    " Training ran for total of {} epsiodes".format(
                    np.mean(scores_window), 100, i_episode))
                return scores
        else:
            consolidation_counter = 0
    print("Completed model training with avg reward {} over last {} episodes."
                        " Training ran for total of {} epsiodes".format(
        np.mean(scores_window), 100, n_episodes))
    return scores


def play_model(actor, env_render=False, return_render_img=False):
    state = env.reset()
    print("Start state : {}".format(state))
    score = 0
    done = False
    images = []
    R = 0
    t = 0
    while not done:
        if env_render:
            if return_render_img:
                images.append(env.render("rgb_array"))
            else:
                env.render()
        state = np.reshape(state, [-1, env.observation_space.shape[0]])
        action = actor.predict(state)
        next_state, reward, done, _ = env.step(np.argmax(action))
        R += reward
        t += 1
        state = next_state
        score += reward
        if done:
            return score, images
    return 0, images

In [8]:
def display_frames_as_gif(frames):
    """
    Displays a list of frames as a gif, with controls
    """
    plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 72)
    patch = plt.imshow(frames[0])
    plt.axis('off')

    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
    display(display_animation(anim, default_mode='loop'))

In [None]:
#train
env_name = "MountainCar-v0"
env = gym.make(env_name)
agent = DDQNAgent(env, buffer_size=100000, gamma=0.99, batch_size=64, lr=0.0001, callbacks=[])
scores = train_model(n_episodes=2000, target_reward=-110, eps_decay=0.9)
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

  deprecation(
  deprecation(
  deprecation(


Initialising DDQN Agent with params : {'env': <TimeLimit<OrderEnforcing<StepAPICompatibility<PassiveEnvChecker<MountainCarEnv<MountainCar-v0>>>>>>, 'batch_size': 64, 'gamma': 0.99, 'tau': 0.001, 'Q_targets': 0.0, 'state_size': 2, 'action_size': 3, 'callbacks': []}
Initialising Local DQNetwork
Initialising Target DQNetwork
Starting model training for 2000 episodes.
Episode 1	Average Score: -200.00	Score: -200.00	State: -0.5150498747825623	Mean Q-Target: -0.8660	Effective Epsilon: 0.900	Time Taken: 31.93 sec
Episode 2	Average Score: -200.00	Score: -200.00	State: -0.5525309443473816	Mean Q-Target: -1.8293	Effective Epsilon: 0.810	Time Taken: 33.67 sec
Episode 3	Average Score: -200.00	Score: -200.00	State: -0.44720834493637085	Mean Q-Target: -2.7870	Effective Epsilon: 0.729	Time Taken: 36.68 sec
Episode 4	Average Score: -200.00	Score: -200.00	State: -0.4854550361633301	Mean Q-Target: -3.7747	Effective Epsilon: 0.656	Time Taken: 35.48 sec
Episode 5	Average Score: -200.00	Score: -200.00	Stat