# install dependancies, takes around 45 seconds

Rendering Dependancies



In [0]:
#remove " > /dev/null 2>&1" to see what is going on under the hood
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

CarRacing Dependancies

In [1]:
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1
!pip install gym[box2d] > /dev/null 2>&1

Requirement already up-to-date: setuptools in /usr/local/lib/python3.6/dist-packages (47.1.1)


# Imports and Helper functions


In [2]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
root_dir = "/content/gdrive/My Drive/"
base_dir = root_dir + 'Colab Notebooks/DQN/Breakout/'

In [4]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

<pyvirtualdisplay.display.Display at 0x7f5ef81f94e0>

In [0]:
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only
import tensorflow as tf
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import math
import glob
import io
import base64
from IPython.display import HTML

from IPython import display as ipythondisplay

In [0]:
"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

In [0]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 17658531919813942574
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 17626257282692675058
physical_device_desc: "device: XLA_CPU device"
, name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 9513092846313748010
physical_device_desc: "device: XLA_GPU device"
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 15701463552
locality {
  bus_id: 1
  links {
  }
}
incarnation: 13614745214897403017
physical_device_desc: "device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0"
]


Car Racing DQN

In [0]:
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt

import numpy as np
import gym
import random
from collections import deque
import tensorflow as tf
import tensorflow.keras as tfk
import tensorflow.keras.layers as tfkl
import pickle
import time
import sys

#Preprocessing
def gray_scale(img):
    return np.dot(img, [0.299, 0.587, 0.144]).astype(np.uint8)

def down_sample(img):
    return img[::2, ::2]

def crop(img, top=18, bottom=7, left=0, right=0):
    h, w = img.shape
    return img[top : h - bottom, left : w - right]

def preprocess(state, img):
    img = crop(down_sample(gray_scale(img)))
    state = np.roll(state, -1, axis=2)
    state[:, :, 2] = img
    return state  #.reshape(1, 80, 80, 1)

class ReplayMemory(object):
    def __init__(self, list, max_size):
        self.max_size = max_size
        self.cntr = 0
        self.mem = deque(list, maxlen = self.max_size)

    def store(self, phi_St, action, reward, next_phi_St, done):
        self.mem.append((phi_St, action, reward, next_phi_St, done))
        self.cntr += 1

    def sample(self, batch_size):
        batch_size = min(batch_size, self.cntr)
        return batch_size, random.sample(self.mem, batch_size)

class DQNAgent(object):
    def __init__(self):
        self.model_id = 1
        self.input_shape = (80, 80, 4)
        self.action_count = 4
        self.epsilon_end = 0.01
        self.epsilon_step = 0.996 #0.0009 #0.0001
        self.no_of_episodes = 500 #no of episodes
        self.replay_mem_max_size = 14000 #1000000 #replay memory capacity
        self.batch_size = 32 #16 #do trial-error #replay batch size
        self.skip_frames = 0
        self.gamma = 0.99 #discount factor
        self.C = 5000 #10000
        self.l_rate = 0.0005 #0.00025
        self.discrete_action_ids = [i for i in range(self.action_count)]
        self.epi_score = 0
        self.total_score = []
        self.epi_loss = 0
        self.total_loss = []
        #self.val_loss = []
        self.skip_cntr = 0
        root_dir = "/content/gdrive/My Drive/"
        self.base_dir = root_dir + 'Colab Notebooks/DQN/Breakout/'
        self.datapath = self.base_dir + "data/model" + str(self.model_id) + "/"
        self.log_file = open(self.datapath + "dqn_log.txt", "a+")
        self.replay_mem_file = self.datapath + "replay_mem_data.file"
        self.epsilon_file = self.datapath + "epsilon.file"
        self.trained_model_file = self.datapath + "trained_model.h5"
        self.latest_weights_file = self.datapath + "latest_weights.h5"  #My Drive/Colab Notebooks/DQN/data/

        try:
            self.replay_mem = pickle.load(open(self.replay_mem_file, "rb"))
            if len(self.replay_mem.mem) > self.replay_mem_max_size:
                self.replay_mem = ReplayMemory(self.replay_mem.mem, self.replay_mem_max_size)
            print("Replay Memory loaded. Length: ", len(self.replay_mem.mem))
        except Exception as e:
            self.replay_mem = ReplayMemory([], self.replay_mem_max_size)
            print("Error in loading Replay Memory. ", e)

        try:
            self.epsilon = pickle.load(open(self.epsilon_file, "rb"))
            print("Epsilon loaded. Value: ", self.epsilon)
        except Exception as e:
            self.epsilon = 1
            print("Error in loading Epsilon. ", e)

    def create_model(self):
        try:
            self.model = tfk.models.load_model(self.trained_model_file)
            print("Trained model loaded.")
        except Exception as e:
            print("Error in loading model. ", e)
            self.model = tfk.models.Sequential([
            tfkl.Conv2D(16, (8, 8), strides = 4, activation = 'relu', input_shape=self.input_shape), #elu or relu
            tfkl.Conv2D(32, (4, 4), strides = 2, activation = 'relu'),
            tfkl.Conv2D(32, (3, 3), strides = 1, activation = 'relu'),
            tfkl.Flatten(),
            tfkl.Dense(256, activation='relu'),
            tfkl.Dense(self.action_count) #, activation='softmax')
            ])

            #self.model.compile(optimizer=tfk.optimizers.RMSprop(learning_rate=self.l_rate, momentum=0.95), loss='mse')
            self.model.compile(optimizer=tfk.optimizers.Adam(learning_rate=self.l_rate), loss='mse')
                                                
            try:
                self.model.load_weights(self.latest_weights_file)
                print("Latest weights loaded.")
            except Exception as e:
                print("Error in loading Latest weights. ", e)

        self.new_weights = self.old_weights = self.model.get_weights()

    def take_skip_action(self, phi_St):
        if self.skip_cntr % self.skip_frames != 0:
            next_x, reward, done, info = env.step(self.skip_action)
            self.epi_score += reward
            self.skip_cntr += 1
            skipped = True
        else:
            next_x = None
            reward = 0
            done = False
            info = None
            self.skip_cntr = 0
            skipped = False
        return preprocess(phi_St, next_x), reward, done, info, skipped

    def take_e_greedy_action(self, phi_St):
        if random.uniform(0, 1) < self.epsilon:
            action = np.random.choice(self.discrete_action_ids)
        else:
            self.model.set_weights(self.new_weights)
            actions = self.model.predict(phi_St[np.newaxis, : ])
            action = np.argmax(actions)
           # print(actions)
           # print(action)
        self.skip_action = action
        next_x, reward, done, info = env.step(action)
        self.epi_score += reward
        return preprocess(phi_St, next_x), action, reward, done, info
    
    def take_predicted_action(self, phi_St):
        actions = self.model.predict(current_state)
        action = np.argmax(actions)
        next_x, reward, done, info = env.step(self.action)
        self.epi_score += reward
        return preprocess(phi_St, next_x), action, reward, done, info

    def update_epsilon(self):
        #self.epsilon -= self.epsilon_step
        if self.epsilon > self.epsilon_end:
            self.epsilon = self.epsilon * self.epsilon_step 
        else:
            self.epsilon = self.epsilon_end        

    def store_transition(self, phi_St, action, reward, next_phi_St, done):
        self.replay_mem.store(phi_St, action, reward, next_phi_St, float(done))

    def sample_transition(self):
        batch_size, samples = self.replay_mem.sample(self.batch_size)
        phi_Sts, actions, rewards, next_phi_Sts, dones = zip(*samples)
        phi_Sts = np.stack(phi_Sts)
        actions = np.stack(actions)
        rewards = np.stack(rewards)
        next_phi_Sts = np.stack(next_phi_Sts)
        dones = np.stack(dones)
        return batch_size, phi_Sts, actions, rewards, next_phi_Sts, dones

    def optimize_loss(self, batch_size, phi_Sts, actions, rewards, next_phi_Sts, dones):
        self.model.set_weights(self.old_weights)
        phi_Sts = phi_Sts.astype(np.float32) /255.0
        next_phi_Sts = next_phi_Sts.astype(np.float32) / 255.0
        q_vals = self.model.predict(next_phi_Sts)
        q_vals = np.max(q_vals, axis=1)
        batch_list = np.arange(batch_size, dtype=np.int32)
        self.model.set_weights(self.new_weights)
        target_q_vals = self.model.predict(phi_Sts)
        target_q_vals[batch_list, actions] = rewards + (1 - dones) * self.gamma * q_vals
        self.old_weights = self.new_weights
        #print("\ntarget \n", target_q_vals)
        #print("\npredict \n", self.model.predict(phi_Sts))
        #print("prediction states ", phi_Sts)
        #print("prediction next states ", self.model.predict(next_phi_Sts))
        
        hist = self.model.fit(phi_Sts, target_q_vals, verbose=0)
        self.new_weights = self.model.get_weights()
        self.epi_loss += np.sum(hist.history['loss'])
        #self.val_loss.append(np.sum(hist.history['val_loss']))

    def reset_target_params(self, epochs):
        if epochs % self.C == 0:
            self.old_weights = self.new_weights

    def end_episode(self):
        self.total_score.append(self.epi_score)
        self.total_loss.append(self.epi_loss)
        self.epi_score = 0
        self.epi_loss = 0

    def print_status(self, episode, elapsed_time):
        str = "\nepisode: {}  score: {:.5f}  loss: {:.5f}  epsilon: {:.3f}  {:.2f}s {}"
        str = str.format(episode, self.epi_score, self.epi_loss, self.epsilon, \
                         elapsed_time, time.strftime("%Y-%m-%d %H:%M:%S"))
        self.log_file.write(str)
        print(str)

    def close(self):
        self.log_file.close()

    def save_partial_status(self, episode):
        if episode % 10 == 0 or episode == self.no_of_episodes:
            agent.model.save(self.trained_model_file)
            self.log_file.close()
            self.log_file = open(self.datapath + "dqn_log.txt", "a+")
            self.model.save_weights(self.latest_weights_file)
            with open(self.epsilon_file, "wb") as f:
                pickle.dump(self.epsilon, f, pickle.HIGHEST_PROTOCOL)
           # with open(self.replay_mem_file, "wb") as f:
           #     pickle.dump(self.replay_mem, f, pickle.HIGHEST_PROTOCOL)
            print("Mem length: ", len(self.replay_mem.mem), "Size: ", sys.getsizeof(self.replay_mem.mem))
    
    def plot(self):
        if len(self.total_loss) > 1:
            N = np.arange(0, len(self.total_loss))
            plt.figure()
            plt.plot(N, self.total_loss, label = "train_loss")
            #plt.plot(N, self.val_loss, label = "val_loss")
            plt.title("Training Loss")
            plt.xlabel("Epoch #")
            plt.ylabel("Loss")
            plt.legend()
            plt.savefig(self.datapath + 'LossPlot.png')
            plt.close()

        if len(self.total_score) > 1:
            N = np.arange(0, self.no_of_episodes)
            plt.figure()
            plt.plot(N, self.total_score, label = "reward")
            plt.title("")
            plt.xlabel("Episode #")
            plt.ylabel("Reward")
            plt.legend()
            plt.savefig(self.datapath + 'RewardPlot.png')
            plt.close()

TRAINING

In [8]:
agent = DQNAgent()
agent.create_model()

epi_start = 1
epochs = 0
env = gym.make ('BreakoutDeterministic-v4') 
#env = wrap_env(gym.make('CarRacing-v0'))
#print(env.observation_space, env.action_space)

for episode in range(epi_start, agent.no_of_episodes+epi_start):
    start_time = time.time()
    #epsilon = epsilon/2
    done = False
    phi_St = preprocess(np.zeros(agent.input_shape), env.reset())

    while not done:
        epochs += 1

        if agent.skip_frames > 0 :
            phi_St, reward, done, info, skipped = agent.take_skip_action()
            if skipped:
                continue

        next_phi_St, action, reward, done, info = agent.take_e_greedy_action(phi_St)
        agent.store_transition(phi_St, action, reward, next_phi_St, done)
        phi_St = next_phi_St
        batch_size, phi_Sts, actions, rewards, next_phi_Sts, dones = agent.sample_transition()

        agent.optimize_loss(batch_size, phi_Sts, actions, rewards, next_phi_Sts, dones)

        agent.reset_target_params(epochs)

    agent.update_epsilon()
    agent.print_status(episode, time.time() - start_time)
    agent.save_partial_status(episode)
    agent.end_episode()

env.close()
agent.close()
agent.plot()


Replay Memory loaded. Length:  14000
Epsilon loaded. Value:  0.1347935812106403
Trained model loaded.

episode: 1  score: 0.00000  loss: 1.90207  epsilon: 0.134  24.91s 2020-05-29 13:46:29

episode: 2  score: 1.00000  loss: 1.77433  epsilon: 0.134  23.40s 2020-05-29 13:46:52

episode: 3  score: 1.00000  loss: 2.42017  epsilon: 0.133  38.35s 2020-05-29 13:47:30

episode: 4  score: 2.00000  loss: 2.53537  epsilon: 0.133  29.81s 2020-05-29 13:48:00

episode: 5  score: 0.00000  loss: 0.79196  epsilon: 0.132  18.41s 2020-05-29 13:48:19

episode: 6  score: 1.00000  loss: 1.09017  epsilon: 0.132  22.41s 2020-05-29 13:48:41

episode: 7  score: 0.00000  loss: 0.98255  epsilon: 0.131  18.41s 2020-05-29 13:48:59

episode: 8  score: 2.00000  loss: 1.99904  epsilon: 0.131  46.80s 2020-05-29 13:49:46

episode: 9  score: 5.00000  loss: 1.92041  epsilon: 0.130  52.94s 2020-05-29 13:50:39

episode: 10  score: 3.00000  loss: 2.21113  epsilon: 0.129  54.09s 2020-05-29 13:51:33
Mem length:  14000 Size:  1

KeyboardInterrupt: ignored

In [0]:

with open(agent.datapath + "train_score.file", "wb") as f:
    pickle.dump(agent.total_score, f, pickle.HIGHEST_PROTOCOL)

In [0]:
with open(agent.replay_mem_file, "wb") as f:
  pickle.dump(agent.replay_mem, f, pickle.HIGHEST_PROTOCOL)

In [0]:
def moving_avg(list_, n):
    cumsum, moving_aves = [0], []
    for i, x in enumerate(list_, 1):
        cumsum.append(cumsum[i-1] + x)
        if i>=n:
            moving_ave = (cumsum[i] - cumsum[i-n])/float(n)
            moving_aves.append(moving_ave)
    return moving_aves

In [0]:
n = 10
dqn_score = agent.datapath + "train_score_1_500.file"
dqn_score1 = agent.datapath + "train_score.file"
graph_file = agent.datapath + "avg_score_10.png"

dqn_score = pickle.load(open(dqn_score, "rb"))
dqn_score1 = pickle.load(open(dqn_score1, "rb"))
dqn_score.extend(dqn_score1)
dqn_score1 = moving_avg(dqn_score, n)

if len(dqn_score) > 1:
    N = np.arange(0, len(dqn_score))
    plt.figure()
    plt.plot(N, dqn_score, label = "dqn_reward")
    plt.title("")
    plt.xlabel("Episode #")
    plt.ylabel("Reward")
    plt.legend()
    plt.savefig(agent.datapath + "score.png")
    plt.close()

    N = np.arange(0, len(dqn_score1))
    plt.figure()
    plt.plot(N, dqn_score1, label = "dqn_reward")
    plt.title("")
    plt.xlabel("Episode #")
    plt.ylabel("Reward")
    plt.legend()
    plt.savefig(graph_file)
    plt.close()

EVALUATE

In [0]:
#agent = DQNAgent()
agent = tfk.models.load_model(base_dir + "data/model1/trained_model.h5")

epochs = 0
env = wrap_env(gym.make('BreakoutDeterministic-v0')) #gym.make ('CarRacing-v0') 

no_of_episodes = 10
input_shape = (80, 80, 4)

for i in range(no_of_episodes):
    done = False
    phi_St = preprocess(np.zeros(input_shape), env.reset())
    env.render()
    R = 0
    while not done:
        epochs += 1
        phi_St = phi_St.reshape((1, 80, 80, 4))
        actions = agent.predict(phi_St)
        print(actions)
        action = np.argmax(actions)
        next_x, reward, done, info = env.step(action)
        env.render()
        phi_St = preprocess(np.squeeze(phi_St, axis=0), next_x)

        R += reward
    print("Reward: ", R) 

[[282.62753 303.73767 289.52042 285.58115]]
[[348.74066 366.84116 355.5993  352.91125]]
[[264.07202 283.79483 270.51065 266.8302 ]]
[[303.43198 317.28268 309.58194 306.49466]]
[[305.07883 316.57483 310.91708 308.1279 ]]
[[318.66592 332.4384  325.0159  321.87387]]
[[349.9464  360.41318 356.2603  353.41336]]
[[367.99576 379.98926 374.77673 371.65506]]
[[380.36905 388.33072 386.74872 384.09674]]
[[269.3271  282.1892  274.8639  272.05002]]
[[294.1326  302.49924 299.37457 297.03754]]
[[273.89996 285.96088 279.38632 276.6568 ]]
[[247.64674 259.28046 252.70856 250.14621]]
[[194.0812  208.57083 198.80632 196.10214]]
[[181.38734 194.92787 185.80168 183.27457]]
[[180.46657 193.93825 184.85837 182.34409]]
[[186.37437 200.28778 190.91081 188.31413]]
[[114.78713 123.34803 117.57096 115.97281]]
[[107.97338  116.02481  110.590385 109.08728 ]]
[[103.2774   110.97772  105.77944  104.341835]]
[[83.459885 88.25606  85.061714 83.91132 ]]
[[83.459885 88.25606  85.061714 83.91132 ]]
[[83.459885 88.25606  85

In [0]:
show_video()