In [None]:
import gym
import matplotlib.pyplot as plt
import numpy as np
from gym.spaces.box import Box
from gym.core import Wrapper
from gym.core import ObservationWrapper
from PIL import Image
import tensorflow as tf
from keras.layers import Conv2D,Dense,Flatten
import keras
from tqdm import trange
from pandas import DataFrame
import dill
import getopt,sys
import pandas as pd
from IPython.display import clear_output
import random
import cv2
tf.reset_default_graph()
sess = tf.InteractiveSession()
a=[2]
a.clear()

In [None]:

class FrameBuffer(Wrapper):
    def __init__(self, env, n_frames=4, dim_order='tensorflow'):
        """A gym wrapper that reshapes, crops and scales image into the desired shapes"""
        super(FrameBuffer, self).__init__(env)
        self.dim_order = dim_order
        if dim_order == 'tensorflow':
            height, width, n_channels = env.observation_space.shape
            obs_shape = [height, width, n_channels * n_frames]
        elif dim_order == 'pytorch':
            n_channels, height, width = env.observation_space.shape
            obs_shape = [n_channels * n_frames, height, width]
        else:
            raise ValueError('dim_order should be "tensorflow" or "pytorch", got {}'.format(dim_order))
        self.observation_space = Box(0.0, 1.0, obs_shape)
        self.framebuffer = np.zeros(obs_shape, 'float32')
        
    def reset(self):
        """resets breakout, returns initial frames"""
        self.framebuffer = np.zeros_like(self.framebuffer)
        self.update_buffer(self.env.reset())
        return self.framebuffer
    
    def step(self, action):
        """plays breakout for 1 step, returns frame buffer"""
        new_img, reward, done, info = self.env.step(action)
        self.update_buffer(new_img)
        return self.framebuffer, reward, done, info
    
    def update_buffer(self, img):
        if self.dim_order == 'tensorflow':
            offset = self.env.observation_space.shape[-1]
            axis = -1
            cropped_framebuffer = self.framebuffer[:,:,:-offset]
        elif self.dim_order == 'pytorch':
            offset = self.env.observation_space.shape[0]
            axis = 0
            cropped_framebuffer = self.framebuffer[:-offset]
        self.framebuffer = np.concatenate([img, cropped_framebuffer], axis = axis)



# This code is shamelessly stolen from https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py

class ReplayBuffer(object):
    def __init__(self, size):
        """Create Replay buffer.
        Parameters
        ----------
        size: int
            Max number of transitions to store in the buffer. When the buffer
            overflows the old memories are dropped.
        """
        self._storage = []
        self._maxsize = size
        self._next_idx = 0

    def __len__(self):
        return len(self._storage)

    def add(self, obs_t, action, reward, obs_tp1, done):
        data = (obs_t, action, reward, obs_tp1, done)

        if self._next_idx >= len(self._storage):
            self._storage.append(data)
        else:
            self._storage[self._next_idx] = data
        self._next_idx = (self._next_idx + 1) % self._maxsize

    def _encode_sample(self, idxes):
        obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
        for i in idxes:
            data = self._storage[i]
            obs_t, action, reward, obs_tp1, done = data
            obses_t.append(np.array(obs_t, copy=False))
            actions.append(np.array(action, copy=False))
            rewards.append(reward)
            obses_tp1.append(np.array(obs_tp1, copy=False))
            dones.append(done)
        return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones)

    def sample(self, batch_size):
        """Sample a batch of experiences.
        Parameters
        ----------
        batch_size: int
            How many transitions to sample.
        Returns
        -------
        obs_batch: np.array
            batch of observations
        act_batch: np.array
            batch of actions executed given obs_batch
        rew_batch: np.array
            rewards received as results of executing act_batch
        next_obs_batch: np.array
            next set of observations seen after executing act_batch
        done_mask: np.array
            done_mask[i] = 1 if executing act_batch[i] resulted in
            the end of an episode and 0 otherwise.
        """
        idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
        return self._encode_sample(idxes)



class PreprocessAtari(ObservationWrapper):
    def __init__(self, env):
        """A gym wrapper that crops, scales image into the desired shapes and optionally grayscales it."""
        ObservationWrapper.__init__(self,env)
        
        self.img_size = (64, 64)
        self.observation_space = Box(0.0, 1.0, (self.img_size[0], self.img_size[1], 1))

    def observation(self, img):

        img = img[34:-16, :, :]
        img = cv2.resize(img, self.img_size)
        img = img.mean(-1, keepdims=True)               
        img = img.astype('float32') / 255.
               
        return img


def makeAtarienv(game:str,n_frames=4):
    env=gym.make(game)
    env = PreprocessAtari(env)
    env = FrameBuffer(env, n_frames=4, dim_order='tensorflow')
    return env

class DQNNetwork:
    """
    class implement deep q learning
    """
    def __init__(self, name, state_shape, n_actions, epsilon=0, reuse=False):
        with tf.variable_scope(name, reuse=reuse):
            
            self.network = keras.models.Sequential()
            self.network.add(Conv2D(16, (3, 3), strides=2, activation='relu', input_shape=state_shape))
            self.network.add(Conv2D(32, (3, 3), strides=2, activation='relu'))
            self.network.add(Conv2D(64, (3, 3), strides=2, activation='relu'))
            self.network.add(Conv2D(128, (3, 3), strides=2, activation='relu'))
            self.network.add(Flatten())
            self.network.add(Dense(512, activation='relu'))
            self.network.add(Dense(256, activation='relu'))
            self.network.add(Dense(128, activation='relu'))
            self.network.add(Dense(n_actions, activation='linear'))
            
            # prepare a graph for agent step
            self.state_t = tf.placeholder('float32', [None,] + list(state_shape))
            self.qvalues_t = self.get_symbolic_qvalues(self.state_t)
            
        self.weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name)
        self.epsilon = epsilon


    def get_symbolic_qvalues(self, state_t):
        """takes agent's observation, returns qvalues. Both are tf Tensors"""
        
        qvalues = self.network(state_t)        
        return qvalues
    
    def get_qvalues(self, state_t):
        """Same as symbolic step except it operates on numpy arrays"""
        sess = tf.get_default_session()
        return sess.run(self.qvalues_t, {self.state_t: state_t})
    
    def sample_actions(self, qvalues):
        """pick actions given qvalues. Uses epsilon-greedy exploration strategy. """
        epsilon = self.epsilon
        batch_size, n_actions = qvalues.shape
        random_actions = np.random.choice(n_actions, size=batch_size)
        best_actions = qvalues.argmax(axis=-1)
        should_explore = np.random.choice([0, 1], batch_size, p = [1-epsilon, epsilon])
        return np.where(should_explore, random_actions, best_actions)


class DQNAgent:
    def __init__(self,env,epsilon=0,gamma=0.99,decrease_epsilon=0.99,load_sess=False,file="",GPU_enable=False,global_step=1,reuse=False):

        self.env=env
        print(env)
        self.state_shape=env.observation_space.shape
        self.n_actions=self.env.action_space.n
        self.epsilon=epsilon
        self.exp_replay=ReplayBuffer(50000)
        self.latest_exp_reply=ReplayBuffer(300)
        if GPU_enable:
            device="/device:GPU:0"
        else:
            device="/cpu:0"
        
        with tf.device(device):
          
        #with tf.device("/device:GPU:0"):
            #with tf.variable_scope("DQNAgent",reuse=reuse):
            self.network = DQNNetwork("network", self.state_shape, self.n_actions,self.epsilon);
            self.target_network = DQNNetwork("target_network", self.state_shape, self.n_actions,self.epsilon);
            self.obs_ph = tf.placeholder(tf.float32, shape=(None,) + state_dim,name="state")
            self.actions_ph = tf.placeholder(tf.int32, shape=[None],name="actions")
            self.rewards_ph = tf.placeholder(tf.float32, shape=[None],name="rewards")
            self.next_obs_ph = tf.placeholder(tf.float32, shape=(None,) + state_dim,name="next_ss")
            self.is_done_ph = tf.placeholder(tf.float32, shape=[None],name="done")

            is_not_done = 1 - self.is_done_ph
            self.gamma=gamma

            current_qvalues = self.network.get_symbolic_qvalues(self.obs_ph)
            current_action_qvalues = tf.reduce_sum(tf.one_hot(self.actions_ph, n_actions) * current_qvalues, axis=1)

            next_qvalues_target = self.target_network.get_symbolic_qvalues(self.next_obs_ph)

            next_state_values_target = tf.reduce_max(next_qvalues_target, axis=-1)

            # compute Q_reference(s,a) as per formula above.
            reference_qvalues = self.rewards_ph + self.gamma*next_state_values_target*is_not_done

            # Define loss function for sgd.
            td_loss = (current_action_qvalues - reference_qvalues) ** 2
            self.td_loss = tf.reduce_mean(td_loss)

            self.train_step = tf.train.AdamOptimizer(1e-3).minimize(self.td_loss, var_list=self.network.weights)
        sess = tf.get_default_session()
        sess.run(tf.global_variables_initializer())
        if load_sess==True:
            self.load(file,global_step)

    def load(self,file:str,global_step=1):
        sess = tf.get_default_session()
        saver=tf.train.Saver()
        saver.restore(sess,file+"-"+str(global_step))
    def save(self,file:str,global_step=1):
        sess = tf.get_default_session()
        saver=tf.train.Saver()
        saver.save(sess,file,global_step=global_step)

        
    def evaluate(self,n_games=1, greedy=False, t_max=10000):
      rewards = []
      for _ in range(n_games):
          s = self.env.reset()
          reward = 0
          for _ in range(t_max):
              qvalues = self.network.get_qvalues([s])
              action = qvalues.argmax(axis=-1)[0] if greedy else self.network.sample_actions(qvalues)[0]
              s, r, done, _ = self.env.step(action)
              reward += r
              if done: break

          rewards.append(reward)
      return np.mean(rewards)

    def play_and_record(self, n_steps=1):
        s = self.env.framebuffer
        reward = 0.0
        for t in range(n_steps):
            # get agent to pick action given state s
            qvalues = self.network.get_qvalues([s])
            action = self.network.sample_actions(qvalues)[0]
            next_s, r, done, _ = self.env.step(action)

            # add to replay buffer
            self.exp_replay.add(s, action, r, next_s, done)
            self.latest_exp_reply.add(s,action,r,next_s,done)
            reward += r
            if done:
                s = self.env.reset()
            else:
                s = next_s
        #sess.run(self.train_step, self.sample_batch(batch_size=64,latest=True))       
        return reward
    
    def load_weigths_into_target_network(self):
        """ assign target_network.weights variables to their respective agent.weights values. """
        assigns = []
        for w_agent, w_target in zip(self.network.weights, self.target_network.weights):
            assigns.append(tf.assign(w_target, w_agent, validate_shape=True))
        tf.get_default_session().run(assigns)
      
    def sample_batch(self, batch_size,latest=False):
        if latest==False:
            obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = self.exp_replay.sample(batch_size)
        else:
            obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = self.latest_exp_reply.sample(batch_size)
        return {
          self.obs_ph:obs_batch, self.actions_ph:act_batch, self.rewards_ph:reward_batch, 
          self.next_obs_ph:next_obs_batch, self.is_done_ph:is_done_batch
      }
    
    def fit(self,t_max=10**5):
        mean_rw_history = []
        td_loss_history = []
        #self.play_and_record(10000)
        moving_average = lambda x, span, **kw: DataFrame({'x':np.asarray(x)}).x.ewm(span=span, **kw).mean().values
        sess=tf.get_default_session()
        for i in trange(t_max):  
            # play
            self.play_and_record(600)

            # train
            _, loss_t = sess.run([self.train_step, self.td_loss], self.sample_batch(batch_size=64))
            td_loss_history.append(loss_t)

            # adjust agent parameters
            if i % 500 == 0:
                self.load_weigths_into_target_network()
                self.network.epsilon = max(self.network.epsilon * 0.99, 0.01)
                mean_rw_history.append(self.evaluate(n_games=3))

                if np.mean(mean_rw_history[-10:]) > 60:
                    print('Should be ok')
                    break

            if i % 100 == 0:
                clear_output(True)
                print("buffer size = %i, epsilon = %.5f" % (len(self.exp_replay), self.network.epsilon))

                plt.figure(figsize=[48, 4])
                plt.subplot(1,2,1)
                plt.title("mean reward per game")
                plt.plot(mean_rw_history)
                plt.grid()

                assert not np.isnan(loss_t)
                plt.figure(figsize=[48, 4])
                plt.subplot(1,2,2)
                plt.title("TD loss history (moving average)")
                plt.plot(moving_average(np.array(td_loss_history), span=100, min_periods=100))
                plt.grid()
                plt.show()
                if(len(td_loss_history)>10000):
                    td_loss_history.clear()






def show(mean_rw_history,td_loss_history):
    moving_average = lambda x, span, **kw: DataFrame({'x':np.asarray(x)}).x.ewm(span=span, **kw).mean().values
    plt.figure(figsize=[12,4])
    plt.subplot(1,2,1)
    plt.title("mean reward per game")
    plt.plot(mean_rw_history)
    plt.grid()

    #plt.figure(figsize=[12, 4])
    plt.subplot(1,2,2)
    plt.title("TD loss history (moving average)")
    plt.plot(moving_average(td_loss_history, span=100, min_periods=100))
    plt.grid()
    plt.show()

In [None]:
env=makeAtarienv("BreakoutDeterministic-v4")
env.reset()
n_actions = env.action_space.n
state_dim = env.observation_space.shape
agent=DQNAgent(env,epsilon=0.5,GPU_enable=True)

In [None]:
agent.network.epsilon=0.5

In [None]:
agent.fit()

In [None]:
agent.network.epsilon=0 # Don't forget to reset epsilon back to previous value if you want to go on training

In [None]:
sessions = [agent.evaluate() for _ in range(100)]
print(np.array(sessions))

In [None]:
s=env.reset()
greedy=True

fig=plt.figure()
ax=fig.add_subplot(1,1,1)

while True:
  clear_output(True)
  qvalues = agent.network.get_qvalues([s])
  action = qvalues.argmax(axis=-1)[0] if greedy else agent.network.sample_actions(qvalues)[0]
  new_s,r,done,info=env.step(action)
  plt.imshow(env.render('rgb_array'))
  plt.pause(0.1)
  plt.close()
  s=new_s
  
  if done:
    s=env.reset()
    

In [None]:
#record sessions
import gym.wrappers
env.reset()
env_monitor = gym.wrappers.Monitor(env,directory="videos",force=True)
agent.env=env_monitor
sessions = [agent.evaluate(n_games=1) for _ in range(100)]
env_monitor.close()

In [None]:
#show video
from IPython.display import HTML
import os

video_names = list(filter(lambda s:s.endswith(".mp4"),os.listdir("./videos/")))
print(video_names[0])
HTML("""
<video width="640" height="480" controls>
  <source src="{}" type="video/mp4">
</video>
""".format("./videos/"+video_names[1])) #this may or may not be _last_ video. Try other indices

In [None]:
agent.save("./model.ckpt",2)

In [None]:
os.listdir("./")

In [None]:
from google.colab import files

files.download('./sample_data') 

In [None]:
from google.colab import files

files.upload()