https://levelup.gitconnected.com/dqn-from-scratch-with-tensorflow-2-eb0541151049

https://towardsdatascience.com/solving-lunar-lander-openaigym-reinforcement-learning-785675066197


https://github.com/ranjitation/DQN-for-LunarLander/blob/master/dqn_agent.py


Action space(discrete)
0-do nothing
1-fire left engine
2-fire down engine
3-fire right engine

In [None]:
from google.colab import drive

drive.mount('/content/drive')

In [None]:
# %tensorflow_version 1.x

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint
import numpy as np
import gym
import time
from collections import deque
import random
import pandas as pd

In [None]:
!apt-get install ffmpeg freeglut3-dev xvfb;  # For visualization

!pip install box2d-py;

In [None]:
class DQNAgent():
  """
  Deep Q-Network agent

  Parameters
  ----------
  actionSpace: int
    Size of action space (4)
  
  stateSpace: int
    Size of state space (8)

  DQN: <class 'tensorflow.python.keras.engine.sequential.Sequential'>
    Online DQN

  targetDQN: <class 'tensorflow.python.keras.engine.sequential.Sequential'>
    Target DQN

  bestDQN: <class 'tensorflow.python.keras.engine.sequential.Sequential'>
    Best model up to the current step in training

  epsilon: float
    Exploration factor
  
  TAU: float
    Soft update factor

  gamma: float
    Discount rate

  learningRate: float
    Training learning rate

  Methods
  -------
  policy(state)
    The agent policy. 
  """
  def __init__(self,actionSpace,stateSpace):
    """
    Parameters
    ----------
    actionSpace: int
    Size of action space (4)
  
    stateSpace: int
      Size of state space (8)
    """
    self.actionSpace = actionSpace
    self.stateSpace = stateSpace
    
    self.epsilon = 0.05
    self.TAU = 0.001
    self.gamma = 0.99
    self.learningRate = 0.0005

    self.DQN = self.buildDQN()
    self.targetDQN = self.buildDQN()
    self.bestDQN = None

    

  def policy(self,state):
    """
    Takes greedy actions by maximising the Q-function. 
    Takes random action epsilon % of the time

    Parameters
    ----------
    State: np.ndarry
      The current state as defined by the action from the prior state
    
    Returns
    -------
    Action: np.ndarry
      Action to take given the input state
    """
    stateInput = tf.convert_to_tensor(state[None,:],dtype=tf.float32)
    if random.random() < self.epsilon:
      action = np.random.randint(self.actionSpace)
    else:
      qActions = self.DQN(stateInput)
      action = np.argmax(qActions.numpy()[0],axis=0)
    return action

  def buildDQN(self):
    """
    Builds the deep Q network

    Returns
    ------
    DQN: <class 'tensorflow.python.keras.engine.sequential.Sequential'>
      The compiled model
    """
    DQN = Sequential()    
    DQN.add(Dense(128,input_dim=self.stateSpace,activation='relu',kernel_initializer='glorot_uniform'))
    DQN.add(Dense(64,activation='relu',kernel_initializer='glorot_uniform'))
    DQN.add(Dense(32,activation='relu',kernel_initializer='glorot_uniform'))
    DQN.add(Dense(self.actionSpace,activation='linear', kernel_initializer='glorot_uniform'))
    DQN.compile(optimizer=Adam(self.learningRate),loss='mse')
    return DQN
   

  def train(self,batch,step):
    """
    A training step

    Parameters
    ----------
    batch: list
      Batch of 32 frames. Each contains an array of [state,nextState,reward,
      action,done]
    
    step: int
      Step of total training steps

    Returns
    -------
    model.history['loss']: <class: list>
      Output log of loss over training step
    """
    stateBatch = []
    nextStateBatch = []
    actionBatch = []
    rewardBatch = []
    doneBatch = []
    
    for frame in batch:
      stateBatch.append(frame[0])
      nextStateBatch.append(frame[1])
      rewardBatch.append(frame[2])
      actionBatch.append(frame[3])
      doneBatch.append(frame[4])

    stateBatch = np.array(stateBatch)
    nextStateBatch = np.array(nextStateBatch)
    actionBatch = np.array(actionBatch)
    recentQ = self.DQN(stateBatch)
    targetQ = np.copy(recentQ)
    nextQ = self.targetDQN(nextStateBatch)
    nextQMax= np.amax(nextQ,axis=1)
    for i in range(stateBatch.shape[0]):
      #discounted reward calculation
      targetQ[i][int(actionBatch[i])]= rewardBatch[i] if doneBatch[i] else rewardBatch[i] + self.gamma * nextQMax[i]
    
    #fit model
    model = self.DQN.fit(x=stateBatch,
                          y=targetQ,
                          verbose=0,
                          batch_size=None)
    return model.history['loss']


  def softUpdate(self):
    """
    Performs a soft update to the targetDQN. The updated is scaled by the TAU
    class attribute
    """
    targetModel = self.targetDQN
    model = self.DQN
    targetParams = np.array(targetModel.get_weights(),dtype=object)
    modelParams = np.array(model.get_weights(),dtype=object)
    targetModel.set_weights(self.TAU*modelParams + (1.0-self.TAU)*targetParams)
    self.targetDQN = targetModel
  
  
  def saveModel(self,modelPath):
    """
    Parameters
    ----------
    modelPath: string
      Path to save model to

    Saves the model to modelPath. Saved as a .json. Weights are saved seperately
    """
    # modelJson = self.bestDQN.to_json()
    # with open((modelPath + "model.json"), "w") as jsonFile:
    #     jsonFile.write(modelJson)
    # self.bestDQN.save_weights((modelPath + 'DQNWeights.tf'),save_format='tf')

    # print('model saved')
    pass
  def setBestModel(self):
    """
    Sets the best model from online model
    """
    self.bestDQN = self.DQN

In [None]:
class ReplayBuffer():
  """
  Replay buffer class
  Saves training data
  
  Parameters
  ----------
  self.gameplayExperiences: collections.deque 
    Gameplay stored in deque that allows for easy use and removal of data
  
  Methods
  -------
  storeGamePlay(state,nextState,reward,action,done)
    Appends a batch of gameplay to gameplayExperiences
  
  batchGamePlay()
    Takes a batch of size 32 from gameplayExperiences
  """
  def __init__(self):
    self.gameplayExperiences = deque(maxlen=10000)
  
  def storeGamePlay(self,state,nextState,reward,action,done):
    """
    Stores a frame of gameplay in gameplayExperiences
    
    Parameters
    ----------
    state: numpy ndarry
      The state of the enviornment 8 values in length
    nextState: numpy ndarry
      The state of the enviornment following the current action
    reward: float
      Reward for taking the current action in the current state
    action: int
      Action taken in state
    done: boolean
      True if epsiode completed
      False if the episode is still running
    """
    self.gameplayExperiences.append((state,nextState,reward,action,done))
    
  
  def batchGamePlay(self):
    """
    Takes a batch of size 32 from gameplayExperiences

    Returns
    ------
    batch: list
      A batch of gameplay 
    """
    batchSize = min(32, len(self.gameplayExperiences))
    sampleBatch = random.sample(self.gameplayExperiences, batchSize)
    batch = []
    for gameplay in sampleBatch:
      batch.append(gameplay)
    return batch

In [None]:
def evaluateDQN(env, agent, numberOfEpisodes):
  """
  Takes in the agent. Runs numberOfEpisodes test episodes.
  Mean rewards and prediction times are calculated.
  Parameters
  ----------
  numberOfEpisodes: int
    Number of episodes to evaluate the agent on
  
  agent : <class: Evaluator>
    Agent to test
    
  Returns
  -------
  meanReward: float
    Mean reward over the test episodes
  """
  rewardSum = 0.0
  for episode in range(numberOfEpisodes):
    obs = env.reset()
    done = False
    state = None
    episodeReward = 0.0
    while not done:
      action = agent.policy(obs)
      newObs, reward, done, _ = env.step(action)
      episodeReward += reward
      obs = newObs
    rewardSum += episodeReward
  meanReward = rewardSum / numberOfEpisodes
  return meanReward

In [None]:
def collectGamePlay(env,agent,buffer):
  """
  Collects an episode of gameplay
  When done the episode is over the function terminates
  """
  state = env.reset()
  done=False
  while not done:
    action = agent.policy(state)
    nextState,reward,done,_ = env.step(action)
    buffer.storeGamePlay(state,nextState,reward,action,done)
    state=nextState

def trainModel(numSteps, evaluationInterval):
  """
  Trains the agent over a user defined number of steps
  Parameters
  ----------
  numSteps: int
    Number of steps to train model - 100000 recommened
  
  evaluationInterval: int
    Frequency of validation testing
  """
  #Initalise pandas results frame
  if numSteps < evaluationInterval:
    zeros = np.zeros((1,2))
  else:
    zeros = np.zeros((int(numSteps/evaluationInterval),2))
  rewardData = pd.DataFrame(zeros,columns=['meanReward', 'episodeNumber'])

  #Initalise lunar lander environment
  env=gym.make('LunarLander-v2')
  env.seed(0)
  #Find size of state and action space
  stateSpace = env.observation_space.shape[0]
  actionSpace = env.action_space.n
  #Init agent and replay buffer
  agent=DQNAgent(actionSpace, stateSpace)
  buffer=ReplayBuffer()
  #asign iter for index pandas frame
  iter = 0
  #set best reward low
  bestReward=-np.inf
  training=True
  #paths for model saving
  modelPath = '/content/drive/My Drive/Github/RLProject/SavedModels/'
  checkpointPath = '/content/drive/My Drive/Github/RLProject/SavedModels/DQNCheckpoint'
  while training:
    for step in range(numSteps):
      if step%1000==0:
        print(f'Step num:{step}')
      
      #Fill buffer and train model
      collectGamePlay(env,agent,buffer)
      experienceBatchGamePlay=buffer.batchGamePlay()
      loss = agent.train(experienceBatchGamePlay,step)
      #Perform soft update
      agent.softUpdate()
      
      
      
      # same callback structure as sb
      if step % evaluationInterval == 0:      
        meanReward = evaluateDQN(env,agent,5)
        rewardData['meanReward'].iloc[iter]=meanReward
        rewardData['episodeNumber'].iloc[iter]=step 
        print('validating')

        #Threshold of completion according to open ai gym
        #Stop training at 200 reward
        if meanReward>=200:
          agent.setBestModel()
          agent.saveModel(modelPath)

          print('Reward 200 reached - training complete')
          print('Best model saved')
          training = False
          break
        
        
        #set best model and save for checkpoint if there is a crash
        if meanReward>bestReward:
          agent.setBestModel()
          bestReward=meanReward
          print('New best score')
          # rewardData.to_csv('/content/drive/My Drive/Github/RLProject/SavedModels/DQNReward.csv',index=False)
        iter+=1
      #save for checkpoint if there is a crash
      if step%5000 == 0:
        agent.saveModel((checkpointPath+f'/Checkpoint@step{step}'))
        # rewardData.to_csv('/content/drive/My Drive/Github/RLProject/SavedModels/DQNReward.csv',index=False)
    
    #training over
    training = False

  #Save the best model
  agent.saveModel(modelPath)
  # rewardData.to_csv('/content/drive/My Drive/Github/RLProject/SavedModels/DQNReward.csv',index=False)

In [None]:
if __name__ == "__main__":
  trainModel(100000,500)