In [None]:
!apt install swig cmake libopenmpi-dev zlib1g-dev
!pip install stable-baselines==2.5.1 box2d box2d-kengz



Reading package lists... Done
Building dependency tree       
Reading state information... Done
zlib1g-dev is already the newest version (1:1.2.11.dfsg-0ubuntu2).
zlib1g-dev set to manually installed.
libopenmpi-dev is already the newest version (2.1.1-8).
cmake is already the newest version (3.10.2-1ubuntu2.18.04.1).
The following package was automatically installed and is no longer required:
  libnvidia-common-440
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  swig3.0
Suggested packages:
  swig-doc swig-examples swig3.0-examples swig3.0-doc
The following NEW packages will be installed:
  swig swig3.0
0 upgraded, 2 newly installed, 0 to remove and 59 not upgraded.
Need to get 1,100 kB of archives.
After this operation, 5,822 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 swig3.0 amd64 3.0.12-1 [1,094 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 swig amd64 3.0.12-1 [6,

In [None]:
import random
import gym
import numpy as np
import matplotlib.pyplot as plt
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from google.colab import files


In [None]:

class DQNAgent:
  def __init__(self, environment):
    self.env = environment
    #training model
    self.model = self.start_model()
    self.targetmodel = self.start_model()
    self.epsilon = 1.0
    self.epsilon_decay = 0.99
    #experiment 1, epsilon decay .999
    #experiment 2, epsilon decay .995
    #experiment 3, epsilon decay .99
    self.alpha = .001
    self.gamma = 0.99
    self.memorySize = 5000
    self.layerNodes = 32
    self.batch_size = 64
    #target model
    self.target_model = self.start_model()
    self.replay = deque(maxlen = self.memorySize)
    self.stepcount = 0

  def start_model(self):
    """create NN using keras"""

    model = Sequential()
    model.add(Dense(64, input_dim = self.env.observation_space.shape[0],activation="relu"))
    model.add(Dense(64, activation="relu"))
    model.add(Dense(self.env.action_space.n))
    model.compile(loss="mean_squared_error",optimizer=Adam(lr=.001))
    return model

  def update_replay(self, state, action, reward, new_state, done):
    """update the replay buffer"""
    self.replay.append([state, action, reward, new_state, done])

  def take_action(self, current_state):
    """take an action, greedy epsilon"""
    if np.random.random() < self.epsilon:
      return self.env.action_space.sample()
    else:
      return np.argmax(self.model.predict(current_state)[0])

  
  def replay_memory(self):
    
    self.stepcount = (self.stepcount+1)%4
    if self.stepcount == 0:
      if len(self.replay) < self.batch_size:
        return
      batch = random.sample(self.replay, self.batch_size)
      batch_states = []
      batch_forecasts = []
      for state, action, reward, next_state, done in batch:
        if done:
          target = reward
        elif not done:
          target = (reward+self.gamma * np.amax(self.targetmodel.predict(next_state)[0]))
        target_f = self.model.predict(state)
        
        target_f[0][action] = target
        
        batch_states.append(state[0])
        batch_forecasts.append(target_f[0])
      batch_states = np.array(batch_states)
      batch_forecasts = np.array(batch_forecasts)
      self.model.fit(batch_states, batch_forecasts, epochs=1, verbose=0)
      
      target_weights = np.array(self.targetmodel.get_weights())
      current_weights = np.array(self.model.get_weights())
      target_weights = (current_weights * .001) + (.999 * target_weights)
      self.targetmodel.set_weights(target_weights)


    
      



In [None]:
def plotchart(xval,yval,xlabel,ylabel,title,filename):
  plt.plot(xval,yval)
  plt.ylabel(ylabel)
  plt.xlabel(xlabel)
  plt.title(title)
  plt.savefig(filename)

  #files.download(filename)
  plt.show()
  plt.close()

environment = gym.make('LunarLander-v2')
environment = environment.unwrapped



environment.reset()
agent = DQNAgent(environment = environment)

In [None]:

done = False
scores = []
for e in range(3000):
  
  state = environment.reset()
  state = np.reshape(state,[1,8])
  episode_score = 0
  for time in range(1000):
    # if e>400:
    #   environment.render()
    action = agent.take_action(state)
    next_state, reward, done, _ = environment.step(action)
    episode_score += reward
    next_state = np.reshape(next_state, [1, 8])
    agent.update_replay(state,action,reward,next_state,done)
    agent.replay_memory()
   # agent.replay_vectorized()
    #if reward == 100:
    #  print("winner")
    state = next_state
    if done:
      print("episode: {}/1000, score: {}, e: {:.2}".format(e,episode_score,agent.epsilon))
      break
  scores.append(episode_score)
  if e>=99:
    if e%20 == 0:
      #plotchart(np.arange(len(scores)),scores,"episode number","episode score","Scores per episode","trainingscoresperepisode.png")
      print("rolling average")
      print(np.mean(np.array(scores)[e-99:e]))
    if np.mean(np.array(scores)[e-99:e])>=200:
      agent.model.save("model.h5")
      plotchart(np.arange(len(scores)),scores,"episode number","episode score","Scores per episode, learning rate"+ str(agent.alpha),"trainingscoresperepisode"+str(e)+".png")
      break
  
  
  if e%200 == 0:
    plotchart(np.arange(len(scores)),scores,"episode number","episode score","Scores per episode","trainingscoresperepisode"+str(e)+".png")
  if agent.epsilon>.01:
    agent.epsilon *= agent.epsilon_decay

  

environment.close()

In [None]:
!cd
!pwd

/content
