In [1]:
!pip install -q tensorflow gym

In [2]:
import os
import gym
import random
import pickle
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

from time import sleep
from gym import wrappers
from collections import deque

from tensorflow.keras import models, layers
from tensorflow.keras import backend as K

tf.keras.backend.clear_session()

In [3]:
def huber_loss(y, q_value):
  error = K.abs(y - q_value)
  quadratic_part = K.clip(error, 0.0, 1.0)
  linear_part = error - quadratic_part
  loss = K.mean(0.5 * K.square(quadratic_part) + linear_part)

  return loss

In [4]:
class DQN:
  def __init__(self, env, gamma=0.85, epsilon=1.0, memory=2000):
    self.env     = env
    self.memory  = deque(maxlen=memory)

    self.gamma = gamma
    self.epsilon = epsilon
    self.epsilon_min = 0.01
    self.epsilon_decay = 0.995
    self.learning_rate = 0.001
    self.tau = .125

    self.model        = self.create_model()
    self.target_model = self.create_model()

  def create_model(self):
    state_shape  = self.env.observation_space.shape

    model = models.Sequential([
      layers.Dense(24, input_dim=state_shape[0], activation='relu'),
      layers.Dense(48, activation='relu'),
      layers.Dense(24, activation='relu'),
      layers.Dense(self.env.action_space.n),
    ])

    model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])

    return model

  def act(self, state):
    self.epsilon *= self.epsilon_decay
    self.epsilon = max(self.epsilon_min, self.epsilon)

    if np.random.random() < self.epsilon:
      return self.env.action_space.sample()

    return np.argmax(self.model.predict(state)[0])

  def remember(self, state, action, reward, new_state, done):
    self.memory.append([state, action, reward, new_state, done])

  def replay(self):
    batch_size = 32

    if len(self.memory) < batch_size: 
      return

    samples = random.sample(self.memory, batch_size)

    for sample in samples:
      state, action, reward, new_state, done = sample
      target = self.target_model.predict(state)

      if done:
        target[0][action] = reward
      else:
        Q_future = max(self.target_model.predict(new_state)[0])
        target[0][action] = reward + Q_future * self.gamma

      self.model.fit(state,
                     target,
                     verbose=0)

  def target_train(self):
    weights = self.model.get_weights()
    target_weights = self.target_model.get_weights()

    for i in range(len(target_weights)):
      target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)

    self.target_model.set_weights(target_weights)
    
  def get_memory(self):
    return self.memory
  
  def load_memory(self, memory):
    self.memory = memory

  def save_model(self, fn):
    self.model.save(fn)

  def load_model(self, fn):
    self.model = models.load_model(fn)

In [None]:
def train_model(env, episodes=2500, checkpoint=None):
  gamma   = 0.9
  epsilon = .95
  trial_len = 200
  output_dir = '/content/gdrive/My Drive/ML/mountaincar'
  

  agent = DQN(env)
  avg_reward = None
  results = []
  
  i = 0
  wins = 0
  
  if checkpoint is not None:
    file_name = 'mountaincar-model_ep{}.h5'.format(checkpoint)
    file_path = os.path.join(output_dir, file_name)
    agent.load_model(file_path)
    
    file_name = 'mountaincar-memory_ep{}.p'.format(checkpoint)
    file_path = os.path.join(output_dir, file_name)
    agent.load_memory(pickle.load(open(file_path, 'rb')))

    file_name = 'mountaincar-results_ep{}.p'.format(checkpoint)
    file_path = os.path.join(output_dir, file_name)
    results = pickle.load(open(file_path, 'rb'))

    i = checkpoint
    
  while i < episodes:
    cur_state = env.reset().reshape(1,2)
    cur_reward, reward = 0, 0

    for step in range(trial_len):
      action = agent.act(cur_state)
      new_state, reward, done, _ = env.step(action)

      new_state = new_state.reshape(1,2)
      agent.remember(cur_state, action, reward, new_state, done)

      agent.replay()       # internally iterates default (prediction) model
      agent.target_train() # iterates target model

      cur_state = new_state
      cur_reward += reward

      if done:
        break
        
    avg_reward = avg_reward * 0.9 + cur_reward * 0.1 \
    if avg_reward != None else cur_reward

    results.append(avg_reward)

    print('Episode %d | Reward %.2f | Average Reward %.2f' %
            (i+1, cur_reward, avg_reward))

    if (i+1) % 50 == 0:
      print('Save snapshot model episode {}'.format(i+1))

      file_name = 'mountaincar-model_ep{}.h5'.format(i+1)
      file_path = os.path.join(output_dir, file_name)
      agent.save_model(file_path)

      file_name = 'mountaincar-memory_ep{}.p'.format(i+1)
      file_path = os.path.join(output_dir, file_name)
      pickle.dump(agent.get_memory(), open(file_path, 'wb'))
      
      file_name = 'mountaincar-results_ep{}.p'.format(i+1)
      file_path = os.path.join(output_dir, file_name)
      pickle.dump(results, open(file_path, 'wb'))

    # Validate episode
    if step >= 199:
      wins = 0
    else:
      print('Succeeded in {} episodes'.format(i+1))
      wins += 1

    if wins == 10:
      break
      
    i += 1


def play(env, filepath):
  done = False
  env = wrappers.Monitor(env, 'video', force=True)
  state = env.reset().reshape(1,2)

  agent = DQN(env)
  agent.load_model(filepath)

  for _ in range(100):
    state = env.reset().reshape(1,2)
    done = False
    while not done:
      env.render()
      action = agent.act(state)
      state, reward, done, _ = env.step(action)
      state = state.reshape(1,2)
      sleep(0.01)
        

In [None]:
train = False

env = gym.make('MountainCar-v0')

print(env.action_space)
print(env.observation_space)
print(env.observation_space.low)
print(env.observation_space.high)

if train:
  train_model(env, checkpoint=50)
else:
  play(env, 'output/mountaincar-best-score.h5')

Discrete(3)
Box(2,)
[-1.2  -0.07]
[0.6  0.07]
