In [4]:
# Install OpenAI Gym dependencies
# !pip install Box2D
# !pip install box2d-py
# !pip install gym[all]
# !pip install Box2D
# !pip install box2d box2d-kengz
# Imports
import gym
import pygame
import random
from gym.utils.play import play
import numpy as np

# DL libraries
from keras.layers import Dense, Activation
from keras.models import Sequential, load_model


In [5]:
env_name = 'LunarLander-v2' # Create environment, source: https://www.gymlibrary.ml/environments
env = gym.make(env_name) 

In [14]:
# Agent class
class Agent():
    def __init__(self, env):
        self.n_actions = env.action_space.n
        print("Action size:", self.n_actions)
    
    def get_action(self, state): 
        action = random.choice(range(self.n_actions))
        return action

In [15]:
agent = Agent(env) # Instantiate agent class and pass env
state = env.reset() # Reset env and assign starting state

for _ in range(200):
    action = agent.get_action(state) # Sample action
    state, reward, done, info = env.step(action) # Step
    env.render()

Action size: 4


## Deep deterministic policy gradient

In [4]:
# OU ACTIN NOISE = https://en.wikipedia.org/wiki/Ornstein%E2%80%93Uhlenbeck_process
class Action_noise(object):
    
    def __init__(self, mu, sigma=0.15, theta=0.2, dt=1e-2, x0=None):
        self.theta = theta
        self.mu = mu
        self.sigma = sigma
        self.dt = dt
        self.x0 = x0
        self.reset()
    
    def __call__(self):
        #TODO: make this more readable
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt
        + self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x
    
    def reset(self):
        if self.x0 is not None:
            self.x_prev = self.x0
        else:
            self.x_prev = np.zeros_like(self.mu)
    
    
class ReplayBuffer(object):
    
    def __init__(self, max_siz, input_shape, n_actions):
        self.mem_size = max_size
        self.mem_cntr = 0
        self.state_memory = np.zeros((self.mem_size, *input_shape))
        self.new_state_memory = np.zeros((self.mem_size, *input_shape))
        self.action_memory = np.zero((self.mem_size, n_actions))


## Deep Q Learning

In [6]:
class ReplayBuffer(object):
    
    def __init__(self, max_size, input_shape, n_actions):
        self.mem_size = max_size
        self.mem_counter = 0
        self.input_shape = input_shape
        self.state_memory = np.zeros((self.mem_size, input_shape))
        self.new_state_memory = np.zeros((self.mem_size, input_shape))
        dtype = np.int8
        self.action_memory = np.zeros((self.mem_size, n_actions), dtype=dtype)
        self.reward_memory = np.zeros(self.mem_size)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.float32)
    
    def store_transition(self, state, action, reward, new_state, done):
        index = self.mem_counter % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = new_state
        self.reward_memory[index] = reward
        self.terminal_memory[index] = 1 - int(done)
        actions = np.zeros(self.action_memory.shape[1])
        actions[action] = 1
        self.action_memory[index] = actions
        self.mem_counter += 1
        
    def sample_buffer(self, batch_size):
        max_memory = min(self.mem_counter, self.mem_size)
        batch = np.random.choice(max_memory, batch_size)
        
        states = self.state_memory[batch]
        new_states = self.new_state_memory[batch]
        rewards = self.reward_memory[batch]
        actions = self.action_memory[batch]
        terminal = self.terminal_memory[batch]
        
        return states, actions, rewards, new_states, terminal 

In [7]:
def build_dqn(lr, n_actions, input_dims, fc1_dims, fc2_dims):
    model = Sequential([
        Dense(fc1_dims, input_shape=(input_dims, )),
        Activation('relu'),
        Dense(fc2_dims),
        Activation('relu'),
        Dense(n_actions)
    ])
    model.compile(optimizer='adam', loss='mse')
    
    return model
    

In [8]:
class Agent(object):
    def __init__(self, alpha, gamma, n_actions, epsilon,
                 batch_size, input_dims, mem_size=1000000):
        self.action_space = np.arange(n_actions)
        self.gamma = gamma 
        self.epsilon = epsilon
        self.alpha = alpha
        self.batch_size = batch_size
        
        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
        
        self.q_eval = build_dqn(alpha, n_actions, input_dims, fc1_dims=256, fc2_dims=256)
        
    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)
        
    def select_action(self,state):
        state = state[np.newaxis, :] # Fix this into different solution
        
        if np.random.random() < self.epsilon:
            action = np.random.choice(self.action_space)
        else:
            actions = self.q_eval.predict(state)
            action = np.argmax(actions)
        
        return action
    
    
    def learn(self):
        if self.memory.mem_counter < self.batch_size:
            return
        state, action, reward, new_state, done = self.memory.sample_buffer(self.batch_size)
        
        action_values = np.array(self.action_space, dtype=np.int8)
        action_indices = np.dot(action, action_values)
        
        
        q_eval = self.q_eval.predict(state)
        q_next = self.q_eval.predict(new_state)
        
        q_target = q_eval.copy()
        
        batch_index = np.arange(self.batch_size, dtype=np.int32)
        td = reward + self.gamma * np.argmax(q_eval, axis=1)*done
        
        q_target[batch_index, action_indices] = td 
        
        _ = self.q_eval.fit(state, q_target, verbose=0) 
        
        

In [None]:
agent = Agent(gamma=0.99, epsilon=0.2, alpha=0.001, input_dims=8, n_actions=4, batch_size=64)

scores = []
eps_history = [ ]
n_episodes = 200

for ep in range(n_episodes):
    done = False
    score = 0
    state = env.reset()
    
    while not done:
        action = agent.select_action(state)
        next_state, reward, done, info = env.step(action)
        score += reward
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        agent.learn()
        env.render()
        
        
        
    scores.append(score)
    
    avg_score = np.mean(scores[max(0,ep-100):(ep+1)])
    print('episode:', ep, 'score:', score, 'avg score:', avg_score)
    

2022-05-26 13:40:10.318845: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


episode: 0 score: -47.46484308701936 avg score: -47.46484308701936
episode: 1 score: -390.41372714862183 avg score: -218.9392851178206
episode: 2 score: -388.5405597279773 avg score: -275.47304332120615
episode: 3 score: -223.89711063754453 avg score: -262.57906015029073
episode: 4 score: -126.7909455450974 avg score: -235.42143722925206
episode: 5 score: -164.53477143093636 avg score: -223.6069929295328
episode: 6 score: -85.10293986001446 avg score: -203.8206996338873
episode: 7 score: -185.56695896706148 avg score: -201.53898205053406
episode: 8 score: -213.32680943457092 avg score: -202.84874064876036
episode: 9 score: -309.2935324454599 avg score: -213.4932198284303
episode: 10 score: -67.89996178325342 avg score: -200.25746909705057
episode: 11 score: -199.76625786188362 avg score: -200.21653482745333
episode: 12 score: -100.77233882651353 avg score: -192.56698128891952
episode: 13 score: -357.0734686548025 avg score: -204.31744467219687
episode: 14 score: -104.72036726869653 avg