In [39]:
# Install OpenAI Gym dependencies
# !pip install Box2D
# !pip install box2d-py
# !pip install gym[all]
# !pip install Box2D
# !pip install box2d box2d-kengz
# Imports
import gym
import pygame
import random
from gym.utils.play import play
import numpy as np

# DL libraries
from keras.layers import Dense, Activation
from keras.models import Sequential, load_model

# loss function
from sklearn.metrics import mean_squared_error as MSE



## Papers

In [22]:
env_name = 'LunarLander-v2' # Create environment, source: https://www.gymlibrary.ml/environments
env = gym.make(env_name) 

In [23]:
# # Agent class
# class Agent():
#     def __init__(self, env):
#         self.n_actions = env.action_space.n
#         print("Action size:", self.n_actions)
    
#     def get_action(self, state): 
#         action = random.choice(range(self.n_actions))
#         return action

In [24]:
# agent = Agent(env) # Instantiate agent class and pass env
# state = env.reset() # Reset env and assign starting state

# for _ in range(200):
#     action = agent.get_action(state) # Sample action
#     state, reward, done, info = env.step(action) # Step
#     env.render()

## Deep Q Learning

In [25]:
class ReplayBuffer(object):
    
    def __init__(self, max_size, input_shape, n_actions):
        self.mem_size = max_size
        self.mem_counter = 0
        self.input_shape = input_shape
        self.state_memory = np.zeros((self.mem_size, input_shape))
        self.new_state_memory = np.zeros((self.mem_size, input_shape))
        dtype = np.int8
        self.action_memory = np.zeros((self.mem_size, n_actions), dtype=dtype)
        self.reward_memory = np.zeros(self.mem_size)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.float32)
    
    def store_transition(self, state, action, reward, new_state, done):
        index = self.mem_counter % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = new_state
        self.reward_memory[index] = reward
        self.terminal_memory[index] = 1 - int(done)
        actions = np.zeros(self.action_memory.shape[1])
        actions[action] = 1
        self.action_memory[index] = actions
        self.mem_counter += 1
        
    def sample_buffer(self, batch_size):
        max_memory = min(self.mem_counter, self.mem_size)
        batch = np.random.choice(max_memory, batch_size)
        
        states = self.state_memory[batch]
        new_states = self.new_state_memory[batch]
        rewards = self.reward_memory[batch]
        actions = self.action_memory[batch]
        terminal = self.terminal_memory[batch]
        
        return states, actions, rewards, new_states, terminal 

In [26]:
def build_dqn(lr, n_actions, input_dims, fc1_dims, fc2_dims):
    model = Sequential([
        Dense(fc1_dims, input_shape=(input_dims, )),
        Activation('relu'),
        Dense(fc2_dims),
        Activation('relu'),
        Dense(n_actions)
    ])
    model.compile(optimizer='adam', loss='mse')
    
    return model
    

In [27]:
class DQN_Agent(object):
    def __init__(self, alpha, gamma, n_actions, epsilon,
                 batch_size, input_dims, mem_size=1000000):
        self.action_space = np.arange(n_actions)
        self.gamma = gamma 
        self.epsilon = epsilon
        self.alpha = alpha
        self.batch_size = batch_size
        
        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
        
        self.q_eval = build_dqn(alpha, n_actions, input_dims, fc1_dims=256, fc2_dims=256)
        
    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)
        
    def select_action(self,state):
        state = state[np.newaxis, :] # Fix this into different solution
        
        if np.random.random() < self.epsilon:
            action = np.random.choice(self.action_space)
        else:
            actions = self.q_eval.predict(state)
            action = np.argmax(actions)
        
        return action
    
    
    def learn(self):
        if self.memory.mem_counter < self.batch_size:
            return
        state, action, reward, new_state, done = self.memory.sample_buffer(self.batch_size)
        
        action_values = np.array(self.action_space, dtype=np.int8)
        action_indices = np.dot(action, action_values)
        
        
        q_eval = self.q_eval.predict(state)
        q_next = self.q_eval.predict(new_state)
        
        q_target = q_eval.copy()
        
        batch_index = np.arange(self.batch_size, dtype=np.int32)
        td = reward + self.gamma * np.argmax(q_eval, axis=1)*done
        
        q_target[batch_index, action_indices] = td 
        
        _ = self.q_eval.fit(state, q_target, verbose=0
        

## Function approximation using gradient TD

In [52]:
class FA_Agent(object):
    
    def __init__(self, n_actions, state_dims, alpha, epsilon, gamma, step_size = 0.01):
        self.action_space = np.arange(n_actions)
        self.gamma = gamma 
        self.epsilon = epsilon
        self.alpha = alpha
        self.state_dims = state_dims
        self.weights = np.random.rand(self.state_dims)
        self.step_size = step_size
        self.policy = np
        pass
    
    
    def calculate_loss(self):
        pass
    
    def select_action(self, state):
        
        if np.random.random() < self.epsilon:
            action = np.random.choice(self.action_space)
        else:
            actions = self.q_eval.predict(state)
            action = np.argmax(actions)
    
    def update(self, state, new_state, reward, done):
        if done:
            self.weights = 
            self.weights + self.alpha (reward - )
        weights 
        
        pass

FA_Agent(n_actions=4, state_dims=8, alpha=0.001, epsilon=0.2, gamma=0.99)

<__main__.FA_Agent at 0x7fd9f5eb96d0>

In [41]:
MSE

<function sklearn.metrics._regression.mean_squared_error(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average', squared=True)>

In [53]:
agent = DQN_Agent(gamma=0.99, epsilon=0.2, alpha=0.001, input_dims=8, n_actions=4, batch_size=64)

scores = []
eps_history = [ ]
n_episodes = 1500
max_timesteps = 1000

for ep in range(n_episodes):
    done = False
    score = 0
    state = env.reset()
    timestep = 0
    while not done:
        if timestep == max_timesteps:
            done = True
        
        action = agent.select_action(state)
        # print(np.shape(state))
        next_state, reward, done, _ = env.step(action)
        score += reward
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        agent.learn()
        env.render()
        timestep += 1
        # break
        
        
    scores.append(score)
    print('timestep:', timestep)
    
    avg_score = np.mean(scores[max(0,ep-100):(ep+1)])
    print('episode:', ep, 'score:', score, 'avg score:', avg_score)
    

KeyboardInterrupt: 