### Imports

In [1]:
import gym
import numpy as np
import random
import collections
import keras
from keras.layers import Dense
from collections import deque

Using TensorFlow backend.


#### Set up agent environment

In [8]:
def cartpole(param_name:str, param_vals):
    
    
    params = {
        'learning_rate': 0.001, 
        'gamma': 0.95, 
        'epsilon': 1.0, 
        'epsilon_min': 0.01, 
        'epsilon_decay': 0.995
    }
    
    print('\n\nPARAM: {}'.format(param_name))
    
    env = gym.make('CartPole-v0')
    
    for param_val in param_vals:
        
        obs_space = env.observation_space.shape[0]
        action_space = env.action_space.n    
        
        params[param_name] = param_val
        
        print('\nVAL: {}'.format(param_val))
        
        dqn = DQN(
            obs_space, 
            action_space, 
            params['learning_rate'], 
            params['gamma'], 
            params['epsilon'], 
            params['epsilon_min'], 
            params['epsilon_decay']
        )
        
        scores = []
        
        for episode in range(50):
            state = env.reset()
            state.reshape([1, obs_space])

            time_step = 0
            done = False

            while not done:
                env.render()

                action = dqn.act(state)

                observation, reward, done, info = env.step(action)

                #if done:
                #    print('Terminal observation: ', observation)

                time_step += 1

                dqn.experience_replay()

            scores.append(time_step)

            mean_survival_time = np.mean(scores)
        

        print('Mean survival time: ', mean_survival_time)
    
        env.close()

#### Set up neural network architecture

In [10]:
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

class DQN:
    
    def __init__(self, 
                 obs_space, 
                 action_space, 
                 learning_rate = 0.001, 
                 gamma = 0.95, 
                 epsilon = 1.0, 
                 epsilon_min = 0.01,
                 epsilon_decay = 0.995
                ):
        self.expl_rate = 1.0
        self.action_space = action_space
        
        self.memory = deque(maxlen = 2000)
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        
        self.model = Sequential()
        self.model.add(Dense(24, input_dim = obs_space, activation = 'relu'))
        self.model.add(Dense(24, activation = 'relu'))
        self.model.add(Dense(self.action_space, activation = 'linear'))
        
        self.model.compile(loss = 'mse', optimizer = Adam(lr = self.learning_rate))
        
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        
    def act(self, state):
        if np.random.rand() < self.epsilon:
            return random.randrange(self.action_space)
        
        q_vals = self.model.predict(state)
        
        print(q_vals)
        
    def experience_replay(self):
        if len(self.memory) < 20:
            return
        
        batch = random.sample(self.memory, 20)
        
        for state, action, reward, state_next, done in batch:
            q_update = reward
            
            if not done:
                q_update = (reward + gamma * np.amax(self.model.predict(state_next)[0]))
                
            q_values = self.model.predict(state)
            
            q_values[0][action] = q_update
            
            self.model.fit(state, q_values, verbose = 0)
            
            self.epsilon *= self.epsilon_decay
            self.epsilon = np.argmax(self.epsilon, self.epsilon_min)
            

# test param configurations in standalone runs (some configs will be duplicated as they match the default vals)
cartpole('learning_rate', [0.001, 0.01, 0.1, 1.0, 10.0])
cartpole('gamma', [0.25, 0.5, 0.75, 0.95])
cartpole('epsilon', [1.0, 1.5, 10.0])
cartpole('epsilon_min', [0.001, 0.01, 0.1, 1.0])
cartpole('epsilon_decay', [0.75, 0.9, 0.995, 1.0])



PARAM: learning_rate

VAL: 0.001
Mean survival time:  23.7

VAL: 0.01
Mean survival time:  20.22

VAL: 0.1
Mean survival time:  23.58

VAL: 1.0
Mean survival time:  20.28

VAL: 10.0
Mean survival time:  19.44


PARAM: gamma

VAL: 0.25
Mean survival time:  20.84

VAL: 0.5
Mean survival time:  18.98

VAL: 0.75
Mean survival time:  23.42

VAL: 0.95
Mean survival time:  20.54


PARAM: epsilon

VAL: 1.0
Mean survival time:  20.08

VAL: 1.5
Mean survival time:  22.54

VAL: 10.0
Mean survival time:  22.06


PARAM: epsilon_min

VAL: 0.001
Mean survival time:  21.84

VAL: 0.01
Mean survival time:  24.3

VAL: 0.1
Mean survival time:  23.18

VAL: 1.0
Mean survival time:  22.56


PARAM: epsilon_decay

VAL: 0.75
Mean survival time:  21.96

VAL: 0.9
Mean survival time:  24.56

VAL: 0.995
Mean survival time:  21.26

VAL: 1.0
Mean survival time:  20.3
