In [3]:
import gym
from AgentRL.agents.deep_q_network import DQN
from AgentRL.common.buffers.prioritised_buffer import prioritised_replay_buffer
from AgentRL.common.buffers.standard_buffer import standard_replay_buffer

import torch

def train(env):

    # Set the hyperparameters
    training = True
    render = False
    display_freq = 10
    episodes = 1000
    timestep_limit = 100
    seed = 1
    
    # intialise the environment
    env = env

    running_reward = []

    env.seed(seed)

    # initialise the agent
    buffer = prioritised_replay_buffer(max_size=50_000, seed=seed)
    # buffer = standard_replay_buffer(max_size=50_000, seed=seed)
    agent = DQN(
        state_dim=env.observation_space.shape[0], 
        action_num=env.action_space.n, 
        replay_buffer=buffer,

        algorithm_type='duelling',
        hidden_dim = 16,
        learning_rate = 5e-4,
        batch_size = 32,
        gamma = 0.95,
        
        target_update_method = 'hard',
        tau = 0.01, # for soft
        target_update_freq = 20, # for hard
        
        exploration_method="noisy_network",
                
        categorical = True,
        v_range = (0, 200),
        atom_size = 51,

        seed = seed

    )

    for ep in range(1, episodes + 1):

        # reset the state
        state, done = env.reset(), False
        counter = 0
        episode_reward = 0

        # run the training loop
        while not done:

            action = agent.get_action(state=state.flatten())              
            next_state, reward, done, info = env.step(action=action[0])

            # render the environment
            if render: 
                env.render(mode='close')

            # update the reward total
            episode_reward += reward


            if training: 

                # push test samples to the replay buffer
                agent.push(state=state, action=action,
                            next_state=next_state, reward=reward/100, done=done)

                agent.update()                       

            # update the state
            state = next_state
            counter += 1

            # terminate when episode limit is reached            
            if counter >= timestep_limit:
                done = True

            # print the episode reward
            if done: 

                # get reward mean
                running_reward.append(episode_reward)

                if ep % display_freq == 0:
                    # print('Ep {} - Mean Reward {} Exploration {}'.format(ep, sum(running_reward) / display_freq, round(agent.policy.current_exploration, 2)))
                    print('Ep {} - Mean Reward {}'.format(ep, sum(running_reward) / display_freq))
                    running_reward = []  

    # close the display
    env.close()    
    
if __name__ == "__main__":
    
    # get the environment    
    env = gym.make("CartPole-v0")
    
    # run the program
    try: 
        train(env)
    
    # shut the y window if interrupted
    except KeyboardInterrupt:
        env.close()        

Ep 10 - Mean Reward 23.2
Ep 20 - Mean Reward 15.3
Ep 30 - Mean Reward 12.9
Ep 40 - Mean Reward 17.1
Ep 50 - Mean Reward 60.8
Ep 60 - Mean Reward 85.3
Ep 70 - Mean Reward 60.8
Ep 80 - Mean Reward 68.3
Ep 90 - Mean Reward 82.0
Ep 100 - Mean Reward 90.1
Ep 110 - Mean Reward 81.7
Ep 120 - Mean Reward 81.7
Ep 130 - Mean Reward 96.7
Ep 140 - Mean Reward 94.3
Ep 150 - Mean Reward 66.9
Ep 160 - Mean Reward 85.4
Ep 170 - Mean Reward 78.7
Ep 180 - Mean Reward 87.2
Ep 190 - Mean Reward 76.3
Ep 200 - Mean Reward 81.0
Ep 210 - Mean Reward 76.3
Ep 220 - Mean Reward 55.2
Ep 230 - Mean Reward 82.2
Ep 240 - Mean Reward 60.9
Ep 250 - Mean Reward 99.6
Ep 260 - Mean Reward 100.0
Ep 270 - Mean Reward 98.7
Ep 280 - Mean Reward 47.1
Ep 290 - Mean Reward 40.2
Ep 300 - Mean Reward 24.8
Ep 310 - Mean Reward 85.6
Ep 320 - Mean Reward 56.3
Ep 330 - Mean Reward 44.8
Ep 340 - Mean Reward 83.3
Ep 350 - Mean Reward 53.0
Ep 360 - Mean Reward 71.6
Ep 370 - Mean Reward 78.4
Ep 380 - Mean Reward 56.1
Ep 390 - Mean Reward