In [1]:
import random
import gym
import sys
import numpy as np
from collections import deque,namedtuple
import os
import time 
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.optim import Adam
from agent import *
plt.style.use('seaborn')

In [2]:
def plot_agent_rewards(agent):
    fig, ax = plt.subplots(1,figsize=(10,8))
    ax.plot(agent.rewards)
    ax.set_title("Agent Reward Progression across Episodes", fontsize = 16)
    ax.set_ylabel('Reward', fontsize = 14)
    ax.set_xlabel('Episodes', fontsize = 14)
    plt.show()
    training_time = agent.get_train_time()
    print('Training Time: {:.2f}s'.format(training_time))
    print("Number of episodes: ", agent.n_episodes)
    print("Batch Size: ", agent.batch_size)
    print("Discount Factor: ", agent.gamma)
    print("Learning Rate: ", agent.lr)
    print("Epsilong Decay: ", agent.decay)


In [3]:
env = gym.make('CartPole-v1')
s = env.reset()
state_sz = env.observation_space.shape[0]
action_sz = env.action_space.n

In [4]:
dq_network = DQN(256,state_sz, action_sz)
target_network = DQN(256,state_sz, action_sz)

In [5]:
n_episodes=500
batch_size=128
gamma = 0.98
lr = 0.0001
eps = 1.0
decay = 0.95

In [6]:
agent1 = Agent(env, dq_network, target_network)
agent1.init_hyperparameters(n_episodes, batch_size, gamma, lr, decay)

In [7]:
agent1.print_env_settings()

State space:  4
Action space:  2


In [None]:
agent1.train()

  s = torch.FloatTensor([t.s for t in batch])


Episode:  100
Transition Count:  312
Episode Reward:  312.0
Episode:  200
Transition Count:  238
Episode Reward:  238.0


In [None]:
plot_agent_rewards(agent1)

The reason that potentially the agent learns how to achieve top reward of 500 then sometimes drop in between and at some point even repetitively misses the target might be due to high level of gamma which makes value updating a bit too slow. 

In [None]:
s = env.reset()
n_episodes=500
batch_size=256
gamma = 0.95
lr = 0.001
decay = 0.99

In [None]:
agent2 = Agent(env, dq_network, target_network, n_episodes, batch_size, gamma, lr, decay, action_sz, state_sz)

In [None]:
agent2.train()

In [None]:
plot_agent_rewards(agent2)

Here our agent does some progress but doesn't reach the state with maximum reward any! Perhaps the discount factor here is too high or the learning rate is too high and the agent cannot properly find the optimal solution.

In [None]:
s = env.reset()
n_episodes=500
batch_size=128
gamma = 0.99
lr = 0.01
decay = 0.98

In [None]:
agent3 = Agent(env, dq_network, target_network, n_episodes, batch_size, gamma, lr, decay, action_sz, state_sz)

In [None]:
agent3.train()

In [None]:
plot_agent_rewards(agent3)

In this case the learning rate is much higher and the agent finds the optimal solution quickly but because the learning rate is so high, it is possible through Adam optimizer to leave a good local minimum to a worse one. While it seems great to have found the solution quickly, it's not very robust and the agent will keep missing some episodes. Ideally we'd want to have a similar discount factor but lower learning rate with perhaps more episodes to make sure the agent learns well. 