In [3]:
# Pendulum implementation
import gym
import os

import numpy as np
import random
import math

# Elementary implementation of Deep Q-learning Agent
# from https://keon.io/deep-q-learning/
from dqn import DQNAgent

# Implementation of a visualization of the Q function
from helpers import plot_q

In [4]:
%matplotlib inline

from pylab import rcParams
rcParams['figure.figsize'] = 20, 20

# initialize gym environment and the agent
env = gym.make('Pendulum-v0')
nactions = 3
agent = DQNAgent(2, nactions)
episodes = 10000

def simplify_pendulum_obs(obs):
    return np.array([np.arccos(obs[0]) * np.sign(obs[1]), obs[2]])

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [None]:
if not os.path.exists("frames"):
    os.mkdir("frames")

episodes = 1000000
action_scale = (agent.action_size - 1)/2

A = np.reshape(np.array([]),[0,2 ])

# Iterate the game
for e in range(episodes):
    # reset state in the beginning of each game
    state = env.reset()
    state = simplify_pendulum_obs(state)
    state = np.reshape(state, [1, 2])
    
    # time_t represents each frame of the game
    # Our goal is to keep the pole upright as long as possible until score of 500
    # the more time_t the more score
    total = 0
    for time_t in range(500):
        # turn this on if you want to render
        # env.render()
        # Decide action
        action = [agent.act(state)]
        
        # Advance the game to the next frame based on the action.
        # Reward is 1 for every frame the pole survived
        next_state, reward, done, _ = env.step( [ (action[0]-action_scale)*2.0/float(action_scale) ])
        next_state = simplify_pendulum_obs(next_state)
        next_state = np.reshape(next_state, [1, 2])
        
        # Remember the previous state, action, reward, and done
        agent.remember(state, action, reward, next_state, done)
        
        # make next_state the new current state for the next frame.
        state = next_state
        total += reward

        # done becomes True when the game ends
        # ex) The agent drops the pole
#        if e % 500 == 0:
#            env.render()
        if done:
            # print the score and break out of the loop
            break
            
    # train the agent with the experience of the episode
    loss = agent.replay(32)
    A = np.vstack([A, [total, loss]])
    
    if e % 5 == 0:
        print("episode: {}/{}, score: {}, loss: {}".format(e, episodes, total, loss))
#        plot_q(agent, e/5)


episode: 0/1000000, score: -907.6615850044843, loss: 14.780041937556916
episode: 5/1000000, score: -1069.8956949966423, loss: 54.87046575397562
episode: 10/1000000, score: -744.7788432757548, loss: 11.037022604403319
episode: 15/1000000, score: -1319.0793536228437, loss: 5.872732219802856
episode: 20/1000000, score: -1181.2258767659475, loss: 69.37678545166273
episode: 25/1000000, score: -1594.1169634103612, loss: 46.90653177897184
episode: 30/1000000, score: -1395.6776969036046, loss: 94.25358753220644
episode: 35/1000000, score: -1222.671427428283, loss: 3.178452866821317
episode: 40/1000000, score: -1190.1301596348835, loss: 11.828887262148783
episode: 45/1000000, score: -1617.5213815309462, loss: 9.000350598558725
episode: 50/1000000, score: -1284.1187743820146, loss: 18.16108563677699
episode: 55/1000000, score: -1170.243072492357, loss: 10.396541607825839
episode: 60/1000000, score: -1284.8032817533544, loss: 12.182793427957222
episode: 65/1000000, score: -856.0998557328621, loss

In [None]:
print(A)