In [1]:
import random
import gym
import numpy as np
from IPython.display import clear_output
from time import sleep

class agent(object):
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.states = []
        self.actions = []
        self.rewards = []
        self.theta = np.random.random(state_size * action_size)
        self.alpha = 0.001
        self.gamma = 1

    def store(self, state, action, reward):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)

    def _phi(self, s, a):
        encoded = np.zeros([self.action_size, self.state_size])
        encoded[a] = s
        return encoded.flatten()

    def _softmax(self, s, a):
        return np.exp(self.theta.dot(self._phi(s, a)) / 100)

    def pi(self, s):
        weights = np.empty(self.action_size)
        for a in range(self.action_size):
            weights[a] = self._softmax(s, a)
        return weights / np.sum(weights)

    def act(self, state):
        probs = self.pi(state)
        a = np.random.choice(range(0, self.action_size), p=probs)
        return a

    def _gradient(self, s, a):
        expected = 0
        probs = self.pi(s)
        for b in range(0, self.action_size):
            expected = expected + probs[b] * self._phi(s, b)
        return self._phi(s, a) - expected

    def _G(self, t):
        G = 0
        for tau in range(t, len(self.rewards)):
            G = G + self.gamma**(tau - t) * self.rewards[tau]
        return G

    def train(self):
        # Normalize the rewards
        #self.rewards =self.rewards - np.mean(self.rewards)
        #self.rewards = self.rewards/ np.std(self.rewards)
        grad = np.zeros(self.action_size * self.state_size)
        for t in range(len(self.states)):
            s = self.states[t]
            a = self.actions[t]
            G = self._G(t)
            grad = grad + G * (self.gamma**t) * self._gradient(s, a)
        
        self.theta = self.theta + self.alpha * grad
  
        self.states = []
        self.actions = []
        self.rewards = []

    
    
env = gym.make('CartPole-v1')
state = env.reset()
score = 0
episode = 0
prev_frame = None
state_size = 4
action_size = env.action_space.n
g = agent(state_size, action_size)


MAX_EPISODES = 30000
while episode < MAX_EPISODES:  # episode loop
    #env.render()
    action = g.act(state)
    next_state, reward, done, info = env.step(action)  # take a random action
    if done:
        reward = -10
    score = score + reward
    g.store(state, action, reward)
    state = next_state


    if done:
        episode = episode + 1
        g.train()
        if episode % 100 == 0:
            clear_output(wait=True)
            print('Episode: {} Score: {}'.format(episode, score))
        score = 0
        state = env.reset()
        
# TEST     
episode = 0
state = env.reset()     
MAX_EPISODES = 100
while episode < MAX_EPISODES:  # episode loop
    env.render()
    action = g.act(state)
    next_state, reward, done, info = env.step(action)  # take a random action
    if done:
        reward = -10
    score = score + reward
    state = next_state
    sleep(0.01)
    

    if done:
        episode = episode + 1
        if episode % 100 == 0:
            clear_output(wait=True)
            print('Episode: {} Score: {}'.format(episode, score))
        score = 0
        state = env.reset()
env.close()       


Episode: 30000 Score: 304.0


KeyboardInterrupt: 