In [23]:
import random
import gym
import numpy as np
from IPython.display import clear_output
from time import sleep

class agent(object):
    """Act with softmax policy. Features are encoded as
    phi(s, a) is a 1-hot vector of states."""
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.states = []
        self.actions = []
        self.theta = np.zeros(state_size*action_size)
        self.Q = np.zeros([state_size, action_size])
        self.V = np.zeros(state_size)
        self.alpha_theta = 0.0001
        self.alpha_Q = 0.0001
        self.gamma = 0.9

    def store(self, state, action):
        self.states.append(state)
        self.actions.append(action)

   
    def _softmax(self, s, a):
        return np.exp(self.theta.dot(self.feature_vector(s, a)) / 100)

    def policy(self, s):
        weights = np.empty(self.action_size)
        for a in range(self.action_size):
            weights[a] = self._softmax(s, a)
        return weights / np.sum(weights)

    def act(self, state):
        probs = self.policy(state)
        a = random.choices(range(0, self.action_size), weights=probs)
        a = a[0]
        return a
    
    def feature_vector(self, s, a):
        encoded = np.zeros([self.state_size,self.action_size])
        encoded[s,a] = 1
        return encoded.flatten()
    
    def _gradient(self, s, a):
        expected = 0
        probs = self.policy(s)
        for b in range(0, self.action_size):
            expected = expected + probs[b] * self.feature_vector(s, b)
        return self.feature_vector(s, a) - expected


    def train(self):
        grad = np.zeros(self.action_size * self.state_size)
        for t in range(len(self.states)):
            state = self.states[t]
            action = self.actions[t]
            grad = grad + (self.Q[state,action]-self.V[state]) * (self.gamma**t) * self._gradient(state, action)
        
        self.theta = self.theta + self.alpha_theta * grad
  
        self.states = []
        self.actions = []
        self.rewards = []
        
    def train_Q(self, state, action, reward, next_state, next_action, done):
            
        if done:
            self.Q[state,action] = (1 - self.alpha_Q) * self.Q[state,action] + self.alpha_Q * (reward)
        else:

            self.Q[state,action] = (1 - self.alpha_Q) * self.Q[state,action] + self.alpha_Q * (reward + self.gamma * self.Q[next_state,next_action])
          
        if done:
            self.V[state] = (1 - self.alpha_Q) * self.V[state] + self.alpha_Q * (reward)
        else:

            self.V[state] = (1 - self.alpha_Q) * self.V[state] + self.alpha_Q * (reward + self.gamma * self.V[next_state])
    
    

# env = gym.make('FrozenLake-v0')
env = gym.make("Taxi-v3")
score = 0
episode = 0
state_size = 500
action_size = 6
MAX_EPISODES = 100000

g = agent(state_size, action_size)

state = env.reset()
action = g.act(state)

while episode < MAX_EPISODES:  # episode loop
    #env.render()
    next_state, reward, done, info = env.step(action)  # take a random action
    score = score + reward
    g.store(state, action)
    next_action = g.act(next_state)
    g.train_Q(state, action, reward, next_state, next_action, done)
    state = next_state
    action = next_action
    


    if done:
        episode = episode + 1
        g.train()
        if episode % 100 == 0:
            clear_output(wait=True)
            print('Episode: {} Score: {}'.format(episode, score))
        score = 0
        state = env.reset()
        
# TEST     
episode = 0
state = env.reset()     
MAX_EPISODES = 10
while episode < MAX_EPISODES:  # episode loop
    env.render()
    action = g.act(state)
    next_state, reward, done, info = env.step(action)  # take a random action
    score = score + reward
    state = next_state
    sleep(0.01)
    

    if done:
        episode = episode + 1
        if episode % 100 == 0:
            clear_output(wait=True)
            print('Episode: {} Score: {}'.format(episode, score))
        score = 0
        state = env.reset()
        


Episode: 99500 Score: -785


KeyboardInterrupt: 