# T3 de Inteligência Artificial

Aluno: Leonardo Souza

Professor: Eduardo Bezerra

Link do vídeo: 

O objetivo deste trabalho é a implementação do algoritmo Q-Learning pelo método de aproximação linear.

O algoritmo será testado dentro do ambiente Taxi-v3 do OpenGym, para tal, será necessário instalar algumas dependências:

    pip3 install gym==0.17.3



### Testando a criação do ambiente Taxi-v3

In [3]:
import gym
import numpy as np

env = gym.make("Taxi-v3").env

env.render()

+---------+
|[34;1mR[0m: | : :G|
| : | : : |
|[43m [0m: : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+



## Implementação do Q-Learning Linear

### Definição dos hyperparâmetros

learning_rate = 0.7

gamma = 0.618

decay_rate = 0.0001

features:

    - feature_manhattan_distance_taxi_passenger
    
    - feature_manhattan_distance_taxi_destiny


#### features:

In [413]:

def feature_manhattan_distance_taxi_passenger(state, action):
    s_  =  env.P[state][action][0][1]

    l, c, p, d = env.unwrapped.decode(s_)

    if p > 3:
        return 0
    
    p = env.unwrapped.locs[p]
    v = (abs(l - p[0]) + abs(c - p[1]))

    if v == 0:
        return 0

    return 1/v


def feature_manhattan_distance_taxi_destiny(state, action):
    s_  =  env.P[state][action][0][1]
    
    l, c, p, d = env.unwrapped.decode(s_)
    
    if p < 4:
        return 0
    
    d = env.unwrapped.locs[d]
    v = (abs(l - d[0]) + abs(c - d[1]))

    if v == 0:
        return 0

    return 1/v


### Implementação das Classe QLearningLinear

In [414]:
# Class QLearningLinear
class QLearningLinear(object):
    
    def __init__(self, features:list , n_actions=6, learning_rate = 0.7, gamma=0.618, decay_rate=0.0001):
        # cria um numpy array de pesos sendo 1:n, onde n representa o número de features
        self.weights  = np.random.rand(len(features))

        self.features = features

        self.learning_rate = learning_rate
        self.gamma = gamma
        self.max_epsilon = 1.0
        self.min_epsilon = 0.01
        self.epsilon = 1.0
        self.n_actions = n_actions
        self.decay_rate = decay_rate

    def choose_action(self, state, explore=True):
        exploration_tradeoff = np.random.uniform(0, 1)
    
        if explore and exploration_tradeoff < self.epsilon:
            return np.random.randint(self.n_actions) 

        max_q = round(max([self.Q(state, action) for action in range(6) ]))

        if 0 <= max_q  <= 5:
            return max_q
        else:
            return np.random.randint(self.n_actions) 

    def Q(self, state, action):
        """
            Q(S,A) 
        """

        f_values = [f(state, action) for f in  self.features]
        f_values = np.array(f_values)

        return (self.weights  * f_values).sum()
        
    def learn(self, state, action, reward, next_state, done, episode):
        '''
            alhpa == learning rate
            gamma == decaiment

            wi <= wi + alpha * [r + gamma * max_Q(s',a') - Q(s,a)] * fi(s, a)
        '''

        # neighbors = [ env.P[next_state][_][0][1] for _ in env.P[next_state]]
        # max_q = [self.Q(v, None)  for v in neighbors ]
        # max_q = max(max_q)
        # neighbors = [ env.P[next_state][_][0][1] for _ in env.P[next_state]]

        max_q = [self.Q(next_state, action)  for action in range(6) ]
        max_q = max(max_q)

        q = self.Q(state, action)
        difference = (reward + self.gamma * max_q - q)
        
        fi_ = np.array([f(state, action) for f in  self.features])

        self.weights  = self.weights  + self.learning_rate * difference * fi_

        if done:
            self.epsilon = self.min_epsilon + (self.max_epsilon - self.min_epsilon) * \
            np.exp(-self.decay_rate * episode)


In [419]:
qll = QLearningLinear([feature_manhattan_distance_taxi_passenger, feature_manhattan_distance_taxi_destiny], learning_rate=0.7)
state = env.reset()
print(qll, state)

<__main__.QLearningLinear object at 0x7f6bbf14c880> 94


In [420]:

def train_fit(agent, env, epoch, debug=True):
    initial_state = env.reset()
    state = initial_state
    hist = []
    rewards = []

    while True:
        action = agent.choose_action(state)
        new_state, reward, done, info = env.step(action)
        agent.learn(state, action, reward, new_state, done, epoch)

        state = new_state
        
        # hist.append([state, action, weights ])
        rewards.append(reward)

        if done == True:
          break
      
    mean = np.array(reward).mean()
    
    if debug:
      print("epoch {} reward mean: {}".format(epoch, mean))

    return mean

In [423]:
# Testando o histórico de mudanças
epochs = 50
for epoch in range(epochs):
    train_fit(qll, env, epoch)


epoch 0 reward mean: 20.0
epoch 1 reward mean: 20.0
epoch 2 reward mean: 20.0
epoch 3 reward mean: 20.0
epoch 4 reward mean: 20.0
epoch 5 reward mean: 20.0
epoch 6 reward mean: 20.0
epoch 7 reward mean: 20.0
epoch 8 reward mean: 20.0
epoch 9 reward mean: 20.0
epoch 10 reward mean: 20.0
epoch 11 reward mean: 20.0
epoch 12 reward mean: 20.0
epoch 13 reward mean: 20.0
epoch 14 reward mean: 20.0
epoch 15 reward mean: 20.0
epoch 16 reward mean: 20.0
epoch 17 reward mean: 20.0
epoch 18 reward mean: 20.0
epoch 19 reward mean: 20.0
epoch 20 reward mean: 20.0
epoch 21 reward mean: 20.0
epoch 22 reward mean: 20.0
epoch 23 reward mean: 20.0
epoch 24 reward mean: 20.0
epoch 25 reward mean: 20.0
epoch 26 reward mean: 20.0
epoch 27 reward mean: 20.0
epoch 28 reward mean: 20.0
epoch 29 reward mean: 20.0
epoch 30 reward mean: 20.0
epoch 31 reward mean: 20.0
epoch 32 reward mean: 20.0
epoch 33 reward mean: 20.0
epoch 34 reward mean: 20.0
epoch 35 reward mean: 20.0
epoch 36 reward mean: 20.0
epoch 37 re

In [429]:
total_actions, total_penalties = 0, 0
NUM_EPISODES = 10

for e in range(NUM_EPISODES):

    state = env.reset()
    actions = 0
    penalties = 0
    reward = 0
    
    count = 1000

    while True and count > 0:
        action = qll.choose_action(state, explore = False)
        state, reward, done, info = env.step(action)

        if reward == -10:
            penalties += 1

        actions += 1

        if done:
          break

        count -=1

    total_penalties += penalties
    total_actions += actions

print("**********************************")
print("Resultados")
print("**********************************")
print("Média de ações por episódio: {}".format(total_actions / NUM_EPISODES))
print("Penalidade média por episódio: {}".format(total_penalties / NUM_EPISODES))

**********************************
Resultados
**********************************
Média de ações por episódio: 5000.0
Penalidade média por episódio: 838.6
