## Importation des bibliothèques nécessaires

In [18]:
import numpy as np
import gym
import matplotlib.pyplot as plt

## Importation et initialisation aléatoire de l’environnement Mountain Car

In [19]:
env = gym.make('MountainCar-v0')
env.reset()

array([-0.44685972,  0.        ])

## Définir la fonction Q-learning

In [20]:
def QLearning(env, learning, discount, epsilon, min_eps, episodes):
    
    # Déterminer la taille de l’espace d’états
    num_states = (env.observation_space.high - env.observation_space.low)*\
                    np.array([10, 100])
    num_states = np.round(num_states, 0).astype(int) + 1
    
    # Initialiser Q table
    Q = np.random.uniform(low = -1, high = 1, 
                          size = (num_states[0], num_states[1], 
                                  env.action_space.n))
    
    # Initialiser les variables pour suivre les récompenses
    reward_list = []
    ave_reward_list = []
    
    # Calculer la réduction épisodique de l’epsilone
    reduction = (epsilon - min_eps)/episodes
    
    # Exécuter l’algorithme Q-Learning
    for i in range(episodes):
        
        # Initialiser les paramètres
        done = False
        tot_reward, reward = 0,0
        state = env.reset()
        
        # État discret
        state_adj = (state - env.observation_space.low)*np.array([10, 100])
        state_adj = np.round(state_adj, 0).astype(int)
        while done != True: 
            
            # Render environment pour les cinq derniers épisodes
            if i >= (episodes - 20):
                env.render()
                
            # Déterminer la prochaine mesure - stratégie gourmande d’epsilon
            if np.random.random() < 1 - epsilon:
                action = np.argmax(Q[state_adj[0], state_adj[1]]) 
            else:
                action = np.random.randint(0, env.action_space.n)
                
            # Obtenir le prochain état et la récompense
            state2, reward, done, info = env.step(action)
            
            # État discret 2
            state2_adj = (state2 - env.observation_space.low)*np.array([10, 100])
            state2_adj = np.round(state2_adj, 0).astype(int)
            
            # Autoriser les états terminaux
            if done and state2[0] >= 0.5:
                Q[state_adj[0], state_adj[1], action] = reward
                
            # Adjust Q value for current state
            else:
                delta = learning*(reward + 
                                 discount*np.max(Q[state2_adj[0], 
                                                   state2_adj[1]]) - 
                                 Q[state_adj[0], state_adj[1],action])
                Q[state_adj[0], state_adj[1],action] += delta
                
            # Mettre à jour les variables
            tot_reward += reward
            state_adj = state2_adj
            
        # Decay epsilon
        if epsilon > min_eps:
            epsilon -= reduction
            
        # Faire le suivi des récompenses
        reward_list.append(tot_reward)
        if (i+1) % 100 == 0:
            ave_reward = np.mean(reward_list)
            ave_reward_list.append(ave_reward)
            reward_list = []
        if (i+1) % 100 == 0:    
            print('Episode {} Average Reward: {}'.format(i+1, ave_reward))
    env.close()
    return ave_reward_list

## Exécution du Q-learning

In [26]:
rewards = QLearning(env, 0.3, 0.9, 0.8, 0, 5000)

Episode 100 Average Reward: -200.0
Episode 200 Average Reward: -200.0
Episode 300 Average Reward: -200.0
Episode 400 Average Reward: -200.0
Episode 500 Average Reward: -200.0
Episode 600 Average Reward: -200.0
Episode 700 Average Reward: -200.0
Episode 800 Average Reward: -200.0
Episode 900 Average Reward: -200.0
Episode 1000 Average Reward: -200.0
Episode 1100 Average Reward: -200.0
Episode 1200 Average Reward: -200.0
Episode 1300 Average Reward: -200.0
Episode 1400 Average Reward: -200.0
Episode 1500 Average Reward: -200.0
Episode 1600 Average Reward: -200.0
Episode 1700 Average Reward: -200.0
Episode 1800 Average Reward: -200.0
Episode 1900 Average Reward: -200.0
Episode 2000 Average Reward: -200.0
Episode 2100 Average Reward: -200.0
Episode 2200 Average Reward: -200.0
Episode 2300 Average Reward: -200.0
Episode 2400 Average Reward: -200.0
Episode 2500 Average Reward: -200.0
Episode 2600 Average Reward: -200.0
Episode 2700 Average Reward: -200.0
Episode 2800 Average Reward: -200.0
E

## Statistiques

In [24]:
plt.plot(100*(np.arange(len(rewards)) + 1), rewards)
plt.xlabel('Episodes')
plt.ylabel('Average Reward')
plt.title('Average Reward vs Episodes')
plt.savefig('rewards.jpg')     
plt.close() 