In [1]:
from __future__ import annotations
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from matplotlib.patches import Patch
from tqdm import tqdm
import gymnasium as gym
import pickle

In [2]:
env = gym.make('MountainCar-v0', render_mode="human")

In [3]:
num_bins = 20

bins_position = np.linspace(-1.2, 0.6, num_bins)
bins_velocity = np.linspace(-0.07, 0.07, num_bins)

def discretize (obs):
    discretized_position = bins_position[np.digitize(obs[0], bins_position)]
    discretized_velocity = bins_velocity[np.digitize(obs[1], bins_velocity)]
    return tuple((discretized_position, discretized_velocity))

**The agent - based on Q-learning**

To ensure that the agents explores the environment,the solution applied here is the ``epsilon-greedy`` strategy: a random action is picked with the percentage ``epsilon`` and the greedy action (currently valued as the best) with  ``1 - epsilon``.

In [4]:
class QAgent:
    def __init__(
        self,
        num_bins: int,
        learning_rate: float,
        initial_epsilon: float,
        epsilon_decay: float,
        final_epsilon: float,
        discount_factor: float = 0.95,
    ):
        # Initialize a Reinforcement Learning agent with an empty dictionary of state-action values (q_values), a learning rate and an epsilon
        
        self.num_bins = num_bins
        self.q_values = {}

        for i in range(self.num_bins):
            for j in range(self.num_bins):
                self.q_values[(bins_position[i], bins_velocity[j])] = np.zeros(env.action_space.n)

        self.lr = learning_rate
        self.discount_factor = discount_factor

        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon

        self.training_error = []

    def get_action(self, obs):
        # Returns the best action with probability (1 - epsilon)
        # otherwise a random action with probability epsilon to ensure exploration.
        
        if np.random.random() < self.epsilon:
            return env.action_space.sample()

        else:
            return int(np.argmax(self.q_values[obs]))
    

**The testing settings:**

In [5]:
# hyperparameters
learning_rate = 0.01
n_episodes = 10_000
start_epsilon = 0.1 # start_epsilon is very low since it's only for testing purposes
epsilon_decay = start_epsilon / (n_episodes / 2) 
final_epsilon = 0.1

agent = QAgent(
    num_bins=num_bins,
    learning_rate=learning_rate,
    initial_epsilon=start_epsilon,
    epsilon_decay=epsilon_decay,
    final_epsilon=final_epsilon,
)

**Load the json-file:**

Deserialize the json file and convert the data formats back.

In [6]:
with open('q_table.pkl', 'rb') as f:
    data_loaded = pickle.load(f)

In [7]:
for key, value in list(data_loaded.items())[:10]:
    print(key, ':', value)

(-1.2, -0.07) : [0. 0. 0.]
(-1.2, -0.06263157894736843) : [0. 0. 0.]
(-1.2, -0.05526315789473685) : [0. 0. 0.]
(-1.2, -0.04789473684210527) : [0. 0. 0.]
(-1.2, -0.04052631578947369) : [0. 0. 0.]
(-1.2, -0.03315789473684211) : [0. 0. 0.]
(-1.2, -0.02578947368421053) : [0. 0. 0.]
(-1.2, -0.01842105263157895) : [0. 0. 0.]
(-1.2, -0.01105263157894737) : [0. 0. 0.]
(-1.2, -0.00368421052631579) : [0. 0. 0.]


In [8]:
# update the Q-table of the agent based on the loaded data
agent.q_values = data_loaded

**Test example:**

In [9]:
terminated = False
obs, info = env.reset()
obs = discretize(obs)
while not terminated:
    env.render()
    action = agent.get_action(obs)
    obs, reward, terminated, truncated, info = env.step(action)
    obs = discretize(obs)

env.close()

**Source for Q-Learning implemetation:** 

https://gymnasium.farama.org/tutorials/training_agents/blackjack_tutorial/#sphx-glr-tutorials-training-agents-blackjack-tutorial-py (02-06-2024)
