# Offline RL

*By*

*Henri Lemoine; 261056402; henri.lemoine@mail.mcgill.ca*

*Frederic Baroz; 261118133; frederic.baroz@mail.mcgill.ca*

***TO DISCUSS***
- use discretized states for log reg
- in fitted q, r is r from dataset right?
- shuffle or not shuffle
- what are the horizontal lines... from when we collect dataset
- find a way to include fitted MLP QL

## 2.&nbsp;Imports

In [2]:
import gymnasium as gym

import time

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np


import matplotlib.pyplot as plt

## 3.&nbsp;Constants

In [3]:
# AGENT HYPERPARAMETERS AND SETTINGS
# Offline agent learning rates
DEFAULT_GAMMA = 0.98  # for fitted q-lerning_agent
DEFAULT_K = 10  # for fitted q-learning agent
LEARNING_RATES = [0.1, 0.01]  # for both fitted q-learning agent and logistic regression agent
LR_EPOCHS = 5000  # for logistic regression agent
MLP_EPOCHS = 1  # for MLP agent
EPISODE_CAP = 1000

BATCH_SIZE = 0  # batch size for MLP agent (0=no batch is used)
FC1_DIMS = 256  # number of dimensions for first fully connected layer of QNetwork
FC2_DIMS = 256  # number of dimensions for second fully connected layer of QNetwork

# Shuffling
SHUFFLE_BEFORE_TRAINING = True  # for both fitted q-learning agent and logistic regression agent

# Discretizing state
USE_DISCRETIZED_STATES = True  # for both logistic regression agent and MLP agent
STORE_STATES_AS_DISCRETIZED = True
NB_BINS = 10

# PLOTTING SETTINGS
# Use standard deviation (True) or standard error (False)
USE_STD: bool = True

# KEYING CONSTANTS
# Dataset sizes
SIZE_100 = 0
SIZE_250 = 1
SIZE_500 = 2

# Dataset conditions
CONDITION_EXPERT = 0
CONDITION_MIXT = 1
CONDITION_RANDOM = 2

# Agent types
LR_AGENT = 0
Q_AGENT = 1
MLP_Q_AGENT = 2

## Utility

In [4]:
# PLOTS
def plot_simple_returns(returns, title):
    plt.figure(figsize=(10, 4))
    plt.plot(returns)
    plt.xlabel("Episodes")
    plt.ylabel("Cumulative undiscounted return")
    plt.title(title)
    plt.show()


def plot_bars(results):
    pass


# DISCRETIZE
def discretize_state(state, nb_bins, nb_states):
    cart_pos, cart_vel, pole_angle, pole_vel = state
    cart_pos = np.digitize(cart_pos, np.linspace(-2.4, 2.4, nb_bins - 1))
    cart_vel = np.digitize(cart_vel, np.linspace(-3.5, 3.5, nb_bins - 1))
    pole_angle = np.digitize(pole_angle, np.linspace(-0.4, 0.4, nb_bins - 1))
    pole_vel = np.digitize(pole_vel, np.linspace(-3.5, 3.5, nb_bins - 1))
    arr = np.zeros(nb_bins * nb_states)
    arr[cart_pos + nb_bins * 0] = 1
    arr[cart_vel + nb_bins * 1] = 1
    arr[pole_angle + nb_bins * 2] = 1
    arr[pole_vel + nb_bins * 3] = 1
    return arr  # (40,)


# BUILDING DATASETS
def make_dataset(expert_data, random_data, nb_episodes, condition):
    # TODO: should be the max of the episode column in datasets
    if nb_episodes > min(expert_data.shape[0], random_data.shape[0]):
        raise Exception(
            "Sorry, nb_episodes cannot be greater than the number of episodes in the source datasets (expert or random)."
        )

    # Initializing dataset to return
    dataset = np.zeros((0, expert_data.shape[1]))
    # Loop through all needed episodes
    for i in range(nb_episodes):
        # Choose if data comes from expert or random agent
        source_data = expert_data
        if condition == CONDITION_RANDOM:
            source_data = random_data
        elif condition == CONDITION_MIXT:
            if np.random.randint(2, size=1)[0] == 1:
                source_data = random_data

        # Selecting all samples for this episode index
        episode_samples = source_data[source_data[:, 10] == i]

        # Append all the samples from this episode to dataset
        dataset = np.append(dataset, episode_samples, axis=0)

    return dataset


def make_all_datasets(expert_data, random_data):
    all_datasets = {}

    all_datasets[(SIZE_100, CONDITION_EXPERT)] = make_dataset(
        expert_data, random_data, 100, CONDITION_EXPERT
    )
    all_datasets[(SIZE_250, CONDITION_EXPERT)] = make_dataset(
        expert_data, random_data, 250, CONDITION_EXPERT
    )
    all_datasets[(SIZE_500, CONDITION_EXPERT)] = make_dataset(
        expert_data, random_data, 500, CONDITION_EXPERT
    )

    all_datasets[(SIZE_100, CONDITION_MIXT)] = make_dataset(
        expert_data, random_data, 100, CONDITION_MIXT
    )
    all_datasets[(SIZE_250, CONDITION_MIXT)] = make_dataset(
        expert_data, random_data, 250, CONDITION_MIXT
    )
    all_datasets[(SIZE_500, CONDITION_MIXT)] = make_dataset(
        expert_data, random_data, 500, CONDITION_MIXT
    )

    all_datasets[(SIZE_100, CONDITION_RANDOM)] = make_dataset(
        expert_data, random_data, 100, CONDITION_RANDOM
    )
    all_datasets[(SIZE_250, CONDITION_RANDOM)] = make_dataset(
        expert_data, random_data, 250, CONDITION_RANDOM
    )
    all_datasets[(SIZE_500, CONDITION_RANDOM)] = make_dataset(
        expert_data, random_data, 500, CONDITION_RANDOM
    )

    return all_datasets


# CALUCLATE STD
def get_error(returns):
    if USE_STD:
        return np.std(returns)
    else:
        return np.std(returns) / np.sqrt(len(returns))


# NAMING STUFF
def uc_first(s):
    return s[0].upper() + s[1:]


def name_agent(agent_ix, ucfirst=False):
    if agent_ix == LR_AGENT:
        r = "logistic regression agent"
    else:
        r = "fitted Q-learning agent"

    if ucfirst:
        r = uc_first(r)
    return r


def name_size(size_ix, ucfirst=False):
    if size_ix == SIZE_100:
        r = "small dataset"
    elif size_ix == SIZE_250:
        r = "medium dataset"
    else:
        r = "large dataset"

    if ucfirst:
        r = uc_first(r)
    return r


def name_condition(condition_ix, ucfirst=False):
    if condition_ix == CONDITION_EXPERT:
        r = "pure expert"
    elif condition_ix == CONDITION_MIXT:
        r = "mixt"
    else:
        r = "pure random"

    if ucfirst:
        r = uc_first(r)
    return r


def name_algorithm(agent_ix, size_ix, condition_ix, ucfirst=False):
    r = name_agent(agent_ix) + " on " + name_condition(condition_ix) + " " + name_size(size_ix)
    if ucfirst:
        r = uc_first(r)
    return r


def name_bar(agent_ix, condition_ix, ucfirst=False):
    r = name_agent(agent_ix) + " on " + name_condition(condition_ix) + " dataset"

    if ucfirst:
        r = uc_first(r)
    return r

## 4.&nbsp;Expert and random agents

TODO: change replay_buffer into dataset (cause not really repla buffer)



In [5]:
class RandomAgent:
    def __init__(self, env):
        self.env = env
        self.nb_actions = self.env.action_space.n
        self.replay_buffer = np.zeros((0, 11))

    def run_episode(self, episode_ix, use_replay_buffer):
        done = False
        total_reward = 0

        state = self.env.reset()[0]

        while not done and total_reward <= EPISODE_CAP:  # ensure episode is finite
            # Select uniformely random action
            action = np.random.choice(self.nb_actions)
            # Take action in environment
            next_state, reward, terminated, truncated, _ = self.env.step(action)
            done = terminated or truncated
            # Increment reward
            total_reward += reward

            # Store s, a, sprime, r and episode_index in replay buffer
            if use_replay_buffer:
                self.replay_buffer = np.append(
                    self.replay_buffer,
                    [
                        [
                            state[0],
                            state[1],
                            state[2],
                            state[3],
                            action,
                            next_state[0],
                            next_state[1],
                            next_state[2],
                            next_state[3],
                            reward,
                            episode_ix,
                        ]
                    ],
                    axis=0,
                )
            # update state
            state = next_state

        return total_reward

    def run(self, nb_episodes, use_replay_buffer=False, do_print=False):
        start_time = time.time()

        if do_print:
            print("Starting RandomAgent run.")

        rewards = []
        for i in range(nb_episodes):
            # Running episode
            reward = self.run_episode(episode_ix=i, use_replay_buffer=use_replay_buffer)
            rewards.append(reward)

            # Printing progress
            print_interval = nb_episodes // 10
            if do_print and (i + 1) % print_interval == 0:
                print(f"Episode {(i+1)}/{nb_episodes} ({((i+1)/nb_episodes * 100):.2f}%) complete.")
        if do_print:
            exec_time = time.time() - start_time
            print(f"> Finished ({exec_time:.0f} s.).\n")

        return rewards

In [6]:
class ActorCriticAgent:
    def __init__(self, env, alpha_theta=0, alpha_w=0, gamma=None):
        self.env = env
        self.alpha_theta = alpha_theta
        self.alpha_w = alpha_w
        self.gamma = DEFAULT_GAMMA if gamma is None else gamma
        self.nb_bins = NB_BINS

        self.nb_states = self.env.observation_space.shape[0]  # 4
        self.nb_actions = self.env.action_space.n  # 2

        self.theta = np.random.uniform(
            -0.001, 0.001, (self.nb_states * self.nb_bins, self.nb_actions)
        )  # (40, 2)
        self.w = np.random.uniform(-0.001, 0.001, (self.nb_states * self.nb_bins,))  # (40,)

        self.replay_buffer = np.zeros((0, 11))

    def get_policy(self, state):
        def softmax(x):
            e_x = np.exp(x - np.max(x))
            return e_x / e_x.sum(axis=0)

        return softmax(np.dot(self.theta.T, state).reshape(-1))

    def run_episode(self, use_replay_buffer, episode_ix):
        done = False
        total_reward = 0
        I = 1

        # Initialize S (first state of episode)
        state = self.env.reset()[0]

        # Loop while S is not terminal (for each time step)
        while not done and total_reward <= EPISODE_CAP:  # ensure episode is finite
            # Discretizing the state
            discr_state = discretize_state(state, nb_bins=self.nb_bins, nb_states=self.nb_states)

            # Choose action: A ~ pi(.|s, theta)
            policy = self.get_policy(discr_state)
            action = np.random.choice(self.nb_actions, p=policy)

            # Take action A, observe S', R
            next_state, reward, terminated, truncated, _ = self.env.step(action)
            next_discr_state = discretize_state(
                next_state, nb_bins=self.nb_bins, nb_states=self.nb_states
            )
            done = terminated or truncated

            # Update delta: delta <- R + gamma * v_hat(S', w) - v_hat(S, w)    (if S' is terminal, then v_hat(S', w) = 0)
            delta = (
                reward
                + self.gamma * float(np.dot(next_discr_state, self.w.T))
                - float(np.dot(discr_state, self.w.T))
            )

            # Update w: w <- w + alpha_w * delta * grad v_hat(S, w)
            self.w += self.alpha_w * delta * discr_state

            # Update theta: theta <- theta + alpha_theta * I * delta * grad ln pi(A|S, theta)
            self.theta += (
                self.alpha_theta
                * I
                * delta
                * np.dot(discr_state.reshape(-1, 1), policy.reshape(1, -1))
            )

            I *= self.gamma

            # Store s, a, sprime, r and episode index in replay buffer
            if use_replay_buffer:
                self.replay_buffer = np.append(
                    self.replay_buffer,
                    [
                        [
                            state[0],
                            state[1],
                            state[2],
                            state[3],
                            action,
                            next_state[0],
                            next_state[1],
                            next_state[2],
                            next_state[3],
                            reward,
                            episode_ix,
                        ]
                    ],
                    axis=0,
                )

            state = next_state
            total_reward += reward

        return total_reward

    def run(self, nb_episodes, use_replay_buffer=False, do_print=False):
        start_time = time.time()

        if do_print:
            print("Starting ActorCriticAgent run.")

        rewards = []
        for i in range(nb_episodes):
            # Running episode
            reward = self.run_episode(use_replay_buffer=use_replay_buffer, episode_ix=i)
            rewards.append(reward)

            # Printing progress
            print_interval = nb_episodes // 10
            if do_print and (i + 1) % print_interval == 0:
                print(f"Episode {(i+1)}/{nb_episodes} ({((i+1)/nb_episodes * 100):.0f}%) complete.")
        if do_print:
            exec_time = time.time() - start_time
            print(f"> Finished ({exec_time:.0f} s.).\n")

        return rewards

## Instantiating objects

In [7]:
env = gym.make("CartPole-v1")

alpha = 1 / 8
actor_critic_agent = ActorCriticAgent(env, alpha_theta=alpha, alpha_w=alpha)
random_agent = RandomAgent(env)

## 5.&nbsp;Pre-train expert agent

In [None]:
# Pre-training actor-critic agent over 1000 episodes

start_time = time.time()
nb_episodes = 1000
pre_training_returns = actor_critic_agent.run(nb_episodes, use_replay_buffer=False, do_print=True)
exec_time = time.time() - start_time
print(f"Pre-training finished in {exec_time:.0f} seconds.")

In [None]:
# Showing return over episodes after expert agent pre-training
plot_simple_returns(
    returns=pre_training_returns, title=f"Expert agent (actor-critic, alpha={alpha}) pre-training"
)

## 6.&nbsp;Make datasets

###  Collect data from expert and random agent

In [None]:
nb_episodes = 500

# Collecting total data for expert agent (we also store returns for plots)
expert_returns = actor_critic_agent.run(nb_episodes, use_replay_buffer=True, do_print=True)
expert_data = actor_critic_agent.replay_buffer

# Collecting total data for random agent (we also store returns for plots)
random_returns = random_agent.run(nb_episodes, use_replay_buffer=True, do_print=True)
random_data = random_agent.replay_buffer

### Make 9 datasets according to size and condition

In [11]:
all_datasets = make_all_datasets(expert_data, random_data)

### Quick look at datasets

In [None]:
for (size_ix, condition_ix), dataset in all_datasets.items():
    print(f"{name_size(size_ix, True)} {name_condition(condition_ix)}:")
    print(f"  > Number of episodes: {np.max(dataset[:,10]+1):.0f}")
    print(f"  > Number of samples: {dataset.shape[0]}")
    if size_ix == SIZE_500:
        print("")

## Offline agents

### Logistic regression agent

In [13]:
# Logistic regression model
class LogisticRegression(nn.Module):
    def __init__(self, nb_features):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(nb_features, 1)

    # Forward pass with linear layer + sigmoid
    def forward(self, x):
        y_predicted = torch.sigmoid(self.linear(x))
        return y_predicted

In [14]:
# Logistic regression agent
class LogisticRegressionAgent:
    def __init__(self, env, learning_rate, use_discretized_states):
        self.env = env
        self.learning_rate = learning_rate
        self.use_discretized_states = use_discretized_states

        self.nb_states = self.env.observation_space.shape[0]  # 4
        self.nb_actions = self.env.action_space.n  # 2

        self.lr_model = None

    def get_states(self, dataset):  # TODO print progress (cause encoding long)
        res_states = None
        # Getting s from our dataset (first 4 columns)
        states = dataset[:, 0:4]

        # If discretize is True, then discretize the state by one-hot encoding resulting in 40 column vector for each state sample
        if self.use_discretized_states:
            res_states = np.zeros((0, 40))
            for state in states:
                discr_state = discretize_state(state, nb_bins=NB_BINS, nb_states=self.nb_states)
                res_states = np.append(res_states, [discr_state], axis=0)
        # Otherwise, use continuous features (4 column vector for each state sample)
        else:
            res_states = states

        # Transform feature array into torch tensor
        res_states = torch.from_numpy(res_states.astype(np.float32))
        # Return features
        return res_states

    def get_actions(self, dataset):
        # Getting actions (column 4 in dataset) and transofrm to torch tensor
        actions = torch.from_numpy(dataset[:, 4].astype(np.float32))
        # Making targets as column vector
        actions = actions.view(actions.shape[0], 1)

        return actions

    def train(self, dataset, nb_epochs=None, do_print=False, algo_name="[unknown algorithm]"):
        start_time = time.time()

        nb_epochs = LR_EPOCHS if nb_epochs is None else nb_epochs

        if do_print:
            print(f"Starting training of {algo_name}.")

        # Shuffle the dataset
        if SHUFFLE_BEFORE_TRAINING:
            np.random.shuffle(dataset)

        # Getting features and target
        if do_print:
            print("Encoding features...")
        states = self.get_states(dataset)
        actions = self.get_actions(dataset)
        if do_print:
            print("> Finished.")

        # Create model
        self.lr_model = LogisticRegression(nb_features=states.shape[1])

        # Loss function and optimizer
        criterion = nn.BCELoss()
        optimizer = torch.optim.SGD(self.lr_model.parameters(), lr=self.learning_rate)

        # Training loop
        if do_print:
            print("Starting training loop...")
        for epoch in range(nb_epochs):
            # Forward pass
            y_predicted = self.lr_model(states)
            # Loss
            loss = criterion(y_predicted, actions)
            # Backward pass
            loss.backward()
            # Update weights
            optimizer.step()
            # Zero gradients
            optimizer.zero_grad()

            # Print progress
            print_interval = nb_epochs // 10
            if do_print and (epoch + 1) % print_interval == 0:
                print(
                    f"Epoch {(epoch+1)}/{nb_epochs} ({((epoch+1)/nb_epochs * 100):.0f}%) complete. Loss = {loss.item():.4f}."
                )
        if do_print:
            exec_time = time.time() - start_time
            print(f"> Finished ({exec_time:.0f} s.).\n")

    def run_episode(self):
        done = False
        total_reward = 0

        state = self.env.reset()[0]

        while not done and total_reward <= EPISODE_CAP:  # ensure episode is finite
            # Discretize state and make it a torch tensor
            if self.use_discretized_states:
                state = discretize_state(state, nb_bins=NB_BINS, nb_states=self.nb_states)
            torch_discr_state = torch.from_numpy(state.astype(np.float32))
            # Predict action
            with torch.no_grad():
                y_predicted = self.lr_model(torch_discr_state)
                predicted_action = 0
                if y_predicted[0] > 0.5:
                    predicted_action = 1

            # Take action and observe reward and next state
            next_state, reward, terminated, truncated, _ = self.env.step(predicted_action)

            done = terminated or truncated
            state = next_state

            # Increment reward
            total_reward += reward

        return total_reward

    def run(self, nb_episodes, do_print=False, algo_name="[unknown algorithm]"):
        if self.lr_model is None:
            raise Exception("Sorry, you have to train the agent before you run it.")

        start_time = time.time()

        if do_print:
            print(f"Starting run of {algo_name}.")

        rewards = []
        for i in range(nb_episodes):
            # Running episode
            reward = self.run_episode()
            rewards.append(reward)

            # Printing progress
            print_interval = nb_episodes // 10
            if do_print and (i + 1) % print_interval == 0:
                print(f"Episode {(i+1)}/{nb_episodes} ({((i+1)/nb_episodes * 100):.0f}%) complete.")
        if do_print:
            exec_time = time.time() - start_time
            print(f"> Finished ({exec_time:.0f} s.).\n")

        return rewards


### Fitted Q-learning Agent

In [15]:
class FittedQLearningAgent:
    def __init__(self, env, learning_rate, gamma=None):
        self.env = env
        self.learning_rate = learning_rate
        self.gamma = DEFAULT_GAMMA if gamma is None else gamma

        self.nb_states = self.env.observation_space.shape[0]  # 4
        self.nb_actions = self.env.action_space.n  # 2

        self.w = np.random.uniform(-0.001, 0.001, size=(self.nb_states * NB_BINS, self.nb_actions))

    def train(self, dataset, k=None, do_print=False, algo_name="[unknown algorithm]"):
        start_time = time.time()

        k = DEFAULT_K if k is None else k

        # Shuffle the dataset
        if SHUFFLE_BEFORE_TRAINING:
            np.random.shuffle(dataset)

        if do_print:
            print(f"Starting training of {algo_name}.")

        # Train over all dataset k times
        for i in range(k):
            if do_print:
                print(f"Starting k-iteration {(i+1)}/{k}.")

            # Loop over all dataset
            j = 0
            for sample in dataset:
                # State is stored as unencoded individual values in columns [0,3]
                state = sample[0:4]
                # Discretize state
                discr_state = discretize_state(state, nb_bins=NB_BINS, nb_states=self.nb_states)
                # Getting the action which was taken in the sample
                action = int(sample[4])
                # Getting next state from the sample (also unencoded in columns [5,8])
                next_state = sample[5:9]
                # Discretize next_state
                discr_next_state = discretize_state(
                    next_state, nb_bins=NB_BINS, nb_states=self.nb_states
                )
                # Get the reward from the dataset
                reward = sample[9]

                # Calculate q and next_q values
                q_value = np.dot(discr_state, self.w[:, action])
                next_q_values = np.dot(discr_next_state, self.w)
                # Calculate error
                error = reward + self.gamma * np.max(next_q_values) - q_value
                # Update weights
                self.w[:, action] += self.learning_rate * error * discr_state

                print_interval = dataset.shape[0] // 10
                if do_print and (j + 1) % print_interval == 0:
                    print(
                        f"  Sample {(j+1)}/{dataset.shape[0]} ({((j+1)/dataset.shape[0] * 100):.0f}%) commplete"
                    )

                j += 1

            if do_print:
                print(f"> K-iteration {(i+1)}/{k} ({(((i+1)/k)*100):.0f}) complete.")

        if do_print:
            exec_time = time.time() - start_time
            print(f"> Finished ({exec_time:.0f} s.).\n")

    def select_action(self, discr_state):
        return np.random.choice(
            np.flatnonzero(
                np.isclose(np.dot(self.w.T, discr_state), np.dot(self.w.T, discr_state).max())
            )
        )

    def run_episode(self):
        done = False
        total_reward = 0

        state = self.env.reset()[0]

        while not done and total_reward <= EPISODE_CAP:  # ensure episode is finite
            # Discretize state
            discr_state = discretize_state(state, nb_bins=NB_BINS, nb_states=self.nb_states)
            # Get next action according to greedy policy
            action = self.select_action(discr_state)

            # Take action and observe reward and next state
            next_state, reward, terminated, truncated, _ = self.env.step(action)
            done = terminated or truncated
            state = next_state

            # Increment reward
            total_reward += reward

        return total_reward

    def run(self, nb_episodes, do_print=False, algo_name="[unknown algorithm]"):
        start_time = time.time()

        if do_print:
            print(f"Starting run of {algo_name}.")

        rewards = []
        for i in range(nb_episodes):
            # Running episode
            reward = self.run_episode()
            rewards.append(reward)

            # Printing progress
            print_interval = nb_episodes // 10
            if do_print and (i + 1) % print_interval == 0:
                print(f"Episode {(i+1)}/{nb_episodes} ({((i+1)/nb_episodes * 100):.0f}%) complete.")
        if do_print:
            exec_time = time.time() - start_time
            print(f"> Finished ({exec_time:.0f} s.).\n")

        return rewards

### MLP Q-learning agent

In [16]:
class QNetwork(nn.Module):
    def __init__(self, learning_rate, input_dims, fc1_dims, fc2_dims, nb_actions):
        # Calling super class init
        super(QNetwork, self).__init__()

        # Storing members
        self.learning_rate = learning_rate
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.nb_actions = nb_actions

        # Fully connected layers
        self.fc1 = nn.Linear(self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.fc3 = nn.Linear(self.fc2_dims, self.nb_actions)

        # Optimizer
        self.optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        # Use MSE loss
        self.loss = nn.MSELoss()
        # Use GPU if possible
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        # Push to device
        self.to(self.device)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        # Dont activate it right away, might need the actual value
        actions = self.fc3(x)

        return actions

In [17]:
class QNetworkAgent:
    def __init__(self, env, learning_rate, use_discretized_states, gamma=None):
        self.env = env
        self.learning_rate = learning_rate
        self.gamma = DEFAULT_GAMMA if gamma is None else gamma
        self.use_discretized_states = use_discretized_states

        self.nb_states = self.env.observation_space.shape[0]  # 4
        self.nb_actions = self.env.action_space.n  # 2

        self.qnetwork = None

    def get_states(self, dataset):
        res_states = None
        res_next_states = None
        # If discretize is True, then discretize the state by one-hot encoding resulting in 40 column vector for each state sample
        if self.use_discretized_states:
            res_states = np.zeros((0, 40))
            res_next_states = np.zeros((0, 40))
            for i in range(dataset.shape[0]):
                discr_state = discretize_state(
                    dataset[i, 0:4], nb_bins=NB_BINS, nb_states=self.nb_states
                )
                discr_next_state = discretize_state(
                    dataset[i, 5:9], nb_bins=NB_BINS, nb_states=self.nb_states
                )
                res_states = np.append(res_states, [discr_state], axis=0)
                res_next_states = np.append(res_next_states, [discr_next_state], axis=0)
        # Otherwise, use continuous features (4 column vector for each state sample)
        else:
            res_states = dataset[:, 0:4]
            res_next_states = dataset[:, 5:9]

        # Return features
        return res_states, res_next_states

    def get_actions(self, dataset):
        # Getting actions (column 4 in dataset) and transofrm to torch tensor
        return dataset[:, 4]

    def get_rewards(self, dataset):
        # Getting actions (column 4 in dataset) and transofrm to torch tensor
        return dataset[:, 9]

    def train(
        self,
        dataset,
        nb_epochs=None,
        batch_size=None,
        do_print=False,
        algo_name="[unknown algorithm]",
    ):
        nb_epochs = MLP_EPOCHS if nb_epochs is None else nb_epochs

        # Shuffle the dataset
        if SHUFFLE_BEFORE_TRAINING:
            np.random.shuffle(dataset)

        # Getting the state, next_state, action and reward (states can be encoded) into memory
        state_data, next_state_data = self.get_states(dataset)
        action_data = self.get_actions(dataset)
        reward_data = self.get_rewards(dataset)

        # Instantiating QNetwork
        self.qnetwork = QNetwork(
            learning_rate=self.learning_rate,
            input_dims=40,
            fc1_dims=FC1_DIMS,
            fc2_dims=FC2_DIMS,
            nb_actions=self.nb_actions,
        )

        for epoch in range(nb_epochs):
            # Zeroing out the gradient
            self.qnetwork.optimizer.zero_grad()

            # Creating batches (if batch_size is 0, then batch is the full dataset)
            batch_size = BATCH_SIZE if batch_size is None else batch_size
            if batch_size > 0:
                batch = np.random.choice(dataset.shape[0], batch_size, replace=False)
                batch_index = np.arange(batch_size, dtype=np.int32)
            else:
                batch = np.arange(dataset.shape[0], dtype=np.int32)
                batch_index = np.arange(dataset.shape[0], dtype=np.int32)

            # Making the batches (if batch_size is 0, then the batch is the whole dataset)
            states = torch.tensor(state_data[batch].astype(np.float32)).to(self.qnetwork.device)
            next_states = torch.tensor(next_state_data[batch].astype(np.float32)).to(
                self.qnetwork.device
            )
            rewards = torch.tensor(reward_data[batch].astype(np.float32)).to(self.qnetwork.device)
            actions = action_data[batch]  # no tensor needed here

            # Getting q values
            q_eval = self.qnetwork.forward(states)
            q_eval = q_eval[batch_index, actions]
            q_next = self.qnetwork.forward(next_states)

            # Calculate target
            q_target = rewards + self.gamma * torch.max(q_next, dim=1)[0]
            # Getting loss function
            loss = self.qnetwork.loss(q_target, q_eval).to(self.qnetwork.device)
            loss.backward()
            self.qnetwork.optimizer.step()

    def run_episode(self):
        done = False
        total_reward = 0

        state = self.env.reset()[0]

        while not done and total_reward <= EPISODE_CAP:  # ensure episode is finite
            # Discretize state and make it a torch tensor
            if self.use_discretized_states:
                state = discretize_state(state, nb_bins=NB_BINS, nb_states=self.nb_states)

            # Predict action
            with torch.no_grad():
                torch_state = torch.tensor(np.array([state]).astype(np.float32)).to(
                    self.qnetwork.device
                )
                actions = self.qnetwork.forward(torch_state)
                action = torch.argmax(actions).item()

            # Take action and observe reward and next state
            next_state, reward, terminated, truncated, _ = self.env.step(action)

            done = terminated or truncated
            state = next_state

            # Increment reward
            total_reward += reward

        return total_reward

    def run(self, nb_episodes, do_print=False, algo_name="[unknown algorithm]"):
        if self.qnetwork is None:
            raise Exception("Sorry, you have to train the agent before you run it.")

        start_time = time.time()

        if do_print:
            print(f"Starting run of {algo_name}.")

        rewards = []
        for i in range(nb_episodes):
            # Running episode
            reward = self.run_episode()
            rewards.append(reward)

            # Printing progress
            print_interval = nb_episodes // 10
            if do_print and (i + 1) % print_interval == 0:
                print(f"Episode {(i+1)}/{nb_episodes} ({((i+1)/nb_episodes * 100):.0f}%) complete.")
        if do_print:
            exec_time = time.time() - start_time
            print(f"> Finished ({exec_time:.0f} s.).\n")

        return rewards

In [None]:
dataset = all_datasets[(SIZE_100, CONDITION_EXPERT)]
qn = QNetworkAgent(env, learning_rate=0.01, use_discretized_states=True)
qn.train(dataset=dataset, batch_size=100, nb_epochs=100, do_print=True)


returns = qn.run(nb_episodes=1000, do_print=True, algo_name="q network")
print("mean return")
print(np.mean(returns))
print("error:")
print(get_error(returns))

plt.plot(returns)
plt.show()


In [None]:
returns = qn.run(nb_episodes=10000, do_print=True, algo_name="q network")
print("mean return")
print(np.mean(returns))
print("error:")
print(get_error(returns))

## Train offline agents

In [None]:
dataset_sizes = [SIZE_100, SIZE_250, SIZE_500]
dataset_conditions = [CONDITION_EXPERT, CONDITION_MIXT, CONDITION_RANDOM]

start_time = time.time()

algorithms = {}
for learning_rate_ix, learning_rate in enumerate(LEARNING_RATES):
    for dataset_size_ix in dataset_sizes:
        for dataset_condition_ix in dataset_conditions:
            lr_agent = LogisticRegressionAgent(
                env, learning_rate=learning_rate, use_discretized_states=USE_DISCRETIZED_STATES
            )
            lr_agent.train(
                all_datasets[(dataset_size_ix, dataset_condition_ix)],
                do_print=True,
                algo_name=name_algorithm(LR_AGENT, dataset_size_ix, dataset_condition_ix),
            )

            q_agent = FittedQLearningAgent(env, learning_rate=learning_rate)
            q_agent.train(
                all_datasets[(dataset_size_ix, dataset_condition_ix)],
                do_print=True,
                algo_name=name_algorithm(Q_AGENT, dataset_size_ix, dataset_condition_ix),
            )

            algorithms[(learning_rate_ix, dataset_size_ix, dataset_condition_ix, LR_AGENT)] = (
                lr_agent
            )
            algorithms[(learning_rate_ix, dataset_size_ix, dataset_condition_ix, Q_AGENT)] = q_agent

exec_time = time.time() - start_time
print(f"Training finished in {exec_time:.0f} seconds.")

## Test offline agents

In [None]:
start_time = time.time()
nb_episodes = 100
returns = {}
for algo_key, agent in algorithms.items():
    returns[algo_key] = agent.run(
        nb_episodes=nb_episodes,
        do_print=True,
        algo_name=name_algorithm(algo_key[3], algo_key[1], algo_key[2]),
    )

exec_time = time.time() - start_time
print(f"Testing finished in {exec_time:.0f} seconds.")

## Plot results

In [None]:
def plot_results(returns):
    fig, axis = plt.subplots(nrows=len(LEARNING_RATES), ncols=1, figsize=(16, 12))
    fig.suptitle("Figure title")

    sizes = [SIZE_100, SIZE_250, SIZE_500]
    conditions = [CONDITION_RANDOM, CONDITION_MIXT, CONDITION_EXPERT]
    agents = [LR_AGENT, Q_AGENT]  # TODO: add MLP AGENT

    for lr_ix, lr in enumerate(LEARNING_RATES):
        ax = axis[lr_ix]
        ax.set_title(f"Title {LEARNING_RATES[lr_ix]}")

        ax.axhline(
            y=np.mean(expert_returns),
            color="b",
            linestyle="--",
            label="Expert agent (actor-critic)",
        )
        ax.axhline(y=np.mean(random_returns), color="r", linestyle="--", label="Random agent")

        mean_returns = {}
        se_returns = {}
        for condition_ix in conditions:
            for agent_ix in agents:
                bar_label = name_bar(agent_ix, condition_ix, True)
                mean_returns[bar_label] = []
                se_returns[bar_label] = []
                for size_ix in sizes:
                    mean_returns[bar_label].append(
                        np.mean(returns[(lr_ix, size_ix, condition_ix, agent_ix)])
                    )
                    se_returns[bar_label].append(
                        get_error(returns[(lr_ix, size_ix, condition_ix, agent_ix)])
                    )

        x = np.arange(len(sizes))
        width = 0.1
        multiplier = 0

        for bar_label, means in mean_returns.items():
            offset = width * multiplier
            rects = ax.bar(x + offset, means, width, yerr=se_returns[bar_label], label=bar_label)
            # ax.bar_label(rects, padding=3)
            multiplier += 1

        ax.set_xticks(
            x + (width * (len(conditions) * len(agents) / 2)),
            [name_size(size_ix, True) for size_ix in sizes],
        )
        ax.legend(loc="upper left", ncols=3)
        ax.set_ylim(0, 250)
        ax.set_ylabel("Cumulative undiscounted return")

    plt.show()


plot_results(returns)

## Report