<a href="https://colab.research.google.com/github/hmerkle/deepRL/blob/main/DRL23_HW04_DQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

- inner loop: create steps that you put in replay buffer
- train q-network: set y_j

# pip install and imports

In [135]:
#!pip install "gymnasium[accept-rom-license, atari]"

In [136]:
#!pip install --upgrade ipykernel

In [137]:
import tensorflow as tf
import gymnasium as gym
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# code

In [138]:
class ExperienceReplayBuffer:
    """
    max_size: maximum size of the replay buffer
    env_name: name of the environment
    parallel_game_unrolls: number of parallel games to play ( we run multiple independent copies of same env in parallel with gym.vactor.VectorEnv)
    observation_preprocessing_function: function that preprocesses the observation
    """
    def __init__(self, max_size:int, env_name: str, parallel_game_unrolls: int, observation_preprocessing_function: callable, unroll_steps: int):
        self.max_size = max_size
        self.env_name = env_name
        self.parallel_game_unrolls = parallel_game_unrolls
        self.unroll_steps = unroll_steps
        self.observation_preprocessing_function = observation_preprocessing_function
        self.envs = gym.vector.make(env_name, self.parallel_game_unrolls)
        self.num_possible_actions = self.envs.single_action_space.n
        self.current_states, _ = self.envs.reset()
        self.data = [] # fill this up step by step

    def fill_with_samples(self, dqn_network, epsilon: float):
        states_list = [] # length of this list is unroll_steps
        actions_list = []
        rewards_list = []
        subsequent_states_list = []
        terminateds_list = [] # each timestep is a batch

        # in each of these steps we take 1 step in each of all our parallel_game_unrolls environments
        # adds new samples into ERP
        for i in range(self.unroll_steps):

            actions = self.sample_epsilon_greedy_action(dqn_network, epsilon)
            print("actions: ", actions.shape)
            next_states, rewards, terminateds, _, _ = self.envs.step(actions)

            # put state, action, reward, next observation into erp
            observations = self.observation_preprocessing_function(self.current_states)
            states_list.append(self.current_states)
            #print("states_list: ", states_list[0])
            actions_list.append(actions)
            rewards_list.append(rewards)
            subsequent_states_list.append(next_states)
            terminateds_list.append(terminateds)

            self.current_states = next_states

        # create a generator that yields a sample for each of the timesteps
        def data_generator():

            # walk through all timesteps created and for each timestep walk through all respective game to create 1 samle for each of these
            for states_batch, actions_batch, rewards_batch, subsequent_states_batch, terminateds_batch in zip(states_list, actions_list, rewards_list, subsequent_states_list, terminateds_list):
              print("shapes batches: s, a, r, ss, t ", states_batch.shape, actions_batch.shape, rewards_batch.shape, subsequent_states_batch.shape, terminateds_batch.shape)
              for game_idx in range(self.parallel_game_unrolls):
                state = states_batch[game_idx, :, :, :]
                action = actions_batch[game_idx]
                reward = rewards_batch[game_idx]
                subsequent_state = subsequent_states_batch[game_idx, :, :, :]
                terminated = terminateds_batch[game_idx]
                # state = states_batch(game_idx, _, _, _)# batch, h, w, c
                # #print("inner state: ", state.shape, states_batch[game_idx, :, :, :].shape)
                # action = actions_batch(game_idx)
                # #print("inner action: ", action.shape, actions_batch[game_idx].shape)
                # reward = rewards_batch(game_idx)
                # subsequent_state = subsequent_states_batch(game_idx)#[game_idx, :, :, :]
                # #print("inner state: ", subsequent_state.shape, subsequent_states_batch[game_idx, :, :, :].shape)
                # terminated = terminateds_batch(game_idx)
                print("shapes: s, a, r, ss, t ", state.shape, action.shape, reward.shape, subsequent_state.shape, terminated.shape)
                yield (state, action, reward, subsequent_state, terminated)

        print("self.envs.action_space.shape: ", self.envs.action_space.shape)
        dataset_tensor_specs = (tf.TensorSpec(shape=(210, 160, 3), dtype=tf.uint8),
                                tf.TensorSpec(shape=self.envs.action_space.shape, dtype=tf.int32),
                                tf.TensorSpec(shape = (), dtype=tf.float32),
                                tf.TensorSpec(shape=(210, 160, 3), dtype=tf.uint8),
                                tf.TensorSpec(shape = (), dtype=tf.bool))

        # create for each step, we create sperate dataset
        new_samples_dataset = tf.data.Dataset.from_generator(data_generator, output_signature=dataset_tensor_specs)
        print("self.observation_preprocessing_function(state): ", states_list[0].shape, (self.observation_preprocessing_function(states_list[0])).shape)
        # Apply preprocessing first
        new_samples_dataset = new_samples_dataset.map(lambda state, action, reward, subsequent_state, terminated:(self.observation_preprocessing_function(state), action, reward, self.observation_preprocessing_function(subsequent_state), terminated)) #TODO
        # shuffle on each iteration
        new_samples_dataset = new_samples_dataset.cache()#.shuffle(buffer_size=self.parallel_game_unrolls*self.unroll_steps, reshuffle_each_iteration=True)

        # make sure cache is applied!
        count = 0
        for elem in new_samples_dataset:
          if count < 2:
            print(elem)
          continue
        # print("new_samples_dataset secured")

        self.data.append(new_samples_dataset)
        datapoints_in_data = len(self.data) * self.parallel_game_unrolls * self.unroll_steps # last two calculate how many s, a, s', t tuples in one element of datalist
        if datapoints_in_data > self.max_size:
          self.data.pop(0)


    def create_dataset(self):
        # creates a tf.data.Dataset object from the ERP
        erp_dataset = tf.data.Dataset.sample_from_datasets(self.data, weights = [1/float(len(self.data)) for _ in self.data], stop_on_empty_dataset = False) # choose from datasets? #TODO
        return erp_dataset

    def sample_epsilon_greedy_action(self, dqn_network, epsilon: float):
        # samples actions from dqn_network using epsilon greedy fashion
        observations = self.observation_preprocessing_function(self.current_states)
        #print("self.current_states: ", (self.current_states).shape)
        q_values = dqn_network(observations) # tensor of type tf.float32, shape (parallel_game_unrolls, num_actions)
        print("q_values: ", q_values.shape)
        greedy_actions = tf.argmax(q_values, axis=1) # tensor of type tf.int64, shape (parallel_game_unrolls, )
        print("greedy_actions: ", greedy_actions.shape)
        random_actions = tf.random.uniform(shape=(self.parallel_game_unrolls,), minval=0, maxval=self.num_possible_actions, dtype=tf.int64) # tensor of type tf.int64, shape (parallel_game_unrolls, )

        # sample random actions with probability epsilon
        # prob 1-epsilon to take greedy action -> (p > epsilon) is true
        # prob epsilon to take random action
        greedy_actions_mask = tf.random.uniform(shape=(self.parallel_game_unrolls,), minval=0, maxval=1, dtype=tf.float32) > epsilon # tensor of type tf.bool, shape (parallel_game_unrolls, )
        actions = tf.where(greedy_actions_mask, greedy_actions, random_actions).numpy() # tensor of type tf.int64, shape (parallel_game_unrolls, )
        return actions

In [139]:
def observation_preprocessing_function(observation):
    # (210, 160, 3) to (84, 84, 3)
    #converts an observation to a tensor of type tf.float32, size(84,84)
    observation = tf.image.resize(observation, size=(84,84))#, resize=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
    observation = tf.cast(observation, dtype=tf.float32) /128.0 - 1.0 # values between -1 and 1

    return observation

In [140]:
def create_dqn_network(num_actions: int):
    #create an input for tf functional model api
    input_layer = tf.keras.Input(shape=(84, 84, 3), dtype= tf.float32)

    x = tf.keras.layers.Conv2D(filters=16, kernel_size=3, activation='relu')(input_layer)
    x = tf.keras.layers.Conv2D(filters=16, kernel_size=3, activation='relu', padding='same')(x) + x #residual connection
    x = tf.keras.layers.Conv2D(filters=16, kernel_size=3, activation='relu', padding='same')(x) + x #residual connection
    x = tf.keras.layers.MaxPool2D(pool_size=2)(x)

    x = tf.keras.layers.Conv2D(filters=32, kernel_size=3, activation='relu')(x)
    x = tf.keras.layers.Conv2D(filters=32, kernel_size=3, activation='relu', padding='same')(x) + x #residual connection
    x = tf.keras.layers.Conv2D(filters=32, kernel_size=3, activation='relu', padding='same')(x) + x #residual connection
    x = tf.keras.layers.MaxPool2D(pool_size=2)(x)

    x = tf.keras.layers.Conv2D(filters=64, kernel_size=3, activation='relu')(x)
    x = tf.keras.layers.Conv2D(filters=64, kernel_size=3, activation='relu', padding='same')(x) + x #residual connection
    x = tf.keras.layers.Conv2D(filters=64, kernel_size=3, activation='relu', padding='same')(x) + x #residual connection
    x = tf.keras.layers.MaxPool2D(pool_size=2)(x)

    x = tf.keras.layers.Conv2D(filters=64, kernel_size=3, activation='relu')(x)
    x = tf.keras.layers.Conv2D(filters=64, kernel_size=3, activation='relu', padding='same')(x) + x #residual connection
    x = tf.keras.layers.Conv2D(filters=64, kernel_size=3, activation='relu', padding='same')(x) + x #residual connection
    x = tf.keras.layers.GlobalAvgPool2D()(x)

    x = tf.keras.layers.Dense(units=64, activation='relu')(x) + x #residual connection
    x = tf.keras.layers.Dense(units=num_actions, activation='linear')(x)

    model = tf.keras.Model(inputs=input_layer, outputs=x)

    return model

In [141]:
# def train_dqn(train_dqn_network, target_network, dataset, optimizer, gamma: float, num_training_steps:int, batch_size: int):

#     dataset = dataset.batch(batch_size).prefetch(4)

#     @tf.function
#     def training_step(q_targets, observations, actions):
#         with tf.GradientTape() as tape:
#             q_predictions_all_actions = train_dqn_network(observations) #shape of q_predictions is (batch_size, num_actions)
#             q_predictions = tf.gather(q_predictions_all_actions, actions, batch_dims=1) # find q-values at right indices from q_predictions_all_actions
#             loss = tf.reduce_mean(tf.square(q_predictions - q_targets))
#         gradients = tape.gradient(loss, train_dqn_network.trainable_variables)
#         optimizer.apply_gradients(zip(gradients, train_dqn_network.trainable_variables))
#         return loss.numpy()

#     losses = []
#     q_values = []
#     #some preparations
#     for i, state_transition in enumerate(dataset):
#         state, action, reward, subsequent_state, terminated = state_transition
#         #calculate q_targets
#         q_vals =  target_network(subsequent_state) #TODO: check if this is correct
#         q_values.append(q_vals.numpy())
#         max_q_values = tf.reduce_max(q_vals, axis=1)
#         use_subsequent_state = tf.where(terminated, tf.zeros_like(max_q_values, dtype=tf.float32), tf.ones_like(max_q_values, dtype=tf.float32))
#         # calculate q_targets
#         q_target = reward + (gamma*max_q_values*use_subsequent_state)
#         #train on data
#         loss = training_step(q_target, observations=state, actions=action).numpy()
#         losses.append(loss)
#         if i >= num_training_steps:
#             break

#     return np.mean(losses), np.mean(q_values)

In [142]:
def train_dqn(train_network, target_network, dataset, optimizer, gamma: float, num_training_steps:int, batch_size: int):

    dataset = dataset.batch(batch_size).prefetch(4)

    @tf.function
    def training_step(q_targets, observations, actions):
        with tf.GradientTape() as tape:
            q_predictions_all_actions = train_network(observations) #shape of q_predictions is (batch_size, num_actions)
            q_predictions = tf.gather(q_predictions_all_actions, actions, batch_dims=1) # find q-values at right indices from q_predictions_all_actions
            loss = tf.reduce_mean(tf.square(q_predictions - q_targets))
        gradients = tape.gradient(loss, train_network.trainable_variables)
        optimizer.apply_gradients(zip(gradients, train_network.trainable_variables))
        return loss.numpy()

    losses = []
    q_values = []
    #some preparations
    for i, state_transition in enumerate(dataset):
        state, action, reward, subsequent_state, terminated = state_transition
        #calculate q_targets
        q_vals =  target_network(subsequent_state) #TODO: check if this is correct
        q_values.append(q_vals.numpy())
        max_q_values = tf.reduce_max(q_vals, axis=1)
        use_subsequent_state = tf.where(terminated, tf.zeros_like(max_q_values, dtype=tf.float32), tf.ones_like(max_q_values, dtype=tf.float32))
        # calculate q_targets
        q_target = reward + (gamma*max_q_values*use_subsequent_state)
        #train on data
        loss = training_step(q_target, observations=state, actions=action).numpy()
        losses.append(loss)
        if i >= num_training_steps:
            break

    return np.mean(losses), np.mean(q_values)

In [143]:
def test_dqn_network(test_dqn_network, env_name: str, num_parallel_game_tests:int, gamma: float, preprocessing_function: callable, test_epsilon: float=0.05):
    envs = gym.vector.make(env_name, num_envs=num_parallel_game_tests)
    num_possible_actions = envs.single_action_space.n
    states,_ = envs.reset()

    #episodes finished is a numpy vector of shape (num_parallel_tests), filled with Booleans, starting with all False
    episodes_finished = np.zeros(num_parallel_game_tests, dtype=bool)
    returns = np.zeros(num_parallel_game_tests)

    done=False
    timestep = 0
    while not done:
        states = preprocessing_function(states)
        q_values = test_dqn_network(states)
        greedy_actions = tf.argmax(q_values, axis=1) # tensor of type tf.int64, shape (num_parallel_game_tests,)
        random_actions = tf.random.uniform(shape=(num_parallel_game_tests,), minval=0, maxval=num_possible_actions, dtype=tf.int64)
        epsilon_sampling = tf.random.uniform(shape=(num_parallel_game_tests,), minval=0, maxval=1, dtype=tf.float32) > test_epsilon
        actions = tf.where(epsilon_sampling, greedy_actions, random_actions).numpy() # type numpy.ndarray, shape (num_parallel_game_tests,)
        states, rewards, terminateds, _, _ = envs.step(actions)

        #compute pointwise or between episodes_finished and terminateds
        episodes_finished = np.logical_or(episodes_finished, terminateds) # if an episode is finished, it stays finished
        returns += ((gamma**timestep)*rewards)*(np.logical_not(episodes_finished).astype(np.float32)) # only add rewards if episode is not finished -> multiply with 0 if episode is finished
        timestep += 1
        #done if all episodes are finished
        done = np.all(episodes_finished)
        test_steps += 1
        if test_steps%100 == 0:
            print("Test steps: ", test_steps, np.sum(episodes_finished)/num_parallel_game_tests, terminateds.shape, episodes_finished.shape)

    return np.mean(returns)

In [144]:
# # Take a fraction of the source network weights and copy weights to target network weights
# def polyak_averaging_weigths(source_network, target_network, polyak_averaging_factor: float):
#     source_network_weights = source_network.get_weights()
#     target_network_weights = target_network.get_weights()
#     averaged_weights = []

#     for source_network_weight, target_network_weight in zip(source_network_weights, target_network_weights):
#         fraction_kept_weights = polyak_averaging_factor*target_network_weight # weight that are kept from previous iteration
#         fraction_updated_weights = (1 - polyak_averaging_factor) * source_network_weight
#         average_weight = fraction_kept_weights + fraction_updated_weights
#         averaged_weights.append(average_weight)
#     target_network.set_weights(averaged_weights)

In [145]:
# Take a fraction of the source network weights and copy weights to target network weights
def polyak_averaging_weigths(source_network, target_network, polyak_averaging_factor: float):
    source_network_weights = source_network.get_weights()
    target_network_weights = target_network.get_weights()
    averaged_weights = []

    for source_network_weight, target_network_weight in zip(source_network_weights, target_network_weights):
        fraction_kept_weights = polyak_averaging_factor*source_network_weight # weight that are kept from previous iteration
        fraction_updated_weights = (1 - polyak_averaging_factor) * target_network_weight
        average_weight = fraction_kept_weights + fraction_updated_weights
        averaged_weights.append(average_weight)
    target_network.set_weights(averaged_weights)

In [146]:
def visualise_results(result_df, step):
    # create a figure with 3 subplots
    fig, axis = plt.subplots(3, 1)
    # include row idxs explicitely in results_df
    result_df['step'] = result_df.index
    # plot average returns
    sns.lineplot(x = "step", y = "average_returns", data = result_df, ax = axis[0])
    # plot average losses
    sns.lineplot(x = "step", y = "average_losses", data = result_df, ax = axis[1])
    # plot average q-values
    sns.lineplot(x = "step", y = "average_q_values", data = result_df, ax = axis[2])
    # # save the figure
    # plt.savefig("average_q_values.png")
    # # create a timestring from the timestamp
    # timestring = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    # # save the figure
    # plt.savefig(f'./results/{timestring}_results_step{step}.png')
    # plt.close(fig)

In [147]:
def dqn():
    ENVIRONMENT_NAME = 'ALE/Breakout-v5'
    NUM_ACTIONS = gym.make(ENVIRONMENT_NAME).action_space.n
    ERP_SIZE= 1000#100000 # experience replay buffer size
    PARALLEL_GAME_UNROLLS = 1#24#128 # number of parallel games to play
    UNROLL_STEPS = 1 # number of steps to unroll each game
    EPSILON = 0.2
    GAMMA = 0.995
    NUM_TRAINING_STEPS_PER_ITERATION = 16
    TRAIN_BATCH_SIZE = 128#512
    NUM_TRAINING_ITERS = 3#50000
    TEST_EVERY_N_STEPS = 50
    TEST_NUM_PARALLEL_ENVS = 1#24#128
    PREFILL_STEPS = 1#24#100
    POLYAK_AVERAGING_FACTOR = 0.99

    # container containing all  s, a, r, s', t transitions
    # With deep Q-networks, we often utilize this technique called experience replay during training. With experience replay, we store
    # the agent's experiences at each time step in a data set called the replay memory
    erp = ExperienceReplayBuffer(max_size=ERP_SIZE,
                                env_name=ENVIRONMENT_NAME,
                                parallel_game_unrolls=PARALLEL_GAME_UNROLLS,
                                observation_preprocessing_function=observation_preprocessing_function,
                                unroll_steps=UNROLL_STEPS)

    #input is some image from game and outputs atari output
    # dqn that is trained
    dqn_agent = create_dqn_network(num_actions=NUM_ACTIONS)
    #this is the target network, used to calculate the q_estimation targets
    target_network = create_dqn_network(num_actions=NUM_ACTIONS)

    #dqn_agent(tf.random.uniform(shape=(1, 84, 84, 3)))
    #dqn_agent.summary()

    #copy over the weights from the dqn_agent to the target_network via polyak averaging with factor 0.8
    polyak_averaging_weigths(source_network=dqn_agent, target_network=target_network, polyak_averaging_factor=0.0)#0.8)

    dqn_optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)

    #prefill the replay buffer -> less  bias
    prefill_exploration_epsilon = 1.0
    for prefill_step in range(PREFILL_STEPS):
        erp.fill_with_samples(dqn_agent, prefill_exploration_epsilon)

    #test the agent
    return_tracker = []
    dqn_prediction_error = []
    average_q_values = []

    for step in range(NUM_TRAINING_ITERS):
        print("Training iteration: ", step)
        #step 1: put some s, a, r, s' transitions into the replay buffer
        erp.fill_with_samples(dqn_agent, EPSILON)
        dataset = erp.create_dataset()

        #step 2: train on some samples from the replay buffer
        average_loss, average_q_value = train_dqn(dqn_agent, target_network, dataset, optimizer=dqn_optimizer, gamma=GAMMA, num_training_steps=NUM_TRAINING_STEPS_PER_ITERATION, batch_size=TRAIN_BATCH_SIZE)
        #update the target network via polyak averaging
        erp.polyak_averaging_weights(source_network=dqn_agent, target_network=target_network, polyak_averaging_factor=POLYAK_AVERAGING_FACTOR)

        #step 3: test the agent
        if step % TEST_EVERY_N_STEPS == 0:  #test every N steps
            average_returns = test_dqn_network(dqn_agent, ENVIRONMENT_NAME, num_parallel_tests=TEST_NUM_PARALLEL_ENVS, gamma=GAMMA )
            return_tracker.append(average_returns)
            dqn_prediction_error.append(average_loss)
            average_q_values.append(average_q_value)
            #print average loss, average returns, average q_values
            print(f'average return: {average_returns}, TESTING: average loss: {average_loss}, average q_value-estimate: {average_q_value}')

            #put all the result lists into a dataframe (by transforming them into a dict first)
            results_dict = {'average_return': return_tracker, 'average_loss': dqn_prediction_error, 'average_q_values': average_q_values }
            results_df = pd.Dataframe(results_dict)

            #visualise the results with sns
            # create 3 subplots
            visualise_results(results_df, step)

In [148]:
if __name__ == '__main__':
    dqn()

q_values:  (1, 4)
greedy_actions:  (1,)
actions:  (1,)
self.envs.action_space.shape:  (1,)
self.observation_preprocessing_function(state):  (1, 210, 160, 3) (1, 84, 84, 3)
shapes batches: s, a, r, ss, t  (1, 210, 160, 3) (1,) (1,) (1, 210, 160, 3) (1,)
shapes: s, a, r, ss, t  (210, 160, 3) () () (210, 160, 3) ()


InvalidArgumentError: ignored

In [None]:
# def dqn():
#     ENVIRONMENT_NAME = 'ALE/Breakout-v5'
#     NUM_ACTIONS = gym.make(ENVIRONMENT_NAME).action_space.n
#     ERP_SIZE= 1000#100000 # experience replay buffer size
#     PARALLEL_GAME_UNROLLS = 24#128 # number of parallel games to play
#     UNROLL_STEPS = 4 # number of steps to unroll each game
#     EPSILON = 0.2
#     GAMMA = 0.995
#     NUM_TRAINING_STEPS_PER_ITERATION = 16
#     TRAIN_BATCH_SIZE = 128#512
#     NUM_TRAINING_ITERS = 3#50000
#     TEST_EVERY_N_STEPS = 50
#     TEST_NUM_PARALLEL_ENVS = 24#128
#     PREFILL_STEPS = 24#100
#     POLYAK_AVERAGING_FACTOR = 0.99

#     # container containing all  s, a, r, s', t transitions
#     erp = ExperienceReplayBuffer(max_size=ERP_SIZE,
#                                 env_name=ENVIRONMENT_NAME,
#                                 parallel_game_unrolls=PARALLEL_GAME_UNROLLS,
#                                 observation_preprocessing_function=observation_preprocessing_function,
#                                 unroll_steps=UNROLL_STEPS)

#     #input is some image from game and outputs atari output
#     # dqn that is trained
#     dqn_agent = create_dqn_network(num_actions=NUM_ACTIONS)
#     #dqn_agent.summary()
#     dqn_agent(tf.random.uniform(shape=(1, 84, 84, 3)))

#     #this is the target network, used to calculate the q_estimation targets
#     target_network = create_dqn_network(num_actions=NUM_ACTIONS)
#     #copy over the weights from the dqn_agent to the target_network via polyak averaging with factor 0.8
#     polyak_averaging_weigths(source_network=dqn_agent, target_network=target_network, polyak_averaging_factor=0.0)#0.8)

#     dqn_optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)

#     #prefill the replay buffer -> less  bias
#     prefill_exploration_epsilon = 1.0
#     for prefill_step in range(PREFILL_STEPS):
#         erp.fill_with_samples(dqn_agent, prefill_exploration_epsilon)

#     #test the agent
#     return_tracker = []
#     dqn_prediction_error = []
#     average_q_values = []

#     for step in range(NUM_TRAINING_ITERS):
#         print("Training iteration: ", step)
#         #step 1: put some s, a, r, s' transitions into the replay buffer
#         erp.fill_with_samples(dqn_agent, EPSILON)
#         dataset = erp.create_dataset()

#         #step 2: train on some samples from the replay buffer
#         average_loss, average_q_value = train_dqn(dqn_agent, target_network, dataset, optimizer=dqn_optimizer, gamma=GAMMA, num_training_steps=NUM_TRAINING_STEPS_PER_ITERATION, batch_size=TRAIN_BATCH_SIZE)
#         #update the target network via polyak averaging
#         erp.polyak_averaging_weights(source_network=dqn_agent, target_network=target_network, polyak_averaging_factor=POLYAK_AVERAGING_FACTOR)

#         #step 3: test the agent
#         if step % TEST_EVERY_N_STEPS == 0:  #test every N steps
#             average_returns = test_dqn_network(dqn_agent, ENVIRONMENT_NAME, num_parallel_tests=TEST_NUM_PARALLEL_ENVS, gamma=GAMMA )
#             return_tracker.append(average_returns)
#             dqn_prediction_error.append(average_loss)
#             average_q_values.append(average_q_value)
#             #print average loss, average returns, average q_values
#             print(f'average return: {average_returns}, TESTING: average loss: {average_loss}, average q_value-estimate: {average_q_value}')

#             #put all the result lists into a dataframe (by transforming them into a dict first)
#             results_dict = {'average_return': return_tracker, 'average_loss': dqn_prediction_error, 'average_q_values': average_q_values }
#             results_df = pd.Dataframe(results_dict)

#             #visualise the results with sns
#             # create 3 subplots
#             visualise_results(results_df, step)