In [None]:
# Import necessary libraries
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
from collections import namedtuple, deque

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Define the neural network model
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size, seed, fc1_units=64, fc2_units=64):
        super(QNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc3 = nn.Linear(fc2_units, action_size)
        self.to(device)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

class ReplayBuffer:
    def __init__(self, action_size, buffer_size, batch_size, seed):
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)

    def add(self, state, action, reward, next_state, done):
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)

    def sample(self):
        experiences = random.sample(self.memory, k=self.batch_size)

        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)

        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        return len(self.memory)

cpu


In [None]:
# Define the DQN agent class
class DQNAgent:
    # Initialize the DQN agent
    def __init__(self, state_size, action_size, seed, lr):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr)

        self.memory = ReplayBuffer(action_size, buffer_size=int(1e5), batch_size=64, seed=seed)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)

        self.t_step = (self.t_step + 1) % 4
        if self.t_step == 0:
            if len(self.memory) > 64:
                experiences = self.memory.sample()
                self.learn(experiences, gamma=0.99)

    # Choose an action based on the current state
    def act(self, state, eps=0.):
        state_tensor = torch.from_numpy(state).float().unsqueeze(0).to(device)

        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state_tensor)
        self.qnetwork_local.train()

        if np.random.random() > eps:
            return action_values.argmax(dim=1).item()
        else:
            return np.random.randint(self.action_size)

    # Learn from batch of experiences
    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = zip(*experiences)
        states = torch.from_numpy(np.vstack(states)).float().to(device)
        actions = torch.from_numpy(np.vstack(actions)).long().to(device)
        rewards = torch.from_numpy(np.vstack(rewards)).float().to(device)
        next_states = torch.from_numpy(np.vstack(next_states)).float().to(device)
        dones = torch.from_numpy(np.vstack(dones).astype(np.uint8)).float().to(device)

        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        Q_expected = self.qnetwork_local(states).gather(1, actions)

        loss = F.mse_loss(Q_expected, Q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.qnetwork_local, self.qnetwork_target, tau=1e-3)

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)

In [None]:
# Initialize the environment and the agent
import gym
from collections import deque
import random

# Set up the environment
env = gym.make("CartPole-v1")

# Define training parameters
num_episodes = 250
max_steps_per_episode = 200
epsilon_start = 1.0
epsilon_end = 0.2
epsilon_decay_rate = 0.99
gamma = 0.9
lr = 0.0025
buffer_size = 10000
buffer = deque(maxlen=buffer_size)
batch_size = 128
update_frequency = 10


# Initialize the DQNAgent
input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n
new_agent = DQNAgent(input_dim, output_dim, seed=170715, lr = lr)

  deprecation(
  deprecation(


In [None]:
# Training loop
for episode in range(num_episodes):
    # Reset the environment
    state = env.reset()
    epsilon = max(epsilon_end, epsilon_start * (epsilon_decay_rate ** episode))

    # Run one episode
    for step in range(max_steps_per_episode):
        # Choose and perform an action
        action = new_agent.act(state, epsilon)
        next_state, reward, done, _ = env.step(action)

        buffer.append((state, action, reward, next_state, done))

        if len(buffer) >= batch_size:
            batch = random.sample(buffer, batch_size)
            # Update the agent's knowledge
            new_agent.learn(batch, gamma)

        state = next_state

        # Check if the episode has ended
        if done:
            break

    if (episode + 1) % update_frequency == 0:
        print(f"Episode {episode + 1}: Finished training")

  if not isinstance(terminated, (bool, np.bool8)):


Episode 10: Finished training
Episode 20: Finished training
Episode 30: Finished training
Episode 40: Finished training
Episode 50: Finished training
Episode 60: Finished training
Episode 70: Finished training
Episode 80: Finished training
Episode 90: Finished training
Episode 100: Finished training
Episode 110: Finished training
Episode 120: Finished training
Episode 130: Finished training
Episode 140: Finished training
Episode 150: Finished training
Episode 160: Finished training
Episode 170: Finished training
Episode 180: Finished training
Episode 190: Finished training
Episode 200: Finished training
Episode 210: Finished training
Episode 220: Finished training
Episode 230: Finished training
Episode 240: Finished training
Episode 250: Finished training


In [None]:
# Evaluate the agent's performance
test_episodes = 100
episode_rewards = []
counter = 0
for episode in range(test_episodes):
    state = env.reset()
    episode_reward = 0
    done = False

    while not done:
        action = new_agent.act(state, eps=0.)
        next_state, reward, done, _ = env.step(action)
        episode_reward += reward
        state = next_state

    episode_rewards.append(episode_reward)
    print(f"Average reward over {test_episodes} test episodes: {episode_reward:.2f}")
    counter = counter + 1

# average_reward = sum(episode_rewards) / test_episodes
# print(f"Average reward over {test_episodes} test episodes: {average_reward:.2f}")


Average reward over 100 test episodes: 124.00
Average reward over 100 test episodes: 124.00
Average reward over 100 test episodes: 124.00
Average reward over 100 test episodes: 125.00
Average reward over 100 test episodes: 117.00
Average reward over 100 test episodes: 102.00
Average reward over 100 test episodes: 125.00
Average reward over 100 test episodes: 111.00
Average reward over 100 test episodes: 118.00
Average reward over 100 test episodes: 109.00
Average reward over 100 test episodes: 105.00
Average reward over 100 test episodes: 107.00
Average reward over 100 test episodes: 115.00
Average reward over 100 test episodes: 128.00
Average reward over 100 test episodes: 112.00
Average reward over 100 test episodes: 116.00
Average reward over 100 test episodes: 109.00
Average reward over 100 test episodes: 127.00
Average reward over 100 test episodes: 115.00
Average reward over 100 test episodes: 121.00
Average reward over 100 test episodes: 116.00
Average reward over 100 test episo

In [None]:
from torch import nn
from torch.nn import functional


class Model(nn.Module):
    def __init__(self, input_features, output_values):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(in_features=input_features, out_features=32)
        self.fc2 = nn.Linear(in_features=32, out_features=32)
        self.fc3 = nn.Linear(in_features=32, out_features=output_values)

    def forward(self, x):
        x = functional.selu(self.fc1(x))
        x = functional.selu(self.fc2(x))
        x = self.fc3(x)
        return x

## DQN without Tile coding

In [None]:
from torch import nn
from torch.nn import functional


class Model(nn.Module):
    def __init__(self, input_features, output_values):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(in_features=input_features, out_features=32)
        self.fc2 = nn.Linear(in_features=32, out_features=32)
        self.fc3 = nn.Linear(in_features=32, out_features=output_values)

    def forward(self, x):
        x = functional.selu(self.fc1(x))
        x = functional.selu(self.fc2(x))
        x = self.fc3(x)
        return x

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
# from Model import Model
import gym
from collections import deque
import random

# Parameters
use_cuda = True
episode_limit = 100
target_update_delay = 2  # update target net every target_update_delay episodes
test_delay = 10
learning_rate = 1e-3
epsilon = 1  # initial epsilon
min_epsilon = 0.1
epsilon_decay = 0.9 / 2.5e3
gamma = 0.7
memory_len = 10000

env = gym.make('CartPole-v1')
n_features = len(env.observation_space.high)
n_actions = env.action_space.n

memory = deque(maxlen=memory_len)
# each memory entry is in form: (state, action, env_reward, next_state)
device = torch.device("cuda" if use_cuda and torch.cuda.is_available() else "cpu")
criterion = nn.MSELoss()
policy_net = Model(n_features, n_actions).to(device)
target_net = Model(n_features, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()


def get_states_tensor(sample, states_idx):
    sample_len = len(sample)
    states_tensor = torch.empty((sample_len, n_features), dtype=torch.float32, requires_grad=False)

    features_range = range(n_features)
    for i in range(sample_len):
        for j in features_range:
            states_tensor[i, j] = sample[i][states_idx][j].item()

    return states_tensor


def normalize_state(state):
    state[0] /= 2.5
    state[1] /= 2.5
    state[2] /= 0.3
    state[3] /= 0.3


def state_reward(state, env_reward):
    return env_reward - (abs(state[0]) + abs(state[2])) / 2.5


def get_action(state, e=min_epsilon):
    if random.random() < e:
        # explore
        action = random.randrange(0, n_actions)
    else:
        state = torch.tensor(state, dtype=torch.float32, device=device)
        action = policy_net(state).argmax().item()

    return action


def fit(model, inputs, labels):
    inputs = inputs.to(device)
    labels = labels.to(device)
    train_ds = TensorDataset(inputs, labels)
    train_dl = DataLoader(train_ds, batch_size=5)

    optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
    model.train()
    total_loss = 0.0

    for x, y in train_dl:
        out = model(x)
        loss = criterion(out, y)
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    model.eval()

    return total_loss / len(inputs)


def optimize_model(train_batch_size=100):
    train_batch_size = min(train_batch_size, len(memory))
    train_sample = random.sample(memory, train_batch_size)

    state = get_states_tensor(train_sample, 0)
    next_state = get_states_tensor(train_sample, 3)

    q_estimates = policy_net(state.to(device)).detach()
    next_state_q_estimates = target_net(next_state.to(device)).detach()
    next_actions = policy_net(next_state.to(device)).argmax(dim=1)

    for i in range(len(train_sample)):
        next_action = next_actions[i].item()
        q_estimates[i][train_sample[i][1]] = (state_reward(next_state[i], train_sample[i][2]) +
                                              gamma * next_state_q_estimates[i][next_action].item())

    fit(policy_net, state, q_estimates)


def train_one_episode():
    global epsilon
    current_state = env.reset()
    normalize_state(current_state)
    done = False
    score = 0
    reward = 0
    while not done:
        action = get_action(current_state, epsilon)
        next_state, env_reward, done, _ = env.step(action)
        normalize_state(next_state)
        memory.append((current_state, action, env_reward, next_state))
        current_state = next_state
        score += env_reward
        reward += state_reward(next_state, env_reward)

        optimize_model(100)

        epsilon -= epsilon_decay

    return score, reward


def test():
    state = env.reset()
    normalize_state(state)
    done = False
    score = 0
    reward = 0
    while not done:
        action = get_action(state)
        state, env_reward, done, _ = env.step(action)
        normalize_state(state)
        score += env_reward
        reward += state_reward(state, env_reward)

    return score, reward


def main():
    best_test_reward = 0

    for i in range(episode_limit):
        score, reward = train_one_episode()

        print(f'Episode {i + 1}: score: {score} - reward: {reward}')

        if i % target_update_delay == 0:
            target_net.load_state_dict(policy_net.state_dict())
            target_net.eval()

        if (i + 1) % test_delay == 0:
            test_score, test_reward = test()
            print(f'Test Episode {i + 1}: test score: {test_score} - test reward: {test_reward}')
            if test_reward > best_test_reward:
                print('New best test reward. Saving model')
                best_test_reward = test_reward
                torch.save(policy_net.state_dict(), 'policy_net.pth')

    if episode_limit % test_delay != 0:
        test_score, test_reward = test()
        print(f'Test Episode {episode_limit}: test score: {test_score} - test reward: {test_reward}')
        if test_reward > best_test_reward:
            print('New best test reward. Saving model')
            best_test_reward = test_reward
            torch.save(policy_net.state_dict(), 'policy_net.pth')

    print(f'best test reward: {best_test_reward}')


if __name__ == '__main__':
    main()

  if not isinstance(terminated, (bool, np.bool8)):


Episode 1: score: 37.0 - reward: 33.238720370084046
Episode 2: score: 16.0 - reward: 13.892036166787149
Episode 3: score: 32.0 - reward: 29.029974260926245
Episode 4: score: 10.0 - reward: 8.443620854616166
Episode 5: score: 49.0 - reward: 41.380140855908394
Episode 6: score: 26.0 - reward: 22.595840267091987
Episode 7: score: 16.0 - reward: 13.488393396139147
Episode 8: score: 45.0 - reward: 37.11015791669488
Episode 9: score: 47.0 - reward: 40.29508927743883
Episode 10: score: 32.0 - reward: 29.469418792799114
Test Episode 10: test score: 140.0 - test reward: 124.13258359879252
New best test reward. Saving model
Episode 11: score: 29.0 - reward: 26.417702896893026
Episode 12: score: 14.0 - reward: 12.099806751310824
Episode 13: score: 14.0 - reward: 12.030390596389772
Episode 14: score: 12.0 - reward: 10.210883410274983
Episode 15: score: 25.0 - reward: 21.62835376858711
Episode 16: score: 16.0 - reward: 14.277893885225055
Episode 17: score: 32.0 - reward: 26.96623806804419
Episode 1

## DQN with Tile coding

In [1]:
import numpy as np
class TileCoder_Carpole:
    def __init__(self, num_tiles_per_dim, num_tilings, state_limits, clip_limits=None):
        self.num_tiles_per_dim = num_tiles_per_dim
        self.num_tilings = num_tilings
        self.state_limits = np.array(state_limits)
        self.clip_limits = np.array(clip_limits) if clip_limits is not None else state_limits
        self.tile_widths = np.zeros(len(state_limits))
        self.offsets = np.zeros(len(state_limits))
        for i, (clip_limit, state_limit) in enumerate(zip(self.clip_limits, self.state_limits)):
            # Use the clipped limit for calculating tile width, but keep the original limit for clipping states
            self.tile_widths[i] = (clip_limit[1] - clip_limit[0]) / (num_tiles_per_dim - 2)
            self.offsets[i] = self.tile_widths[i] / (2 * num_tilings)
        self.total_tiles = num_tilings * num_tiles_per_dim * len(state_limits)

    def get_tile_indices(self, state):
        tile_indices = np.zeros(self.total_tiles)
        for tiling in range(self.num_tilings):
            for dim, (state_dim, limits, clip_limit, offset, tile_width) in enumerate(
                zip(state, self.state_limits, self.clip_limits, self.offsets, self.tile_widths)
            ):
                clipped_state_dim = np.clip(state_dim, clip_limit[0], clip_limit[1])
                norm_state_dim = ((clipped_state_dim - clip_limit[0]) + (tiling * offset)) / tile_width
                # Calculate tile index for the dimension, with special handling for outermost tiles
                tile = int(norm_state_dim) + 1
                tile = max(1, min(self.num_tiles_per_dim - 2, tile))
                index = (tiling * self.num_tiles_per_dim * len(self.state_limits)) + (dim * self.num_tiles_per_dim) + tile
                tile_indices[index] = 1
        return tile_indices

# Parameters for tile coding
num_tiles_per_dim = 100
num_tilings = 5
state_limits = [[-2.4, 2.4], [-np.inf, np.inf], [-12 * 2 * np.pi / 360, 12 * 2 * np.pi / 360], [-np.inf, np.inf]]
clip_limits = [[-2.4, 2.4], [-10, 10], [-12 * 2 * np.pi / 360, 12 * 2 * np.pi / 360], [-10, 10]]

tile_coder = TileCoder_Carpole(num_tiles_per_dim, num_tilings, state_limits, clip_limits)

In [2]:
from torch import nn
from torch.nn import functional


class Model(nn.Module):
    def __init__(self, input_features, output_values):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(in_features=input_features, out_features=32)
        self.fc2 = nn.Linear(in_features=32, out_features=32)
        self.fc3 = nn.Linear(in_features=32, out_features=output_values)

    def forward(self, x):
        x = functional.selu(self.fc1(x))
        x = functional.selu(self.fc2(x))
        x = self.fc3(x)
        return x


import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
# from Model import Model
import gym
from collections import deque
import random

# Parameters
use_cuda = True
episode_limit = 50
target_update_delay = 2  # update target net every target_update_delay episodes
test_delay = 10
learning_rate = 1e-4
epsilon = 1  # initial epsilon
min_epsilon = 0.1
epsilon_decay = 0.9 / 2.5e3
gamma = 0.99
memory_len = 10000

env = gym.make('CartPole-v1')
n_features = len(env.observation_space.high)
n_actions = env.action_space.n

memory = deque(maxlen=memory_len)
# each memory entry is in form: (state, action, env_reward, next_state)
device = torch.device("cuda" if use_cuda and torch.cuda.is_available() else "cpu")
criterion = nn.MSELoss()
# policy_net = Model(n_features, n_actions).to(device)
# target_net = Model(n_features, n_actions).to(device)
input_features = num_tiles_per_dim * num_tilings * len(state_limits)
output_values = env.action_space.n
policy_net = Model(input_features, output_values).to(device)
target_net = Model(input_features, output_values).to(device)

target_net.load_state_dict(policy_net.state_dict())
target_net.eval()


def get_states_tensor(sample, states_idx):
    sample_len = len(sample)
    states_tensor = torch.empty((sample_len, n_features), dtype=torch.float32, requires_grad=False)

    features_range = range(n_features)
    for i in range(sample_len):
        for j in features_range:
            states_tensor[i, j] = sample[i][states_idx][j].item()

    return states_tensor


def normalize_state(state):
    state[0] /= 2.5
    state[1] /= 2.5
    state[2] /= 0.3
    state[3] /= 0.3


def state_reward(state, env_reward):
    return env_reward - (abs(state[0]) + abs(state[2])) / 2.5


def get_action(state, e=min_epsilon):
    if random.random() < e:
        action = random.randrange(0, n_actions)
    else:
        encoded_state = tile_coder.get_tile_indices(state)
        encoded_state = torch.tensor(encoded_state, dtype=torch.float32, device=device)
        action = policy_net(encoded_state).argmax().item()

    return action


def fit(model, inputs, labels):
    inputs = inputs.to(device)
    labels = labels.to(device)
    train_ds = TensorDataset(inputs, labels)
    train_dl = DataLoader(train_ds, batch_size=5)

    optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
    model.train()
    total_loss = 0.0

    for x, y in train_dl:
        out = model(x)
        loss = criterion(out, y)
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    model.eval()

    return total_loss / len(inputs)


def optimize_model(train_batch_size=100):
    train_batch_size = min(train_batch_size, len(memory))
    train_sample = random.sample(memory, train_batch_size)

    state = torch.tensor([tile_coder.get_tile_indices(s[0]) for s in train_sample], dtype=torch.float32, device=device)
    next_state = torch.tensor([tile_coder.get_tile_indices(s[3]) for s in train_sample], dtype=torch.float32, device=device)

    q_estimates = policy_net(state).detach()
    next_state_q_estimates = target_net(next_state).detach()
    next_actions = policy_net(next_state).argmax(dim=1)

    for i in range(len(train_sample)):
        next_action = next_actions[i].item()
        q_estimates[i][train_sample[i][1]] = (state_reward(next_state[i], train_sample[i][2]) +
                                              gamma * next_state_q_estimates[i][next_action].item())

    fit(policy_net, state, q_estimates)




def train_one_episode():
    global epsilon
    current_state = env.reset()
    normalize_state(current_state)
    done = False
    score = 0
    reward = 0
    while not done:
        action = get_action(current_state, epsilon)
        next_state, env_reward, done, _ = env.step(action)
        normalize_state(next_state)
        encoded_current_state = tile_coder.get_tile_indices(current_state)
        encoded_next_state = tile_coder.get_tile_indices(next_state)
        memory.append((encoded_current_state, action, env_reward, encoded_next_state))
        current_state = next_state
        score += env_reward
        reward += state_reward(next_state, env_reward)

        optimize_model(100)

        epsilon -= epsilon_decay

    return score, reward

def test():
    state = env.reset()
    normalize_state(state)
    done = False
    score = 0
    reward = 0
    while not done:
        encoded_state = tile_coder.get_tile_indices(state)
        action = get_action(encoded_state)
        state, env_reward, done, _ = env.step(action)
        normalize_state(state)
        score += env_reward
        reward += state_reward(state, env_reward)

    return score, reward



def main():
    best_test_reward = 0

    for i in range(episode_limit):
        score, reward = train_one_episode()

        print(f'Episode {i + 1}: score: {score} - reward: {reward}')

        if i % target_update_delay == 0:
            target_net.load_state_dict(policy_net.state_dict())
            target_net.eval()

        if (i + 1) % test_delay == 0:
            test_score, test_reward = test()
            print(f'Test Episode {i + 1}: test score: {test_score} - test reward: {test_reward}')
            if test_reward > best_test_reward:
                print('New best test reward. Saving model')
                best_test_reward = test_reward
                torch.save(policy_net.state_dict(), 'policy_net.pth')

    if episode_limit % test_delay != 0:
        test_score, test_reward = test()
        print(f'Test Episode {episode_limit}: test score: {test_score} - test reward: {test_reward}')
        if test_reward > best_test_reward:
            print('New best test reward. Saving model')
            best_test_reward = test_reward
            torch.save(policy_net.state_dict(), 'policy_net.pth')

    print(f'best test reward: {best_test_reward}')


if __name__ == '__main__':
    main()

  deprecation(
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):
  state = torch.tensor([tile_coder.get_tile_indices(s[0]) for s in train_sample], dtype=torch.float32, device=device)


Episode 1: score: 15.0 - reward: 13.411069809645413
Episode 2: score: 21.0 - reward: 18.320616906136276
Episode 3: score: 9.0 - reward: 7.589578223228456
Episode 4: score: 12.0 - reward: 10.726375594735146
Episode 5: score: 13.0 - reward: 11.346395973861219
Episode 6: score: 14.0 - reward: 11.841252374649049
Episode 7: score: 21.0 - reward: 17.887440127134322
Episode 8: score: 34.0 - reward: 31.232086889818312
Episode 9: score: 21.0 - reward: 18.859454522281887
Episode 10: score: 19.0 - reward: 16.29662732183933
Test Episode 10: test score: 9.0 - test reward: 7.869649133086204
New best test reward. Saving model
Episode 11: score: 41.0 - reward: 37.69408140778541
Episode 12: score: 20.0 - reward: 18.092632233724
Episode 13: score: 12.0 - reward: 10.34507518559694
Episode 14: score: 30.0 - reward: 27.526338087767364
Episode 15: score: 13.0 - reward: 11.583654091134667
Episode 16: score: 14.0 - reward: 12.761727516725658
Episode 17: score: 13.0 - reward: 11.040456775575878
Episode 18: sco

## Fix