In [1]:
# Importing the necessary libraries
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
import matplotlib.pyplot as plt

# Defining some constants for the simulation
GRAVITY = 9.8 # m/s^2
MASS = 10 # kg
LENGTH = 1 # m
RADIUS = 0.2 # m
INCLINATION = np.pi / 6 # rad
SLOPE = 0.1 # rad

# Defining the action space as a discrete space of four actions
ACTION_SPACE = ['pedal forward', 'pedal backward', 'turn left', 'turn right']

# Defining the observation space as a box space of six variables and eight pixels
OBS_SPACE = gym.spaces.Box(low=np.array([-5, -5, -np.pi, -np.inf, -np.inf, -np.pi, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 
                           high=np.array([5, 5, np.pi, np.inf, np.inf, np.pi, 1, 1, 1, 1, 1, 1, 1, 1, 14.14214]), 
                           dtype=np.float32)

# Defining a custom environment class that simulates a bicycle in 2D
class BicycleEnv(gym.Env):
    def __init__(self):
        # Initializing the action and observation spaces
        self.action_space = gym.spaces.Discrete(len(ACTION_SPACE))
        self.observation_space = OBS_SPACE
        
        # Initializing the state variables
        self.x = 0 # horizontal position
        self.y = 0 # vertical position
        self.theta = 0 # angle of the bicycle
        self.omega = 0 # angular velocity of the bicycle
        self.v = 0 # linear velocity of the bicycle
        self.phi = 0 # angle of the handlebar
        
        # Initializing the pixels around the bicycle
        self.pixels = np.zeros(8) # binary values indicating if there is an obstacle (1) or not (0) in each direction

        # initializing the distance to the target
        self.distance = 0
        
        # Initializing the obstacles and the target
        self.obstacles = [] # list of tuples (x,y) representing the positions of the red circles
        self.target = None # tuple (x,y) representing the position of the green circle
        
        # Initializing the surface slope
        self.slope = SLOPE
        
        # Initializing the reward and done flags
        self.reward = 0 
        self.done = False
        
    def reset(self):
        # Resetting the state variables to random values within the observation space bounds
        self.x = np.random.uniform(-4.5, 4.5)
        self.y = np.random.uniform(-4.5, 4.5)
        self.theta = np.random.uniform(-np.pi/2, np.pi/2)
        self.omega = np.random.uniform(-np.pi/4, np.pi/4)
        self.v = np.random.uniform(-1, 1)
        self.phi = np.random.uniform(-np.pi/4, np.pi/4)
        # self.distance = np.sqrt((self.x - self.target[0])**2 + (self.y - self.target[1])**2) results in nonetype object is not subscriptable, so first check if all variables are not None
        if self.target is not None and self.x is not None and self.y is not None:
            self.distance = np.sqrt((self.x - self.target[0])**2 + (self.y - self.target[1])**2)
        else:
            self.distance = 14.14214
        
        # Resetting the pixels around the bicycle to zeros
        self.pixels.fill(0)
        
        # Resetting the obstacles and the target to random positions within the environment bounds
        self.obstacles.clear()
        for _ in range(10):
            x_obstacle = np.random.uniform(-4.5, 4.5)
            y_obstacle = np.random.uniform(-4.5, 4.5)
            self.obstacles.append((x_obstacle, y_obstacle))
        
        x_target = np.random.uniform(-4.5, 4.5)
        y_target = np.random.uniform(-4.5, 4.5)
        self.target = (x_target, y_target)
        
        # Resetting the reward and done flags to zero and False respectively
        self.reward = 0 
        self.done = False
        
        # Returning the initial observation as a numpy array
        # return np.array([self.x, self.y, self.theta, self.omega,
        #                  self.v,self.phi] + list(self.pixels) )

        # append the distance which is a float to the returned array after self.pixels

        # print dimensions of the returned array    
        # print ("return 1 out is ")
        # print(np.array([self.x, self.y, self.theta, self.omega,
        #                  self.v,self.phi] + list(self.pixels) + [self.distance]).shape)
        return np.array([self.x, self.y, self.theta, self.omega,
                            self.v,self.phi] + list(self.pixels) + [self.distance])
    
    def step(self, action):
        # Applying the action to the state variables
        if action == 0: # pedal forward
            self.v += 0.1 # increase the linear velocity by 0.1 m/s
        elif action == 1: # pedal backward
            self.v -= 0.1 # decrease the linear velocity by 0.1 m/s
        elif action == 2: # turn left
            self.phi += np.pi / 12 # increase the handlebar angle by pi/12 rad
        elif action == 3: # turn right
            self.phi -= np.pi / 12 # decrease the handlebar angle by pi/12 rad
        
        # Clamping the state variables to the observation space bounds
        self.v = np.clip(self.v, -1, 1)
        self.phi = np.clip(self.phi, -np.pi/4, np.pi/4)
        
        # Updating the state variables using the equations of motion
        self.x += self.v * np.cos(self.theta) * 0.1 # update the horizontal position using the linear velocity and the angle of the bicycle
        self.y += self.v * np.sin(self.theta) * 0.1 # update the vertical position using the linear velocity and the angle of the bicycle
        self.omega += (GRAVITY * np.sin(self.slope) - GRAVITY * np.cos(self.slope) * np.sin(self.theta + INCLINATION) + 
                       MASS * RADIUS * self.v**2 * np.sin(self.phi) / LENGTH) * 0.1 # update the angular velocity using the gravity, mass, length, radius, linear velocity, handlebar angle and inclination of the bicycle
        self.theta += self.omega * 0.1 # update the angle of the bicycle using the angular velocity
        
        # Checking for collisions with obstacles or boundaries
        for obstacle in self.obstacles:
            if np.sqrt((self.x - obstacle[0])**2 + (self.y - obstacle[1])**2) <= 2.5:
                # The bicycle has hit an obstacle
                self.reward -= 10 # give a negative reward of -10

                if np.sqrt((self.x - obstacle[0])**2 + (self.y - obstacle[1])**2) <= 1.25:
                    self.reward -= 12

                    if np.sqrt((self.x - obstacle[0])**2 + (self.y - obstacle[1])**2) <= 0.625:
                        self.reward -= 15

                        if np.sqrt((self.x - obstacle[0])**2 + (self.y - obstacle[1])**2) <= RADIUS:
                            self.reward -= 20
                            self.done = True # end the episode
                
        
        if abs(self.x) >= 5 or abs(self.y) >= 5:
            # The bicycle has reached a boundary
            self.reward -= 10 # give a negative reward of -10
            self.done = True # end the episode
        
        # Checking for reaching the target
        if np.sqrt((self.x - self.target[0])**2 + (self.y - self.target[1])**2) <= 2.5:
            # The bicycle has reached the target
            self.reward += 30 # give a positive reward of +100

            if np.sqrt((self.x - self.target[0])**2 + (self.y - self.target[1])**2) <= 1.25:
                self.reward += 34

                if np.sqrt((self.x - self.target[0])**2 + (self.y - self.target[1])**2) <= 0.625:
                    self.reward += 36

                    if np.sqrt((self.x - self.target[0])**2 + (self.y - self.target[1])**2) <= RADIUS:
                        self.reward += 100
                        self.done = True # end the episode

        
        # Checking for falling down
        if abs(self.theta) >= np.pi / 2:
            # The bicycle has fallen down
            self.reward -= 10 # give a negative reward of -10
            self.done = True # end the episode
        
        # Updating the pixels around the bicycle based on the environment state
        self.pixels.fill(0) # reset the pixels to zeros
        
        # Defining a helper function to check if a given position is within the environment bounds and not occupied by an obstacle or a target
        def is_free(x, y):
            if abs(x) >= 5 or abs(y) >= 5:
                return False
            
            for obstacle in self.obstacles:
                if np.sqrt((x - obstacle[0])**2 + (y - obstacle[1])**2) <= RADIUS:
                    return False
            
            if np.sqrt((x - self.target[0])**2 + (y - self.target[1])**2) <= RADIUS:
                return False
            
            return True
        
        # Checking for each direction around the bicycle
        if is_free(self.x + RADIUS, self.y): # right
            self.pixels[0] = 1
        if is_free(self.x + RADIUS, self.y + RADIUS): # right-up
            self.pixels[1] = 1
        if is_free(self.x, self.y + RADIUS): # up
            self.pixels[2] = 1
        if is_free(self.x - RADIUS, self.y + RADIUS): # left-up
            self.pixels[3] = 1
        if is_free(self.x - RADIUS, self.y): # left
            self.pixels[4] = 1
        if is_free(self.x - RADIUS, self.y - RADIUS): # left-down
            self.pixels[5] = 1
        if is_free(self.x, self.y - RADIUS): # down
            self.pixels[6] = 1
        if is_free(self.x + RADIUS, self.y - RADIUS): # right-down
            self.pixels[7] = 1

        # Calculating the distance to the target
        if self.target is not None and self.x is not None and self.y is not None:
            self.distance = np.sqrt((self.x - self.target[0])**2 + (self.y - self.target[1])**2)
        else:
            self.distance = 14.14214
        
        
        # Returning the observation, reward, done flag and an empty info dictionary as a tuple
        # return (np.array([self.x, self.y, self.theta, self.omega,
        #                   self.v,self.phi] + list(self.pixels)) , 
        #         self.reward, 
        #         self.done, 
        #         {})

        # append  distance, which is a float value, to the returned array. distance is appended after the pixels but before the reward
        return (np.array([self.x, self.y, self.theta, self.omega,
                            self.v,self.phi] + list(self.pixels) + [self.distance]),
                self.reward,
                self.done,
                {})
    
    
    def render(self):
        # Rendering the environment using matplotlib.pyplot
        import matplotlib.pyplot as plt

        # clear previous plot
        plt.clf()
        
        
        # Creating a figure and an axis object
        fig, ax = plt.subplots()
        
        # Setting the axis limits and labels
        ax.set_xlim(-5, 5)
        ax.set_ylim(-5, 5)
        ax.set_xlabel('x')
        ax.set_ylabel('y')
        
        # Plotting the obstacles as red circles
        for obstacle in self.obstacles:
            ax.add_patch(plt.Circle(obstacle, RADIUS, color='red'))
        
        # Plotting the target as a green circle
        ax.add_patch(plt.Circle(self.target, RADIUS, color='green'))
        
        # Plotting the bicycle as a blue line with a black dot for the handlebar
        x_bike = [self.x - LENGTH / 2 * np.cos(self.theta), 
                  self.x + LENGTH / 2 * np.cos(self.theta)]
        y_bike = [self.y - LENGTH / 2 * np.sin(self.theta), 
                  self.y + LENGTH / 2 * np.sin(self.theta)]
        x_handle = x_bike[1] + RADIUS * np.cos(self.theta + self.phi)
        y_handle = y_bike[1] + RADIUS * np.sin(self.theta + self.phi)
        
        ax.plot(x_bike, y_bike, color='blue', linewidth=3)
        ax.plot(x_handle, y_handle, color='black', marker='o')
        
        # Showing the figure
        plt.show()
        
# Defining a custom policy network class that approximates the probability distribution of actions given observations using a two-layer neural network with ReLU activation function
class PolicyNetwork(nn.Module):
    def __init__(self):
        # Initializing the parent class
        super(PolicyNetwork, self).__init__()
        
        # Defining the network layers and and parameters


        self.fc1 = nn.Linear(OBS_SPACE.shape[0], 64) # first fully connected layer with 64 hidden units
        self.fc2 = nn.Linear(64, len(ACTION_SPACE)) # second fully connected layer with output units equal to the number of actions
        
        # Defining the optimizer
        self.optimizer = optim.Adam(self.parameters(), lr=0.01) # using Adam optimizer with learning rate of 0.01
        
    def forward(self, x):
        # Forward pass of the network
        x = F.relu(self.fc1(x)) # applying ReLU activation function to the output of the first layer
        x = self.fc2(x) # passing the output to the second layer
        x = x + 1e-8 # adding a small epsilon value to avoid NaN values

        # PRINT the shape of x

        return F.log_softmax(x, dim=-1).exp() # applying softmax function to get a probability distribution over actions
    
    # def act(self, state):
    #     # Choosing an action based on the current state
    #     state = torch.from_numpy(state).float().unsqueeze(0) # converting the state to a torch tensor and adding a batch dimension
    #     probs = self.forward(state).squeeze() # getting the probability distribution and removing the batch dimension
    #     m = Categorical(probs) # creating a categorical distribution object
    #     action = m.sample() # sampling an action from the distribution
    #     return action.item(), m.log_prob(action) # returning the action and its log probability
    
    def act(self, state):
    # Choosing an action based on the current state using the policy network
      state = torch.from_numpy(state).float().unsqueeze(0) # converting the state to a torch tensor and adding a batch dimension
      probs = self.forward(state).squeeze() # getting the probability distribution and removing the batch dimension
      probs[probs != probs] = 1e-8 # replacing any NaN values with a small positive value
      m = Categorical(probs) # creating a categorical distribution object
      action = m.sample() # sampling an action from the 
      # print the dimension of action
    #   print (" action shape")
    #   print (action.shape)
    

      return action.item(), m.log_prob(action) # returning the action and its log probability


# Defining a custom value network class that approximates the expected return given observations using a two-layer neural network with ReLU activation function
class ValueNetwork(nn.Module):
    def __init__(self):
        # Initializing the parent class
        super(ValueNetwork, self).__init__()

        # print (OBS_SPACE.shape[0]) : the output is 15

        
        # Defining the network layers and parameters
        self.fc1 = nn.Linear(OBS_SPACE.shape[0], 64) # first fully connected layer with 64 hidden units
        self.fc2 = nn.Linear(64, 1) # second fully connected layer with output unit equal to one
        
        
        # Defining the optimizer
        self.optimizer = optim.Adam(self.parameters(), lr=0.01) # using Adam optimizer with learning rate of 0.01
        
    def forward(self, x):
        # Forward pass of the network
        x = F.relu(self.fc1(x)) # applying ReLU activation function to the output of the first layer
        x = self.fc2(x) # passing the output to the second layer
        return x # returning the expected return
    
# Defining some hyperparameters for the proximal policy optimization algorithm
GAMMA = 0.99 # discount factor for future rewards
LAMBDA = 0.95 # parameter for generalized advantage estimation
EPSILON = 0.2 # parameter for clipping the ratio of probabilities
BATCH_SIZE = 64 # batch size for mini-batch updates
EPOCHS = 10 # number of epochs for each update

# Creating an instance of the custom environment
env = BicycleEnv()

# Creating an instance of the policy network
policy_net = PolicyNetwork()

# Creating an instance of the value network
value_net = ValueNetwork()

# Defining some variables to store the trajectories and statistics
states = [] # list of states visited by the agent
actions = [] # list of actions taken by the agent
rewards = [] # list of rewards received by the agent
log_probs = [] # list of log probabilities of actions taken by the agent
values = [] # list of expected returns estimated by the value network
returns = [] # list of actual returns calculated from rewards
advantages = [] # list of advantages calculated from returns and values

episode_reward = 0 # cumulative reward for each episode
episode_rewards = [] # list of episode rewards

# Defining a helper function to calculate discounted returns from rewards
def calculate_returns(rewards):
    returns = []
    R = 0 
    for r in reversed(rewards):
        R = r + GAMMA * R 
        returns.insert(0, R)
    return returns

# Defining a helper function to calculate generalized advantages from returns and values
def calculate_advantages(returns, values):
    advantages = []
    A = 0 
    for i in reversed(range(len(returns))):
        delta = returns[i] - values[i]
        A = delta + GAMMA * LAMBDA * A 
        advantages.insert(0, A)
    return advantages

# Defining a helper function to update the policy and value networks using mini-batch gradient descent
def update_networks(states, actions, log_probs, returns, advantages):
    # Converting the trajectories to torch tensors
    states = torch.tensor(states, dtype=torch.float)
    actions = torch.tensor(actions)
    log_probs = torch.tensor(log_probs)
    returns = torch.tensor(returns, dtype=torch.float)
    advantages = torch.tensor(advantages, dtype=torch.float)
    
    # Normalizing the advantages
    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
    
    # Creating a dataset and a data loader from the trajectories
    dataset = torch.utils.data.TensorDataset(states, actions, log_probs, returns, advantages)
    data_loader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
    
    # Looping over the number of epochs
    for _ in range(EPOCHS):
        # Looping over the mini-batches
        for state_batch, action_batch, log_prob_batch, return_batch, advantage_batch in data_loader:
            # Calculating the current log probabilities and values for the mini-batch states
            # current_log_probs = policy_net(state_batch).log_prob(action_batch).unsqueeze(-1)
            current_probs = policy_net(state_batch)
            current_log_probs = torch.log(current_probs.gather(1, action_batch.unsqueeze(-1)))


            current_values = value_net(state_batch)
            
            # Calculating the ratio of probabilities
            ratio = torch.exp(current_log_probs - log_prob_batch)
            
            # Calculating the surrogate losses for the policy and value networks
            policy_loss = -torch.min(ratio * advantage_batch,
                                     torch.clamp(ratio, 1 - EPSILON, 1 + EPSILON) * advantage_batch).mean()
            value_loss = F.mse_loss(current_values, return_batch)
            
            # Updating the policy network parameters
            policy_net.optimizer.zero_grad()
            policy_loss.backward()
            policy_net.optimizer.step()
            
            # Updating the value network parameters
            value_net.optimizer.zero_grad()
            value_loss.backward()
            value_net.optimizer.step()

# Defining the number of episodes to train the agent
NUM_EPISODES = 100000

# buffer to store the episode rewards each episode
new_episode_rewards = []

# Looping over the episodes
for i in range(NUM_EPISODES):
    # Resetting the environment and getting the initial state
    state = env.reset()
    
    # Looping until the episode ends
    while True:
        # Choosing an action based on the current state using the policy network
        action, log_prob = policy_net.act(state)
        
        # Taking the action in the environment and getting the next state, reward and done flag
        next_state, reward, done, _ = env.step(action)
        
        # Estimating the expected return for the current state using the value network
        value = value_net(torch.from_numpy(state).float().unsqueeze(0)).item()
        
        # Storing the state, action, reward, log probability and value in the trajectories
        states.append(state)
        actions.append(action)
        rewards.append(reward)
        log_probs.append(log_prob)
        values.append(value)
        
        # Updating the episode reward
        episode_reward += reward

        # # buffer to store the episode rewards each episode
        # new_episode_rewards = []
        
        # Updating the state
        state = next_state
        
        # Rendering the environment if it is a multiple of 100 episodes
        if (i + 1) % 5000 == 0:
            env.render()
        
        # Checking if the episode has ended
        if done:
            # Calculating the discounted returns and generalized advantages from the trajectories
            returns = calculate_returns(rewards)
            advantages = calculate_advantages(returns, values)
            
            # Updating the policy and value networks using the trajectories
            update_networks(states, actions, log_probs, returns, advantages)
            
            # Clearing the trajectories and statistics
            states.clear()
            actions.clear()
            rewards.clear()
            log_probs.clear()
            values.clear()
            returns.clear()
            advantages.clear()
            
            # Storing the episode reward in the list
            episode_rewards.append(episode_reward)
            
            # Printing the episode reward
            print(f'Episode {i + 1}: Reward = {episode_reward}')

            # Storing the episode reward in the list
            new_episode_rewards.append(episode_reward)

            

            # plot episode rewards every 100 episodes
            if (i + 1) % 100 == 0:
                plt.plot(new_episode_rewards)
                plt.xlabel('Episode')
                plt.ylabel('Reward')
                # title of plot is the number of episodes that have elapsed so far
                plt.title('Episode Rewards over Time: ' + str(i + 1) + ' Episodes')
                # plt.show()
                # save plot to file with filename based on number of episodes that have elapsed so far
                plt.savefig('episode_rewards_' + str(i + 1) + '.png')
            
            # Saving the policy and value networks every 10000 episodes

            if (i + 1) % 10000 == 0:
                torch.save(policy_net.state_dict(), 'policy_net_' + str(i + 1) + '.pth')
                torch.save(value_net.state_dict(), 'value_net_' + str(i + 1) + '.pth')





            

            
            
            # Resetting the episode reward
            episode_reward = 0
            
            # Breaking out of the loop
            break

# Plotting the episode rewards over time
import matplotlib.pyplot as plt

plt.plot(episode_rewards)
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Episode Rewards over Time')
plt.show()

  states = torch.tensor(states, dtype=torch.float)
  value_loss = F.mse_loss(current_values, return_batch)
  value_loss = F.mse_loss(current_values, return_batch)
  value_loss = F.mse_loss(current_values, return_batch)
  value_loss = F.mse_loss(current_values, return_batch)


Episode 1: Reward = -1484
Episode 2: Reward = -11486
Episode 3: Reward = -4187
Episode 4: Reward = -560
Episode 5: Reward = -22498


  value_loss = F.mse_loss(current_values, return_batch)
  value_loss = F.mse_loss(current_values, return_batch)


Episode 6: Reward = -960285


  value_loss = F.mse_loss(current_values, return_batch)


Episode 7: Reward = 1063218
Episode 8: Reward = -1930


  value_loss = F.mse_loss(current_values, return_batch)


Episode 9: Reward = 201067


  value_loss = F.mse_loss(current_values, return_batch)


Episode 10: Reward = -214614


  value_loss = F.mse_loss(current_values, return_batch)


Episode 11: Reward = -3282341


  value_loss = F.mse_loss(current_values, return_batch)


Episode 12: Reward = -329781
Episode 13: Reward = -96532
Episode 14: Reward = -670


  value_loss = F.mse_loss(current_values, return_batch)


Episode 15: Reward = -146375
Episode 16: Reward = 285978
Episode 17: Reward = -560
Episode 18: Reward = -29234


  value_loss = F.mse_loss(current_values, return_batch)
  value_loss = F.mse_loss(current_values, return_batch)


Episode 19: Reward = -195316


  value_loss = F.mse_loss(current_values, return_batch)
  value_loss = F.mse_loss(current_values, return_batch)


Episode 20: Reward = -171203
Episode 21: Reward = -9280
Episode 22: Reward = -10
Episode 23: Reward = -7080


  value_loss = F.mse_loss(current_values, return_batch)
  value_loss = F.mse_loss(current_values, return_batch)


Episode 24: Reward = 739464


  value_loss = F.mse_loss(current_values, return_batch)
  value_loss = F.mse_loss(current_values, return_batch)


Episode 25: Reward = -258510
Episode 26: Reward = -1360
Episode 27: Reward = 9164
Episode 28: Reward = -57
Episode 29: Reward = -2530


  value_loss = F.mse_loss(current_values, return_batch)
  value_loss = F.mse_loss(current_values, return_batch)
  value_loss = F.mse_loss(current_values, return_batch)


Episode 30: Reward = 2324
Episode 31: Reward = -5960
Episode 32: Reward = -1370


  value_loss = F.mse_loss(current_values, return_batch)


Episode 33: Reward = 13431
Episode 34: Reward = -114
Episode 35: Reward = -2275380
Episode 36: Reward = -13210


  value_loss = F.mse_loss(current_values, return_batch)
  value_loss = F.mse_loss(current_values, return_batch)


Episode 37: Reward = 183590


  value_loss = F.mse_loss(current_values, return_batch)


Episode 38: Reward = -440406
Episode 39: Reward = -1660
Episode 40: Reward = -1265
Episode 41: Reward = -2568312
Episode 42: Reward = -3550
Episode 43: Reward = -118466
Episode 44: Reward = -637770
Episode 45: Reward = -85428


  value_loss = F.mse_loss(current_values, return_batch)
  value_loss = F.mse_loss(current_values, return_batch)


Episode 46: Reward = -727561
Episode 47: Reward = -122605
Episode 48: Reward = -2060
Episode 49: Reward = -78524
Episode 50: Reward = -2133


  value_loss = F.mse_loss(current_values, return_batch)


Episode 51: Reward = -106650
Episode 52: Reward = -1900
Episode 53: Reward = -26950
Episode 54: Reward = -23225


  value_loss = F.mse_loss(current_values, return_batch)


Episode 55: Reward = -2751765


  value_loss = F.mse_loss(current_values, return_batch)


Episode 56: Reward = -1243687


  value_loss = F.mse_loss(current_values, return_batch)


Episode 57: Reward = -401050


  value_loss = F.mse_loss(current_values, return_batch)


Episode 58: Reward = -4605978
Episode 59: Reward = -66787
Episode 60: Reward = -10


  value_loss = F.mse_loss(current_values, return_batch)


Episode 61: Reward = -1000


  value_loss = F.mse_loss(current_values, return_batch)


Episode 62: Reward = -635895
Episode 63: Reward = -1100
Episode 64: Reward = 294


  value_loss = F.mse_loss(current_values, return_batch)
  value_loss = F.mse_loss(current_values, return_batch)


Episode 65: Reward = -53525
Episode 66: Reward = -67
Episode 67: Reward = -460
Episode 68: Reward = -1045
Episode 69: Reward = -550527
Episode 70: Reward = 34160


  value_loss = F.mse_loss(current_values, return_batch)


Episode 71: Reward = -340300
Episode 72: Reward = -51757
Episode 73: Reward = -38839
Episode 74: Reward = -10
Episode 75: Reward = -5090
Episode 76: Reward = -10
Episode 77: Reward = -298115
Episode 78: Reward = -3384786
Episode 79: Reward = -1003979
Episode 80: Reward = -10
Episode 81: Reward = -62416
Episode 82: Reward = -7300
Episode 83: Reward = 37830


  value_loss = F.mse_loss(current_values, return_batch)


Episode 84: Reward = -47690
Episode 85: Reward = -19440


  value_loss = F.mse_loss(current_values, return_batch)


Episode 86: Reward = -580922
Episode 87: Reward = 1972


  value_loss = F.mse_loss(current_values, return_batch)


Episode 88: Reward = -169492
Episode 89: Reward = -945
Episode 90: Reward = -121607


  value_loss = F.mse_loss(current_values, return_batch)


Episode 91: Reward = -97485
Episode 92: Reward = -44638
Episode 93: Reward = 530
Episode 94: Reward = -12097
Episode 95: Reward = -30248
Episode 96: Reward = -9590
Episode 97: Reward = -122272
Episode 98: Reward = -4993
Episode 99: Reward = -397038
Episode 100: Reward = 127565
Episode 101: Reward = -1120
Episode 102: Reward = 135887
Episode 103: Reward = -8694
Episode 104: Reward = -460
Episode 105: Reward = 12333
Episode 106: Reward = -2474
Episode 107: Reward = -49310


  value_loss = F.mse_loss(current_values, return_batch)


Episode 108: Reward = -4720
Episode 109: Reward = -301063
Episode 110: Reward = -5670


  value_loss = F.mse_loss(current_values, return_batch)


Episode 111: Reward = -534802
Episode 112: Reward = -209618


  value_loss = F.mse_loss(current_values, return_batch)
  value_loss = F.mse_loss(current_values, return_batch)


Episode 113: Reward = -10
Episode 114: Reward = -410
Episode 115: Reward = 890
Episode 116: Reward = -134868
Episode 117: Reward = -970662
Episode 118: Reward = -131
Episode 119: Reward = -1266710
Episode 120: Reward = -1750


  value_loss = F.mse_loss(current_values, return_batch)


Episode 121: Reward = -33454
Episode 122: Reward = -267728
Episode 123: Reward = 40634
Episode 124: Reward = -37
Episode 125: Reward = -20970
Episode 126: Reward = -20810
Episode 127: Reward = -44970
Episode 128: Reward = -57
Episode 129: Reward = -150666
Episode 130: Reward = 1330
Episode 131: Reward = -356186
Episode 132: Reward = -10
Episode 133: Reward = -802165
Episode 134: Reward = -10
Episode 135: Reward = -1923631
Episode 136: Reward = -10
Episode 137: Reward = -22960
Episode 138: Reward = -10103
Episode 139: Reward = -42220
Episode 140: Reward = -2590


  value_loss = F.mse_loss(current_values, return_batch)
  value_loss = F.mse_loss(current_values, return_batch)


Episode 141: Reward = -5030
Episode 142: Reward = -670
Episode 143: Reward = -63453
Episode 144: Reward = -41350
Episode 145: Reward = -24237


  value_loss = F.mse_loss(current_values, return_batch)


Episode 146: Reward = -164234


  value_loss = F.mse_loss(current_values, return_batch)


Episode 147: Reward = -82299
Episode 148: Reward = -96919
Episode 149: Reward = -377147
Episode 150: Reward = -680873
Episode 151: Reward = -129410
Episode 152: Reward = -17120
Episode 153: Reward = 2420


  value_loss = F.mse_loss(current_values, return_batch)


Episode 154: Reward = -2738473
Episode 155: Reward = -1462
Episode 156: Reward = -1977
Episode 157: Reward = -1110
Episode 158: Reward = 14690


  value_loss = F.mse_loss(current_values, return_batch)


Episode 159: Reward = -130752
Episode 160: Reward = -2530
Episode 161: Reward = -4980
Episode 162: Reward = -92896
Episode 163: Reward = -306881
Episode 164: Reward = -24320
Episode 165: Reward = -23263
Episode 166: Reward = -1110
Episode 167: Reward = -83260
Episode 168: Reward = -113352
Episode 169: Reward = -244263
Episode 170: Reward = -880


  value_loss = F.mse_loss(current_values, return_batch)


Episode 171: Reward = -2775
Episode 172: Reward = -21588
Episode 173: Reward = -3115


  value_loss = F.mse_loss(current_values, return_batch)


Episode 174: Reward = 319800
Episode 175: Reward = -290520


  value_loss = F.mse_loss(current_values, return_batch)


Episode 176: Reward = -58532
Episode 177: Reward = 7478
Episode 178: Reward = -239148
Episode 179: Reward = -596076
Episode 180: Reward = -13427
Episode 181: Reward = -1564
Episode 182: Reward = -10
Episode 183: Reward = -680
Episode 184: Reward = -10
Episode 185: Reward = -10
Episode 186: Reward = -40960
Episode 187: Reward = -58325
Episode 188: Reward = -2690
Episode 189: Reward = -152410
Episode 190: Reward = -120
Episode 191: Reward = -1228665
Episode 192: Reward = -6690
Episode 193: Reward = -10
Episode 194: Reward = -216340
Episode 195: Reward = -133988
Episode 196: Reward = -1340355
Episode 197: Reward = -203220
Episode 198: Reward = 650
Episode 199: Reward = -167422
Episode 200: Reward = -212986


  value_loss = F.mse_loss(current_values, return_batch)


Episode 201: Reward = -1512
Episode 202: Reward = -84366
Episode 203: Reward = -439664
Episode 204: Reward = -6090


  value_loss = F.mse_loss(current_values, return_batch)


Episode 205: Reward = -805142
Episode 206: Reward = -2232
Episode 207: Reward = -700
Episode 208: Reward = -66955
Episode 209: Reward = -1230
Episode 210: Reward = -124228
Episode 211: Reward = -910
Episode 212: Reward = -286660
Episode 213: Reward = -1548
Episode 214: Reward = -1110
Episode 215: Reward = -1660
Episode 216: Reward = -48042


  value_loss = F.mse_loss(current_values, return_batch)


Episode 217: Reward = -27660
Episode 218: Reward = 247622
Episode 219: Reward = -1990
Episode 220: Reward = -1662
Episode 221: Reward = -278507
Episode 222: Reward = -581620
Episode 223: Reward = -10
Episode 224: Reward = -4130
Episode 225: Reward = -277387
Episode 226: Reward = -174836
Episode 227: Reward = -1000
Episode 228: Reward = -470
Episode 229: Reward = -10
Episode 230: Reward = -10
Episode 231: Reward = 3824


  value_loss = F.mse_loss(current_values, return_batch)


Episode 232: Reward = 143044
Episode 233: Reward = 58638
Episode 234: Reward = -20547
Episode 235: Reward = -560


  value_loss = F.mse_loss(current_values, return_batch)


Episode 236: Reward = -542852
Episode 237: Reward = -247085
Episode 238: Reward = -1000
Episode 239: Reward = 430
Episode 240: Reward = -111353
Episode 241: Reward = 1640
Episode 242: Reward = -10
Episode 243: Reward = -142015
Episode 244: Reward = -601450
Episode 245: Reward = -87248
Episode 246: Reward = -246714
Episode 247: Reward = -10
Episode 248: Reward = -10
Episode 249: Reward = -6480
Episode 250: Reward = -1276848
Episode 251: Reward = -390
Episode 252: Reward = -7522
Episode 253: Reward = -39735
Episode 254: Reward = -10


  value_loss = F.mse_loss(current_values, return_batch)


Episode 255: Reward = -408211
Episode 256: Reward = -1830
Episode 257: Reward = -2200
Episode 258: Reward = -2230
Episode 259: Reward = -164240
Episode 260: Reward = 890
Episode 261: Reward = -163564
Episode 262: Reward = -1208
Episode 263: Reward = -18730
Episode 264: Reward = 6373
Episode 265: Reward = -1342
Episode 266: Reward = -3072
Episode 267: Reward = -470769
Episode 268: Reward = -1330
Episode 269: Reward = -3910


  value_loss = F.mse_loss(current_values, return_batch)


Episode 270: Reward = -68521
Episode 271: Reward = 372866
Episode 272: Reward = -1990
Episode 273: Reward = -33740
Episode 274: Reward = 100997
Episode 275: Reward = -139426
Episode 276: Reward = -10
Episode 277: Reward = -55443
Episode 278: Reward = -3589535
Episode 279: Reward = -93520
Episode 280: Reward = 940
Episode 281: Reward = -169774
Episode 282: Reward = -37
Episode 283: Reward = -19204
Episode 284: Reward = -560
Episode 285: Reward = -379802
Episode 286: Reward = -610
Episode 287: Reward = -1162
Episode 288: Reward = -5460
Episode 289: Reward = -1917318
Episode 290: Reward = -1780
Episode 291: Reward = -44172
Episode 292: Reward = -145892
Episode 293: Reward = -11030
Episode 294: Reward = -10
Episode 295: Reward = -63501
Episode 296: Reward = -2485
Episode 297: Reward = -10
Episode 298: Reward = -1810
Episode 299: Reward = -459246
Episode 300: Reward = -1570
Episode 301: Reward = -57050
Episode 302: Reward = -18930
Episode 303: Reward = -136554
Episode 304: Reward = -38350
E