In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import datetime
import gym

class ReplayMemory():
    
    def __init__(self, capacity, obs_shape, device='cpu'):
        
        self.device=device
        
        self.capacity = capacity # The maximum number of items to be stored in memory
        
        # Initialize (empty) memory tensors
        self.obs_mem    = torch.empty([capacity]+[dim for dim in obs_shape], dtype=torch.float32, device=self.device)
        self.action_mem = torch.empty(capacity, dtype=torch.int64, device=self.device)
        self.reward_mem = torch.empty(capacity, dtype=torch.int8, device=self.device)
        self.done_mem   = torch.empty(capacity, dtype=torch.int8, device=self.device)
        
        self.push_count = 0 # The number of times new data has been pushed to memory
        
    def push(self, obs, action, reward, done):
        
        # Store data to memory
        self.obs_mem[self.position()] = obs 
        self.action_mem[self.position()] = action
        self.reward_mem[self.position()] = reward
        self.done_mem[self.position()] = done
        
        self.push_count += 1
    
    def position(self):
        # Returns the next position (index) to which data is pushed
        return self.push_count % self.capacity
    
    
    def sample(self, obs_indices, action_indices, reward_indices, done_indices, max_n_indices, batch_size):
        # Fine as long as max_n is not greater than the fewest number of time steps an episode can take
        
        # Pick indices at random
        end_indices = np.random.choice(min(self.push_count, self.capacity)-max_n_indices*2, batch_size, replace=False) + max_n_indices
        
        # Correct for sampling near the position where data was last pushed
        for i in range(len(end_indices)):
            if end_indices[i] in range(self.position(), self.position()+max_n_indices):
                end_indices[i] += max_n_indices
        
        # Retrieve the specified indices that come before the end_indices
        obs_batch = self.obs_mem[np.array([index-obs_indices for index in end_indices])]
        action_batch = self.action_mem[np.array([index-action_indices for index in end_indices])]
        reward_batch = self.reward_mem[np.array([index-reward_indices for index in end_indices])]
        done_batch = self.done_mem[np.array([index-done_indices for index in end_indices])]
        
        # Correct for sampling over multiple episodes
        for i in range(len(end_indices)):
            index = end_indices[i]
            for j in range(1, max_n_indices):
                if self.done_mem[index-j]:
                    for k in range(len(obs_indices)):
                        if obs_indices[k] >= j:
                            obs_batch[i, k] = torch.zeros_like(self.obs_mem[0]) 
                    for k in range(len(action_indices)):
                        if action_indices[k] >= j:
                            action_batch[i, k] = torch.zeros_like(self.action_mem[0]) # Assigning action '0' might not be the best solution, perhaps as assigning at random, or adding an action for this specific case would be better
                    for k in range(len(reward_indices)):
                        if reward_indices[k] >= j:
                            reward_batch[i, k] = torch.zeros_like(self.reward_mem[0]) # Reward of 0 will probably not make sense for every environment
                    for k in range(len(done_indices)):
                        if done_indices[k] >= j:
                            done_batch[i, k] = torch.zeros_like(self.done_mem[0]) 
                    break
                
        return obs_batch, action_batch, reward_batch, done_batch
    

class Model(nn.Module):
    
    def __init__(self, n_inputs, n_outputs, n_hidden=64, lr=1e-3, softmax=False, device='cpu'):
        super(Model, self).__init__()
        
        self.n_inputs = n_inputs # Number of inputs
        self.n_hidden = n_hidden # Number of hidden units
        self.n_outputs = n_outputs # Number of outputs
        self.softmax = softmax # If true apply a softmax function to the output
        
        self.fc1 = nn.Linear(self.n_inputs, self.n_hidden) # Hidden layer
        self.fc2 = nn.Linear(self.n_hidden, self.n_outputs) # Output layer
        
        self.optimizer = optim.Adam(self.parameters(), lr) # Adam optimizer
        
        self.device = device
        self.to(self.device)
    
    def forward(self, x):
        # Define the forward pass:
        h_relu = F.relu(self.fc1(x))
        
        if self.softmax: # If true apply a softmax function to the output
            return F.softmax(self.fc2(h_relu), dim=-1).clamp(min=1e-9, max=1-1e-9)
        else:
            return self.fc2(h_relu)
    
class Agent():
    
    def __init__(self):
        
        self.set_parameters() # Set parameters
        
        self.obs_shape = self.env.observation_space.shape # The shape of observations
        self.obs_size = np.prod(self.obs_shape) # The size of the observation
        self.n_actions = self.env.action_space.n # The number of actions available to the agent
        
        self.freeze_cntr = 0 # Keeps track of when to (un)freeze the target network
        
        # Initialize the networks:
        self.transition_net = Model(self.obs_size+1, self.obs_size, self.n_hidden_trans, lr=self.lr_trans, device=self.device)
        self.policy_net = Model(self.obs_size, self.n_actions, self.n_hidden_pol, lr=self.lr_pol, softmax=True, device=self.device)
        self.value_net = Model(self.obs_size, self.n_actions, self.n_hidden_val, lr=self.lr_val, device=self.device)
        
        if self.load_network: # If true: load the networks given paths
            self.transition_net.load_state_dict(torch.load(self.network_load_path.format("trans")))
            self.transition_net.eval()
            self.policy_net.load_state_dict(torch.load(self.network_load_path.format("pol")))
            self.policy_net.eval()
            self.value_net.load_state_dict(torch.load(self.network_load_path.format("val")))
            self.value_net.eval()
        self.target_net = Model(self.obs_size, self.n_actions, self.n_hidden_val, lr=self.lr_val, device=self.device)
        self.target_net.load_state_dict(self.value_net.state_dict())
        
        # Initialize the replay memory
        self.memory = ReplayMemory(self.memory_capacity, self.obs_shape, device=self.device)
        
        # When sampling from memory at index i, obs_indices indicates that we want observations with indices i-obs_indices, works the same for the others
        self.obs_indices = [2, 1, 0]
        self.action_indices = [2, 1]
        self.reward_indices = [1]
        self.done_indices = [0]
        self.max_n_indices = max(max(self.obs_indices, self.action_indices, self.reward_indices, self.done_indices)) + 1

    def set_parameters(self):
        
        # The default parameters
        default_parameters = {
            'run_id':"_rX", 
            'device':'cpu',
            'env':'CartPole-v1', 
            'n_episodes':1000, 
            'n_hidden_trans':64, 
            'lr_trans':1e-3, 
            'n_hidden_pol':64, 
            'lr_pol':1e-3, 
            'n_hidden_val':64, 
            'lr_val':1e-4,
            'memory_capacity':65536, 
            'batch_size':64,
            'freeze_period':25,
            'Beta':0.99,
            'gamma':1.00, 
            'print_timer':100,
            'log_save_timer':10,
            'save_results':True,
            'results_path':"results/ai_mdp_results{}.npz",
            'results_save_timer':500,
            'save_network':True, 
            'network_save_path':"networks/ai_mdp_{}net{}.pth",
            'network_save_timer':500,
            'load_network':False,
            'network_load_path':"networks/ai_mdp_{}net_rX.pth",
            'record_video': False,
            'record_statistics': True
            }
        
        # Possible command:
            # python ai_mdp_agent.py device=cuda:0
        
        # Adjust the custom parameters according to the arguments in
        custom_parameters = default_parameters.copy()
        
        # Set all parameters
        self.run_id = custom_parameters['run_id'] # Is appended to paths to distinguish between runs
        self.device = custom_parameters['device'] # The device used to run the code
        
        self.env = gym.make(custom_parameters['env'], render_mode='rgb_array') # The environment in which to train

        if custom_parameters['record_video']:
            self.env  = gym.wrappers.RecordVideo(self.env, f'{custom_parameters["env"]}/{self.run_id}')

        if custom_parameters['record_statistics']:
            self.env  = gym.wrappers.RecordEpisodeStatistics(self.env)

        self.n_episodes = int(custom_parameters['n_episodes']) # The number of episodes for which to train
        
        # Set number of hidden nodes and learning rate for each network
        self.n_hidden_trans = int(custom_parameters['n_hidden_trans'])
        self.lr_trans = float(custom_parameters['lr_trans'])
        self.n_hidden_pol = int(custom_parameters['n_hidden_pol'])
        self.lr_pol = float(custom_parameters['lr_pol'])
        self.n_hidden_val = int(custom_parameters['n_hidden_val'])
        self.lr_val = float(custom_parameters['lr_val'])
        
        self.memory_capacity = int(custom_parameters['memory_capacity']) # The maximum number of items to be stored in memory
        self.batch_size = int(custom_parameters['batch_size']) # The mini-batch size
        self.freeze_period = int(custom_parameters['freeze_period']) # The number of time-steps the target network is frozen
        
        self.gamma = float(custom_parameters['gamma']) # A precision parameter
        self.Beta = float(custom_parameters['Beta']) # The discount rate
        
        self.save_network = custom_parameters['save_network'] # If true saves the policy network (state_dict) to a .pth file
        self.network_save_path = custom_parameters['network_save_path'].format("{}", self.run_id) # The path to which the network is saved
        self.network_save_timer = int(custom_parameters['network_save_timer']) # The number of episodes after which the network is saved
                
        self.load_network = custom_parameters['load_network'] # If true loads a (policy) network (state_dict) instead of initializing a new one
        self.network_load_path = custom_parameters['network_load_path'] # The path from which to laod the network
        
    def select_action(self, obs):
        with torch.no_grad():
            # Determine the action distribution given the current observation:
            policy = self.policy_net(obs)
            return torch.multinomial(policy, 1)
    
    def get_mini_batches(self):
        # Retrieve transition data in mini batches
        all_obs_batch, all_actions_batch, reward_batch_t1, done_batch_t2 = self.memory.sample(
                self.obs_indices, self.action_indices, self.reward_indices,
                self.done_indices, self.max_n_indices, self.batch_size)
        
        # Retrieve a batch of observations for 3 consecutive points in time
        obs_batch_t0 = all_obs_batch[:, 0].view([self.batch_size] + [dim for dim in self.obs_shape])
        obs_batch_t1 = all_obs_batch[:, 1].view([self.batch_size] + [dim for dim in self.obs_shape])
        obs_batch_t2 = all_obs_batch[:, 2].view([self.batch_size] + [dim for dim in self.obs_shape])
        
        # Retrieve the agent's action history for time t0 and time t1
        action_batch_t0 = all_actions_batch[:, 0].unsqueeze(1)
        action_batch_t1 = all_actions_batch[:, 1].unsqueeze(1)
        
        # At time t0 predict the state at time t1:
        X = torch.cat((obs_batch_t0, action_batch_t0.float()), dim=1)
        pred_batch_t0t1 = self.transition_net(X)

        # Determine the prediction error wrt time t0-t1:
        pred_error_batch_t0t1 = torch.mean(F.mse_loss(
                pred_batch_t0t1, obs_batch_t1, reduction='none'), dim=1).unsqueeze(1)
        
        return (obs_batch_t0, obs_batch_t1, obs_batch_t2, action_batch_t0,
                action_batch_t1, reward_batch_t1, done_batch_t2, pred_error_batch_t0t1)
    
    def compute_value_net_loss(self, obs_batch_t1, obs_batch_t2,
                               action_batch_t1, reward_batch_t1,
                               done_batch_t2, pred_error_batch_t0t1):
        
        with torch.no_grad():
            # Determine the action distribution for time t2:
            policy_batch_t2 = self.policy_net(obs_batch_t2)
            
            # Determine the target EFEs for time t2:
            target_EFEs_batch_t2 = self.target_net(obs_batch_t2)
            
            # Weigh the target EFEs according to the action distribution:
            weighted_targets = ((1-done_batch_t2) * policy_batch_t2 *
                                target_EFEs_batch_t2).sum(-1).unsqueeze(1)
                
            # Determine the batch of bootstrapped estimates of the EFEs:
            EFE_estimate_batch = -reward_batch_t1 + pred_error_batch_t0t1 + self.Beta * weighted_targets
        
        # Determine the EFE at time t1 according to the value network:
        EFE_batch_t1 = self.value_net(obs_batch_t1).gather(1, action_batch_t1)
            
        # Determine the MSE loss between the EFE estimates and the value network output:
        value_net_loss = F.mse_loss(EFE_estimate_batch, EFE_batch_t1)
        
        return value_net_loss
    
    def compute_VFE(self, obs_batch_t1, pred_error_batch_t0t1):
        
        # Determine the action distribution for time t1:
        policy_batch_t1 = self.policy_net(obs_batch_t1)
        
        # Determine the EFEs for time t1:
        EFEs_batch_t1 = self.value_net(obs_batch_t1).detach()

        # Take a gamma-weighted Boltzmann distribution over the EFEs:
        boltzmann_EFEs_batch_t1 = torch.softmax(-self.gamma * EFEs_batch_t1, dim=1).clamp(min=1e-9, max=1-1e-9)
        
        # Weigh them according to the action distribution:
        energy_batch = -(policy_batch_t1 * torch.log(boltzmann_EFEs_batch_t1)).sum(-1).view(self.batch_size, 1)
        
        # Determine the entropy of the action distribution
        entropy_batch = -(policy_batch_t1 * torch.log(policy_batch_t1)).sum(-1).view(self.batch_size, 1)
        
        # Determine the VFE, then take the mean over all batch samples:
        VFE_batch = pred_error_batch_t0t1 + (energy_batch - entropy_batch)
        VFE = torch.mean(VFE_batch)
        
        return VFE
        
    def learn(self):
        
        # If there are not enough transitions stored in memory, return:
        if self.memory.push_count - self.max_n_indices*2 < self.batch_size:
            return
        
        # After every freeze_period time steps, update the target network:
        if self.freeze_cntr % self.freeze_period == 0:
            self.target_net.load_state_dict(self.value_net.state_dict())
        self.freeze_cntr += 1
        
        # Retrieve transition data in mini batches:
        (obs_batch_t0, obs_batch_t1, obs_batch_t2, action_batch_t0,
         action_batch_t1, reward_batch_t1, done_batch_t2,
         pred_error_batch_t0t1) = self.get_mini_batches()
        
        # Compute the value network loss:
        value_net_loss = self.compute_value_net_loss(obs_batch_t1, obs_batch_t2, 
                                         action_batch_t1, reward_batch_t1,
                                         done_batch_t2, pred_error_batch_t0t1)
        
        # Compute the variational free energy:
        VFE = self.compute_VFE(obs_batch_t1, pred_error_batch_t0t1)
        
        # Reset the gradients:
        self.transition_net.optimizer.zero_grad()
        self.policy_net.optimizer.zero_grad()
        self.value_net.optimizer.zero_grad()
        
        # Compute the gradients:
        VFE.backward()
        value_net_loss.backward()
        
        # Perform gradient descent:
        self.transition_net.optimizer.step()
        self.policy_net.optimizer.step()
        self.value_net.optimizer.step()
        
    def train(self):

        results = []

        for ith_episode in range(self.n_episodes):
            
            total_reward = 0
            obs, _ = self.env.reset()
            obs = torch.tensor(obs, dtype=torch.float32, device=self.device)
            done = False
            reward = 0
            while not done:
                
                action = self.select_action(obs)
                self.memory.push(obs, action, reward, done)
                
                obs, reward, done, _, _ = self.env.step(action[0].item())
                obs = torch.tensor(obs, dtype=torch.float32, device=self.device)
                total_reward += reward
                
                self.learn()
                
                if done:
                    self.memory.push(obs, -99, -99, done)
                    
            results.append(total_reward)

            print(f'Episode: {ith_episode} - Reward: {total_reward}')

        self.env.close()
                
if __name__ == "__main__":
    agent = Agent()
    agent.train()
        

  logger.warn(
  if not isinstance(terminated, (bool, np.bool8)):


Moviepy - Building video /Users/jackmontgomery/Desktop/UCT/Research_Project/honours-project/practice/CartPole-v1/_rX/rl-video-episode-0.mp4.
Moviepy - Writing video /Users/jackmontgomery/Desktop/UCT/Research_Project/honours-project/practice/CartPole-v1/_rX/rl-video-episode-0.mp4



                                                           

Moviepy - Done !
Moviepy - video ready /Users/jackmontgomery/Desktop/UCT/Research_Project/honours-project/practice/CartPole-v1/_rX/rl-video-episode-0.mp4
Episode: 0 - Reward: 11.0
Moviepy - Building video /Users/jackmontgomery/Desktop/UCT/Research_Project/honours-project/practice/CartPole-v1/_rX/rl-video-episode-1.mp4.
Moviepy - Writing video /Users/jackmontgomery/Desktop/UCT/Research_Project/honours-project/practice/CartPole-v1/_rX/rl-video-episode-1.mp4



                                                   

Moviepy - Done !
Moviepy - video ready /Users/jackmontgomery/Desktop/UCT/Research_Project/honours-project/practice/CartPole-v1/_rX/rl-video-episode-1.mp4
Episode: 1 - Reward: 30.0
Episode: 2 - Reward: 14.0
Episode: 3 - Reward: 11.0
Episode: 4 - Reward: 16.0
Episode: 5 - Reward: 19.0
Episode: 6 - Reward: 9.0
Episode: 7 - Reward: 14.0
Moviepy - Building video /Users/jackmontgomery/Desktop/UCT/Research_Project/honours-project/practice/CartPole-v1/_rX/rl-video-episode-8.mp4.
Moviepy - Writing video /Users/jackmontgomery/Desktop/UCT/Research_Project/honours-project/practice/CartPole-v1/_rX/rl-video-episode-8.mp4



                                                   

Moviepy - Done !
Moviepy - video ready /Users/jackmontgomery/Desktop/UCT/Research_Project/honours-project/practice/CartPole-v1/_rX/rl-video-episode-8.mp4
Episode: 8 - Reward: 17.0




Episode: 9 - Reward: 19.0
Episode: 10 - Reward: 12.0
Episode: 11 - Reward: 28.0
Episode: 12 - Reward: 16.0
Episode: 13 - Reward: 16.0
Episode: 14 - Reward: 18.0
Episode: 15 - Reward: 14.0
Episode: 16 - Reward: 21.0
Episode: 17 - Reward: 14.0
Episode: 18 - Reward: 23.0
Episode: 19 - Reward: 14.0
Episode: 20 - Reward: 14.0
Episode: 21 - Reward: 23.0
Episode: 22 - Reward: 12.0
Episode: 23 - Reward: 40.0
Episode: 24 - Reward: 10.0
Episode: 25 - Reward: 14.0
Episode: 26 - Reward: 12.0
Moviepy - Building video /Users/jackmontgomery/Desktop/UCT/Research_Project/honours-project/practice/CartPole-v1/_rX/rl-video-episode-27.mp4.
Moviepy - Writing video /Users/jackmontgomery/Desktop/UCT/Research_Project/honours-project/practice/CartPole-v1/_rX/rl-video-episode-27.mp4



                                                   

Moviepy - Done !
Moviepy - video ready /Users/jackmontgomery/Desktop/UCT/Research_Project/honours-project/practice/CartPole-v1/_rX/rl-video-episode-27.mp4
Episode: 27 - Reward: 17.0
Episode: 28 - Reward: 19.0
Episode: 29 - Reward: 16.0
Episode: 30 - Reward: 13.0
Episode: 31 - Reward: 13.0
Episode: 32 - Reward: 54.0
Episode: 33 - Reward: 21.0
Episode: 34 - Reward: 16.0
Episode: 35 - Reward: 20.0
Episode: 36 - Reward: 10.0
Episode: 37 - Reward: 15.0
Episode: 38 - Reward: 20.0
Episode: 39 - Reward: 28.0
Episode: 40 - Reward: 17.0
Episode: 41 - Reward: 23.0
Episode: 42 - Reward: 32.0
Episode: 43 - Reward: 17.0
Episode: 44 - Reward: 21.0
Episode: 45 - Reward: 9.0
Episode: 46 - Reward: 10.0
Episode: 47 - Reward: 12.0
Episode: 48 - Reward: 40.0
Episode: 49 - Reward: 20.0
Episode: 50 - Reward: 20.0
Episode: 51 - Reward: 15.0
Episode: 52 - Reward: 11.0
Episode: 53 - Reward: 20.0
Episode: 54 - Reward: 12.0
Episode: 55 - Reward: 13.0
Episode: 56 - Reward: 19.0
Episode: 57 - Reward: 43.0
Episode: 

                                                   

Moviepy - Done !
Moviepy - video ready /Users/jackmontgomery/Desktop/UCT/Research_Project/honours-project/practice/CartPole-v1/_rX/rl-video-episode-64.mp4
Episode: 64 - Reward: 16.0
Episode: 65 - Reward: 10.0




Episode: 66 - Reward: 20.0
Episode: 67 - Reward: 16.0
Episode: 68 - Reward: 14.0
Episode: 69 - Reward: 9.0
Episode: 70 - Reward: 11.0
Episode: 71 - Reward: 17.0
Episode: 72 - Reward: 20.0
Episode: 73 - Reward: 16.0
Episode: 74 - Reward: 15.0
Episode: 75 - Reward: 18.0
Episode: 76 - Reward: 14.0
Episode: 77 - Reward: 12.0
Episode: 78 - Reward: 9.0
Episode: 79 - Reward: 19.0
Episode: 80 - Reward: 11.0
Episode: 81 - Reward: 16.0
Episode: 82 - Reward: 15.0
Episode: 83 - Reward: 22.0
Episode: 84 - Reward: 20.0
Episode: 85 - Reward: 17.0
Episode: 86 - Reward: 17.0
Episode: 87 - Reward: 9.0
Episode: 88 - Reward: 28.0
Episode: 89 - Reward: 11.0
Episode: 90 - Reward: 19.0
Episode: 91 - Reward: 16.0
Episode: 92 - Reward: 33.0
Episode: 93 - Reward: 18.0
Episode: 94 - Reward: 25.0
Episode: 95 - Reward: 12.0
Episode: 96 - Reward: 21.0
Episode: 97 - Reward: 16.0
Episode: 98 - Reward: 16.0
Episode: 99 - Reward: 12.0
Episode: 100 - Reward: 10.0
Episode: 101 - Reward: 13.0
Episode: 102 - Reward: 14.0
E

                                                   

Moviepy - Done !
Moviepy - video ready /Users/jackmontgomery/Desktop/UCT/Research_Project/honours-project/practice/CartPole-v1/_rX/rl-video-episode-125.mp4
Episode: 125 - Reward: 26.0
Episode: 126 - Reward: 20.0
Episode: 127 - Reward: 11.0
Episode: 128 - Reward: 17.0
Episode: 129 - Reward: 13.0
Episode: 130 - Reward: 16.0
Episode: 131 - Reward: 11.0
Episode: 132 - Reward: 31.0
Episode: 133 - Reward: 14.0
Episode: 134 - Reward: 15.0
Episode: 135 - Reward: 9.0
Episode: 136 - Reward: 11.0
Episode: 137 - Reward: 14.0
Episode: 138 - Reward: 10.0
Episode: 139 - Reward: 15.0
Episode: 140 - Reward: 9.0
Episode: 141 - Reward: 18.0
Episode: 142 - Reward: 12.0
Episode: 143 - Reward: 13.0
Episode: 144 - Reward: 10.0
Episode: 145 - Reward: 9.0
Episode: 146 - Reward: 8.0
Episode: 147 - Reward: 10.0
Episode: 148 - Reward: 16.0
Episode: 149 - Reward: 11.0
Episode: 150 - Reward: 22.0
Episode: 151 - Reward: 15.0
Episode: 152 - Reward: 13.0
Episode: 153 - Reward: 12.0
Episode: 154 - Reward: 13.0
Episode:

                                                   

Moviepy - Done !
Moviepy - video ready /Users/jackmontgomery/Desktop/UCT/Research_Project/honours-project/practice/CartPole-v1/_rX/rl-video-episode-216.mp4
Episode: 216 - Reward: 15.0
Episode: 217 - Reward: 9.0
Episode: 218 - Reward: 10.0
Episode: 219 - Reward: 18.0
Episode: 220 - Reward: 10.0
Episode: 221 - Reward: 24.0
Episode: 222 - Reward: 8.0
Episode: 223 - Reward: 12.0
Episode: 224 - Reward: 22.0
Episode: 225 - Reward: 19.0
Episode: 226 - Reward: 13.0
Episode: 227 - Reward: 11.0
Episode: 228 - Reward: 14.0
Episode: 229 - Reward: 16.0
Episode: 230 - Reward: 12.0
Episode: 231 - Reward: 24.0
Episode: 232 - Reward: 12.0
Episode: 233 - Reward: 10.0
Episode: 234 - Reward: 15.0
Episode: 235 - Reward: 26.0
Episode: 236 - Reward: 18.0
Episode: 237 - Reward: 12.0
Episode: 238 - Reward: 10.0
Episode: 239 - Reward: 14.0
Episode: 240 - Reward: 11.0
Episode: 241 - Reward: 13.0
Episode: 242 - Reward: 11.0
Episode: 243 - Reward: 12.0
Episode: 244 - Reward: 23.0
Episode: 245 - Reward: 24.0
Episod

                                                   

Moviepy - Done !
Moviepy - video ready /Users/jackmontgomery/Desktop/UCT/Research_Project/honours-project/practice/CartPole-v1/_rX/rl-video-episode-343.mp4
Episode: 343 - Reward: 15.0
Episode: 344 - Reward: 19.0
Episode: 345 - Reward: 26.0
Episode: 346 - Reward: 28.0
Episode: 347 - Reward: 14.0
Episode: 348 - Reward: 10.0
Episode: 349 - Reward: 14.0
Episode: 350 - Reward: 18.0
Episode: 351 - Reward: 17.0
Episode: 352 - Reward: 21.0
Episode: 353 - Reward: 15.0
Episode: 354 - Reward: 15.0
Episode: 355 - Reward: 33.0
Episode: 356 - Reward: 18.0
Episode: 357 - Reward: 16.0
Episode: 358 - Reward: 18.0
Episode: 359 - Reward: 18.0
Episode: 360 - Reward: 35.0
Episode: 361 - Reward: 31.0
Episode: 362 - Reward: 13.0
Episode: 363 - Reward: 36.0
Episode: 364 - Reward: 26.0
Episode: 365 - Reward: 34.0
Episode: 366 - Reward: 29.0
Episode: 367 - Reward: 21.0
Episode: 368 - Reward: 27.0
Episode: 369 - Reward: 16.0
Episode: 370 - Reward: 22.0
Episode: 371 - Reward: 30.0
Episode: 372 - Reward: 25.0
Epis

                                                   

Moviepy - Done !
Moviepy - video ready /Users/jackmontgomery/Desktop/UCT/Research_Project/honours-project/practice/CartPole-v1/_rX/rl-video-episode-512.mp4
Episode: 512 - Reward: 18.0
Episode: 513 - Reward: 36.0
Episode: 514 - Reward: 18.0
Episode: 515 - Reward: 11.0
Episode: 516 - Reward: 13.0
Episode: 517 - Reward: 39.0
Episode: 518 - Reward: 12.0
Episode: 519 - Reward: 25.0
Episode: 520 - Reward: 28.0
Episode: 521 - Reward: 34.0
Episode: 522 - Reward: 14.0
Episode: 523 - Reward: 38.0
Episode: 524 - Reward: 34.0
Episode: 525 - Reward: 40.0
Episode: 526 - Reward: 22.0
Episode: 527 - Reward: 24.0
Episode: 528 - Reward: 45.0
Episode: 529 - Reward: 51.0
Episode: 530 - Reward: 50.0
Episode: 531 - Reward: 14.0
Episode: 532 - Reward: 43.0
Episode: 533 - Reward: 23.0
Episode: 534 - Reward: 58.0
Episode: 535 - Reward: 19.0
Episode: 536 - Reward: 13.0
Episode: 537 - Reward: 34.0
Episode: 538 - Reward: 25.0
Episode: 539 - Reward: 22.0
Episode: 540 - Reward: 25.0
Episode: 541 - Reward: 54.0
Epis

                                                   

Moviepy - Done !
Moviepy - video ready /Users/jackmontgomery/Desktop/UCT/Research_Project/honours-project/practice/CartPole-v1/_rX/rl-video-episode-729.mp4
Episode: 729 - Reward: 23.0
Episode: 730 - Reward: 104.0
Episode: 731 - Reward: 21.0
Episode: 732 - Reward: 28.0
Episode: 733 - Reward: 100.0
Episode: 734 - Reward: 46.0
Episode: 735 - Reward: 33.0
Episode: 736 - Reward: 45.0
Episode: 737 - Reward: 37.0
Episode: 738 - Reward: 44.0
Episode: 739 - Reward: 50.0
Episode: 740 - Reward: 42.0
Episode: 741 - Reward: 16.0
Episode: 742 - Reward: 26.0
Episode: 743 - Reward: 47.0
Episode: 744 - Reward: 56.0
Episode: 745 - Reward: 35.0
Episode: 746 - Reward: 33.0
Episode: 747 - Reward: 38.0
Episode: 748 - Reward: 68.0
Episode: 749 - Reward: 84.0
Episode: 750 - Reward: 27.0
Episode: 751 - Reward: 49.0
Episode: 752 - Reward: 25.0
Episode: 753 - Reward: 51.0
Episode: 754 - Reward: 49.0
Episode: 755 - Reward: 43.0
Episode: 756 - Reward: 26.0
Episode: 757 - Reward: 29.0
Episode: 758 - Reward: 17.0
Ep

                                                  

Moviepy - Done !
Moviepy - video ready /Users/jackmontgomery/Desktop/UCT/Research_Project/honours-project/practice/CartPole-v1/_rX/rl-video-episode-1000.mp4


