In [1]:
!pip install gymnasium
!pip install swig
!pip install gymnasium[box2d]
!pip install gymnasium[classic_control]
!pip install gymnasium[mujoco]

Collecting gymnasium
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-1.0.0-py3-none-any.whl (958 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m958.1/958.1 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-1.0.0
Collecting swig
  Downloading swig-4.3.0-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (3.5 kB)
Downloading swig-4.3.0-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig
Successfully installed swig-4.3.0
Collectin

In [2]:
!pip3 install numpngw


Collecting numpngw
  Downloading numpngw-0.1.4-py3-none-any.whl.metadata (14 kB)
Downloading numpngw-0.1.4-py3-none-any.whl (21 kB)
Installing collected packages: numpngw
Successfully installed numpngw-0.1.4


In [7]:
# Models and computation
import torch # will use pyTorch to handle NN
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random
from random import sample

# Visualization
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import Image
from numpngw import write_apng

# IO
import gymnasium as gym
from pathlib import Path
import os
from torch.distributions import MultivariateNormal, Normal,Beta


In [None]:
env = gym.make("CarRacing-v3", render_mode="rgb_array", lap_complete_percent=0.95, domain_randomize=False, continuous=True)
# env

In [None]:
env.observation_space.shape,env.action_space

((96, 96, 3), Box([-1.  0.  0.], 1.0, (3,), float32))

In [None]:
images = []
s,_ = env.reset()
# print(s)
img = env.render()
images.append(img)

done = False
steps = 0

while not done:
    action = env.action_space.sample()
    steps += 1
    # print(env.step(action))
    obs, reward, done, _,_ = env.step(action)
    img = env.render()
    images.append(img)
    if steps > 160:
      break
env.close()

In [None]:
s,_ = env.reset()
# print(s)
img = env.render()
images.append(img)
plt.imshow(img)  # if state is a 3D tensor (e.g., (1, H, W, C))
plt.show()

In [None]:
write_apng('anim.png', images, delay=20)
Image(filename='anim.png')

In [4]:
from google.colab import drive
from google.colab import files

def save_checkpoint(model, filename, mode=0):
    """
    Save a model to a file in your colab space

    Parameters
    ----------
    model: your policy network
    filename: the name of the checkpoint file
    mode (int): determines where to store the file
                --> 0: collab space 1: Google Drive
    """
    if mode == 1:
      drive.mount('/content/gdrive')
      path = F"/content/gdrive/My Drive/{filename}"
      torch.save(model.state_dict(), path)
    else:
      torch.save(model.state_dict(), filename)

def export_to_local_drive(filename):
    """
    Download a file to your local machine

    Parameters
    ----------
    filename: the name of the file
    """
    files.download(filename)


In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [11]:
from collections import deque
import numpy as np

# class FrameStackWrapper:
#     def __init__(self, env, num_frames):
#         self.env = env
#         self.num_frames = num_frames
#         self.frames = deque(maxlen=num_frames)

#     def reset(self):
#         state, info = self.env.reset()
#         for _ in range(self.num_frames):
#             normalized_state = self.normalize_state(state)
#             self.frames.append(normalized_state)
#             # self.frames.append(state)
#         return np.concatenate(self.frames, axis=2), info

#     def normalize_state(self,state):
#       return (state - state.min()) / (state.max() - state.min() + 1e-8)

#     def step(self, action):
#         state, reward, done, truncated, info = self.env.step(action)
#         normalized_state = self.normalize_state(state)
#         self.frames.append(normalized_state)
#         # self.frames.append(state)
#         return np.concatenate(self.frames, axis=2), reward, done, truncated, info

#     def render(self):
#         return self.env.render()

#     def close(self):
#         self.env.close()


from collections import deque
import numpy as np

class FrameStackWrapper:
    def __init__(self, env, num_frames):
        """
        Environment wrapper for frame stacking and reward adjustments.
        """
        self.env = env
        self.num_frames = num_frames
        self.frames = deque(maxlen=num_frames)
        self.reward_threshold = self.env.spec.reward_threshold

    def reset(self):
        self.counter = 0
        self.av_r = self.reward_memory()

        self.die = False
        img_rgb, _ = self.env.reset()
        img_grey = self.rgb2gray(img_rgb)
        nor_img = self.normalize_state(img_grey)

        self.frames = deque([nor_img] * self.num_frames, maxlen=self.num_frames)
        return np.stack(self.frames), _

    # def step(self, action, action_repeat=1):
    #     total_reward = 0
    #     for i in range(action_repeat):
    #         img_rgb, reward, done, truncated, info = self.env.step(action)

    #         if np.mean(img_rgb[:, :, 1]) > 185.0:
    #             reward -= 0.05

    #         total_reward += reward

    #         if self.av_r(reward) <= -0.1 or done:
    #             done = True
    #             break

    #     # Calculate average reward
    #     avg_reward = total_reward / (i + 1)  # i + 1 because i starts from 0

    #     img_gray = self.rgb2gray(img_rgb)
    #     img_norm = self.normalize_state(img_gray)

    #     self.frames.append(img_norm)

    #     return np.stack(self.frames), avg_reward, done, truncated, info

    def step(self, action):
        total_reward = 0
        for i in range(4):
            img_rgb, reward, die, truncated ,_ = self.env.step(action)
            # don't penalize "die state"
            if die:
                reward += 50
            # green penalty
            if np.mean(img_rgb[:, :, 1]) > 140.0:
                reward -= 0.05
            total_reward += reward
            # if no reward recently, end the episode
            done = True if self.av_r(reward) <= -0.1 else False
            if done or die or truncated:
                break
        img_gray = self.rgb2gray(img_rgb)
        img_norm = self.normalize_state(img_gray)
        # avg_reward = total_reward / (i + 1)  # i + 1 because i starts from 0

        self.frames.append(img_norm)
        return np.stack(self.frames), total_reward, done, truncated,_


    @staticmethod
    def rgb2gray(rgb):
        # rgb image -> gray [0, 1]
        gray = np.dot(rgb[..., :], [0.299, 0.587, 0.114])
        return gray

    def render(self):
        """
        Renders the environment.
        """
        return self.env.render()

    def close(self):
        """
        Closes the environment.
        """
        self.env.close()

    def normalize_state(self,state):
        return state/255

    @staticmethod
    def reward_memory():
        # record reward for last 100 steps
        count = 0
        length = 50
        history = np.zeros(length)

        def memory(reward):
            nonlocal count
            history[count] = reward
            count = (count + 1) % length
            return np.mean(history)

        return memory


In [12]:
class TD3_PolicyNetwork(nn.Module):
    def __init__(self, input_shape, output_dim):
        super(TD3_PolicyNetwork, self).__init__()
        # Convolutional layers for feature extraction
# Convolutional layers for feature extraction
        self.cnn_base = nn.Sequential(
            nn.Conv2d(4, 8, kernel_size=4, stride=2),   # (96, 96, 3) -> (8, 47, 47)
            nn.ReLU(),
            nn.Conv2d(8, 16, kernel_size=3, stride=2),               # (8, 47, 47) -> (16, 23, 23)
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=3, stride=2),              # (16, 23, 23) -> (32, 11, 11)
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=2),              # (32, 11, 11) -> (64, 5, 5)
            nn.ReLU(),
            nn.ReLU(),
            nn.Conv2d(64, 128, kernel_size=3, stride=1),  # (64, 5, 5)
            nn.ReLU(),  # activation
            nn.Conv2d(128, 256, kernel_size=3, stride=1),  # (128, 3, 3)
            nn.ReLU(),  # activation
        )
        # Calculate the flattened feature size
        with torch.no_grad():
            in_shape = [4,96,96]
            dummy_input = torch.zeros(1, *in_shape)
            # print(dummy_input.shape)

            flattened_size = self.cnn_base(dummy_input).view(1, -1).shape[1]

        # Fully connected layers for control
        self.fc1 = nn.Sequential(
            nn.Linear(flattened_size, 128),  # First hidden layer
            nn.ReLU(),
            nn.Linear(128, output_dim) ,
            nn.Softplus()# Output layer
        )


    def forward(self, x):
        x = self.cnn_base(x)
        # x = x.view(-1, 256)
        x = x.view(x.size(0), -1)

        action = self.fc1(x)
        return action

    def select_action(self, state):
        # state = torch.from_numpy(state).float().unsqueeze(0).to(device)  # Add batch dimension
        # print(state.shape)
        if(len(state.shape)==3):
          state = torch.from_numpy(state).float().unsqueeze(0).to(device)  # Add batch dimension
        else:
          state = torch.from_numpy(state).float().to(device)  # Add batch dimension
        action = self.forward(state)
        # print(action.shape)
        return action.detach().numpy()[0]

    def evaluate_action(self, state):
        # state = torch.from_numpy(state).float().unsqueeze(0).to(device)  # Add batch dimension
        # print(state.shape)
        if(len(state.shape)==3):
          state = torch.from_numpy(state).float().unsqueeze(0).to(device)  # Add batch dimension
        else:
          state = torch.from_numpy(state).float().to(device)  # Add batch dimension
        action = self.forward(state)
        # print(action.shape)
        return action.detach().numpy()


class TD3_ValueNetwork(nn.Module):
    def __init__(self, input_shape,action_dim):
        super(TD3_ValueNetwork, self).__init__()
        # Convolutional layers for feature extraction
        self.cnn_base = nn.Sequential(
            nn.Conv2d(4, 8, kernel_size=4, stride=2),   # (96, 96, 3) -> (8, 47, 47)
            nn.ReLU(),
            nn.Conv2d(8, 16, kernel_size=3, stride=2),               # (8, 47, 47) -> (16, 23, 23)
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=3, stride=2),              # (16, 23, 23) -> (32, 11, 11)
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=2),              # (32, 11, 11) -> (64, 5, 5)
            nn.ReLU(),
            nn.ReLU(),
            nn.Conv2d(64, 128, kernel_size=3, stride=1),  # (64, 5, 5)
            nn.ReLU(),  # activation
            nn.Conv2d(128, 256, kernel_size=3, stride=1),  # (128, 3, 3)
            nn.ReLU(),  # activation
        )
        # Calculate the flattened feature size
        with torch.no_grad():
            in_shape = [4,96,96]

            dummy_input = torch.zeros(1, *in_shape)
            # print(dummy_input.shape)
            flattened_size = self.cnn_base(dummy_input).view(1, -1).shape[1]

        # Fully connected layers for control
        self.fc1 = nn.Sequential(
            nn.Linear(flattened_size+ action_dim, 128),  # First hidden layer
            nn.ReLU(),
            nn.Linear(128, 1)       # Output layer
        )# output shape (256, 1, 1)





    def forward(self, state,action):
        if(len(state.shape)==3):
          state = torch.from_numpy(state).float().unsqueeze(0).to(device)  # Add batch dimension
        else:
          state = torch.from_numpy(state).float().to(device)  # Add batch dimension
        x = self.cnn_base(state)
        # x = x.view(-1, 256)
        x = x.view(x.size(0), -1)
        # print(x.shape,action.shape)
        x = torch.cat([x, action], dim=1)  # Concatenate along feature dimension

        value = self.fc1(x)
        return value


In [14]:
class ReplayBuffer:
    def __init__(self, max_size, state_dim, action_dim):
        self.max_size = max_size
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.buffer = deque(maxlen=max_size)

    def add(self, state, action, reward, next_state, done):
        experience = (state, action, reward, next_state, done)
        self.buffer.append(experience)

    def sample(self, batch_size):
        batch = random.sample(self.buffer, min(batch_size, len(self.buffer)))
        states, actions, rewards, next_states, dones = zip(*batch)

        return (np.array(states),
                np.array(actions),
                np.array(rewards).reshape(-1, 1),
                np.array(next_states),
                np.array(dones).reshape(-1, 1))

    def __len__(self):
        return len(self.buffer)

######################## Your code ####################################
class TD3_ACAgent():
    def __init__(self, state_size, action_size, pi_lr, vf_lr, tau=0.007, policy_noise=0.1, noise_clip=0.2, policy_freq=2):
        self.state_size = state_size
        self.action_size = action_size
        self.pi_lr = pi_lr
        self.vf_lr = vf_lr
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.policy_freq = policy_freq
        self.tau = tau
        self.total_steps = 0
        self.replay_buffer = ReplayBuffer(10000, state_size, action_size)

        # Actor Network
        self.actor = TD3_PolicyNetwork(state_size, action_size).to(device)
        self.actor_target = TD3_PolicyNetwork(state_size, action_size).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.pi_lr )

        # Critic Networks
        self.critic1 = TD3_ValueNetwork(state_size,action_size).to(device)
        self.critic2 = TD3_ValueNetwork(state_size,action_size).to(device)
        self.critic1_target = TD3_ValueNetwork(state_size,action_size).to(device)
        self.critic2_target = TD3_ValueNetwork(state_size,action_size).to(device)
        self.critic1_target.load_state_dict(self.critic1.state_dict())
        self.critic2_target.load_state_dict(self.critic2.state_dict())
        self.critic1_optimizer = optim.Adam(self.critic1.parameters(), lr=self.vf_lr)
        self.critic2_optimizer = optim.Adam(self.critic2.parameters(), lr=self.vf_lr)

    def evaluate(self, env, n_rollouts=1):
        # print("In side the evaluation function")
        rewards = []
        for _ in range(n_rollouts):
            state, _ = env.reset()
            done = False
            truncated = False
            rewards.append(0)
            while not done and not truncated:
                # state = self.normalize_state(state)
                action = self.actor.select_action(state)
                # print(action)
                state, reward, done, truncated,_ = env.step(action * np.array([2., 1., 1.]) + np.array([-1., 0., 0.]))
                rewards[-1] += reward
        return np.mean(rewards), np.std(rewards)

    def learn(self, batch_size):
            if len(self.replay_buffer) < batch_size:
                return
            # print("inside the learn method")
            # Sample a batch from the replay buffer
            state, action, reward, next_state, done = self.replay_buffer.sample(batch_size)

            # Convert to tensors
            # state = torch.FloatTensor(state).to(device)
            action = torch.tensor(action).float().to(device)
            reward = torch.tensor(reward).float().reshape(-1, 1).to(device)
            # next_state = torch.FloatTensor(next_state).to(device)
            # print(action.shape,reward.shape)
            done = torch.tensor(done).float().reshape(-1, 1).to(device)

            # Select next action according to target policy with noise
            noise = torch.tensor(np.random.normal(0, self.policy_noise, size=(batch_size, self.action_size))).float().to(device)
            # noise = noise.clamp(-self.noise_clip, self.noise_clip)
            # print("next_state.shape",next_state.shape)
            temp_action = self.actor_target.evaluate_action(next_state)
            # # print(temp_action.shape,"temp_action.shape")

            temp_action = torch.from_numpy(temp_action).float().to(device)
            # next_action = ( temp_action + noise).clamp(-1, 1)

            temp_action = temp_action + noise

                  # Apply the appropriate clamp to each action for the batch
            first_action = temp_action[:, 0].clamp(-1, 1)  # First action (ranges from -1 to 1)
            second_action = temp_action[:, 1].clamp(0, 1)  # Second action (ranges from 0 to 1)
            third_action = temp_action[:, 2].clamp(0, 1)   # Third action (ranges from 0 to 1)

            # Combine the actions back together into a tensor
            next_action = torch.stack([first_action, second_action, third_action], dim=1)


            # Compute target Q-value
            target_Q1 = self.critic1_target(next_state, next_action)
            target_Q2 = self.critic2_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + (1 - done) * self.gamma * target_Q

            # Optimize Critic 1
            current_Q1 = self.critic1(state, action)
            loss_Q1 = F.mse_loss(current_Q1, target_Q.detach())
            self.critic1_optimizer.zero_grad()
            loss_Q1.backward()
            self.critic1_optimizer.step()

            # Optimize Critic 2
            current_Q2 = self.critic2(state, action)
            loss_Q2 = F.mse_loss(current_Q2, target_Q.detach())
            self.critic2_optimizer.zero_grad()
            loss_Q2.backward()
            self.critic2_optimizer.step()

            # Delayed policy updates
            if self.total_steps % self.policy_freq == 0:
                # Compute actor loss
                # print("state.shape",state.shape)
                t_act = self.actor.evaluate_action(state)
                t_act = torch.from_numpy(t_act).float().to(device)

                fa = t_act[:, 0].clamp(-1, 1)  # First action (ranges from -1 to 1)
                sa = t_act[:, 1].clamp(0, 1)  # Second action (ranges from 0 to 1)
                ta = t_act[:, 2].clamp(0, 1)   # Third action (ranges from 0 to 1)
                 # Combine the actions back together into a tensor
                t_act_u = torch.stack([fa, sa, ta], dim=1)


                actor_loss = -self.critic1(state,t_act_u).mean()

                # Optimize the actor
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()

                # Soft update target networks
                self.soft_update(self.actor, self.actor_target)
                self.soft_update(self.critic1, self.critic1_target)
                self.soft_update(self.critic2, self.critic2_target)

    def soft_update(self, local_model, target_model):
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)

    # def normalize_state(self, state):
    #     return state /255

    def train(self, env1,env2, seed, gamma, max_episodes, max_episode_steps, batch_size, goal_mean_100_reward):
        self.seed = seed
        self.gamma = gamma
        torch.manual_seed(self.seed)
        np.random.seed(self.seed)
        random.seed(self.seed)

        self.episode_reward = []
        self.evaluation_scores = []
        self.total_steps = 0
        result = np.empty((max_episodes, 3))
        result[:] = np.nan
        env1 = FrameStackWrapper(env1, num_frames=4)
        # env2 = FrameStackWrapper(env2, num_frames=4)

        for episode in range(1, max_episodes + 1):
            state1, _ = env1.reset()
            # state2, _ = env2.reset()

            episode_reward = 0
            episode_steps = 0

            while episode_steps < max_episode_steps:
                # if(episode_steps>max_episode_steps):
                # print("I am falling into infinite",episode_steps,max_episode_steps)
                action1 = self.actor.select_action(state1)
                # action2 = self.actor.select_action(state2)

                action1 = action1 + np.random.normal(0, 0.15, size=self.action_size)
                # action2 = action2 + np.random.normal(0, 0.15, size=self.action_size)

                # print(action,state.shape)
                next_state1, reward1, done1, truncated1, _ = env1.step(action1 * np.array([2., 1., 1.]) + np.array([-1., 0., 0.]))
                # next_state2, reward2, done2, truncated2, _ = env1.step(action1 * np.array([2., 1., 1.]) + np.array([-1., 0., 0.]))

                episode_reward += reward1
                episode_steps += 1
                self.total_steps += 1

                self.replay_buffer.add(state1, action1, reward1, next_state1, done1)
                # self.replay_buffer.add(state2, action2, reward2, next_state2, done2)

                state1 = next_state1
                # state2 = next_state2

                if self.total_steps > batch_size:
                    self.learn(batch_size)
                max_out = episode_steps == max_episode_steps
                if done1 or truncated1 or max_out:
                    break

            self.episode_reward.append(episode_reward)

            # Bookkeeping
            evaluation_score, _ = self.evaluate(env1)  # You need to implement this method
            self.evaluation_scores.append(evaluation_score)

            mean_100_reward = np.mean(self.episode_reward[-100:])
            std_100_reward = np.std(self.episode_reward[-100:])
            mean_100_eval_score = np.mean(self.evaluation_scores[-100:])
            std_100_eval_score = np.std(self.evaluation_scores[-100:])

            result[episode-1] = self.total_steps, mean_100_reward, mean_100_eval_score

            print('Episode: {:d}, Total Steps: {:d}, Train_reward: {:.2f}, Eval_reward: {:.2f}'.format(
                episode, self.total_steps, mean_100_reward, mean_100_eval_score))

            # Check if the problem is considered solved
            training_is_over = episode >= max_episodes or \
                               (goal_mean_100_reward is not None and mean_100_eval_score >= goal_mean_100_reward)
            if training_is_over:
                if episode >= max_episodes:
                    print('--> reached_max_episodes')
                if goal_mean_100_reward is not None and mean_100_eval_score >= goal_mean_100_reward:
                    print('Environment solved in {:d} steps!\tAverage Score: {:.2f}'.format(self.total_steps, mean_100_eval_score))
                break

        return np.array(result)


In [None]:
# Main training loop
env1 = gym.make("CarRacing-v3", render_mode="rgb_array", lap_complete_percent=0.25, domain_randomize=False, continuous=True)
env2 = gym.make("CarRacing-v3", render_mode="rgb_array", lap_complete_percent=0.25, domain_randomize=False, continuous=True)

pi_lr = 7e-4
vf_lr = 1e-3
gamma = .99
max_episodes = 200
max_episode_steps = 150 #env1.spec.max_episode_steps  # you could set your own time limit
goal_mean_100_reward = env1.spec.reward_threshold # that's Gym specific
seed = 12
batch_size = 256
tau = 0.08 # how much new weights would be considered /// current run is with 0.007
policy_noise = 0.1 # policy noise to randomize the actions
noise_clip = 0.1 # clipping the actions
policy_freq = 2
# env1.seed(seed)
td3agent = TD3_ACAgent(env1.observation_space.shape, env1.action_space.shape[0], pi_lr, vf_lr,tau, policy_noise, noise_clip, policy_freq)
result_td3 = td3agent.train(env1,env2, seed, gamma, max_episodes, max_episode_steps,batch_size, goal_mean_100_reward)

env1.close()
del env1

Episode: 1, Total Steps: 28, Train_reward: -2.98, Eval_reward: -46.79
Episode: 2, Total Steps: 178, Train_reward: -17.55, Eval_reward: -43.97
Episode: 3, Total Steps: 218, Train_reward: -14.22, Eval_reward: -41.18
Episode: 4, Total Steps: 244, Train_reward: -11.23, Eval_reward: -40.00
Episode: 5, Total Steps: 394, Train_reward: -15.38, Eval_reward: -41.76
Episode: 6, Total Steps: 544, Train_reward: -17.62, Eval_reward: -39.52
Episode: 7, Total Steps: 653, Train_reward: -18.54, Eval_reward: -39.17
Episode: 8, Total Steps: 714, Train_reward: -17.90, Eval_reward: -39.32
Episode: 9, Total Steps: 753, Train_reward: -16.53, Eval_reward: -38.96
Episode: 10, Total Steps: 835, Train_reward: -16.71, Eval_reward: -38.27
Episode: 11, Total Steps: 963, Train_reward: -17.19, Eval_reward: -38.87
Episode: 12, Total Steps: 1113, Train_reward: -19.31, Eval_reward: -39.27
Episode: 13, Total Steps: 1196, Train_reward: -19.45, Eval_reward: -39.74
Episode: 14, Total Steps: 1271, Train_reward: -19.25, Eval_r

In [22]:
env1 = gym.make("CarRacing-v3", render_mode="rgb_array", lap_complete_percent=0.75, domain_randomize=False, continuous=True)

env2 = FrameStackWrapper(env1, num_frames=4)

images = []
s,_ = env2.reset()
print(s.shape)
img = env2.render()
# print(img.shape)
# plt.imshow(img)  # if state is a 3D tensor (e.g., (1, H, W, C))
# plt.show()
images.append(img)

(4, 96, 96)


In [None]:
def save_results(array, filename, mode=0):
    """
    Save a model to a file in your colab space

    Parameters
    ----------
    model: your Q network
    filename: the name of the checkpoint file
    mode (int): determines where to store the file
                --> 0: collab space 1: Google Drive
    """
    if mode == 1:
      drive.mount('/content/gdrive')
      path = F"/content/gdrive/My Drive/Colab_workspace/{filename}"
      np.save(path, array)

    else:
      np.save(filename, array)

In [None]:
save_results(result_ppo,'result_PPO_race_car2.npy')

In [None]:
save_checkpoint(ppoagent.pnetwork,'ppo_pnetwork_race_car2.pt')
save_checkpoint(ppoagent.vnetwork,'ppo_vnetwork_race_car2.pt')

In [None]:
export_to_local_drive('result_PPO_race_car2.npy')
export_to_local_drive('ppo_pnetwork_race_car2.pt')
export_to_local_drive('ppo_vnetwork_race_car2.pt')