# SWEN 711 - Spring 24 - Term Project

### Code written by: Hemanth Chebiyam, Sanjeev Vijayakumar

### PPO Algorithm on OpenAI Gym Car Racing Environment

# 1. Import Dependencies

In [1]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
import os

# 2. Test Environment

In [2]:
environment_name = "CarRacing-v2"
env = gym.make(environment_name, render_mode="human")

In [3]:
env.reset()

(array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        ...,
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]], dtype=uint8),
 {})

In [4]:
env.action_space

Box([-1.  0.  0.], 1.0, (3,), float32)

In [5]:
env.observation_space

Box(0, 255, (96, 96, 3), uint8)

In [6]:
env.close()

# 3. Simulating without Training

In [7]:
environment_name = "CarRacing-v2"
env = gym.make(environment_name, render_mode="human")

episodes = 5
for episode in range(episodes):
    observation, info = env.reset(seed=42)
    score = 0 
    done = False

    while not done:
        action = env.action_space.sample()  # this is where you would insert your policy
        observation, reward, terminated, truncated, info = env.step(action)
        score += reward

        if terminated or truncated:
            break

    print('Episode:{} Score:{}'.format(episode + 1, score))

env.close()

Episode:1 Score:-29.32862190812757
Episode:2 Score:-29.328621908127605
Episode:3 Score:-29.328621908127623
Episode:4 Score:-29.328621908127566
Episode:5 Score:-25.79505300353385


# 4. Simulating after basic training (~500k timesteps)

In [8]:
env = gym.make(environment_name, render_mode="human")
env = DummyVecEnv([lambda: env])

In [9]:
ppo_path = os.path.join('Training', 'Saved Models', 'PPO_500k_model')

In [10]:
model = PPO.load(ppo_path, env)

Exception: code expected at least 16 arguments, got 15
Exception: code expected at least 16 arguments, got 15


Wrapping the env in a VecTransposeImage.


	Missing key(s) in state_dict: "pi_features_extractor.cnn.0.weight", "pi_features_extractor.cnn.0.bias", "pi_features_extractor.cnn.2.weight", "pi_features_extractor.cnn.2.bias", "pi_features_extractor.cnn.4.weight", "pi_features_extractor.cnn.4.bias", "pi_features_extractor.linear.0.weight", "pi_features_extractor.linear.0.bias", "vf_features_extractor.cnn.0.weight", "vf_features_extractor.cnn.0.bias", "vf_features_extractor.cnn.2.weight", "vf_features_extractor.cnn.2.bias", "vf_features_extractor.cnn.4.weight", "vf_features_extractor.cnn.4.bias", "vf_features_extractor.linear.0.weight", "vf_features_extractor.linear.0.bias".  


In [11]:
environment_name = "CarRacing-v2"
env = gym.make(environment_name, render_mode="human")

episodes = 5
for episode in range(episodes):
    observation, info = env.reset(seed=42)
    score = 0 
    done = False

    while not done:
        action, _ = model.predict(observation)  # this is where you would insert your policy
        observation, reward, terminated, truncated, info = env.step(action)
        score += reward

        if terminated or truncated:
            break

    print('Episode:{} Score:{}'.format(episode + 1, score))

env.close()

Episode:1 Score:-4.593639575972384
Episode:2 Score:83.74558303887123
Episode:3 Score:48.409893992936006
Episode:4 Score:-15.194346289753488
Episode:5 Score:94.34628975265443


# 5. Simulating after more training (~2 mil timesteps)

In [12]:
env = gym.make(environment_name, render_mode="human")
env = DummyVecEnv([lambda: env])

In [13]:
ppo_path = os.path.join('Training', 'Saved Models', 'PPO_2mil_model')

In [14]:
model = PPO.load(ppo_path, env)

Exception: code expected at least 16 arguments, got 15
Exception: code expected at least 16 arguments, got 15


Wrapping the env in a VecTransposeImage.


In [15]:
environment_name = "CarRacing-v2"
env = gym.make(environment_name, render_mode="human")
episodes = 5
for episode in range(episodes):
    observation, info = env.reset(seed=42)
    score = 0 
    done = False

    while not done:
        action, _ = model.predict(observation)  # this is where you would insert your policy
        observation, reward, terminated, truncated, info = env.step(action)
        score += reward

        if terminated or truncated:
            break

    print('Episode:{} Score:{}'.format(episode + 1, score))

env.close()

Episode:1 Score:620.8480565370928
Episode:2 Score:525.4416961130704
Episode:3 Score:574.9116607773736
Episode:4 Score:589.0459363957509
Episode:5 Score:571.3780918727856
