In [1]:
import gymnasium as gym
import numpy as np
import cv2
import datetime

In [5]:
import agent_model

In [6]:
GAME = "ALE/MarioBros-v5"

#Make environment
env = agent_model.make_env(gym.make(GAME, mode=4))
print("Action space: {}".format(env.action_space))
print("Action space size: {}".format(env.action_space.n))
observation, info = env.reset()
print("Observation space shape: {}".format(observation.shape))
print("Environment spec: ", env.spec)

Action space: Discrete(18)
Action space size: 18
Observation space shape: (84, 84, 1)
Environment spec:  EnvSpec(id='ALE/MarioBros-v5', entry_point='shimmy.atari_env:AtariEnv', reward_threshold=None, nondeterministic=False, max_episode_steps=None, order_enforce=True, autoreset=False, disable_env_checker=False, apply_api_compatibility=False, kwargs={'game': 'mario_bros', 'obs_type': 'rgb', 'repeat_action_probability': 0.25, 'full_action_space': False, 'frameskip': 4, 'max_num_frames_per_episode': 108000, 'mode': 4}, namespace='ALE', name='MarioBros', version=5, additional_wrappers=(WrapperSpec(name='ProcessFrame84', entry_point='agent_model:ProcessFrame84', kwargs=None), WrapperSpec(name='ScaledFloatFrame', entry_point='agent_model:ScaledFloatFrame', kwargs=None)), vector_entry_point=None)


A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


In [None]:
# Traing the agent
agent = agent_model.Agent(env, gamma=0.999, batch_size=64, lr=0.0007, max_episodes=1000,
              max_steps_per_episode=2000,
              steps_until_sync=20, choose_action_frequency=1,
              pre_train_steps = 10, final_exploration_step = 200_000)
agent.load_weights("./checkpoints_v5/ep_90")
agent.train(start_ep=91, start_step=16276, eps_until_save=10)

env.close()

[84, 84, 1]
[84, 84, 1]
run:  5
*********
Episode 100 (Step 23786) - Moving Avg Reward: -0.490 Loss: 0.00538 Epsilon: 0.884 Avg Steps per Episode: 751.0
*********
Episode 110 (Step 29819) - Moving Avg Reward: -0.690 Loss: 0.01004 Epsilon: 0.856 Avg Steps per Episode: 603.3
*********
Episode 120 (Step 35227) - Moving Avg Reward: -0.790 Loss: 0.01739 Epsilon: 0.825 Avg Steps per Episode: 540.8
*********
Episode 130 (Step 41046) - Moving Avg Reward: -0.740 Loss: 0.01193 Epsilon: 0.798 Avg Steps per Episode: 581.9
*********
Episode 140 (Step 46885) - Moving Avg Reward: -0.710 Loss: 0.01360 Epsilon: 0.769 Avg Steps per Episode: 583.9
*********
Episode 150 (Step 53505) - Moving Avg Reward: -0.673 Loss: 0.01311 Epsilon: 0.734 Avg Steps per Episode: 662.0
***

In [None]:
# Test the agent
env = agent_model.make_env(gym.make(GAME, mode=4, render_mode="rgb_array"))
observation, info = env.reset()

# create a VideoWriter object.
video_fourcc = cv2.VideoWriter_fourcc(*'XVID')
video_writer = cv2.VideoWriter('./output_videos/test_output_' + str(datetime.datetime.now()) + '.avi', -1, 20.0, (160, 210), isColor=True)

#show the steps the agent takes using the optimal policy table
for i in range(2):
    observation, info = env.reset()
    terminated = truncated = False
    rewards = 0
    while not terminated and not truncated:
        #find max policy
        Q_values = agent.predict_q(np.expand_dims(observation, axis=0))
        action = np.argmax(Q_values[0])
        
        num_lives = info["lives"]
            
        observation, reward, terminated, truncated, info = env.step(action)
        
        reward /= 800
        if info["lives"] < num_lives: # penalize agent when life lost
            reward -= 0.33
                    
        video_writer.write(cv2.cvtColor(np.uint8(np.reshape(env.render(), (210, 160, 3))), cv2.COLOR_RGB2BGR))
        rewards += reward
    print('Total reward is: '+str(rewards))
env.close()

# Close the VideoWriter object.
video_writer.release()