In [1]:
import gymnasium as gym
import numpy as np
import cv2
import datetime
import pickle

In [2]:
import agent_model

In [3]:
GAME = "ALE/MarioBros-v5"

#Make environment
env = agent_model.make_env(gym.make(GAME, mode=4))
print("Action space: {}".format(env.action_space))
print("Action space size: {}".format(env.action_space.n))
observation, info = env.reset()
print("Observation space shape: {}".format(observation.shape))
print("Environment spec: ", env.spec)

Action space: Discrete(18)
Action space size: 18
Observation space shape: (84, 84, 1)
Environment spec:  EnvSpec(id='ALE/MarioBros-v5', entry_point='shimmy.atari_env:AtariEnv', reward_threshold=None, nondeterministic=False, max_episode_steps=None, order_enforce=True, autoreset=False, disable_env_checker=False, apply_api_compatibility=False, kwargs={'game': 'mario_bros', 'obs_type': 'rgb', 'repeat_action_probability': 0.25, 'full_action_space': False, 'frameskip': 4, 'max_num_frames_per_episode': 108000, 'mode': 4}, namespace='ALE', name='MarioBros', version=5, additional_wrappers=(WrapperSpec(name='ProcessFrame84', entry_point='agent_model:ProcessFrame84', kwargs=None), WrapperSpec(name='ScaledFloatFrame', entry_point='agent_model:ScaledFloatFrame', kwargs=None)), vector_entry_point=None)


A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


In [None]:
# Traing the agent
agent = agent_model.Agent(env, gamma=0.99, batch_size=64, lr=0.002, max_episodes=1000,
              max_steps_per_episode=2000,
              steps_until_sync=20, choose_action_frequency=1,
              pre_train_steps = 1, final_exploration_step = 50_000)

# You can configure the agent to resume training from a past run:

#   1. load past weights to the model:
agent.load_weights("./checkpoints_v7/ep_240")

#   2. load past replay buffers (so that the model can train on past moves):
with open("./output_replay_memory/run7_ep240", 'rb') as f:
    agent.replay_buffer.replay_memory = pickle.load(f)

#   3. set `start_ep` and `start_step` to where the last run left off (affects printing the the epsilon schedule)
agent.train(start_ep=241, start_step=78023, eps_until_save=10, save_checkpoints=True)

env.close()

[84, 84, 1]
[84, 84, 1]
run:  7
*********
Episode 250 (Step 85714) - Moving Avg Reward: -0.6900 Loss: 0.01796 Epsilon: 0.100 Avg Steps Per Episode: 769.1
*********
Episode 260 (Step 92767) - Moving Avg Reward: -0.9900 Loss: 0.00000 Epsilon: 0.100 Avg Steps Per Episode: 705.3
*********
Episode 270 (Step 99315) - Moving Avg Reward: -0.9900 Loss: 0.00000 Epsilon: 0.100 Avg Steps Per Episode: 654.8
*********
Episode 280 (Step 105796) - Moving Avg Reward: -0.3900 Loss: 0.01565 Epsilon: 0.100 Avg Steps Per Episode: 648.1
*********
Episode 290 (Step 111563) - Moving Avg Reward: -0.4900 Loss: 0.00000 Epsilon: 0.100 Avg Steps Per Episode: 576.7
*********
Episode 300 (Step 117979) - Moving Avg Reward: -0.5900 Loss: 0.00403 Epsilon: 0.100 Avg Steps Per Episode: 641.6
*********
Episode 310 (Step 125330) - Moving Avg Reward: -0.7900 Loss: 0.00000 Epsilon: 0.100 Avg Steps Per Episode: 735.1
*********
Episode 320 (Step 131597) - Moving Avg Reward: -0.9900 Loss: 0.00000 Epsilon: 0.100 Avg Steps Per Ep

In [None]:
# Test the agent
env = agent_model.make_env(gym.make(GAME, mode=4, render_mode="rgb_array"))
observation, info = env.reset()

# create a VideoWriter object.
video_fourcc = cv2.VideoWriter_fourcc(*'XVID')
video_writer = cv2.VideoWriter('./output_videos/test_output_' + str(datetime.datetime.now()) + '.avi', -1, 20.0, (160, 210), isColor=True)

#show the steps the agent takes using the optimal policy table
for i in range(2):
    observation, info = env.reset()
    terminated = truncated = False
    rewards = 0
    while not terminated and not truncated:
        #find max policy
        Q_values = agent.predict_q(np.expand_dims(observation, axis=0))
        action = np.argmax(Q_values[0])
        
        num_lives = info["lives"]
            
        observation, reward, terminated, truncated, info = env.step(action)
        
        reward /= 800
        if info["lives"] < num_lives: # penalize agent when life lost
            reward -= 0.33
                    
        video_writer.write(cv2.cvtColor(np.uint8(np.reshape(env.render(), (210, 160, 3))), cv2.COLOR_RGB2BGR))
        rewards += reward
    print('Total reward is: '+str(rewards))
env.close()

# Close the VideoWriter object.
video_writer.release()

In [5]:
import pickle
with open("./output_replay_memory/run7_ep240", 'wb') as f:
    pickle.dump(agent.replay_buffer.replay_memory, f)

In [None]:
with open("./output_replay_memory/run7_ep240", 'r' as f:
    agent.replay_buffer.replay_memory = pickle.load(f)