# Import Dependencies

In [None]:
# %pip install stable-baselines3

from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3 import A2C

from utils import make_env
from utils import rwd_func_1, rwd_func_3

import PIL

import time

import os

# Checking Environment

In [None]:
env = make_env()
env.reset()

_ = env.render("rgb_array")
_ = env.render("print")

env.close()
del env

# Reward Function 0

## Prepare Vectorized Envirnoment

In [None]:
vec_env = make_vec_env(lambda: make_env(max_step=1000, num_stack=4), n_envs=1000)

## Train A2C Agent

In [None]:
n_steps = 50_000_000
args = {"gamma": 0.80, "learning_rate": 0.0003, "n_steps": 4}
model = A2C("MlpPolicy", vec_env, **args, verbose=0, tensorboard_log=f"logs/{n_steps: _}")

model.learn(total_timesteps=n_steps, log_interval=1)

model.save("models/Snake A2C (stable-baselines3) rew_func0 (50mil iters).zip")

vec_env.close()

## Play the Game by the Agent

In [None]:
env = make_env(num_stack=4)

state = env.reset()
rewards = 0
done = False

frames = []
start = time.time()
for _ in range(1000):
    action, _states = model.predict(state, deterministic=False)
    state, reward, done, info = env.step(action)

    rewards += reward
    print("\r{}, {}".format(rewards, reward), end="")

    obs = env.render("rgb_array")
    frames.append(obs)

    if done:
        break

fps = len(frames) / (time.time() - start)
print(f"\nFPS: {fps}")

env.close()

In [None]:
image_path = os.path.join("rl videos", f"snake_a2c_stablebaselines_rew_func0_{n_steps}_steps.gif")

frame_images = []
for frame in frames:
    frame_images.append(PIL.Image.fromarray(frame))

frame_images[0].save(image_path, format='GIF',
                     append_images=frame_images[1:],
                     save_all=True,
                     duration=30,
                     loop=0)

# Reward Function 1

## Prepare Vectorized Envirnoment

In [None]:
vec_env = make_vec_env(lambda: make_env(env_rwd_func=rwd_func_1, max_step=2000, num_stack=5), n_envs=1000)

## Train A2C Agent

In [None]:
n_steps = 80_000_000
args = {"gamma": 0.995, "learning_rate": 0.0006, "n_steps": 10, "normalize_advantage": True}
model = A2C("MlpPolicy", vec_env, **args, verbose=0, tensorboard_log=f"logs/{n_steps: _}")

model.learn(total_timesteps=n_steps, log_interval=1)

model.save("models/Snake A2C (stable-baselines3) rew_func1 (80mil iters).zip")

vec_env.close()

## Play the Game by the Agent

In [None]:
env = make_env(num_stack=5)

state = env.reset()
rewards = 0
done = False

frames = []
start = time.time()
for _ in range(1000):
    action, _states = model.predict(state, deterministic=False)
    state, reward, done, info = env.step(action)

    rewards += reward
    print("\r{}, {}".format(rewards, reward), end="")

    obs = env.render("rgb_array")
    frames.append(obs)

    if done:
        break

fps = len(frames) / (time.time() - start)
print(f"\nFPS: {fps}")

env.close()

In [None]:
import PIL

import os


image_path = os.path.join("rl videos", f"snake_a2c_stablebaselines_rew_func1_{n_steps}_steps.gif")

frame_images = []
for frame in frames:
    frame_images.append(PIL.Image.fromarray(frame))

frame_images[0].save(image_path, format='GIF',
                     append_images=frame_images[1:],
                     save_all=True,
                     duration=30,
                     loop=0)

# Reward Function 3

## Prepare Vectorized Envirnoment

In [None]:
vec_env = make_vec_env(lambda: make_env(env_rwd_func=rwd_func_3, max_step=1000, num_stack=4), n_envs=1000)

## Train A2C Agent

In [None]:
n_steps = 80_000_000
args = {"ent_coef": 0.048131320125026966, "gae_lambda": 0.9650094161425868, "gamma": 0.7910967341909643, "max_grad_norm": 7.697800855483016, "n_steps": 8, "normalize_advantage": True, "vf_coef": 0.20991523532588405}
model = A2C("MlpPolicy", vec_env,  **args, verbose=0, tensorboard_log=f"logs/{n_steps: _}")

model.learn(total_timesteps=n_steps, log_interval=1)

model.save("models/Snake A2C (stable-baselines3) rew_func3 (80mil iters).zip")

vec_env.close()

## Play the game by the Agent

In [None]:
env = make_env(env_rwd_func=rwd_func_3, num_stack=4)

state = env.reset()
rewards = 0
done = False

frames = []
start = time.time()
for _ in range(1000):
    action, _states = model.predict(state, deterministic=True)
    state, reward, done, info = env.step(action)

    rewards += reward
    print("\r{}, {}".format(rewards, reward), end="")

    obs = env.render("rgb_array")
    frames.append(obs)

    if done:
        break

fps = len(frames) / (time.time() - start)
print(f"\nFPS: {fps}")

env.close()

In [None]:
image_path = os.path.join("rl videos", f"snake_a2c_stablebaselines_rew_func3_{n_steps}_steps.gif")

frame_images = []
for frame in frames:
    frame_images.append(PIL.Image.fromarray(frame))

frame_images[0].save(image_path, format='GIF',
                     append_images=frame_images[1:],
                     save_all=True,
                     duration=30,
                     loop=0)