In [3]:
from tqdm import tqdm
import numpy as np

from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.vec_env import VecMonitor, VecNormalize, VecCheckNan
from stable_baselines3.ppo import MlpPolicy

from rlgym.envs import Match
from rlgym.utils.action_parsers import DiscreteAction
from rlgym.utils.obs_builders import AdvancedObs
from rlgym.utils.state_setters import DefaultState, RandomState
from rlgym.utils.terminal_conditions.common_conditions import TimeoutCondition, NoTouchTimeoutCondition, GoalScoredCondition
from rlgym.utils.reward_functions.common_rewards.misc_rewards import EventReward, ConstantReward, VelocityReward, SaveBoostReward
from rlgym.utils.reward_functions.common_rewards.player_ball_rewards import VelocityPlayerToBallReward
from rlgym.utils.reward_functions.common_rewards.ball_goal_rewards import VelocityBallToGoalReward
from rlgym.utils.reward_functions.common_rewards.conditional_rewards import RewardIfBehindBall
from rlgym.utils.reward_functions import CombinedReward
from rlgym_tools.sb3_utils import SB3MultipleInstanceEnv

from egocentric_obs import EgocentricObs
from rewards import RewardIfGoalside, RewardIfShouldShadow1s, PossessionReward, RewardIfPlayerBallY, PlayerBallYDistReward, TimestepReward, MultiplyRewards, RewardIfGrounded

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
frame_skip = 8                                             # Number of ticks to repeat an action
half_life_seconds = 5                                      # Easier to conceptualize, after this many seconds the reward discount is 0.5

fps = 240 / frame_skip
gamma = np.exp(np.log(0.5) / (fps * half_life_seconds))    # Quick mafs
agents_per_match = 2
num_instances = 1 #18
target_steps = 1_000_000
steps = target_steps // (num_instances * agents_per_match) #making sure the experience counts line up properly
batch_size = target_steps//10                              #getting the batch size down to something more manageable - 100k in this case
training_interval = 25_000_000
mmr_save_frequency = 50_000_000

In [3]:
model_name = 'test'#'Egor_512'
def exit_save(model):
    model.save(f'models/{model_name}/exit_save')

In [4]:
def get_match():  # Need to use a function so that each instance can call it and produce their own objects
    return Match(
        team_size=1,
        tick_skip=frame_skip,
        reward_function=CombinedReward(
        (
#             RewardIfShouldShadow1s(ConstantReward()),
#             RewardIfGoalside(ConstantReward()),
            SaveBoostReward(),
#             RewardIfPlayerBallY(PlayerBallYDistReward()),
            RewardIfGrounded(ConstantReward()),
            VelocityReward(), 
#             VelocityPlayerToBallReward(),
#             VelocityBallToGoalReward(),
#             EventReward(
#                 team_goal=100.0,
#                 concede=-100.0,
#                 shot=5.0,
#                 save=30.0,
#                 demo=10.0,
#             ),
        ),
        (0.05, 0.01, 0.05)),
        # self_play=True,  in rlgym 1.2 'self_play' is depreciated. Uncomment line if using an earlier version and comment out spawn_opponents
        spawn_opponents=True,
#         terminal_conditions=[TimeoutCondition(fps * 100), NoTouchTimeoutCondition(fps * 20), GoalScoredCondition()],
        terminal_conditions=[TimeoutCondition(fps * 10)],
        obs_builder=EgocentricObs(),  # Not that advanced, good default
        state_setter=RandomState(),  # Resets to kickoff position
        action_parser=DiscreteAction(n_bins=9)  # Discrete > Continuous don't @ me
    )

In [5]:
env = SB3MultipleInstanceEnv(get_match, num_instances, wait_time=45)# Start 1 instances, waiting 60 seconds between each
env = VecCheckNan(env)                                # Optional
env = VecMonitor(env)                                 # Recommended, logs mean reward and ep_len to Tensorboard
env = VecNormalize(env, norm_obs=False, gamma=gamma)  # Highly recommended, normalizes rewards

  return np.divide(vec, vecmag(vec))


In [6]:
model_id = 'exit_save'
try:
    model = PPO.load(
        f'models/{model_name}/{model_id}.zip',
        env,
        device="cuda",
        custom_objects={"n_envs": env.num_envs}, #automatically adjusts to users changing instance count, may encounter shaping error otherwise
        # If you need to adjust parameters mid training, you can use the below example as a guide
        #custom_objects={"n_envs": env.num_envs, "n_steps": steps, "batch_size": batch_size, "n_epochs": 10, "learning_rate": 5e-5}
    )
    print(f"Loaded: {model_id}.")
except:
    print("model ({model_id}) not found, creating new model.")
    from torch.nn import ELU
    policy_kwargs = dict(
        activation_fn=ELU,
        net_arch=[256, 256, 256, dict(pi=[512, 512, 512], vf=[512, 512, 512])],
    )

    model = PPO(
        MlpPolicy,
        env,
        n_epochs=10,                                                             # PPO calls for multiple epochs
        policy_kwargs=policy_kwargs,
        learning_rate=5e-5,                                                      # Around this is fairly common for PPO Originally 5e-5
        ent_coef=0.01,                                                           # From PPO Atari
        vf_coef=1.,                                                              # From PPO Atari
        gamma=gamma,                                                             # Gamma as calculated using half-life
        verbose=3,                                                               # Print out all the info as we're going
        batch_size=batch_size,                                                   # Batch size as high as possible within reason
        n_steps=steps,                                                           # Number of steps to perform before optimizing network
        tensorboard_log=f'logs/{model_name}',                                    # `tensorboard --logdir out/logs` in terminal to see graphs
        device="cuda"                                                            # Uses GPU if available
    )

model ({model_id}) not found, creating new model.
Using cuda device


In [None]:
# Save model every so often
# Divide by num_envs (number of agents) because callback only increments every time all agents have taken a step
# This saves to specified folder with a specified name
callback = CheckpointCallback(round(5_000_000 / env.num_envs), save_path=f"models/{model_name}", name_prefix=model_name)

try:
    mmr_model_target_count = model.num_timesteps + mmr_save_frequency
    while True:
        #may need to reset timesteps when you're running a different number of instances than when you saved the model
        model.learn(training_interval, callback=callback, reset_num_timesteps=False) #can ignore callback if training_interval < callback target
        model.save(f"models/{model_name}/exit_save")
        if model.num_timesteps >= mmr_model_target_count:
            model.save(f"mmr_models/{model_name}/{model.num_timesteps}")
            mmr_model_target_count += mmr_save_frequency

except KeyboardInterrupt:
    print("Exiting training")

print("Saving model")
exit_save(model)
print("Save complete")


Logging to logs/test\PPO_0


In [7]:
x = np.random.rand(5, 3)
y = np.array([1, 1, 1])
print(x)
print(x-y)

[[0.42024388 0.52496365 0.57216038]
 [0.95141792 0.56389961 0.29002981]
 [0.67104232 0.4345674  0.56028025]
 [0.52341958 0.09264067 0.3512735 ]
 [0.83544003 0.86887197 0.82118781]]
[[-0.57975612 -0.47503635 -0.42783962]
 [-0.04858208 -0.43610039 -0.70997019]
 [-0.32895768 -0.5654326  -0.43971975]
 [-0.47658042 -0.90735933 -0.6487265 ]
 [-0.16455997 -0.13112803 -0.17881219]]


In [None]:
exit_save(model)