### Setup

conda create --prefix=D:/L6_SWD/Dissertation/Project/CondaEnvs/diss_test python=3.8

conda activate diss_test

conda install ipykernel

#### Pip installs

pip install gymnasium[atari]
pip install gymnasium[accept-rom-license]


pip install stable-baselines3
pip install ale-py==0.7.4
pip install opencv-python

pip install tensorflow

### Import

In [1]:
import gymnasium as gym
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env
import torch as th

### Data

In [10]:
environment_name = "ALE/BankHeist-v5"
envAmt = 8
trainTimeSteps = 9500000

### Create Env and View Action Space

In [3]:
def CreateEnv():   
    env = gym.make(environment_name,render_mode="human")#render mode can be changed to None
    return env

In [4]:
env = CreateEnv()
env.action_space

Discrete(18)

### Create RL Models

#### Logging

In [5]:
logPath = "./TrainingLogs/A2C_DefaultParams/A2C-"+str(trainTimeSteps)

#### Create Model

In [6]:
kwargs = {
    "policy": "CnnPolicy",   
    "gamma": 0.999,
    "normalize_advantage": False,
    "max_grad_norm": 0.9,
    "use_rms_prop": False,
    "gae_lambda": 0.95,
    "n_steps": 8,
    "learning_rate": 0.00038933533117428,
    "ent_coef": 0.08634329836004319,
    "vf_coef": 0.01320939014246536,   
 
    "policy_kwargs": 
    {
        "net_arch": {"pi": [64, 64], "vf": [64, 64]},
        "activation_fn": th.nn.ReLU,
        "ortho_init": False,
    }
}

In [7]:
env = make_atari_env(environment_name, n_envs=envAmt, seed=0)

In [8]:
#Wrap environment 
env = VecFrameStack(env,n_stack=envAmt)

In [9]:
model = A2C(policy="CnnPolicy",env=env,verbose=0,tensorboard_log=logPath)

In [10]:
#model = A2C(env=env,**kwargs,verbose=0,tensorboard_log=logPath)

#### Train Model

In [11]:
model.learn(total_timesteps=trainTimeSteps)

<stable_baselines3.a2c.a2c.A2C at 0x1da29c1b370>

#### Save Model

In [5]:
savePath = "./SavedModels/A2C_DefaultParams/A2C-"+str(trainTimeSteps)

In [13]:
model.save(savePath+"-Model")



In [14]:
policy = model.policy
policy.save(savePath+"-Policy")

#### Load Model

In [7]:
env = make_atari_env(environment_name, n_envs=envAmt, seed=0)
env = VecFrameStack(env, n_stack=envAmt)

In [8]:
model = A2C.load(savePath+"-Model", env)

#### Test Model

In [9]:
obs = env.reset()

In [22]:
mean_reward, std_reward  = evaluate_policy(model, env, n_eval_episodes=10, render=False)

print(mean_reward)
print(std_reward)

KeyboardInterrupt: 

In [21]:
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()

KeyboardInterrupt: 

In [None]:
env.close()

### Logs

tensorboard --logdir E:/L6_SWD/Dissertation/Project/TestCode/Training/Logs/SpaceInvaders/A2C/A2C_1-1000000