# Imports

In [1]:
# ! conda install -c conda-forge pytorch-model-summary 
# ! conda install -c conda-forge gym 
from torchsummary import summary
import gym
import torch.nn.functional as F
from A2C.a2c_agents import AgentA2C
from A2C.a2c_networks import Network1
from A2C.workers import Worker
import keyboard
import torch.utils.tensorboard


# Create environments and display environment properties

In [2]:
mc_discrete_name = 'MountainCar-v0'
mc_discrete = gym.make(mc_discrete_name)
print("MountainCar-v0: ", "actions: ", mc_discrete.action_space.n, "observation: ", len(mc_discrete.reset()))  

mc_continuous_name = 'MountainCarContinuous-v0'
mc_continuous = gym.make(mc_continuous_name)
print("MountainCarContinuous-v0: ", "actions: ", mc_continuous.action_space, "observation: ", len(mc_continuous.reset()))  

cp_discrete_name = 'CartPole-v1'
cp_discrete = gym.make(cp_discrete_name)
print("CartPole-v1: ", "actions: ", cp_discrete.action_space.n, "observation: ", len(cp_discrete.reset()))  

ll_discrete_name = 'LunarLander-v2'
ll_discrete = gym.make(ll_discrete_name)
print("LunarLander-v2: ", "actions: ", ll_discrete.action_space.n, "observation: ", len(ll_discrete.reset()))  

ll_continuous_name = 'LunarLanderContinuous-v2'
ll_continuous = gym.make(ll_continuous_name)
print("LunarLanderContinuous-v2: ", "actions: ", ll_continuous.action_space, "observation: ", len(ll_continuous.reset()))  

envs = {
    mc_discrete_name: mc_discrete,
    mc_continuous_name: mc_continuous,
    cp_discrete_name: cp_discrete,
    ll_discrete_name: ll_discrete,
    ll_continuous_name: ll_continuous
}

MountainCar-v0:  actions:  3 observation:  2
MountainCarContinuous-v0:  actions:  Box([-1.], [1.], (1,), float32) observation:  2
CartPole-v1:  actions:  2 observation:  4
LunarLander-v2:  actions:  4 observation:  8
LunarLanderContinuous-v2:  actions:  Box([-1. -1.], [1. 1.], (2,), float32) observation:  8


# Shared parameters

In [3]:
EPISODES = 5000

# Initialization of A2C

In [4]:
A2C_NAME = "test1"
A2C_ID = 1
N_ACTIONS =  envs[ll_discrete_name].action_space.n
STATE_DIM = len(envs[ll_discrete_name].reset())
LEN_WORKERS = 20
ENV_STEPS = 15


a2c_network = Network1(state_dim=STATE_DIM, actions_count=N_ACTIONS)
a2c_agent = AgentA2C(model_name=A2C_NAME, id=A2C_ID, model=a2c_network)

# Create workers and environments
workers = []
for id_w in range(LEN_WORKERS):
    env = gym.make(ll_discrete_name)
    env.seed(id_w)
    w = Worker(id_w, env, a2c_agent, print_score=False)
    workers.append(w)

# Load model (if we trained with interruptions)
a2c_agent.load_model()
a2c_agent.load_progress()


Device:  cuda:0


# Training

In [5]:
a2c_agent.train(workers=workers, total_episodes=EPISODES, steps=ENV_STEPS)    
a2c_agent.save_progress()
a2c_agent.save_model()

In [6]:
%reload_ext tensorboard
%tensorboard --logdir runs --host localhost --port 8888

Reusing TensorBoard on port 8888 (pid 13188), started 1:36:05 ago. (Use '!kill 13188' to kill it.)

# Evaluation

In [7]:
obs = envs[ll_discrete_name].reset()
while(True):
    envs[ll_discrete_name].render()
    actor_prob, _ = a2c_agent.act(obs)
    step_probs = F.softmax(actor_prob, dim=-1)
    step_actions = step_probs.multinomial(num_samples=1)
    obs, _, term, _ = envs[ll_discrete_name].step(step_actions.item())

    if term:
        obs = envs[ll_discrete_name].reset()

    if keyboard.is_pressed('Tab'):
        break

envs[ll_discrete_name].close()