In [1]:
import torch
import torch.nn as nn
import torch.nn.utils as nn_utils
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
import numpy as np

In [2]:
from parameters import Tank_params, Disturbance_params, RL_params

In [3]:
from tankEnv import Tank, TankEnv
from utils import unpack_batch,RewardTracker,TBMeanTracker,ExperienceSourceFirstLast
from agent import Agent, A2CNet

In [4]:
tank_0 = Tank(Tank_params, Disturbance_params)
tank_1 = Tank(Tank_params, Disturbance_params)
tank_2 = Tank(Tank_params, Disturbance_params)
tank_3 = Tank(Tank_params, Disturbance_params)
tank_4 = Tank(Tank_params, Disturbance_params)
tank_5 = Tank(Tank_params, Disturbance_params)


# one tank case

In [5]:
n_tanks = 1

In [6]:
env = TankEnv(
    [tank_0], Tank_params["max_lvl"], Tank_params["min_lvl"]
)  # ,tank_2,tank_3,tank_4,tank_5,tank_6]

In [7]:
obs = env.reset()

In [8]:
n_hidden_size = 12
input_size = len(obs)

In [9]:
net = A2CNet(input_size,n_hidden_size).to(RL_params["device"])
print(net)

A2CNet(
  (policy): Sequential(
    (0): Linear(in_features=4, out_features=12, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=12, out_features=12, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=12, out_features=2, bias=True)
  )
  (value): Sequential(
    (0): Linear(in_features=4, out_features=12, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=12, out_features=12, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=12, out_features=1, bias=True)
  )
)


In [10]:
agent =Agent(net,RL_params["device"])

In [11]:
exp_source = ExperienceSourceFirstLast(env, agent,  RL_params)

In [12]:
optimizer = torch.optim.Adam(net.parameters(), lr=RL_params["learning_rate"], eps=1e-3)

In [13]:
writer = SummaryWriter(comment="1 tanks process" )

In [14]:
batch = []

with RewardTracker(writer, stop_reward=100) as tracker:
    with TBMeanTracker(writer, batch_size=RL_params["batch_size"]) as tb_tracker:
        for step_idx, exp in enumerate(exp_source):
            batch.append(exp)

            new_rewards = exp_source.pop_total_rewards()
            
            if new_rewards:
                if tracker.reward(new_rewards[0], step_idx):
                    break
            if exp.epi_step > 1000:
                print("Solved ! @ epi_step",exp.epi_step)
                break

            if len(batch) < RL_params["batch_size"]:
                continue

            states_v, actions_t, vals_ref_v = unpack_batch(batch, net, RL_params)
            batch.clear()
            optimizer.zero_grad()
            logits_v, value_v = net(states_v)
            loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v)

            log_prob_v = F.log_softmax(logits_v, dim=1)

            adv_v = vals_ref_v - value_v.squeeze(-1).detach()

            log_prob_actions_v = adv_v[:, None] * log_prob_v[
                actions_t.to(torch.bool)
            ].view(RL_params["batch_size"], -1)

            loss_policy_v = -log_prob_actions_v.mean()

            prob_v = F.softmax(logits_v, dim=1)
            entropy_loss_v = (
                RL_params["entropy_beta"] * (prob_v * log_prob_v).sum(dim=1).mean()
            )
            # calculate policy gradients only
            loss_policy_v.backward(retain_graph=True)
            grads = np.concatenate(
                [
                    p.grad.data.cpu().numpy().flatten()
                    for p in net.parameters()
                    if p.grad is not None
                ]
            )

            # apply entropy and value gradients
            loss_v = entropy_loss_v + loss_value_v
            loss_v.backward()
            nn_utils.clip_grad_norm_(net.parameters(), RL_params["clip_grad"])
            optimizer.step()
            # get full loss
            loss_v += loss_policy_v

            tb_tracker.track("exp_step", exp.epi_step, step_idx)

            tb_tracker.track("advantage", adv_v, step_idx)
            tb_tracker.track("values", value_v, step_idx)
            tb_tracker.track("batch_rewards", vals_ref_v, step_idx)
            tb_tracker.track("loss_entropy", entropy_loss_v, step_idx)
            tb_tracker.track("loss_policy", loss_policy_v, step_idx)
            tb_tracker.track("loss_value", loss_value_v, step_idx)
            tb_tracker.track("loss_total", loss_v, step_idx)
            tb_tracker.track("grad_l2", np.sqrt(np.mean(np.square(grads))), step_idx)
            tb_tracker.track("grad_max", np.max(np.abs(grads)), step_idx)
            tb_tracker.track("grad_var", np.var(grads), step_idx)

 done 500 games, mean reward 68.710 
 done 1000 games, mean reward 67.290 
 done 1500 games, mean reward 68.440 
 done 2000 games, mean reward 67.400 
Solved !


In [15]:
cur_steps = 0
obs = env.reset()
total_reward = 0.0
total_steps = 0
agent_states = None
done = 0
lvl = []
while True:
    obs = torch.FloatTensor([obs])
    lvl.append((obs[0][0]).item())

    actions, agent_states = agent(obs, agent_states)
    obs, reward, is_done, info = env.step(actions, cur_steps)
    cur_steps += 1
    total_reward += sum(reward)
    total_steps += 1
    if is_done:
        done += 1
        obs = env.reset()
    if done > 3: # three lives
        break


print("In %d steps we got %.3f reward" % (total_steps, total_reward))

  states = torch.tensor(states, dtype=torch.float32).to(self.device)


In 327 steps we got 319.000 reward


# six tanks case

In [23]:
env6 = TankEnv(
    [tank_0,tank_1,tank_2,tank_3,tank_4,tank_5], Tank_params["max_lvl"], Tank_params["min_lvl"])

In [24]:
obs = env6.reset()
n_hidden_size = 16
input_size = len(obs)
net6 = A2CNet(input_size,n_hidden_size).to(RL_params["device"])
print(net6)

A2CNet(
  (policy): Sequential(
    (0): Linear(in_features=24, out_features=16, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=16, out_features=16, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=16, out_features=12, bias=True)
  )
  (value): Sequential(
    (0): Linear(in_features=24, out_features=16, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=16, out_features=16, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=16, out_features=1, bias=True)
  )
)


In [25]:
agent6 =Agent(net6,RL_params["device"])

In [26]:
exp_source6 = ExperienceSourceFirstLast(env6, agent6,  RL_params)

In [27]:
optimizer6 = torch.optim.Adam(net6.parameters(), lr=RL_params["learning_rate"], eps=1e-3)

In [28]:
writer6 = SummaryWriter(comment="1 tanks process" )

In [29]:
batch = []

with RewardTracker(writer6, stop_reward=56) as tracker:
    with TBMeanTracker(writer6, batch_size=RL_params["batch_size"]) as tb_tracker:
        for step_idx, exp in enumerate(exp_source6):
            batch.append(exp)

            new_rewards = exp_source6.pop_total_rewards()
            
            if new_rewards:
                if tracker.reward(new_rewards[0], step_idx):
                    break
            if exp.epi_step > 1000:
                print("Solved ! @ epi_step",exp.epi_step)
                break

            if len(batch) < RL_params["batch_size"]:
                continue

            states_v, actions_t, vals_ref_v = unpack_batch(batch, net6, RL_params)
            batch.clear()
            optimizer6.zero_grad()
            logits_v, value_v = net6(states_v)
            loss_value_v = F.mse_loss(value_v.squeeze(-1), vals_ref_v)

            log_prob_v = F.log_softmax(logits_v, dim=1)

            adv_v = vals_ref_v - value_v.squeeze(-1).detach()

            log_prob_actions_v = adv_v[:, None] * log_prob_v[
                actions_t.to(torch.bool)
            ].view(RL_params["batch_size"], -1)

            loss_policy_v = -log_prob_actions_v.mean()

            prob_v = F.softmax(logits_v, dim=1)
            entropy_loss_v = (
                RL_params["entropy_beta"] * (prob_v * log_prob_v).sum(dim=1).mean()
            )
            # calculate policy gradients only
            loss_policy_v.backward(retain_graph=True)
            grads = np.concatenate(
                [
                    p.grad.data.cpu().numpy().flatten()
                    for p in net6.parameters()
                    if p.grad is not None
                ]
            )

            # apply entropy and value gradients
            loss_v = entropy_loss_v + loss_value_v
            loss_v.backward()
            nn_utils.clip_grad_norm_(net6.parameters(), RL_params["clip_grad"])
            optimizer6.step()
            # get full loss
            loss_v += loss_policy_v

            tb_tracker.track("exp_step", exp.epi_step, step_idx)

            tb_tracker.track("advantage", adv_v, step_idx)
            tb_tracker.track("values", value_v, step_idx)
            tb_tracker.track("batch_rewards", vals_ref_v, step_idx)
            tb_tracker.track("loss_entropy", entropy_loss_v, step_idx)
            tb_tracker.track("loss_policy", loss_policy_v, step_idx)
            tb_tracker.track("loss_value", loss_value_v, step_idx)
            tb_tracker.track("loss_total", loss_v, step_idx)
            tb_tracker.track("grad_l2", np.sqrt(np.mean(np.square(grads))), step_idx)
            tb_tracker.track("grad_max", np.max(np.abs(grads)), step_idx)
            tb_tracker.track("grad_var", np.var(grads), step_idx)

 done 500 games, mean reward 42.663 
 done 1000 games, mean reward 42.050 
 done 1500 games, mean reward 44.980 
 done 2000 games, mean reward 42.925 
 done 2500 games, mean reward 43.380 
 done 3000 games, mean reward 43.838 
 done 3500 games, mean reward 43.887 
 done 4000 games, mean reward 43.752 
 done 4500 games, mean reward 43.177 
 done 5000 games, mean reward 43.612 
 done 5500 games, mean reward 42.987 
 done 6000 games, mean reward 44.295 
 done 6500 games, mean reward 45.385 
 done 7000 games, mean reward 44.652 
 done 7500 games, mean reward 43.693 
 done 8000 games, mean reward 44.692 
 done 8500 games, mean reward 44.832 
 done 9000 games, mean reward 45.002 
 done 9500 games, mean reward 46.598 
 done 10000 games, mean reward 44.102 
 done 10500 games, mean reward 48.135 
 done 11000 games, mean reward 45.673 
 done 11500 games, mean reward 47.233 
 done 12000 games, mean reward 45.780 
 done 12500 games, mean reward 47.665 
 done 13000 games, mean reward 44.808 
 done 

In [30]:
cur_steps = 0
obs = env6.reset()
total_reward = 0.0
total_steps = 0
agent_states = None
done = 0
lvl = []
while True:
    obs = torch.FloatTensor([obs])
    lvl.append((obs[0][0]).item())

    actions, agent_states = agent6(obs, agent_states)
    obs, reward, is_done, info = env6.step(actions, cur_steps)
    cur_steps += 1
    total_reward += sum(reward)
    total_steps += 1
    if is_done:
        done += 1
        obs = env6.reset()
    if done > 3: # four lives
        break


print("In %d steps we got %.3f reward" % (total_steps, total_reward))

In 35 steps we got 205.333 reward
