# **BipedalWalker - Continuous Advantage Actor-Critic**

The implementation of Advantage Actor-Critic with Continuous action space for BipedalWalker v3.



## Initialization

In [None]:
%matplotlib inline
from IPython import display
import matplotlib.pyplot as plt

In [None]:
from sim.GymTrainer import GymTrainer
from framework.A2C import ContinuousAdvantageActorCriticAgent

import torch
trainDevice = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Create the simulation environment
sim = GymTrainer(
    'BipedalWalker-v3', 
    evalDevice="cpu", 
    trainDevice=trainDevice, 
    render_mode='rgb_array', 
    envNum=8,
    maxEpisode=140000,
    batchSize=16,
    maxStep=160000,
    # stepLimitPenalty=0.0,
    seed=543)

In [None]:
# Create the agent
agent = ContinuousAdvantageActorCriticAgent(
    actionNum=sim.actionSize(), 
    stateNum=sim.stateSize(), 
    gamma=0.99, 
    policyLR=0.003, 
    criticLR=0.01)

## Training

In [None]:
import pickle
from torch.distributions import Normal

expert_demo = pickle.load(open("bipedalwalker_heuristic.pkl", "rb"))
print(f"Expert demo length: {len(expert_demo['states'])}")

# Train the agent
steps = torch.from_numpy(expert_demo['steps']).to(device=trainDevice)
states = torch.from_numpy(expert_demo['states']).to(device=trainDevice)
actions = torch.from_numpy(expert_demo['actions']).to(device=trainDevice)


wave = torch.stack( 
    [torch.sin(steps/10.0),
    torch.sin(steps/100.0),
    torch.cos(steps/10.0),
    torch.cos(steps/100.0)], dim=-1).to(torch.float32)
states = torch.cat([states, wave], dim=-1)

optimizer = torch.optim.Adam(agent.policy.parameters(), lr=0.0001)
mse_loss = torch.nn.MSELoss()

# torch dataset
dataset = torch.utils.data.TensorDataset(states, actions)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=True)

for epoch in range(100):
    i=0
    for batch_states, batch_actions in dataloader:
        pred_actions = agent.policy.forward(batch_states.float())

        actionMean = pred_actions[:, :agent.actionNum]
        actionStd = pred_actions[:, agent.actionNum:]
        actionDist = Normal(actionMean, actionStd)
        loss = -actionDist.log_prob(batch_actions).mean() + 0.1 * actionStd.mean()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(f"epoch:{epoch} \t batch:{i} \t Loss: {loss.item():.4f}")
        i+=1

In [None]:
name, writer = sim.makeSummaryWriter(agent)
sim.train(agent, writer)
    pass
    # lossImitation = 0.0
    # for epoch in range(5):
    #     for batch_states, batch_actions in dataloader:
    #         optimizer.zero_grad()
    #         pred_actions = agent.policy.forward(batch_states.float())

    #         actionMean = pred_actions[:, :agent.actionNum]
    #         actionStd = pred_actions[:, agent.actionNum:]
    #         actionDist = Normal(actionMean, actionStd)
    #         loss = -actionDist.log_prob(batch_actions).mean() + 0.1 * actionStd.mean()

    #         loss.backward()
    #         optimizer.step()
    #         lossImitation += loss.item()
    # print(f"Batch:{stage.totalBatch} \t Episode:{stage.totalEpisode} \t Loss: {lossImitation / 5.0:.4f}")

# Testing

In [None]:
sim.test(
    agent = agent, 
    episode = 12, 
    maxStep = 1000, 
    renderStep = 3,
    writer = writer)

## Save

In [None]:
agent.save(name)