In [None]:
!pip install gym[box2d] -q
!mkdir models

In [None]:
import numpy as np
import os
import sys
from itertools import count
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Categorical

import matplotlib.pyplot as plt
plt.switch_backend('agg')

#import ipdb

# if gpu is to be used
use_cuda = torch.cuda.is_available()
#use_cuda = False
print("use_cuda : ", use_cuda)
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor
Tensor = FloatTensor

import gym
env = gym.make('LunarLander-v2')
env.reset()

class Actor(nn.Module):
    def __init__(self, state_size, num_actions):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_size, 16)
        self.fc2 = nn.Linear(16, 16)
        self.fc3 = nn.Linear(16, 16)
        self.fc4 = nn.Linear(16, num_actions)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.log_softmax(self.fc4(x),dim=-1)
        return x

class Critic(nn.Module):
    def __init__(self, state_size):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_size, 16)
        self.dp1 = nn.Dropout(p=0.5)
        self.fc2 = nn.Linear(16, 8)
        #self.fc3 = nn.Linear(16, 16)
        self.fc4 = nn.Linear(8, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dp1(x)
        x = F.relu(self.fc2(x))
        #x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x

class A2CAgent(object):
    def __init__(self, env):
        super(A2CAgent, self).__init__()
        self.env = env
        self.actor = Actor(env.observation_space.shape[0], env.action_space.n)
        self.critic = Critic(env.observation_space.shape[0])
        self.actor.cuda()
        self.critic.cuda()
        self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=5e-4)
        self.optimizer_critic = optim.Adam(self.critic.parameters(), lr=5e-4)
        self.N_steps = 100
        self.num_episodes = 40000
        self.test_episodes = 100
        #self.num_steps = args.num_steps
        self.gamma = 0.99
        self.save_path = "models"
        self.test_freq = 500
        self.save_freq = 2e3
        self.train_rewards = []
        self.test_rewards = []
        self.train_steps = []
        self.test_steps = []
        self.losses_actor = []
        self.losses_critic = []

    def select_action(self, state):
        state = Variable(Tensor(state))
        log_probs = self.actor(state)
        value = self.critic(state)
        action = Categorical(log_probs.exp()).sample()
        #return action.data.cpu().numpy()[0], log_probs[action], value
        return int(action.data.cpu().numpy()), log_probs[action], value

    def play_episode(self, e):
        state = self.env.reset()
        steps = 0
        rewards = []
        log_probs = []
        values = []
        # for i in range(self.num_steps):
        while True:
            action, log_prob, value = self.select_action(state)
            state, reward, is_terminal, _ = self.env.step(action)
            log_probs.append(log_prob)
            rewards.append(reward)
            values.append(value)
            steps +=1
            if is_terminal:
                break
        #print(torch.stack(log_probs))
        #print(log_probs)
        #print(values)
        return steps, rewards, torch.stack(log_probs), torch.stack(values)

    def optimize(self, rewards, log_probs, values):
        T = len(rewards)
        N = self.N_steps
        R = np.zeros(T, dtype=np.float32)
        loss_actor = 0
        loss_critic = 0
        for t in reversed(range(T)):
            V_end = 0 if (t+N >= T) else values[t+N].data
            R[t] = (self.gamma**N * V_end) + sum([self.gamma**k * rewards[t+k]*1e-2 for k in range(min(N, T-t))])
        R = Variable(Tensor(R), requires_grad=False)
        # compute losses using the advantage function;
        # Note: `values` is detached while computing loss for actor
        loss_actor = ((R - values.detach()) * -log_probs).mean()
        loss_critic = ((R - values)**2).mean()
        # loss = loss_actor + loss_critic

        self.optimizer_actor.zero_grad()
        self.optimizer_critic.zero_grad()
        loss_actor.backward()
        loss_critic.backward()
        # nn.utils.clip_grad_norm(self.actor.parameters(), grad_norm_limit)
        # nn.utils.clip_grad_norm(self.critic.parameters(), grad_norm_limit)
        self.optimizer_actor.step()
        self.optimizer_critic.step()
        # self.losses.append(loss.detach().cpu().numpy())
        # ipdb.set_trace()
        self.losses_actor.append(int(loss_actor.data.cpu().numpy()))
        self.losses_critic.append(int(loss_critic.data.cpu().numpy()))

    def train(self, num_episodes):
        print("Going to be training for a total of {} episodes".format(num_episodes))
        state = Variable(torch.Tensor(self.env.reset()))
        for e in range(num_episodes):
            steps, rewards, log_probs, values = self.play_episode(e)
            self.train_rewards.append(sum(rewards))
            self.train_steps.append(steps)
            self.optimize(rewards, log_probs,values)

            if (e+1) % 100 == 0:
                print("Episode: {}, reward: {}, steps: {}".format(e+1, sum(rewards), steps))

            # Freeze the current policy and test over 100 episodes
            if (e+1) % self.test_freq == 0:
                print("-"*10 + " testing now " + "-"*10)
                self.test(self.test_episodes, e)

            # Save the current policy model
            if (e+1) % (self.save_freq) == 0:
                torch.save(self.actor.state_dict(),  os.path.join(self.save_path, "train_actor_ep_{}.pth".format(e+1)))
                torch.save(self.critic.state_dict(), os.path.join(self.save_path, "train_critic_ep_{}.pth".format(e+1)))

    def test(self, num_episodes, e_train):
        state = Variable(torch.Tensor(self.env.reset()))
        testing_rewards = []
        testing_steps = []
        for e in range(num_episodes):
            steps, rewards, log_probs,values = self.play_episode(e)
            self.test_rewards.append(sum(rewards))
            self.test_steps.append(steps)
            testing_rewards.append(sum(rewards))
            testing_steps.append(steps)
        print("Mean reward achieved : {} ".format(np.mean(testing_rewards)))
        print("-"*50)
        if np.mean(testing_rewards) >= 200:
            print("-"*10 + " Solved! " + "-"*10)
            print("Mean reward achieved : {} in {} steps".format(np.mean(testing_rewards), np.mean(testing_steps)))
            print("-"*50)



agent = A2CAgent(env)


agent.actor.train()  #确保 network 在 training 模式
agent.critic.train()

agent.train(40000)