In [1]:
import copy
import glob
import os
import time
from collections import deque

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import algo
from arguments import get_args
from envs import make_vec_envs_ViZDoom, make_vec_envs
from model import Policy
from storage import RolloutStorage
from utils import get_vec_normalize
from visualize import visdom_plot

import os
import json

In [12]:
result_dir = "../random_record/"
os.makedirs(result_dir, exist_ok=True)

reward_history = os.path.join(result_dir, "reward_history")
loss_history = os.path.join(result_dir, "loss_history")
parameter_save = os.path.join(result_dir, "parameter.json")
env_parameter_save = os.path.join(result_dir, "env.json")
fileL = [reward_history, loss_history, parameter_save, env_parameter_save]

In [3]:
#remove old record files
for f in fileL:
    try:
        os.remove(f)
    except OSError:
        pass

In [4]:
args = get_args()

def ppo_hyper():
    args.algo = "ppo"
    args.use_gae = False
    args.lr = 1e-5
    args.value_loss_coef = 1.0
    args.num_processes = 8
    args.num_steps = 512
    args.num_mini_batch = 4
    args.entropy_coef = 0.01
    args.gamma = 1.0
    args.ppo_epoch = 10
    args.clip_param = 0.1

def a2c_hyper():
    args.algo = "a2c"
    args.gamma = 1.0
    args.num_steps = 512
    args.num_processes = 8
    args.entropy_coef = 0.0
    args.use_adam = True

In [5]:
a2c_hyper()
args

Namespace(add_timestep=False, algo='a2c', alpha=0.99, clip_param=0.2, cuda=False, entropy_coef=0.0, env_name='PongNoFrameskip-v4', eps=1e-05, eval_interval=None, gamma=1.0, log_dir='/tmp/gym/', log_interval=10, lr=0.0007, max_grad_norm=0.5, no_cuda=False, num_frames=10000000.0, num_mini_batch=32, num_processes=8, num_steps=512, port=8097, ppo_epoch=4, recurrent_policy=False, save_dir='./trained_models/', save_interval=100, seed=1, tau=0.95, use_adam=True, use_gae=False, value_loss_coef=0.5, vis=False, vis_interval=100)

In [6]:
parameters = {}
parameters['algo'] = args.algo
parameters['gamma'] = args.gamma
parameters['num_steps'] = args.num_steps
parameters['num_processes'] = args.num_processes
parameters['value_loss_coef'] = args.value_loss_coef
parameters['eps'] = args.eps
parameters['entropy_coef'] = args.entropy_coef
parameters['lr'] = args.lr
parameters['use_gae'] = args.use_gae
parameters['max_grad_norm'] = args.max_grad_norm
parameters['seed'] = args.seed

if parameters['algo'] == "a2c":
    parameters['alpha'] = args.alpha
    parameters['use_adam'] = args.use_adam
elif parameters['algo'] == "ppo":
    parameters['clip_param'] = args.clip_param
    parameters['ppo_epoch'] = args.ppo_epoch
    parameters['num_mini_batch'] = args.num_mini_batch

if parameters['use_gae']:
    parameters['tau'] = args.tau
    

In [9]:
env_arg = {
    "reward_scale": 0.01,
    "use_rgb": True,
    "use_depth": False,
}

In [13]:
json.dump(parameters, open(parameter_save, "w"))
json.dump(env_arg, open(env_parameter_save, "w"))

In [8]:
num_updates = int(args.num_frames) // args.num_steps // args.num_processes
torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

In [22]:
torch.set_num_threads(1)
device = torch.device("cuda:0" if args.cuda else "cpu")

envs = make_vec_envs_ViZDoom(args.seed, args.num_processes, device, **env_arg)

actor_critic = Policy(envs.observation_space.shape, envs.action_space,
    base_kwargs={'recurrent': args.recurrent_policy})
actor_critic.to(device)

if args.algo == 'a2c':
    agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, lr=args.lr,
                               eps=args.eps, alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm,
                               use_adam=args.use_adam)
else:
    agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
                         args.value_loss_coef, args.entropy_coef, lr=args.lr,
                               eps=args.eps,
                               max_grad_norm=args.max_grad_norm)

rollouts = RolloutStorage(args.num_steps, args.num_processes,
                    envs.observation_space.shape, envs.action_space,
                    actor_critic.recurrent_hidden_state_size)

In [15]:
obs = envs.reset()
rollouts.obs[0].copy_(obs)
rollouts.to(device)

recent_count = 50
episode_rewards = deque(maxlen=recent_count)
episode_lengths = deque(maxlen=recent_count)

In [16]:
for j in range(num_updates):
    for step in range(args.num_steps):
        # Sample actions
        with torch.no_grad():
            value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step],
                    rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

        # Obser reward and next obs
        obs, reward, done, infos = envs.step(action)

        for info in infos:
            if 'Episode_Total_Reward' in info.keys():
                episode_rewards.append(info['Episode_Total_Reward'])
            if 'Episode_Total_Len' in info.keys():
                episode_lengths.append(info['Episode_Total_Len'])

        # If done then clean the history of observations.
        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                   for done_ in done])
        rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks)
        
    
    

    with torch.no_grad():
        next_value = actor_critic.get_value(rollouts.obs[-1],
                                            rollouts.recurrent_hidden_states[-1],
                                            rollouts.masks[-1]).detach()

    rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

    value_loss, action_loss, dist_entropy = agent.update(rollouts)

    rollouts.after_update()
    
    total_num_steps = (j + 1) * args.num_processes * args.num_steps
    
    with open(loss_history, 'a') as the_file:
        the_file.write("{} {} {} {} \n".format(total_num_steps, value_loss, action_loss, dist_entropy))
    
    if len(episode_rewards) > 0:
        print("{} updates: avg reward = {}, avg length = {}".format(total_num_steps, np.mean(episode_rewards),
                                                               np.mean(episode_lengths)))
        
        with open(reward_history, 'a') as the_file:
            the_file.write('{} {} {} \n'.format(total_num_steps, np.mean(episode_rewards),
                                               np.mean(episode_lengths)))
        
    

4096 updates: avg reward = -6.510000000000032, avg length = 349.0


KeyboardInterrupt: 

In [13]:
MODEL_SAVE_PATH = os.path.join(result_dir, "model.save")
torch.save(actor_critic.state_dict(), MODEL_SAVE_PATH)