In [5]:
import numpy as np
import argparse
import torch
from copy import deepcopy

from option_critic import OptionCriticFeatures, OptionCriticConv
from option_critic import critic_loss as critic_loss_fn
from option_critic import actor_loss as actor_loss_fn

from experience_replay import ReplayBuffer
from utils import to_tensor
from logger import Logger


%run oc_agent
%run ../game
%run ../preprocessor

import time

parser = argparse.ArgumentParser(description="Option Critic PyTorch")
parser.add_argument('--env', default='CartPole-v0', help='ROM to run')
parser.add_argument('--optimal-eps', type=float, default=0.05, help='Epsilon when playing optimally')
parser.add_argument('--frame-skip', default=4, type=int, help='Every how many frames to process')
parser.add_argument('--learning-rate',type=float, default=.0005, help='Learning rate')
parser.add_argument('--gamma', type=float, default=.99, help='Discount rate')
parser.add_argument('--epsilon-start',  type=float, default=1.0, help=('Starting value for epsilon.'))
parser.add_argument('--epsilon-min', type=float, default=.1, help='Minimum epsilon.')
parser.add_argument('--epsilon-decay', type=float, default=20000, help=('Number of steps to minimum epsilon.'))
parser.add_argument('--max-history', type=int, default=10000, help=('Maximum number of steps stored in replay'))
parser.add_argument('--batch-size', type=int, default=32, help='Batch size.')
parser.add_argument('--freeze-interval', type=int, default=200, help=('Interval between target freezes.'))
parser.add_argument('--update-frequency', type=int, default=4, help=('Number of actions before each SGD update.'))
parser.add_argument('--termination-reg', type=float, default=0.01, help=('Regularization to decrease termination prob.'))
parser.add_argument('--entropy-reg', type=float, default=0.01, help=('Regularization to increase policy entropy.'))
parser.add_argument('--num-options', type=int, default=2, help=('Number of options to create.'))
parser.add_argument('--temp', type=float, default=1, help='Action distribution softmax tempurature param.')

parser.add_argument('--max_steps_ep', type=int, default=18000, help='number of maximum steps per episode.')
parser.add_argument('--max_steps_total', type=int, default=int(4e6), help='number of maximum steps to take.') # bout 4 million
parser.add_argument('--cuda', type=bool, default=True, help='Enable CUDA training (recommended if possible).')
parser.add_argument('--seed', type=int, default=0, help='Random seed for numpy, torch, random.')
parser.add_argument('--logdir', type=str, default='runs', help='Directory for logging statistics')
parser.add_argument('--exp', type=str, default=None, help='optional experiment name')
parser.add_argument('--switch-goal', type=bool, default=False, help='switch goal after 2k eps')

class Args:
    gamma = 0.99
    termination_reg = 0.01
    entropy_reg = 0.01

def run():
    n_frames = 1
    n_channels = 3
    original_width = 256
    original_height = 240
    scaled_width = 84
    scaled_height = 84

    game_visible = True
    mario_scale = 2.0
    mario_state = 0
    mario_timer = 200
    mario_fps = 30
    level_path = "/levels/custom/quick_victory_3.txt"
    preprocess = Preprocessor(n_frames, n_channels, original_height, original_width, scaled_height, scaled_width)
    game = Game(game_visible, mario_scale, mario_state, mario_timer, mario_fps, level_path, preprocess)
    
    in_features = n_channels
    num_actions = len(game.all_actions)
    num_options = 2
    temp = 1
    eps_start = 1.0
    eps_min = 0.1
    eps_decay = 20000
    eps_test = 0.05
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    num_episodes = 50
    max_history = 10000
    max_steps_total = int(4e6)
    max_steps_ep = 18000
    seed=0
    learning_rate = 0.0005
    batch_size = 32
    update_frequency = 4
    freeze_interval = 200
   
    option_critic = OptionCriticConv(
        in_features=in_features,
        num_actions=num_actions,
        num_options=num_options,
        temperature=temp,
        eps_start=eps_start,
        eps_min=eps_min,
        eps_decay=eps_decay,
        eps_test=eps_test,
        device=device
    )
    agent = OCAgent(
        env=game,
        option_critic=option_critic,
        num_episodes=num_episodes,
        max_history=max_history,
        max_steps_total=max_steps_total,
        max_steps_ep=max_steps_ep,
        num_options=num_options,
        seed=seed,
        learning_rate=learning_rate,
        batch_size=batch_size,
        update_frequency=update_frequency,
        freeze_interval=freeze_interval
    )
    args = Args()
    agent.train(args)
    

run()


torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 84, 84])
torch.Size([3, 8

In [23]:
import numpy as np
a = np.array([1,2,3,4])
a[-1]

4