In [3]:
import sys
sys.path.append('../')
from deep_rl.gridworld import ReachGridWorld, PickGridWorld, PORGBEnv, GoalManager, ScaleObsEnv
from deep_rl.network import *
from deep_rl.utils import *
from train import _exp_parser, get_visual_body, get_network, get_env_config, PickGridWorldTask
import os
import random
import argparse
import dill
import json
import itertools
import numpy as np
import matplotlib.pyplot as plt
from random import shuffle
from collections import Counter, namedtuple
from IPython.display import display
from PIL import Image
from pathlib import Path
from IPython.core.debugger import Tracer

def set_seed(s):
    random.seed(s)
    np.random.seed(s)
    torch.manual_seed(s)

set_seed(0) # set seed

pygame 1.9.4
Hello from the pygame community. https://www.pygame.org/contribute.html



You appear to be missing a License Key for mujoco.  We expected to find the
file here: /home/liyuc/.mujoco/mjkey.txt

You can get licenses at this page:

    https://www.roboti.us/license.html

If python tries to activate an invalid license, the process will exit.



# Try Fitted Q

In [23]:
n_objs = 4
action_dim = 5
feat_dim = 512
scale = 2

def get_expert(weight_path):
    visual_body = TSAMiniConvBody(
        2 + n_objs, 
        feature_dim=feat_dim,
        scale=scale,
    )
    expert = VanillaNet(action_dim, visual_body)
    # load weight
    weight_dict = expert.state_dict()
    loaded_weight_dict = {k: v for k, v in torch.load(
        weight_path,
        map_location=lambda storage, loc: storage)['network'].items()
        if k in weight_dict}
    weight_dict.update(loaded_weight_dict)
    expert.load_state_dict(weight_dict)
    return expert

def get_env(env_config):
    states = []
    positions = []
    qs = []
    reward_config = {'wall_penalty': -0.01, 'time_penalty': -0.01, 'complete_sub_task': 0.1, 'complete_all': 1, 'fail': -1}
    with open(env_config, 'rb') as f:
        env_config = dill.load(f)
    env = ScaleObsEnv(
        PickGridWorld(
                **env_config,
                min_dis=1,
                window=1,
                task_length=1,
                reward_config=reward_config,
                seed=0,
        ),
        2,
    )
    env.reset(sample_obj_pos=False)
    positions = env.unwrapped.pos_candidates
    for pos in positions:
        o, _, _, _ = env.teleport(*pos)
        states.append(o)
        qs.append(env.get_q(0.99))
    return env, states, positions, qs

def rollout(env, q, horizon=100, epsilon=0.0, feat_state=False):
    states = []
    actions = []
    rewards = []
    next_states = []
    terminals = []
    returns = 0.0
    done = False
    state = env.reset(sample_obj_pos=False) # very important!
    for _ in range(horizon):
        if feat_state:
            states.append(q.body(tensor([state])).cpu().detach().numpy()[0])
        else:
            states.append(state)
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            action = q([state]).cpu().detach().numpy().flatten().argmax()
        state, r, done, _ = env.step(action) # note that info is not used
        actions.append(action)
        if feat_state:
            next_states.append(q.body(tensor([state])).cpu().detach().numpy()[0])
        else:
            next_states.append(state)
        rewards.append(r)
        terminals.append(done)
        returns += r
        if done: break
    return states, actions, next_states, rewards, terminals, returns




In [39]:
n_expert_trajs = 10
n_random_trajs = 10
epsilon = 0.0
feat_state = True

weight_path = '../log/pick.mask.fourroom-16.0.min_dis-1/dqn/double_q/0.190425-220424/models/step-3000000-mean-0.96'
env_config_path = '../data/env_configs/pick/fourroom-16.0'


expert = get_expert(weight_path)
env, states, positions, optimal_q = get_env(env_config_path)

#expert_q = expert(states).detach().cpu().numpy()
states = []
actions = []
next_states = []
rewards = []
terminals = []

for _ in range(n_expert_trajs):
    states_, actions_, next_states_, rewards_, terminals_, returns = rollout(env, expert, epsilon=epsilon, feat_state=feat_state)
    print('expert returns:', returns)
    states.append(states_)
    actions.append(actions_)
    next_states.append(next_states_)
    rewards.append(rewards_)
    terminals.append(terminals_)
    
for _ in range(n_random_trajs):
    states_, actions_, next_states_, rewards_, terminals_, returns = rollout(env, expert, epsilon=1.0, feat_state=feat_state)
    print('random returns:', returns)
    states.append(states_)
    actions.append(actions_)
    next_states.append(next_states_)
    rewards.append(rewards_)
    terminals.append(terminals_)    


states = np.concatenate(states)
    
# Fitted Q iteration
weight = np.random.randn(feat_dim, action_dim)
for _ in range(5000):
    



[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
maps: [(0, 'fourroom-16')]
tasks: [(0, ('A',)), (1, ('B',)), (2, ('C',)), (3, ('D',))]
train: [(0, 0)]
test: [(0, 0)]
expert returns: 0.91
expert returns: 1.0
expert returns: 0.9800000000000001
expert returns: 0.88
expert returns: 0.9
expert returns: 0.9400000000000001
expert returns: 0.9600000000000001
expert returns: 1.0
expert returns: 0.9700000000000001
expert returns: 0.9400000000000001
random returns: -1.5400000000000005
random returns: -1.1800000000000008
random returns: -2.0200000000000005
random returns: -1.0600000000000007
random returns: -1.04
random returns: -1.1100000000000008
random returns: -1.0800000000000007
random returns: -1.1200000000000008
random returns: -1.1400000000000008
random returns: -1.0400000000000007
