In [1]:
import sys
sys.path.append('../')
from deep_rl.gridworld import ReachGridWorld, PickGridWorld, PORGBEnv, GoalManager, ScaleObsEnv
from deep_rl.network import *
from deep_rl.utils import *
from train import _exp_parser, get_visual_body, get_network, get_env_config, PickGridWorldTask
import os
import random
import argparse
import dill
import json
import itertools
import numpy as np
import matplotlib.pyplot as plt
from random import shuffle
from collections import Counter, namedtuple
from IPython.display import display
from PIL import Image
from pathlib import Path
from IPython.core.debugger import Tracer

def set_seed(s):
    random.seed(s)
    np.random.seed(s)
    torch.manual_seed(s)

set_seed(0) # set seed

pygame 1.9.4
Hello from the pygame community. https://www.pygame.org/contribute.html



You appear to be missing a License Key for mujoco.  We expected to find the
file here: /home/liyuc/.mujoco/mjkey.txt

You can get licenses at this page:

    https://www.roboti.us/license.html

If python tries to activate an invalid license, the process will exit.



# Try Fitted Q

In [2]:
n_objs = 4
action_dim = 5
feat_dim = 512
scale = 2
discount = 0.99

def get_expert(weight_path):
    visual_body = TSAMiniConvBody(
        2 + n_objs, 
        feature_dim=feat_dim,
        scale=scale,
    )
    expert = VanillaNet(action_dim, visual_body)
    # load weight
    weight_dict = expert.state_dict()
    loaded_weight_dict = {k: v for k, v in torch.load(
        weight_path,
        map_location=lambda storage, loc: storage)['network'].items()
        if k in weight_dict}
    weight_dict.update(loaded_weight_dict)
    expert.load_state_dict(weight_dict)
    return expert

def get_env(env_config):
    states = []
    positions = []
    qs = []
    reward_config = {'wall_penalty': -0.01, 'time_penalty': -0.01, 'complete_sub_task': 0.1, 'complete_all': 1, 'fail': -1}
    with open(env_config, 'rb') as f:
        env_config = dill.load(f)
    env = ScaleObsEnv(
        PickGridWorld(
                **env_config,
                min_dis=1,
                window=1,
                task_length=1,
                reward_config=reward_config,
                seed=0,
        ),
        2,
    )
    env.reset(sample_obj_pos=False)
    positions = env.unwrapped.pos_candidates
    for pos in positions:
        o, _, _, _ = env.teleport(*pos)
        states.append(o)
        qs.append(env.get_q(discount))
    return env, states, positions, qs

def rollout(env, q, horizon=100, epsilon=0.0, feat_state=False):
    states = []
    actions = []
    rewards = []
    next_states = []
    terminals = []
    qs = []
    returns = 0.0
    done = False
    state = env.reset(sample_obj_pos=False) # very important!
    for _ in range(horizon):
        if feat_state:
            states.append(q.body(tensor([state])).cpu().detach().numpy()[0])
        else:
            states.append(state)
        qval = q([state]).cpu().detach().numpy().flatten()
        qs.append(qval)
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            action = qval.argmax()
        state, r, done, _ = env.step(action) # note that info is not used
        actions.append(action)
        if feat_state:
            next_states.append(q.body(tensor([state])).cpu().detach().numpy()[0])
        else:
            next_states.append(state)
        rewards.append(r)
        terminals.append(done)
        returns += r
        if done: break
    return states, actions, next_states, rewards, terminals, qs, returns



In [12]:
%pdb on
n_expert_trajs = 10
n_random_trajs = 10
epsilon = 0.0
feat_state = True

weight_path = '../log/pick.mask.fourroom-16.0.min_dis-1/dqn/double_q/0.190425-220424/models/step-3000000-mean-0.96'
env_config_path = '../data/env_configs/pick/fourroom-16.0'


expert = get_expert(weight_path)
env, all_states, positions, optimal_q = get_env(env_config_path)

states = []
actions = []
next_states = []
rewards = []
terminals = []
qs = []

for _ in range(n_expert_trajs):
    states_, actions_, next_states_, rewards_, terminals_, qs_, returns = rollout(env, expert, epsilon=epsilon, feat_state=feat_state)
    print('expert returns:', returns)
    states.append(states_)
    actions.append(actions_)
    next_states.append(next_states_)
    rewards.append(rewards_)
    terminals.append(terminals_)
    qs.append(qs_)
    
for _ in range(n_random_trajs):
    states_, actions_, next_states_, rewards_, terminals_, qs_, returns = rollout(env, expert, epsilon=1.0, feat_state=feat_state)
    print('random returns:', returns)
    states.append(states_)
    actions.append(actions_)
    next_states.append(next_states_)
    rewards.append(rewards_)
    terminals.append(terminals_)
    qs.append(qs_)


data = dict(
    states=np.concatenate(states),
    actions=np.concatenate(actions),
    next_states=np.concatenate(next_states),
    rewards=np.concatenate(rewards),
    terminals=np.concatenate(terminals),
    expert_q=np.concatenate(qs),
)

# Fitted Q iteration
weight = np.zeros((feat_dim, action_dim))
for _ in range(50):
    new_weight = np.zeros((feat_dim, action_dim))
    for a in range(action_dim):
        indices = data['actions'] == a
        states = data['states'][indices]
        actions = data['actions'][indices]
        next_states = data['next_states'][indices]
        rewards = data['rewards'][indices]
        terminals = data['terminals'][indices]
        qs = (states @ weight).argmax(1)
        targets = rewards + discount * (1 - terminals) * qs
        new_weight[:, a] = np.linalg.lstsq(states, targets)[0]
        #new_weight[:, a] = np.linalg.lstsq(states.T @ states / states.shape[0], states.T.dot(targets) / states.shape[0])[0]
    weight = new_weight
    #print(data['states'][:5] @ weight)
    print(((data['expert_q'] - data['states'] @ weight) ** 2).mean())
    
    


Automatic pdb calling has been turned ON
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
maps: [(0, 'fourroom-16')]
tasks: [(0, ('A',)), (1, ('B',)), (2, ('C',)), (3, ('D',))]
train: [(0, 0)]
test: [(0, 0)]
expert returns: 0.91
expert returns: 1.0
expert returns: 0.9800000000000001
expert returns: 0.88
expert returns: 0.9
expert returns: 0.9400000000000001
expert returns: 0.9600000000000001
expert returns: 1.0
expert returns: 0.9700000000000001
expert returns: 0.9400000000000001
random returns: -1.1200000000000008
random returns: -1.0700000000000007
random returns: -1.0500000000000007
random returns: -1.1700000000000008
random returns: -1.2300000000000009
random returns: -1.1600000000000008
random returns: -1.0800000000000007
random returns: -1.0700000000000007
random returns: -1.0900000000000007
random returns: -2.1200000000000006




0.7530391236797941
6.604199421511358
4.154672512829174
2.4240169852112743
2731844996070853.0
2.898442468664994
3.534794307722824
5.090476179537211
4.90384586440879
3.437958506738988
3.158840799500725
3.171617646016419
3.520866007552742
5.362635730216085
2731844977714423.5
2.526371566359914
3.668366791723194
3.55749571545002e+16
2.5725146662220584
4.2764989804098406
682961249382715.8
2.7808678450840327
3.910702749640933
4.0794259431713575
2.8042517285729103
3.7363403122466616
4.060435356313744
1.6759303515066033
5.112886566453829
4.416063274029171
3.927258479465625
4.465514169661016
1.998799884100855
4.06469224927175
3.706488226212251
2.4920296374496087
3.7462679213716115
1.2465396885408199e+19
2.7850605864564946
2.684912384570506
3.710083741226179
2.804782595772397e+19
2.3784140635245805
2.8746454971102966
2.562785060324813
3.430447977489571
3.557495703561783e+16
2731844964232539.0
3.9575808113045
3.688430402707513
