In [34]:
import sys
sys.path.append('../')
from deep_rl.gridworld import ReachGridWorld, PickGridWorld, PORGBEnv, GoalManager, ScaleObsEnv
from deep_rl.network import *
from deep_rl.utils import *
from train import _exp_parser, get_visual_body, get_network, get_env_config, PickGridWorldTask
import os
import random
import argparse
import dill
import json
import itertools
import numpy as np
import matplotlib.pyplot as plt
from random import shuffle
from collections import Counter, namedtuple
from IPython.display import display
from PIL import Image
from pathlib import Path
from IPython.core.debugger import Tracer
from tqdm import tqdm

def set_seed(s):
    random.seed(s)
    np.random.seed(s)
    torch.manual_seed(s)

set_seed(0) # set seed

# Try Fitted Q

In [2]:
n_objs = 4
action_dim = 5
feat_dim = 512
scale = 2
discount = 0.99

def get_expert(weight_path):
    visual_body = TSAMiniConvBody(
        2 + n_objs, 
        feature_dim=feat_dim,
        scale=scale,
    )
    expert = VanillaNet(action_dim, visual_body)
    # load weight
    weight_dict = expert.state_dict()
    loaded_weight_dict = {k: v for k, v in torch.load(
        weight_path,
        map_location=lambda storage, loc: storage)['network'].items()
        if k in weight_dict}
    weight_dict.update(loaded_weight_dict)
    expert.load_state_dict(weight_dict)
    return expert

def get_env(env_config):
    states = []
    positions = []
    qs = []
    reward_config = {'wall_penalty': -0.01, 'time_penalty': -0.01, 'complete_sub_task': 0.1, 'complete_all': 1, 'fail': -1}
    with open(env_config, 'rb') as f:
        env_config = dill.load(f)
    env = ScaleObsEnv(
        PickGridWorld(
                **env_config,
                min_dis=1,
                window=1,
                task_length=1,
                reward_config=reward_config,
                seed=0,
        ),
        2,
    )
    env.reset(sample_obj_pos=False)
    positions = env.unwrapped.pos_candidates
    for pos in positions:
        o, _, _, _ = env.teleport(*pos)
        states.append(o)
        qs.append(env.get_q(discount))
    return env, states, positions, qs

def rollout(env, q, horizon=100, epsilon=0.0, feat_state=False):
    states = []
    actions = []
    rewards = []
    next_states = []
    terminals = []
    qs = []
    returns = 0.0
    done = False
    state = env.reset(sample_obj_pos=False) # very important!
    for _ in range(horizon):
        if feat_state:
            states.append(q.body(tensor([state])).cpu().detach().numpy()[0])
        else:
            states.append(state)
        qval = q([state]).cpu().detach().numpy().flatten()
        qs.append(qval)
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            action = qval.argmax()
        state, r, done, _ = env.step(action) # note that info is not used
        actions.append(action)
        if feat_state:
            next_states.append(q.body(tensor([state])).cpu().detach().numpy()[0])
        else:
            next_states.append(state)
        rewards.append(r)
        terminals.append(done)
        returns += r
        if done: break
    return states, actions, next_states, rewards, terminals, qs, returns



In [57]:
%pdb on
n_expert_trajs = 1
epsilon = 0.0
feat_state = True

weight_path = '../log/pick.mask.fourroom-16.0.min_dis-1/dqn/double_q/0.190425-220424/models/step-3000000-mean-0.96'
env_config_path = '../data/env_configs/pick/fourroom-16.0'

expert = get_expert(weight_path)
env, all_states, positions, optimal_q = get_env(env_config_path)

states = []
actions = []
next_states = []
rewards = []
terminals = []
qs = []

for _ in range(n_expert_trajs):
    states_, actions_, next_states_, rewards_, terminals_, qs_, returns = rollout(env, expert, epsilon=epsilon, feat_state=feat_state)
    print('expert returns:', returns)
    states.append(states_)
    actions.append(actions_)
    next_states.append(next_states_)
    rewards.append(rewards_)
    terminals.append(terminals_)
    qs.append(qs_)

data = dict(
    states=np.concatenate(states),
    actions=np.concatenate(actions),
    next_states=np.concatenate(next_states),
    rewards=np.concatenate(rewards),
    terminals=np.concatenate(terminals),
    expert_q=np.concatenate(qs),
)

# input: experiences, feature extractor
# output: linear layer
def fitted_q(data, body, feat_dim, action_dim):
    A = np.zeros((feat_dim * action_dim + action_dim, feat_dim * action_dim + action_dim))
    b = np.zeros(feat_dim * action_dim + action_dim)
    N = len(data['states'])

# Fitted Q iteration
A = np.zeros((feat_dim * action_dim + action_dim, feat_dim * action_dim + action_dim))
b = np.zeros(feat_dim * action_dim + action_dim)
N = len(data['states'])

#weight = expert.fc_head.weight.cpu().detach().numpy()
#bias = expert.fc_head.bias.cpu().detach().numpy()
#print(weight.shape, bias.shape)
#w = np.concatenate([weight.flatten(), bias])

pbar = tqdm(total=N)
for i, transition in enumerate(zip(data['states'], data['actions'], data['next_states'], data['rewards'], data['terminals'], data['expert_q'])):
    state, action, next_state, reward, terminal, expert_q = transition
    phi = np.zeros(feat_dim * action_dim + action_dim)
    phi[feat_dim * action: feat_dim * (action + 1)] = state
    phi[feat_dim * action_dim + action] = 1
    #print(phi.dot(w) - expert_q[action])
    #print(phi.dot(w) - (weight.dot(state) + bias)[action])
    b += reward * phi / N
    if terminal:
        A += np.outer(phi, phi) / N
        #print(phi.dot(w) - expert_q[action])
    else:
        next_phi = np.zeros(feat_dim * action_dim + action_dim)
        s_idx = feat_dim * data['actions'][i+1]
        next_phi[s_idx: s_idx + feat_dim] = next_state
        next_phi[feat_dim * action_dim + data['actions'][i+1]] = 1
        A += np.outer(phi, phi - discount * next_phi)
        #print(phi.dot(w) - reward - discount * next_phi.dot(w), expert_q[action] - reward - discount * data['expert_q'][i+1][data['actions'][i+1]])
        #print(next_phi.dot(w) - data['expert_q'][i+1][data['actions'][i+1]])
    #if i % 10 == 0: print('at {}'.format(i))
    pbar.update(1)
pbar.close() 

# update weight
total_weight = np.linalg.lstsq(A, b)[0]
weight = total_weight[:-action_dim].reshape(-1, feat_dim).T
bias = total_weight[-action_dim:]

estimate_q = data['states'] @ weight + bias
print(((estimate_q - data['expert_q']) ** 2).mean())
print('difference between argmax:', (estimate_q.argmax() != expert_q.argmax()).sum())

Automatic pdb calling has been turned ON
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
maps: [(0, 'fourroom-16')]
tasks: [(0, ('A',)), (1, ('B',)), (2, ('C',)), (3, ('D',))]
train: [(0, 0)]
test: [(0, 0)]


 16%|█▌        | 3/19 [00:00<00:00, 27.99it/s]

expert returns: 0.91


100%|██████████| 19/19 [00:00<00:00, 31.27it/s]


0.3271358519712087


# Meta Linear Q

In [None]:
# D_1, ..., D_n
# D, \phi -> A, b -> w(\phi)
# Q(\phi) - Q_E as loss

