In [2]:
import sys
sys.path.append('../')
from deep_rl.gridworld import ReachGridWorld, PickGridWorld, PORGBEnv, GoalManager, ScaleObsEnv
from deep_rl.network import *
from deep_rl.utils import *
from train import _exp_parser, get_visual_body, get_network, get_env_config, PickGridWorldTask
import os
import random
import argparse
import dill
import json
import copy
import itertools
import numpy as np
import matplotlib.pyplot as plt
from random import shuffle
from collections import Counter, namedtuple
from IPython.display import display
from PIL import Image
from pathlib import Path
from IPython.core.debugger import Tracer
from tqdm import tqdm

def set_seed(s):
    random.seed(s)
    np.random.seed(s)
    torch.manual_seed(s)

set_seed(0) # set seed

# Try Fitted Q

In [3]:
n_objs = 4
action_dim = 5
feat_dim = 512
scale = 2
discount = 0.99

def get_expert(weight_path):
    visual_body = TSAMiniConvBody(
        2 + n_objs, 
        feature_dim=feat_dim,
        scale=scale,
    )
    expert = VanillaNet(action_dim, visual_body)
    # load weight
    weight_dict = expert.state_dict()
    loaded_weight_dict = {k: v for k, v in torch.load(
        weight_path,
        map_location=lambda storage, loc: storage)['network'].items()
        if k in weight_dict}
    weight_dict.update(loaded_weight_dict)
    expert.load_state_dict(weight_dict)
    return expert

def get_env(env_config):
    states = []
    positions = []
    qs = []
    reward_config = {'wall_penalty': -0.01, 'time_penalty': -0.01, 'complete_sub_task': 0.1, 'complete_all': 1, 'fail': -1}
    with open(env_config, 'rb') as f:
        env_config = dill.load(f)
    env = ScaleObsEnv(
        PickGridWorld(
                **env_config,
                min_dis=1,
                window=1,
                task_length=1,
                reward_config=reward_config,
                seed=0,
        ),
        2,
    )
    env.reset(sample_obj_pos=False)
    positions = env.unwrapped.pos_candidates
    for pos in positions:
        o, _, _, _ = env.teleport(*pos)
        states.append(o)
        qs.append(env.get_q(discount))
    return env, states, positions, qs

def rollout(env, q, horizon=100, epsilon=0.0, feat_state=False):
    states = []
    actions = []
    rewards = []
    next_states = []
    terminals = []
    qs = []
    returns = 0.0
    done = False
    state = env.reset(sample_obj_pos=False) # very important!
    for _ in range(horizon):
        if feat_state:
            states.append(q.body(tensor([state])).cpu().detach().numpy()[0])
        else:
            states.append(state)
        qval = q([state]).cpu().detach().numpy().flatten()
        qs.append(qval)
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            action = qval.argmax()
        state, r, done, _ = env.step(action) # note that info is not used
        actions.append(action)
        if feat_state:
            next_states.append(q.body(tensor([state])).cpu().detach().numpy()[0])
        else:
            next_states.append(state)
        rewards.append(r)
        terminals.append(done)
        returns += r
        if done: break
    return states, actions, next_states, rewards, terminals, qs, returns



In [33]:
%pdb on
n_expert_trajs = 2000
epsilon = 0.0
feat_state = False

weight_path = '../log/pick.mask.fourroom-16.0.min_dis-1/dqn/double_q/0.190425-220424/models/step-3000000-mean-0.96'
env_config_path = '../data/env_configs/pick/fourroom-16.0'

expert = get_expert(weight_path)
env, all_states, positions, optimal_q = get_env(env_config_path)

states = []
actions = []
next_states = []
rewards = []
terminals = []
qs = []

for _ in tqdm(range(n_expert_trajs)):
    states_, actions_, next_states_, rewards_, terminals_, qs_, returns = rollout(env, expert, epsilon=epsilon, feat_state=feat_state)
    #print('expert returns:', returns)
    states.append(states_)
    actions.append(actions_)
    next_states.append(next_states_)
    rewards.append(rewards_)
    terminals.append(terminals_)
    qs.append(qs_)

data = dict(
    states=np.concatenate(states),
    actions=np.concatenate(actions),
    next_states=np.concatenate(next_states),
    rewards=np.concatenate(rewards),
    terminals=np.concatenate(terminals),
    expert_q=np.concatenate(qs),
)
print('num of transitions:', len(data['states']))

def fitted_q(data, body, feat_dim, action_dim):
    N = len(data['states'])

    feats = body(tensor(data['states'])).repeat(1, action_dim).detach().cpu().numpy()
    a_vec = np.eye(action_dim)[data['actions']]
    phis = np.concatenate([feats * a_vec.repeat(feat_dim, 1), a_vec], 1)
    
    A = phis.T @ (phis - discount * np.expand_dims(1 - data['terminals'], 1) * np.roll(phis, -1, 0)) / N
    b = phis.T @ data['rewards'] / N
    
    # update weight
    #print(torch.isnan(A).any())
    #total_weight = np.linalg.inv(A + 1e-4 * np.eye(A.shape[0])) @ b
    total_weight = np.linalg.lstsq(A, b)[0]
    weight = total_weight[:-action_dim].reshape(-1, feat_dim).T
    bias = total_weight[-action_dim:]
    return weight, bias

# input: experiences, feature extractor
# output: linear layer
# def fitted_q(data, body, feat_dim, action_dim):
#     A = np.zeros((feat_dim * action_dim + action_dim, feat_dim * action_dim + action_dim))
#     b = np.zeros(feat_dim * action_dim + action_dim)
#     N = len(data['states'])

#     pbar = tqdm(total=N)
#     for i, transition in enumerate(zip(data['states'], data['actions'], data['next_states'], data['rewards'], data['terminals'])):
#         state, action, next_state, reward, terminal = transition
#         phi = np.zeros(feat_dim * action_dim + action_dim)
#         phi[feat_dim * action: feat_dim * (action + 1)] = body(tensor([state])).detach().cpu().numpy()[0]
#         phi[feat_dim * action_dim + action] = 1
#         b += reward * phi / N
#         if terminal:
#             A += np.outer(phi, phi) / N
#         else:
#             next_phi = np.zeros(feat_dim * action_dim + action_dim)
#             s_idx = feat_dim * data['actions'][i+1] # assume trajectories is contiguous
#             next_phi[s_idx: s_idx + feat_dim] = body(tensor([next_state])).detach().cpu().numpy()[0]
#             next_phi[feat_dim * action_dim + data['actions'][i+1]] = 1
#             A += np.outer(phi, phi - discount * next_phi) / N
#         pbar.update(1)
#     pbar.close() 

#     # update weight
#     total_weight = np.linalg.lstsq(A, b)[0]
#     weight = total_weight[:-action_dim].reshape(-1, feat_dim).T
#     bias = total_weight[-action_dim:]
#     return weight, bias
    

weight, bias = fitted_q(data, expert.body, feat_dim, action_dim)

# model = copy.deepcopy(expert)
# model.fc_head.weight.data.copy_(tensor(weight).t())
# model.fc_head.bias.data.copy_(tensor(bias))
# for _ in range(10):
#     res = rollout(env, model)
#     print(res[-1])

estimate_q = expert.body(tensor(data['states'])).detach().cpu().numpy() @ weight + bias
print(((estimate_q - data['expert_q']) ** 2).mean())
print('difference between argmax:', (estimate_q.argmax(1) != data['expert_q'].argmax(1)).sum())

print(estimate_q)
print()
print(data['expert_q'])



  0%|          | 0/2000 [00:00<?, ?it/s]

Automatic pdb calling has been turned ON
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
maps: [(0, 'fourroom-16')]
tasks: [(0, ('A',)), (1, ('B',)), (2, ('C',)), (3, ('D',))]
train: [(0, 0)]
test: [(0, 0)]


100%|██████████| 2000/2000 [01:12<00:00, 27.48it/s]


num of transitions: 29454




0.01659769967412279
difference between argmax: 17129
[[0.74836036 0.60369045 0.74413376 0.6654658  0.76228272]
 [0.76175127 0.63501297 0.76053703 0.68347097 0.79641732]
 [0.77954674 0.6798445  0.78007591 0.66827122 0.79370532]
 ...
 [1.0210867  0.82932089 1.048409   0.757595   0.92456787]
 [1.04761135 0.7969205  1.0691     0.7887416  1.02937634]
 [1.05806591 0.76661029 1.07518061 0.8062914  1.09      ]]

[[0.73559886 0.7194368  0.7483418  0.7340713  0.72777945]
 [0.7664412  0.7394179  0.76616466 0.72755235 0.7490153 ]
 [0.7852332  0.7568129  0.7838565  0.7514845  0.7688418 ]
 ...
 [1.0151633  1.0040917  1.0483844  1.001756   1.0287615 ]
 [1.0374562  1.027259   1.0691859  1.0235871  1.0467873 ]
 [1.0582035  1.0501345  1.0585701  1.0527699  1.0902131 ]]


# Meta Linear Q

In [None]:
# D_1, ..., D_n
# D, body / \phi -> A, b -> w(\phi)
# Q(\phi) - Q_E as loss

n_expert_trajs = 2000
epsilon = 0.0
mkdir('log/meta_linear_q')

weight_path = '../log/pick.mask.fourroom-16.0.min_dis-1/dqn/double_q/0.190425-220424/models/step-3000000-mean-0.96'
env_config_path = '../data/env_configs/pick/fourroom-16.0'

expert = get_expert(weight_path)
env, all_states, positions, optimal_q = get_env(env_config_path)

states = []
actions = []
next_states = []
rewards = []
terminals = []
qs = []

# for _ in tqdm(range(n_expert_trajs)):
#     states_, actions_, next_states_, rewards_, terminals_, qs_, returns = rollout(env, expert, epsilon=epsilon)
#     #print('expert returns:', returns)
#     states.append(states_)
#     actions.append(actions_)
#     next_states.append(next_states_)
#     rewards.append(rewards_)
#     terminals.append(terminals_)
#     qs.append(qs_)

# data = dict(
#     states=np.concatenate(states),
#     actions=np.concatenate(actions),
#     next_states=np.concatenate(next_states),
#     rewards=np.concatenate(rewards),
#     terminals=np.concatenate(terminals),
#     expert_q=np.concatenate(qs),
# )
print('num of transitions:', len(data['states']))

def fitted_q(data, body, feat_dim, action_dim):
    A = torch.zeros(feat_dim * action_dim + action_dim, feat_dim * action_dim + action_dim)
    b = torch.zeros(feat_dim * action_dim + action_dim)
    N = len(data['states'])

    feats = body(tensor(data['states'])).repeat(1, action_dim)
    a_vec = one_hot.encode(tensor(data['actions'], torch.long), action_dim)
    phis = torch.cat([feats * a_vec.repeat_interleave(feat_dim, 1), a_vec], 1)
    
    A = torch.matmul(phis.t(), phis - discount * tensor(1 - data['terminals']).unsqueeze(1) * phis.roll(-1, 0)) / N
    b = torch.matmul(phis.t(), tensor(data['rewards'])) / N
    
    # update weight
    #print(torch.isnan(A).any())
    total_weight = torch.matmul(torch.inverse(A + 1e-4 * torch.eye(A.shape[0])), b)
    #total_weight = torch.matmul(torch.pinverse(A), b)
    weight = total_weight[:-action_dim].view(-1, feat_dim).t()
    bias = total_weight[-action_dim:]
    return weight, bias


model = copy.deepcopy(expert)
optim = torch.optim.RMSprop(
        filter(lambda p: p.requires_grad, model.parameters()), lr=0.00025, alpha=0.95, eps=0.01, centered=True)
#weight, bias = torch.randn(feat_dim, action_dim), torch.randn(action_dim)
for i in range(2000):
    weight, bias = fitted_q(data, model.body, feat_dim, action_dim)
    model.fc_head.weight.data.copy_(weight.t())
    model.fc_head.bias.data.copy_(bias)
    estimate_q = model(data['states'])
    #estimate_q = torch.matmul(model.body(tensor(data['states'])), weight) + bias
    loss = F.mse_loss(estimate_q, tensor(data['expert_q']))
    print('{}-th loss:'.format(i), loss.detach().cpu().numpy())
    if i > 0 and i % 5 == 0: # save model
        weight_dict = dict(network=model.state_dict())
        torch.save(weight_dict, 'log/meta_linear_q/step-{}-loss-{:.4f}'.format(i, loss.detach().cpu().numpy()))
    optim.zero_grad()
    loss.backward()
    optim.step()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
maps: [(0, 'fourroom-16')]
tasks: [(0, ('A',)), (1, ('B',)), (2, ('C',)), (3, ('D',))]
train: [(0, 0)]
test: [(0, 0)]
num of transitions: 29454
0-th loss: 0.026722785


In [10]:
# import torch
# print(torch.__version__)

# class DummyModule(torch.nn.Module):
#     def forward(self, x):
#         V = torch.Tensor(2, 2)
#         V[0, 0] = x
#         return torch.sum(V * 3)


# x = torch.tensor([1], requires_grad=True)
# r = DummyModule()(x)
# r.backward()
# print(x.grad)


print(torch.__version__)

1.1.0
