In [2]:
import sys
sys.path.append('../')
from deep_rl.gridworld import ReachGridWorld, PickGridWorld, PORGBEnv, GoalManager, ScaleObsEnv
from deep_rl.network import *
from deep_rl.utils import *
from train import _exp_parser, get_visual_body, get_network, get_env_config, PickGridWorldTask
import os
import random
import argparse
import dill
import json
import copy
import itertools
import numpy as np
import matplotlib.pyplot as plt
from random import shuffle
from collections import Counter, namedtuple
from IPython.display import display
from PIL import Image
from pathlib import Path
from IPython.core.debugger import Tracer
from tqdm import tqdm

def set_seed(s):
    random.seed(s)
    np.random.seed(s)
    torch.manual_seed(s)

set_seed(0) # set seed

pygame 1.9.4
Hello from the pygame community. https://www.pygame.org/contribute.html


# Try Fitted Q

In [3]:
n_objs = 4
action_dim = 5
feat_dim = 512
scale = 2
discount = 0.99

def get_expert(weight_path):
    visual_body = TSAMiniConvBody(
        2 + n_objs, 
        feature_dim=feat_dim,
        scale=scale,
    )
    expert = VanillaNet(action_dim, visual_body)
    # load weight
    weight_dict = expert.state_dict()
    loaded_weight_dict = {k: v for k, v in torch.load(
        weight_path,
        map_location=lambda storage, loc: storage)['network'].items()
        if k in weight_dict}
    weight_dict.update(loaded_weight_dict)
    expert.load_state_dict(weight_dict)
    return expert

def get_env(env_config):
    states = []
    positions = []
    qs = []
    reward_config = {'wall_penalty': -0.01, 'time_penalty': -0.01, 'complete_sub_task': 0.1, 'complete_all': 1, 'fail': -1}
    with open(env_config, 'rb') as f:
        env_config = dill.load(f)
    env = ScaleObsEnv(
        PickGridWorld(
                **env_config,
                min_dis=1,
                window=1,
                task_length=1,
                reward_config=reward_config,
                seed=0,
        ),
        2,
    )
    env.reset(sample_obj_pos=False)
    positions = env.unwrapped.pos_candidates
    for pos in positions:
        o, _, _, _ = env.teleport(*pos)
        states.append(o)
        qs.append(env.get_q(discount))
    return env, states, positions, qs

def rollout(env, q, horizon=100, epsilon=0.0, feat_state=False):
    states = []
    actions = []
    rewards = []
    next_states = []
    terminals = []
    qs = []
    returns = 0.0
    done = False
    state = env.reset(sample_obj_pos=False) # very important!
    for _ in range(horizon):
        if feat_state:
            states.append(q.body(tensor([state])).cpu().detach().numpy()[0])
        else:
            states.append(state)
        qval = q([state]).cpu().detach().numpy().flatten()
        qs.append(qval)
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            action = qval.argmax()
        state, r, done, _ = env.step(action) # note that info is not used
        actions.append(action)
        if feat_state:
            next_states.append(q.body(tensor([state])).cpu().detach().numpy()[0])
        else:
            next_states.append(state)
        rewards.append(r)
        terminals.append(done)
        returns += r
        if done: break
    return states, actions, next_states, rewards, terminals, qs, returns



In [4]:
%pdb on
n_expert_trajs = 20
epsilon = 0.0
feat_state = False

weight_path = '../log/pick.mask.fourroom-16.0.min_dis-1/dqn/double_q/0.190425-220424/models/step-3000000-mean-0.96'
env_config_path = '../data/env_configs/pick/fourroom-16.0'

expert = get_expert(weight_path)
env, all_states, positions, optimal_q = get_env(env_config_path)

states = []
actions = []
next_states = []
rewards = []
terminals = []
qs = []

for _ in tqdm(range(n_expert_trajs)):
    states_, actions_, next_states_, rewards_, terminals_, qs_, returns = rollout(env, expert, epsilon=epsilon, feat_state=feat_state)
    #print('expert returns:', returns)
    states.append(states_)
    actions.append(actions_)
    next_states.append(next_states_)
    rewards.append(rewards_)
    terminals.append(terminals_)
    qs.append(qs_)

data = dict(
    states=np.concatenate(states),
    actions=np.concatenate(actions),
    next_states=np.concatenate(next_states),
    rewards=np.concatenate(rewards),
    terminals=np.concatenate(terminals),
    expert_q=np.concatenate(qs),
)
print('num of transitions:', len(data['states']))

# input: experiences, feature extractor
# output: linear layer
def fitted_q(data, body, feat_dim, action_dim):
    A = np.zeros((feat_dim * action_dim + action_dim, feat_dim * action_dim + action_dim))
    b = np.zeros(feat_dim * action_dim + action_dim)
    N = len(data['states'])

    pbar = tqdm(total=N)
    for i, transition in enumerate(zip(data['states'], data['actions'], data['next_states'], data['rewards'], data['terminals'])):
        state, action, next_state, reward, terminal = transition
        phi = np.zeros(feat_dim * action_dim + action_dim)
        phi[feat_dim * action: feat_dim * (action + 1)] = body(tensor([state])).detach().cpu().numpy()[0]
        phi[feat_dim * action_dim + action] = 1
        b += reward * phi / N
        if terminal:
            A += np.outer(phi, phi) / N
        else:
            next_phi = np.zeros(feat_dim * action_dim + action_dim)
            s_idx = feat_dim * data['actions'][i+1] # assume trajectories is contiguous
            next_phi[s_idx: s_idx + feat_dim] = body(tensor([next_state])).detach().cpu().numpy()[0]
            next_phi[feat_dim * action_dim + data['actions'][i+1]] = 1
            A += np.outer(phi, phi - discount * next_phi)
        pbar.update(1)
    pbar.close() 

    # update weight
    total_weight = np.linalg.lstsq(A, b)[0]
    weight = total_weight[:-action_dim].reshape(-1, feat_dim).T
    bias = total_weight[-action_dim:]
    return weight, bias
    

# # Fitted Q iteration
# A = np.zeros((feat_dim * action_dim + action_dim, feat_dim * action_dim + action_dim))
# b = np.zeros(feat_dim * action_dim + action_dim)
# N = len(data['states'])

# #weight = expert.fc_head.weight.cpu().detach().numpy()
# #bias = expert.fc_head.bias.cpu().detach().numpy()
# #print(weight.shape, bias.shape)
# #w = np.concatenate([weight.flatten(), bias])

# pbar = tqdm(total=N)
# for i, transition in enumerate(zip(data['states'], data['actions'], data['next_states'], data['rewards'], data['terminals'], data['expert_q'])):
#     state, action, next_state, reward, terminal, expert_q = transition
#     phi = np.zeros(feat_dim * action_dim + action_dim)
#     phi[feat_dim * action: feat_dim * (action + 1)] = state
#     phi[feat_dim * action_dim + action] = 1
#     #print(phi.dot(w) - expert_q[action])
#     #print(phi.dot(w) - (weight.dot(state) + bias)[action])
#     b += reward * phi / N
#     if terminal:
#         A += np.outer(phi, phi) / N
#         #print(phi.dot(w) - expert_q[action])
#     else:
#         next_phi = np.zeros(feat_dim * action_dim + action_dim)
#         s_idx = feat_dim * data['actions'][i+1]
#         next_phi[s_idx: s_idx + feat_dim] = next_state
#         next_phi[feat_dim * action_dim + data['actions'][i+1]] = 1
#         A += np.outer(phi, phi - discount * next_phi)
#         #print(phi.dot(w) - reward - discount * next_phi.dot(w), expert_q[action] - reward - discount * data['expert_q'][i+1][data['actions'][i+1]])
#         #print(next_phi.dot(w) - data['expert_q'][i+1][data['actions'][i+1]])
#     #if i % 10 == 0: print('at {}'.format(i))
#     pbar.update(1)
# pbar.close() 

# # update weight
# total_weight = np.linalg.lstsq(A, b)[0]
# weight = total_weight[:-action_dim].reshape(-1, feat_dim).T
# bias = total_weight[-action_dim:]

weight, bias = fitted_q(data, expert.body, feat_dim, action_dim)
#weight, bias = np.random.randn(feat_dim, action_dim), np.random.randn(action_dim)

estimate_q = expert.body(tensor(data['states'])).detach().cpu().numpy() @ weight + bias
print(((estimate_q - data['expert_q']) ** 2).mean())
print('difference between argmax:', (estimate_q.argmax(1) != data['expert_q'].argmax(1)).sum())

  0%|          | 0/20 [00:00<?, ?it/s]

Automatic pdb calling has been turned ON
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
maps: [(0, 'fourroom-16')]
tasks: [(0, ('A',)), (1, ('B',)), (2, ('C',)), (3, ('D',))]
train: [(0, 0)]
test: [(0, 0)]


100%|██████████| 20/20 [00:00<00:00, 28.64it/s]
  1%|          | 3/291 [00:00<00:10, 26.99it/s]

num of transitions: 291


100%|██████████| 291/291 [00:09<00:00, 31.35it/s]


0.03546138380832832
difference between argmax: 114


# Meta Linear Q

In [53]:
# D_1, ..., D_n
# D, body / \phi -> A, b -> w(\phi)
# Q(\phi) - Q_E as loss

n_expert_trajs = 30
epsilon = 0.0

weight_path = '../log/pick.mask.fourroom-16.0.min_dis-1/dqn/double_q/0.190425-220424/models/step-3000000-mean-0.96'
env_config_path = '../data/env_configs/pick/fourroom-16.0'

expert = get_expert(weight_path)
env, all_states, positions, optimal_q = get_env(env_config_path)

states = []
actions = []
next_states = []
rewards = []
terminals = []
qs = []

for _ in tqdm(range(n_expert_trajs)):
    states_, actions_, next_states_, rewards_, terminals_, qs_, returns = rollout(env, expert, epsilon=epsilon)
    #print('expert returns:', returns)
    states.append(states_)
    actions.append(actions_)
    next_states.append(next_states_)
    rewards.append(rewards_)
    terminals.append(terminals_)
    qs.append(qs_)

data = dict(
    states=np.concatenate(states),
    actions=np.concatenate(actions),
    next_states=np.concatenate(next_states),
    rewards=np.concatenate(rewards),
    terminals=np.concatenate(terminals),
    expert_q=np.concatenate(qs),
)
#data = {k: v[:2] for k, v in data.items()}
print('num of transitions:', len(data['states']))

sub_idx = 1

def fitted_q(data, body, feat_dim, action_dim):
    A = torch.zeros(feat_dim * action_dim + action_dim, feat_dim * action_dim + action_dim)
    b = torch.zeros(feat_dim * action_dim + action_dim)
    N = len(data['states'])

    feats = body(tensor(data['states'])).repeat(1, action_dim)
    a_vec = one_hot.encode(tensor(data['actions'], torch.long), action_dim)
    phis = torch.cat([feats * a_vec.repeat_interleave(feat_dim, 1), a_vec], 1)
    tot_A = torch.matmul(phis[:sub_idx].t(), phis[:sub_idx] - discount * tensor(1 - data['terminals'])[:sub_idx].unsqueeze(1) * phis.roll(-1, 0)[:sub_idx]) / N
    tot_b = torch.matmul(phis[:sub_idx].t(), tensor(data['rewards'])[:sub_idx]) / N
    
    pbar = tqdm(total=N)
    for i, transition in enumerate(zip(data['states'], data['actions'], data['next_states'], data['rewards'], data['terminals'])):
        if i == sub_idx: break
        state, action, next_state, reward, terminal = transition
        phi = torch.zeros(feat_dim * action_dim + action_dim)
        phi[feat_dim * action: feat_dim * (action + 1)] = body(tensor([state]))[0]
        phi[feat_dim * action_dim + action] = 1
        b += float(reward) * phi / N
        if terminal:
            A += torch.ger(phi, phi) / N
        else:
            next_phi = torch.zeros(feat_dim * action_dim + action_dim)
            s_idx = feat_dim * data['actions'][i+1] # assume trajectories is contiguous
            next_phi[s_idx: s_idx + feat_dim] = body(tensor([next_state]))[0]
            next_phi[feat_dim * action_dim + data['actions'][i+1]] = 1
            #print(torch.abs(phis[i+1]-next_phi).max())
            #print(torch.abs(next_phi - next_phis[i]).max())
            A += torch.ger(phi, phi - discount * next_phi) / N
        pbar.update(1)
    pbar.close()

    print(torch.abs(tot_A - A).max())
    print(torch.abs(tot_b - b).max())
    
    # update weight
    #print(torch.isnan(A).any())
    total_weight = torch.matmul(torch.inverse(A + 0.1 * torch.eye(A.shape[0])), b)
    weight = total_weight[:-action_dim].view(-1, feat_dim).t()
    bias = total_weight[-action_dim:]
    return weight, bias


model = copy.deepcopy(expert)
optim = torch.optim.RMSprop(
        filter(lambda p: p.requires_grad, model.parameters()), lr=0.00025, alpha=0.95, eps=0.01, centered=True)
#weight, bias = torch.randn(feat_dim, action_dim), torch.randn(action_dim)
for i in range(1):
    weight, bias = fitted_q(data, model.body, feat_dim, action_dim)
    estimate_q = torch.matmul(model.body(tensor(data['states'])), weight) + bias
    loss = F.mse_loss(estimate_q, tensor(data['expert_q']))
    print('{}-th loss:'.format(i), loss.detach().cpu().numpy())
    optim.zero_grad()
    loss.backward()
    optim.step()





  0%|          | 0/30 [00:00<?, ?it/s][A[A[A[A

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
maps: [(0, 'fourroom-16')]
tasks: [(0, ('A',)), (1, ('B',)), (2, ('C',)), (3, ('D',))]
train: [(0, 0)]
test: [(0, 0)]






 10%|█         | 3/30 [00:00<00:00, 27.80it/s][A[A[A[A



 17%|█▋        | 5/30 [00:00<00:01, 24.00it/s][A[A[A[A



 27%|██▋       | 8/30 [00:00<00:00, 25.12it/s][A[A[A[A



 37%|███▋      | 11/30 [00:00<00:00, 25.82it/s][A[A[A[A



 50%|█████     | 15/30 [00:00<00:00, 27.28it/s][A[A[A[A



 60%|██████    | 18/30 [00:00<00:00, 25.26it/s][A[A[A[A



 70%|███████   | 21/30 [00:00<00:00, 25.58it/s][A[A[A[A



 83%|████████▎ | 25/30 [00:00<00:00, 28.34it/s][A[A[A[A



 97%|█████████▋| 29/30 [00:01<00:00, 28.95it/s][A[A[A[A



100%|██████████| 30/30 [00:01<00:00, 26.90it/s][A[A[A[A



  0%|          | 0/420 [00:00<?, ?it/s][A[A[A[A



  0%|          | 1/420 [00:00<00:04, 90.35it/s][A[A[A[A

num of transitions: 420
tot_A: tensor([[-0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [-0.0004,  0.0000,  0.0000,  ...,  0.0024,  0.0000,  0.0000],
        [-0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       grad_fn=<DivBackward0>)
tensor([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [0.],
        [0.]], grad_fn=<TBackward>)
tensor([[-0.1490,  0.0000,  0.0000,  ...,  1.0000,  0.0000,  0.0000]],
       grad_fn=<SubBackward0>)
tensor([0., 0., 0.,  ..., 1., 0., 0.], grad_fn=<CopySlices>)
tensor([-0.1490,  0.0000,  0.0000,  ...,  1.0000,  0.0000,  0.0000],
       grad_fn=<SubBackward0>)
tensor(3.4925e-10, grad_fn=<MaxBackward1>)
tensor(3.6380e-12, grad_fn=<MaxBackward1>)
0-th loss: 0.85890085


In [48]:
# import torch
# print(torch.__version__)

# class DummyModule(torch.nn.Module):
#     def forward(self, x):
#         V = torch.Tensor(2, 2)
#         V[0, 0] = x
#         return torch.sum(V * 3)


# x = torch.tensor([1], requires_grad=True)
# r = DummyModule()(x)
# r.backward()
# print(x.grad)


a = tensor([[0, 1, 2]])
b = tensor([[3, 4, 5]])
print(torch.matmul(a.t(), b))

tensor([[ 0.,  0.,  0.],
        [ 3.,  4.,  5.],
        [ 6.,  8., 10.]])
