In [3]:
import sys
sys.path.append('../')
from deep_rl.gridworld import ReachGridWorld, PickGridWorld, PORGBEnv, GoalManager
from deep_rl.network import *
from deep_rl.utils import *
import os
import random
import argparse
import dill
import json
import itertools
import numpy as np
import matplotlib.pyplot as plt
from random import shuffle
from collections import Counter, namedtuple
from IPython.display import display
from PIL import Image
from pathlib import Path
from IPython.core.debugger import Tracer

def seed(s):
    random.seed(s)
    np.random.seed(s)

seed(0) # set seed 

def fload(fn, ftype):
    if ftype == 'json':
        with open(fn) as f:
            return json.load(f)
    elif ftype == 'pkl':
        with open(fn, 'rb') as f:
            return dill.load(f)
    elif ftype == 'png':
        raise NotImplementedError
    else:
        raise Exception('cannot read this data type: {}'.format(ftype))
    
def fsave(data, fn, ftype):
    dirname = os.path.dirname(fn)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    if ftype == 'json':
        with open(fn, 'w') as f:
            json.dump(data, f)
    elif ftype == 'pkl':
        with open(fn, 'wb') as f:
            dill.dump(data, f)    
    elif ftype == 'png':
        Image.fromarray(data).save(fn)
    else:
        raise Exception('unsupported file type: {}'.format(ftype))
        
GoalConfig = namedtuple('GoalConfig', ['map_name', 'n_goal', 'min_dis'])  

# multitask NMF from: https://ieeexplore.ieee.org/document/6939673
class MTNMF:
    def __init__(self, n_components, l1_ratio=0.0, max_iter=200, tol=0.0001):
        self.n_components = n_components
        self.l1_ratio = l1_ratio
        self.max_iter = max_iter
        self.tol = tol

    def loss(self, X, A, S):
        return 0.5 * ((X - np.matmul(A, S)) ** 2).sum() + self.l1_ratio * S.sum()
        
    # input: a stack of observed data X_1, ..., X_K
    # output: S, A_1, ..., A_K
    def fit(self, X):
        K, N, M = X.shape
        A = np.random.rand(K, N, self.n_components)
        S = np.random.rand(self.n_components, M)
        prev_loss = np.inf
        cur_loss = None
        for i in range(self.max_iter):
            A_T = A.transpose(0, 2, 1)
            new_S = S * (np.matmul(A_T, X).sum(0)) / (np.matmul(np.matmul(A_T, A), S).sum(0) + K * self.l1_ratio * np.ones((self.n_components, M)))
            S = new_S
            new_A = A * np.matmul(X, S.T) / np.matmul(np.matmul(A, S), S.T)
            A = new_A
            cur_loss = self.loss(X, A, S)
            if i % 100 == 0: print('NMF loss:', cur_loss)
            if abs(cur_loss - prev_loss) < self.tol: break
            prev_loss = cur_loss # update loss
        return A, S, {'loss': cur_loss, 'iter': i}
    
def rollout(env, policy=None, horizon=100):
    states = []
    done = False
    state = env.reset()
    info = dict(task_id=[0])
    for _ in range(horizon):
        states.append(state)
        if policy is None:
            action = env.action_space.sample()
        else:
            action = policy([state], info)['a'][0].cpu().detach().numpy()
        state, _, _, _ = env.step(action) # note that info is not used
    return states

pygame 1.9.4
Hello from the pygame community. https://www.pygame.org/contribute.html


# MTNMF

In [None]:
n_abs = 100
l1_ratio = 0.0 # this is currently not working... since alpha is not set
feat_dim = 512
action_dim = 5
horizon = 100
n_trajs = 10
scale=2

def get_expert(weight_path, state_dim, action_dim):
    visual_body = TSAMiniConvBody(
        7, 
        feature_dim=feat_dim,
        scale=scale)
    expert = CategoricalActorCriticNet(
        2,
        0, # state_dim
        5,
        visual_body,
    )
    # load weight
    weight_dict = expert.state_dict()
    loaded_weight_dict = {k: v for k, v in torch.load(
        weight_path,
        map_location=lambda storage, loc: storage)['network'].items()
        if k in weight_dict}
    weight_dict.update(loaded_weight_dict)
    expert.load_state_dict(weight_dict)
    return expert

set_seed(0)

expert_dict = {
    1: '../log/reacher.1_corner/fc_discrete.baseline/split/0.190315-202731/models/step-704000-mean--6.36',
    2: '../log/reacher.2_corner/fc_discrete.baseline/split/0.190315-203310/models/step-704000-mean--17.16',
    3: '../log/reacher.3_corner/fc_discrete.baseline/split/0.190315-203532/models/step-704000-mean--11.25',
}

envs = [DiscretizeActionEnv(
    MultiGoalReacherEnv(
        [
            [0.15, 0.0],
            [-0.15, 0.0],
            [0.0, 0.15],
            [0.0, -0.15],
        ],
        sample_indices=[i],
        with_goal_pos=True,
    ),
    n_bins=[5, 5],
) for i in range(3)]
decomposer = MTNMF(n_abs, max_iter=5000, tol=0.0001)

states = []
experts = dict()

for goal_idx, weight_path in expert_dict.items():
    experts[goal_idx] = get_expert(weight_path, state_dim, action_dim)
    for _ in range(n_trajs // 2):
        states.append(rollout(envs[goal_idx-1], experts[goal_idx], horizon=horizon))
    for _ in range(n_trajs - (n_trajs // 2)):
        states.append(rollout(envs[goal_idx-1], None, horizon=horizon))
states = np.concatenate(states)
print('states shape:', states.shape)
    
pvs = []
    
for goal_idx in expert_dict:
    infos = {'task_id': [goal_idx-1] * len(states)}
    pv = F.softmax(experts[goal_idx].get_logits(states, infos), dim=-1).cpu().detach().numpy()
    print(pv.shape)
    pv = pv.reshape(pv.shape[0], -1)
    pvs.append(pv)

pvs = np.stack(pvs, 0)
A, S, info = MTNMF(n_abs, max_iter=5000, l1_ratio=l1_ratio).fit(pvs.transpose(0, 2, 1))
print(pvs.shape)

fsave(
    dict(
        abs=S.T,
        policies=list(pvs.reshape(pvs.shape[0], pvs.shape[1], 2, -1)),
        states=[states for _ in range(len(pvs))],
        infos=list([[{'task_id': i} for _ in range(len(states))] for i in range(3)]),
    ),
    '../data/nmf_sample/reacher/split.mix.{}'.format(n_abs),
    'pkl',
)

# Generate EnvConfig

In [11]:
seed(0) # set seed 

def get_pick_config(goal_config, train_combos=None):
    MAX_OBJ_NUM = 15
    goal_manager = GoalManager(goal_config.map_name)
    obj_pos = goal_manager.gen_goals(MAX_OBJ_NUM + 1, min_dis=goal_config.min_dis)
    obj_pos = [obj_pos[-1:] + obj_pos[:goal_config.n_goal-1]] # always the same test
    if train_combos is None:
        train_combos = [(0, i) for i in range(1, goal_config.n_goal)]
    env_config = dict(
        map_names = [goal_config.map_name],
        train_combos = train_combos,
        test_combos = [(0, 0)],
        num_obj_types=goal_config.n_goal,
        obj_pos=obj_pos,
    )
    return env_config 

map_name = 'map49'
n_goal = 5
train_idx = 4


env_config = get_pick_config(
    GoalConfig(
        map_name=map_name,
        n_goal=n_goal,
        min_dis=4,
    ),
    #train_combos=[(0, train_idx)],
    train_combos=[(0, 1), (0, 2), (0, 3)],
)

print(env_config)
fsave(env_config, 
      #'../data/env_configs/pick/nmf/{}.{}-{}'.format(map_name, n_goal, train_idx), 
      '../data/env_configs/pick/nmf/{}.{}-f3'.format(map_name, n_goal),
      ftype='pkl')


{'map_names': ['map49'], 'train_combos': [(0, 1), (0, 2), (0, 3)], 'test_combos': [(0, 0)], 'num_obj_types': 5, 'obj_pos': [[(13, 6), (4, 13), (12, 2), (7, 4), (10, 14)]]}
