In [1]:
from __future__ import print_function
import numpy as np
import gym
from pickle import load, dump

import torch
from torch.autograd import Variable, Function
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from sklearn.model_selection import train_test_split

from minimax_entropy import MinimaxEntropyEstimator

In [2]:
class ExpertDataset(Dataset):

    def __init__(self, X, Y, transform=None):
        self.transform = transform
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        sample = {'observations': self.X[idx], 'actions': self.Y[idx]}

        if self.transform:
            sample = self.transform(sample)

        return sample['observations'], sample['actions']

In [3]:
class Discretizer:
    
    def __init__(self, bins):
        self._bins = bins
        
    def fit(self, X):
        m = 2
        self._edges = []
        for j in range(m):
            _, edges = np.histogram(X[:,j], bins=self._bins)
            self._edges.append(edges)
    
    def transform(self, X):
        n = X.shape[0]
        m = 2
        Xd = np.zeros((n, self._bins, self._bins))
        for i in range(n):
            coord = []
            for j in range(m):
                c = X[i][j]
                if c <= self._edges[j][0]:
                    coord.append(0)
                elif c >= self._edges[j][-1]:
                    coord.append(-1)
                else:
                    for k in range(self._bins):
                        if self._edges[j][k] < c <= self._edges[j][k+1]:
                            coord.append(k)
                            break
            Xd[i][coord[0], coord[1]] = 1
        return Xd.reshape(n, self._bins ** m)
    
    def inverse_transform(self, X):
        n = X.shape[0]
        m = 2
        Xc = np.zeros((n, m))
        X = X.reshape(n, self._bins, self._bins)
        
        for i in range(n):
            j, k = np.argwhere(X[i] == 1)[0]
            Xc[i] = np.array([self._edges[0][j] + self._edges[0][j + 1], self._edges[1][k] + self._edges[1][k + 1]])/2
        return Xc

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

In [4]:
def get_data_loaders(filename, test_size=0.3, batch_size=64, num_workers=4, pin_memory=True, bins=0):
    with file(filename, 'r') as f:
        data = load(f)
    n = len(data['actions'])
    m = data['actions'].shape[-1]
    actions = data['actions'].reshape(n, m)
    
    if bins > 0:
        d = Discretizer(bins)
        actions = d.fit_transform(actions)
    else:
        d = None
    
    indices = np.arange(n)
    np.random.shuffle(indices)
    split = int(n*test_size)
    train_idx, test_idx = indices[split:], indices[:split]

    train_sampler = SubsetRandomSampler(train_idx)
    test_sampler = SubsetRandomSampler(test_idx)
    
    X, Y = data['observations'], actions
    ds_tr = ExpertDataset(X, Y)
    ds_t = ExpertDataset(X, Y)
    
    train_loader = DataLoader(ds_tr, 
                    batch_size=batch_size, sampler=train_sampler, 
                    num_workers=num_workers, pin_memory=pin_memory)
    test_loader = DataLoader(ds_t,
                    batch_size=len(test_idx), sampler=test_sampler, 
                    num_workers=num_workers, pin_memory=pin_memory)
    
    return train_loader, test_loader, data, d

In [5]:
def predict(model, X):
    X = Variable(torch.from_numpy(X)).cuda()
    Y = model(X.float())
    return Y.cpu().data.numpy()

def predictD_gen(d):
    def predictD(model, X):
        Yd = predict(model, X)
        Y = np.zeros((1, Yd.shape[1]))
        Y[0, Yd.argmax()] = 1            
        return d.inverse_transform(Y) 
    return predictD

# eval model
def eval_model(model, f, n):
    env = gym.make('Reacher-v1')
    returns = []
    max_steps = env.spec.timestep_limit
    for i in range(n):
        obs = env.reset()
        done = False
        totalr = 0.
        steps = 0
        while not done:
            action = f(model, (obs[None,:])).reshape(1, -1)
            obs, r, done, _ = env.step(action)
            totalr += r
            steps += 1
            if steps >= max_steps:
                break
        returns.append(totalr)

    return np.mean(returns), np.std(returns)

In [6]:
def metric_loss_gen(L, convert_onehot=False):
    def metric_loss(model, X, Y):
        output = model(X.float())
        
        batch_size, n_classes = output.size()
        if convert_onehot:
            target = Variable(torch.DoubleTensor(batch_size, n_classes)).cuda()
            for i in range(batch_size):
                target[i, Y.data[i]] = 1.
            pred = output
        else:
            target = Y
            pred = output
            
        pred = pred.double()
        losses = [L(pred[i], target[i]).data.cpu().numpy() for i in range(batch_size)]
        return np.mean(losses)
        
    return metric_loss     
            
def train(model, opt, L, train_loader, test_loader, n_classes, epoch, f_eval,
          n_samples=1, convert_onehot=False, log_interval=100, n_evals=50):
    model.train()
    
    evals = {'tr_loss':[], 't_loss':[], 'mean_r': [], 'ts': [], 'es': []}
    for e in range(epoch):
        for t, (X, Y) in enumerate(train_loader):
            X, Y = X.cuda(), Y.cuda()
            X, Y = Variable(X), Variable(Y)
            opt.zero_grad()
            
            batch_size = Y.size()[0]

            if convert_onehot:
                target_onehot = Variable(torch.DoubleTensor(batch_size, n_classes)).cuda()
                target_onehot.data.zero_()
                for i in range(batch_size):
                    target_onehot[i, Y.data[i]] = 1.
                target = target_onehot
            else:
                target = Y

            mean_output = Variable(torch.DoubleTensor(batch_size, n_classes)).cuda()
            outputs = [model(X.float()).double() for _ in range(n_samples)]
            for i in range(batch_size):
                for output in outputs:
                    mean_output[i] = mean_output[i] + output[i]
            mean_output /= n_samples

            loss = Variable(torch.zeros(1)).double().cuda()
            for i in range(batch_size):
                loss += L(output[i], target[i])
            loss.backward()

            opt.step()
            if t % log_interval == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]'.format(e, t * len(X), len(train_loader.dataset),
                                                                    100. * t / len(train_loader)))
                
                model.eval()
                tr_loss = loss.data[0] / batch_size                    
                mean_r, std_r = eval_model(model, f_eval, n_evals)
                
                print('Tr Loss: {:.6f} | MeanR: {:.2f} | VarR: {:.2f}'.format(tr_loss, mean_r, std_r))
                evals['tr_loss'].append(tr_loss)
                evals['mean_r'].append(mean_r)
                
                evals['ts'].append(t)
                evals['es'].append(e)
                model.train()
            
    return evals

In [7]:
class ReacherDisModel(nn.Module):
    def __init__(self, dropout):
        super(ReacherDisModel, self).__init__()
        self.fc1 = nn.Linear(11, 100)
        self.fc2 = nn.Linear(100, 100)
        self.fc3 = nn.Linear(100, 225)
        
        self._dropout = dropout
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.dropout(F.relu(self.fc2(x)), p=self._dropout, training=self.training)
        x = F.dropout(F.relu(self.fc3(x)), p=self._dropout, training=self.training)
        return x

In [8]:
batch_size = 64
bins = 15

train_loader, test_loader, reacher_data, d = get_data_loaders('expert_data/Reacher-v1.pkl',
                                                              batch_size=batch_size, bins=bins)

In [9]:
epochs = 4
log_interval = 10
dropout = 0.1
n_samples = 50
n_classes = bins**2

entro = MinimaxEntropyEstimator('poly_coeff_entro.mat', n_samples, gpu=True)

In [None]:
modelD_mm = ReacherDisModel(dropout)
modelD_mm.cuda()
opt = optim.Adam(modelD_mm.parameters())

mm_evals = train(modelD_mm, opt, entro.minimax_cross_entro_loss, train_loader, test_loader, n_classes, epochs, predictD_gen(d), 
      n_samples=n_samples, log_interval=log_interval)

Tr Loss: 0.291897 | MeanR: -18.16 | VarR: 3.70

In [34]:
modelD = ReacherDisModel(dropout)
modelD.cuda()
opt = optim.Adam(modelD.parameters())

mle_evals = train(modelD, opt, entro.cross_entro_loss, train_loader, test_loader, n_classes, epochs, predictD_gen(d), 
      n_samples=n_samples, log_interval=log_interval)

Tr Loss: 11.569041 | MeanR: -19.65 | VarR: 2.12
Tr Loss: 3.576886 | MeanR: -12.87 | VarR: 2.43
Tr Loss: 1.890307 | MeanR: -11.86 | VarR: 4.75
Tr Loss: 1.368103 | MeanR: -12.65 | VarR: 4.80
Tr Loss: 1.343412 | MeanR: -10.89 | VarR: 4.23
Tr Loss: 1.094646 | MeanR: -11.31 | VarR: 4.81
Tr Loss: 1.281374 | MeanR: -12.36 | VarR: 4.40
Tr Loss: 1.110748 | MeanR: -12.38 | VarR: 5.54
Tr Loss: 1.193671 | MeanR: -12.12 | VarR: 4.07
Tr Loss: 0.978873 | MeanR: -12.45 | VarR: 4.29
Tr Loss: 1.126318 | MeanR: -12.71 | VarR: 3.92
Tr Loss: 1.041594 | MeanR: -10.96 | VarR: 5.04


In [37]:
with open('results/reacher_mle_evals.pkl', 'w') as f:
    dump(mle_evals, f)

In [None]:
# define model
class ReacherModel(nn.Module):
    def __init__(self):
        super(ReacherModel, self).__init__()
        self.fc1 = nn.Linear(11, 100)
        self.fc2 = nn.Linear(100, 50)
        self.fc3 = nn.Linear(50, 10)
        self.fc4 = nn.Linear(10, 2)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x

In [None]:
model = ReacherModel()
model.cuda()
optimizer = optim.Adam(model.parameters())

train(model, 1)

In [10]:
# save model
torch.save(model.state_dict(), './models/reacher_cont.pt')

In [16]:
# load model
model2 = ReacherModel()
model2.load_state_dict(torch.load('./models/reacher_cont.pt'))
model2.cuda()
model2.eval()

ReacherModel (
  (fc1): Linear (11 -> 100)
  (fc2): Linear (100 -> 50)
  (fc3): Linear (50 -> 10)
  (fc4): Linear (10 -> 2)
)

In [144]:
eval_model(modelD, predictD_gen(d), 50)

iter 0
(1, 40)


ValueError: cannot reshape array of size 40 into shape (1,1,30)

In [16]:
reacher_data['mean_return']

-4.0726575280658706