In [1]:
from __future__ import print_function
import numpy as np
import gym
from pickle import load

import torch
from torch.autograd import Variable, Function
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from sklearn.model_selection import train_test_split

from minimax_entropy import MinimaxEntropyEstimator

In [2]:
entro = MinimaxEntropyEstimator('poly_coeff_entro.mat', gpu=True)
batch_size = 64
bins = 20

In [3]:
class ExpertDataset(Dataset):

    def __init__(self, X, Y, transform=None):
        self.transform = transform
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        sample = {'observations': self.X[idx], 'actions': self.Y[idx]}

        if self.transform:
            sample = self.transform(sample)

        return sample['observations'], sample['actions']

In [111]:
class Discretizer:
    
    def __init__(self, bins):
        self._bins = bins
        
    def fit(self, X):
        m = X.shape[1]
        self._edges = []
        for j in range(m):
            _, edges = np.histogram(X[:,j], bins=self._bins)
            self._edges.append(edges)
    
    def transform(self, X):
        n, m = X.shape
        Xd = np.zeros((n, m * self._bins))
        for i in range(n):
            for j in range(m):
                c = X[i][j]
                if c <= self._edges[j][0]:
                    Xd[i][j * self._bins] = 1
                elif c >= self._edges[j][-1]:
                    Xd[i][(j + 1) * self._bins - 1] = 1
                else:
                    for k in range(self._bins):
                        if self._edges[j][k] < c <= self._edges[j][k+1]:
                            Xd[i][j * self._bins + k] = 1
                            break
        return Xd
    
    def inverse_transform(self, X):
        n, T = X.shape
        m = T//self._bins
        Xc = np.zeros((n, m))
        X = X.reshape(n, m, self._bins)
        
        for i in range(n):
            for j in range(m):
#                 print(X[i, j])
                k = np.where(X[i,j] == 1)[0][0]
                Xc[i, j] = (self._edges[j][k] + self._edges[j][k + 1])/2.
        return Xc

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

In [112]:
def get_data_loaders(filename, test_size=0.3, batch_size=64, num_workers=4, pin_memory=True, bins=0):
    with file(filename, 'r') as f:
        data = load(f)
    n = len(data['actions'])
    m = data['actions'].shape[-1]
    actions = data['actions'].reshape(n, m)
    
    if bins > 0:
        d = Discretizer(bins)
        actions = d.fit_transform(actions)
    else:
        d = None
    
    indices = np.arange(n)
    np.random.shuffle(indices)
    split = int(n*test_size)
    train_idx, test_idx = indices[split:], indices[:split]

    train_sampler = SubsetRandomSampler(train_idx)
    test_sampler = SubsetRandomSampler(test_idx)
    
    X, Y = data['observations'], actions
    ds_tr = ExpertDataset(X, Y)
    ds_t = ExpertDataset(X, Y)
    
    train_loader = DataLoader(ds_tr, 
                    batch_size=batch_size, sampler=train_sampler, 
                    num_workers=num_workers, pin_memory=pin_memory)
    test_loader = DataLoader(ds_t,
                    batch_size=batch_size, sampler=test_sampler, 
                    num_workers=num_workers, pin_memory=pin_memory)
    
    return train_loader, test_loader, data, d

In [113]:
train_loader, test_loader, reacher_data, d = get_data_loaders('expert_data/Reacher-v1.pkl',
                                                              batch_size=batch_size, bins=bins)

In [7]:
def train(model, opt, L, train_loader, batch_size, epoch, log_interval=100):
    for e in range(epoch):
        for batch_idx, (X, Y) in enumerate(train_loader):
            X, Y = X.cuda().double(), Y.cuda().double()
            X, Y = Variable(X), Variable(Y)
            optimizer.zero_grad()

            Y_pred = model(X.float()).double()
            loss = Variable(torch.zeros(1)).double().cuda()
            for i in range(Y_pred.size()[0]):
                loss += L(Y_pred[i], Y[i])

            loss.backward()
            optimizer.step()
            
            if batch_idx % log_interval == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    e, batch_idx * len(X), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.data[0]))

In [8]:
class ReacherDisModel(nn.Module):
    def __init__(self):
        super(ReacherDisModel, self).__init__()
        self.fc1 = nn.Linear(11, 100)
        self.fc2 = nn.Linear(100, 50)
        self.fc3 = nn.Linear(50, 40)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.softmax(self.fc3(x))
        return x

In [10]:
modelD = ReacherDisModel()
modelD.cuda()
optimizer = optim.Adam(modelD.parameters())

train(modelD, optimizer, entro.cross_entro_loss, train_loader, batch_size, 10, log_interval=10)



In [15]:
# define model
class ReacherModel(nn.Module):
    def __init__(self):
        super(ReacherModel, self).__init__()
        self.fc1 = nn.Linear(11, 100)
        self.fc2 = nn.Linear(100, 50)
        self.fc3 = nn.Linear(50, 10)
        self.fc4 = nn.Linear(10, 2)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x

In [21]:
model = ReacherModel()
model.cuda()
optimizer = optim.Adam(model.parameters())

train(model, 1)



In [10]:
# save model
torch.save(model.state_dict(), './models/reacher_cont.pt')

In [16]:
# load model
model2 = ReacherModel()
model2.load_state_dict(torch.load('./models/reacher_cont.pt'))
model2.cuda()
model2.eval()

ReacherModel (
  (fc1): Linear (11 -> 100)
  (fc2): Linear (100 -> 50)
  (fc3): Linear (50 -> 10)
  (fc4): Linear (10 -> 2)
)

In [130]:
def predict(model, X):
    X = Variable(torch.from_numpy(X)).cuda()
    Y = model(X.float())
    return Y.cpu().data.numpy()

def predictD_gen(d):
    def predictD(model, X):
        Yd = predict(model, X)
        Y = np.zeros((1, Yd.shape[1]))
        
        n_action = Yd.shape[1]//d._bins
        Yd = Yd.reshape(1, n_action, d._bins)
        
        for i in range(n_action):
            ind = Yd[0, i].flatten().argmax()
            Y[0, i * d._bins + ind] = 1
            
        return d.inverse_transform(Y) 
    return predictD

In [131]:
# eval model
def eval_model(model, f, n):
    env = gym.make('Reacher-v1')
    returns = []
    max_steps = env.spec.timestep_limit
    for i in range(n):
        print('iter', i)
        obs = env.reset()
        done = False
        totalr = 0.
        steps = 0
        while not done:
            action = f(model, (obs[None,:])).reshape(1, -1)
            obs, r, done, _ = env.step(action)
            totalr += r
            steps += 1
            if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
            if steps >= max_steps:
                break
        returns.append(totalr)

        print('returns', returns)
        print('mean return', np.mean(returns))
        print('std of return', np.std(returns))

In [132]:
eval_model(modelD, predictD_gen(d), 50)

iter 0
returns [-12.351645367422396]
mean return -12.3516453674
std of return 0.0
iter 1
returns [-12.351645367422396, -10.651055372593909]
mean return -11.50135037
std of return 0.850294997414
iter 2
returns [-12.351645367422396, -10.651055372593909, -17.388492386169961]
mean return -13.4637310421
std of return 2.86074760494
iter 3
returns [-12.351645367422396, -10.651055372593909, -17.388492386169961, -12.62903867883858]
mean return -13.2550579513
std of return 2.50370545809
iter 4
returns [-12.351645367422396, -10.651055372593909, -17.388492386169961, -12.62903867883858, -6.3921229981177481]
mean return -11.8824709606
std of return 3.5427126618
iter 5
returns [-12.351645367422396, -10.651055372593909, -17.388492386169961, -12.62903867883858, -6.3921229981177481, -17.355901571048051]
mean return -12.7947093957
std of return 3.82359852087
iter 6
returns [-12.351645367422396, -10.651055372593909, -17.388492386169961, -12.62903867883858, -6.3921229981177481, -17.355901571048051, -7.6527

In [16]:
reacher_data['mean_return']

-4.0726575280658706