In [32]:
from __future__ import print_function
import numpy as np
import gym
from pickle import load

import torch
from torch.autograd import Variable, Function
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from sklearn.model_selection import train_test_split

In [40]:
class ExpertDataset(Dataset):

    def __init__(self, X, Y, transform=None):
        self.transform = transform
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        sample = {'observations': self.X[idx], 'actions': self.Y[idx]}

        if self.transform:
            sample = self.transform(sample)

        return sample['observations'], sample['actions']

In [41]:
def get_data_loaders(filename, test_size=0.3, batch_size=64, num_workers=4, pin_memory=True):
    with file(filename, 'r') as f:
        data = load(f)
    n = len(data['actions'])
    
    indices = np.arange(n)
    np.random.shuffle(indices)
    split = int(n*test_size)
    train_idx, test_idx = indices[split:], indices[:split]

    train_sampler = SubsetRandomSampler(train_idx)
    test_sampler = SubsetRandomSampler(test_idx)
    
    X, Y = data['observations'], data['actions']
    ds_tr = ExpertDataset(X, Y)
    ds_t = ExpertDataset(X, Y)
    
    train_loader = DataLoader(ds_tr, 
                    batch_size=batch_size, sampler=train_sampler, 
                    num_workers=num_workers, pin_memory=pin_memory)
    test_loader = DataLoader(ds_t,
                    batch_size=batch_size, sampler=test_sampler, 
                    num_workers=num_workers, pin_memory=pin_memory)
    
    return train_loader, test_loader

In [42]:
train_loader, test_loader = get_data_loaders('expert_data/Hopper-v1.pkl')

In [43]:
# define model
class HopperModel(nn.Module):
    def __init__(self):
        super(HopperModel, self).__init__()
        self.fc1 = nn.Linear(11, 100)
        self.fc2 = nn.Linear(100, 50)
        self.fc3 = nn.Linear(50, 10)
        self.fc4 = nn.Linear(10, 3)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x

In [93]:
def train(model, epoch, log_interval=100):
    model.train()
    for e in range(epoch):
        for batch_idx, (X, Y) in enumerate(train_loader):
            X, Y = X.cuda(), Y.cuda()
            X, Y = Variable(X), Variable(Y)
            optimizer.zero_grad()

            Y_pred = model(X.float())
            loss = F.mse_loss(Y_pred, Y)

            loss.backward()
            optimizer.step()
            if batch_idx % log_interval == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    e, batch_idx * len(X), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.data[0]))

model = HopperModel()
model.cuda()
optimizer = optim.Adam(model.parameters())

train(model, 50)



In [59]:
# save model
torch.save(model.state_dict(), './models/hopper_cont.pt')

In [65]:
# load model
model2 = HopperModel()
model2.load_state_dict(torch.load('./models/hopper_cont.pt'))
model2.cuda()
model2.eval()

HopperModel (
  (fc1): Linear (11 -> 100)
  (fc2): Linear (100 -> 50)
  (fc3): Linear (50 -> 10)
  (fc4): Linear (10 -> 3)
)

In [84]:
def predict(model, X):
    X = Variable(torch.from_numpy(X)).cuda()
    Y = model(X.float())
    return Y.cpu().data.numpy()

In [94]:
# eval model
env = gym.make('Hopper-v1')
returns = []
max_steps = env.spec.timestep_limit
for i in range(50):
    print('iter', i)
    obs = env.reset()
    done = False
    totalr = 0.
    steps = 0
    while not done:
        action = predict(model, (obs[None,:])).reshape(1, -1)
        obs, r, done, _ = env.step(action)
        totalr += r
        steps += 1
        if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
        if steps >= max_steps:
            break
    returns.append(totalr)

    print('returns', returns)
    print('mean return', np.mean(returns))
    print('std of return', np.std(returns))

iter 0
100/1000
200/1000
300/1000
400/1000
returns [1229.0965094559349]
mean return 1229.09650946
std of return 0.0
iter 1
100/1000
200/1000
300/1000
400/1000
returns [1229.0965094559349, 1240.9278306312158]
mean return 1235.01217004
std of return 5.91566058764
iter 2
100/1000
200/1000
300/1000
400/1000
returns [1229.0965094559349, 1240.9278306312158, 1170.3661409859071]
mean return 1213.46349369
std of return 30.8548364424
iter 3
100/1000
200/1000
300/1000
400/1000
500/1000
returns [1229.0965094559349, 1240.9278306312158, 1170.3661409859071, 1938.0577446554753]
mean return 1394.61205643
std of return 314.894301391
iter 4
100/1000
200/1000
300/1000
400/1000
returns [1229.0965094559349, 1240.9278306312158, 1170.3661409859071, 1938.0577446554753, 1638.4615671464708]
mean return 1443.38195858
std of return 298.06165513
iter 5
100/1000
200/1000
returns [1229.0965094559349, 1240.9278306312158, 1170.3661409859071, 1938.0577446554753, 1638.4615671464708, 582.41585398660311]
mean return 1299.8

In [91]:
hopper_data['mean_return']

3778.5251823066046