## Using neural network create a predictor which when given observation and action outputs the collected reward.

In [1]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from copy import deepcopy

import numpy as np
from tqdm import trange
import glob
import io
import base64
from IPython.display import HTML

from IPython import display as ipythondisplay
import pickle

In [2]:
CUDA = torch.cuda.is_available()

def to_np(x):
    return x.detach().cpu().numpy()

def to_tensor(x, requires_grad=False):
    x = torch.from_numpy(x)
    if CUDA:
        x = x.cuda()
    
    if requires_grad:
        return x.clone().contiguous().detach().requires_grad_(True)
    else:
        return x.clone().contiguous().detach()

    
class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(119, 500),
            nn.Tanh(),
            nn.Linear(500, 500),
            nn.Tanh(),
            nn.Linear(500, 500),
            nn.Tanh(),
            nn.Linear(500, 1)
        )
        self.loss = nn.MSELoss()
        
        
    def forward(self, X):
        X = X.view(X.size(0), -1)
        return self.layers.forward(X)

In [None]:
def train(net=Network()):
    if CUDA:
        net = net.cuda()

    optimizer = torch.optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

    for _ in trange(1000, position=0, leave=True):
        for bch in range(10):
            d = pickle.load(open(f'drive/My Drive/project_evo/data/b{bch}.pkl', 'rb'))
            X, Y = d['X'], d['Y']
            if CUDA:
                X, Y = X.cuda(), Y.cuda()

            for i in range(0, len(X), 200):
                preds = net(X[i : i+200])
                optimizer.zero_grad()
                loss = net.loss(Y[i : 200+i], preds)
                loss.backward()
                optimizer.step()
    return net

In [None]:
net = train()

100%|██████████| 1000/1000 [4:32:59<00:00, 16.38s/it]


In [None]:
L = []
with torch.no_grad():
    for bch in range(10):
        d = pickle.load(open(f'drive/My Drive/project_evo/data/b{bch}.pkl', 'rb'))
        X, Y = d['X'][:500], d['Y'][:500]
        if CUDA:
            X, Y = X.cuda(), Y.cuda()

        preds = net(X)
        loss = net.loss(Y, preds)
        L.append(loss.item())
print(np.mean(L))

0.00546825596319721


In [None]:
pickle.dump(net, open('drive/My Drive/project_evo/reward_pred/net.pkl', 'w+b'), pickle.HIGHEST_PROTOCOL)
pickle.dump(net.to(torch.device('cpu')), open('drive/My Drive/project_evo/reward_pred/net_cpu.pkl', 'w+b'), pickle.HIGHEST_PROTOCOL)



In [3]:
net = pickle.load(open('drive/My Drive/project_evo/reward_pred/net.pkl', 'rb'))

## Testing predictor

### Random agent

In [10]:
!pip install pybullet==2.5.5

%cd /usr/local/lib/python3.6/dist-packages
!git clone https://github.com/benelot/pybullet-gym.git
%cd pybullet-gym
!pip install -e .

Collecting pybullet==2.5.5
[?25l  Downloading https://files.pythonhosted.org/packages/d4/6c/6b14ae6d1d8f10f16ea82c2c194394564b02c80b88b6e391470046968c7b/pybullet-2.5.5.tar.gz (60.4MB)
[K     |████████████████████████████████| 60.4MB 48kB/s 
[?25hBuilding wheels for collected packages: pybullet
  Building wheel for pybullet (setup.py) ... [?25l[?25hdone
  Created wheel for pybullet: filename=pybullet-2.5.5-cp36-cp36m-linux_x86_64.whl size=71822554 sha256=e2fe7d2a614d370bfbffbba3eb76fe9a0b8c3ca415d14154bc0a34287eae5ab3
  Stored in directory: /root/.cache/pip/wheels/1d/e4/cc/7b50d6689e1bc6ba07d2df04946a0eabc89deca7caed5f52d1
Successfully built pybullet
Installing collected packages: pybullet
Successfully installed pybullet-2.5.5
/usr/local/lib/python3.6/dist-packages
Cloning into 'pybullet-gym'...
remote: Enumerating objects: 735, done.[K
remote: Total 735 (delta 0), reused 0 (delta 0), pack-reused 735[K
Receiving objects: 100% (735/735), 19.29 MiB | 16.09 MiB/s, done.
Resolving de

In [4]:
import gym
import pybulletgym
from gym import logger as gymlogger
gymlogger.set_level(40) #error only

In [15]:
env = gym.make("AntMuJoCoEnv-v0")
env._max_episode_steps = 5000
observation = env.reset()
observation = observation.astype(np.float32)

preds, reals = [], []

for _ in trange(5000, position=0, leave=True):
    action = env.action_space.sample()
    obs_action = to_tensor(np.append(observation, action).reshape(1, -1).astype(np.float32), )

    pred = net(obs_action).item()
    preds.append(pred)
    
    observation, reward, done, info = env.step(action)
    observation = observation.astype(np.float32)
    reals.append(reward)

    if done: 
        break;

env.close()
preds, reals = np.array(preds), np.array(reals)

  1%|          | 40/5000 [00:00<00:12, 399.93it/s]

WalkerBase::__init__
options= 


 99%|█████████▉| 4961/5000 [00:12<00:00, 382.03it/s]

In [16]:
np.abs(preds-reals).mean()

0.07642148231714527

In [17]:
preds.min(), preds.mean(), preds.max()

(0.22161123156547546, 0.5788271162390709, 0.9424511194229126)

In [18]:
reals.min(), reals.mean(), reals.max()

(0.20103016860230127, 0.6085227696980585, 1.0996811828343198)

### Trained model

In [9]:
class AgentNetwork(nn.Module):
    
    def __init__(self):
        super(AgentNetwork, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(111, 100),
            nn.Tanh(),
            nn.Linear(100, 8),
            nn.Tanh()
        )

        
    def forward(self, X):
        X = X.view(X.size(0), -1)
        return self.layers.forward(X)
    
    
    def set_params(self, params):
        cpt = 0
        for param in self.parameters():
            tmp = np.product(param.size())

            if torch.cuda.is_available():
                param.data.copy_(to_tensor(
                    params[cpt:cpt + tmp]).view(param.size()).cuda())
            else:
                param.data.copy_(to_tensor(
                    params[cpt:cpt + tmp]).view(param.size()))
            cpt += tmp

            
    def get_params(self):
        return deepcopy(np.hstack([to_np(v).flatten() for v in
                                   self.parameters()]))

In [10]:
ind = pickle.load(open('drive/My Drive/project_evo/history/1000.pkl', 'rb'))['best']
actor = AgentNetwork().cuda()
actor.set_params(ind)

In [11]:
env = gym.make("AntMuJoCoEnv-v0")
env._max_episode_steps = 5000
observation = env.reset()
observation = to_tensor(observation.reshape(1, -1).astype(np.float32))

preds, reals = [], []


for _ in trange(5000, position=0, leave=True):
    action = actor(observation).view(1, -1)
    obs_action = torch.cat((observation, action), 1)

    pred = net(obs_action).item()
    preds.append(pred)
    
    observation, reward, done, info = env.step(to_np(action.view(-1).to('cpu')))
    observation = to_tensor(observation.reshape(1, -1).astype(np.float32))

    reals.append(reward)
    if done: 
        break;
            
env.close()
preds, reals = np.array(preds), np.array(reals)

  1%|          | 37/5000 [00:00<00:13, 367.30it/s]

WalkerBase::__init__
options= 


100%|█████████▉| 4991/5000 [00:13<00:00, 366.13it/s]

In [12]:
np.abs(preds-reals).mean()

0.08532460161497292

In [13]:
preds.min(), preds.mean(), preds.max()

(-1.2780717611312866, 0.9191121489018201, 3.3176541328430176)

In [14]:
reals.min(), reals.mean(), reals.max()

(-1.2710890481976094, 0.916044231693701, 3.2882286245570866)