In [1]:
import gym
import d4rl # Import required to register environments

# Create the environment
env = gym.make('halfcheetah-medium-expert-v2')

# d4rl abides by the OpenAI gym interface
env.reset()
env.step(env.action_space.sample())

# Each task is associated with a dataset
# dataset contains observations, actions, rewards, terminals, and infos
dataset = env.get_dataset()
print(dataset['observations']) # An N x dim_observation Numpy array of observations

# Alternatively, use d4rl.qlearning_dataset which
# also adds next_observations.
dataset = d4rl.qlearning_dataset(env)

No module named 'flow'
/home/lms/anaconda3/envs/mujoco/lib/python3.7/site-packages/glfw/__init__.py:834: GLFWError: (65544) b'X11: The DISPLAY environment variable is missing'
No module named 'carla'


Downloading dataset: http://rail.eecs.berkeley.edu/datasets/offline_rl/gym_mujoco_v2/halfcheetah_medium_expert-v2.hdf5 to /home/lms/.d4rl/datasets/halfcheetah_medium_expert-v2.hdf5
[[ 1.9831914e-02 -8.9501314e-02 -3.1969063e-03 ...  1.1365079e-01
   6.8424918e-02 -1.3811582e-01]
 [-3.8486063e-03 -5.2394319e-02  8.3050327e-03 ...  4.5068407e+00
  -9.2885571e+00  4.7328596e+00]
 [-5.5298433e-02 -7.7850236e-05 -2.3952831e-01 ... -7.0811687e+00
  -1.4037068e+00  7.5524049e+00]
 ...
 [-3.8276739e-02 -5.9685200e-03 -5.3859454e-01 ...  9.6563587e+00
  -9.2510633e+00 -2.3956337e+00]
 [-3.5350587e-02 -1.3052115e-01 -1.6677204e-01 ... -3.3741906e+00
  -4.8845510e+00 -5.1081996e+00]
 [-9.0780985e-03 -1.5547317e-01  6.0090959e-01 ... -2.2751564e+01
  -3.7737691e+00 -3.9162564e+00]]


In [4]:
dataset.keys()

dict_keys(['observations', 'actions', 'next_observations', 'rewards', 'terminals'])

In [6]:
dataset['observations'].shape

(1998000, 17)

In [7]:
import torch.nn as nn

In [77]:
class BigEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(BigEncoder, self).__init__()
         
        self.encoder = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size)
        )
    
    def forward(self, x):
        return self.encoder(x)

In [78]:
class StochasticTransitionModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, minimum_std = 0.001):
        super(StochasticTransitionModel, self).__init__()

        self.make_prob_parameters = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 2 * output_size)
        )

        self.minimum_std = minimum_std
        self.output_size = output_size

    def forward(self, x):
        flatted_mu_std = self.make_prob_parameters(x)
        reshaped_mu_std = flatted_mu_std.reshape(-1, self.output_size, 2)
        mu = reshaped_mu_std[:, :, 0]
        std = reshaped_mu_std[:, :, 1]

        std += self.minimum_std
        epsilon = torch.randn((x.shape[0], self.output_size))
        next_state_prediction = epsilon * std + mu
        return next_state_prediction

In [108]:
import random
import torch
import os

class EntireEnsembleModel(nn.Module):
    def __init__(self, input_size, encoder_hidden_size, encoder_output_size, transition_model_hidden_size, transition_model_output_size, ensemble_size, learning_rate):
        super(EntireEnsembleModel, self).__init__()
        
        self.ensemble_size = ensemble_size
        self.big_encoder = BigEncoder(input_size, encoder_hidden_size, encoder_output_size)
        self.stochastic_models = list()
        for _ in range(ensemble_size):
            self.stochastic_models.append(StochasticTransitionModel(encoder_hidden_size, transition_model_hidden_size, transition_model_output_size))
        
        self.all_parameters = list(self.big_encoder.parameters())
        for idx in range(ensemble_size):
            self.all_parameters += list(self.stochastic_models[idx].parameters())
        
        self.optimizer = torch.optim.Adam(self.all_parameters, lr=learning_rate)
        self.ensemble_size = ensemble_size

    def forward(self, state, action):
        state_action = torch.cat((state, action), dim=1)
        latent = self.big_encoder(state_action)
        selected_model_idx = random.choice(range(len(self.stochastic_models)))
        selected_model =self.stochastic_models[selected_model_idx]
        next_state_prediction = selected_model(latent)
        return next_state_prediction

    def save_model(self, path):
        torch.save(self.big_encoder.state_dict(), os.path.join("model_pt", path + "_big_encoder.pt"))
        for idx in range(self.ensemble_size):
            torch.save(self.stochastic_models[idx].state_dict(), os.path.join("model_pt", path + "_ensemble_{}.pt".format(idx)))

    def load_model(self, path):
        self.big_encoder.load_state_dict(torch.load(os.path.join("model_pt", path + "_big_encoder.pt")))
        for idx in range(self.ensemble_size):        
            self.stochastic_models[idx].load_state_dict(torch.load(os.path.join("model_pt", path + "_ensemble_{}.pt".format(idx))))

In [104]:
path = "0804"
torch.save(model.big_encoder.state_dict(), os.path.join("model_pt", path + "_big_encoder.pt"))
for idx in range(model.ensemble_size):
    torch.save(model.stochastic_models[idx].state_dict(),os.path.join("model_pt", path + "_ensemble_{}.pt".format(idx)))

In [73]:
from torch.utils.data import Dataset, DataLoader

class D4rlDataset(nn.Module):
    def __init__(self, d4rl_dataset):
        super(D4rlDataset, self).__init__()
        self.d4rl_dataset = d4rl_dataset
        self.state_array = d4rl_dataset['observations']
        self.next_state_array = d4rl_dataset['next_observations']
        self.action_array = d4rl_dataset['actions']
        self.reward_array = d4rl_dataset['rewards']
        self.terminal_array = d4rl_dataset['terminals']        

    def __getitem__(self, idx):
        return {'state': self.state_array[idx], 'action': self.action_array[idx], 'next_state': self.next_state_array[idx]}

    def __len__(self):
        return self.state_array.shape[0]


In [111]:
model2.load_model(path)

In [88]:
from sklearn.model_selection import train_test_split
import numpy as np

whole_ind = np.arange(dataset['observations'].shape[0])
train_ind, val_ind = train_test_split(whole_ind, test_size=0.1, random_state = 42)
train_ind, test_ind = train_test_split(train_ind, test_size=0.1, random_state = 42)

In [89]:
train_dataset = {}
val_dataset = {}
test_dataset = {}

for key in dataset.keys():
    train_dataset[key] = dataset[key][train_ind]
    val_dataset[key] = dataset[key][val_ind]
    test_dataset[key] = dataset[key][test_ind]

In [92]:
d4rl_train_dataset = D4rlDataset(dataset)
d4rl_train_dataloader = DataLoader(d4rl_train_dataset, batch_size=256, shuffle=True, drop_last=False)

d4rl_val_dataset = D4rlDataset(dataset)
d4rl_val_dataloader = DataLoader(d4rl_val_dataset, batch_size=256, shuffle=True, drop_last=False)

d4rl_test_dataset = D4rlDataset(dataset)
d4rl_test_dataloader = DataLoader(d4rl_test_dataset, batch_size=256, shuffle=True, drop_last=False)

In [96]:
model = EntireEnsembleModel(input_size=17+6, encoder_hidden_size=64, encoder_output_size=64, transition_model_hidden_size=64, transition_model_output_size=17, ensemble_size=5, learning_rate=0.0005)

In [95]:
model1 = model

In [97]:
loss_fn = nn.MSELoss()
epochs = 100

for epoch in range(epochs):
    loss_sum = 0
    for row in d4rl_train_dataloader:
        state = row['state']
        action = row['action']
        next_state = row['next_state']

        next_state_prediction = model(state, action)
        loss = loss_fn(next_state, next_state_prediction)

        loss.backward()
        model.optimizer.step()
        model.optimizer.zero_grad()

        loss_sum += loss.item()

    print('Train Mean Loss at {} epoch: {}'.format(loss_sum/len(d4rl_train_dataloader), epoch))

    loss_sum = 0
    with torch.no_grad():
        for row in d4rl_val_dataloader:
            state = row['state']
            action = row['action']
            next_state = row['next_state']

            next_state_prediction = model(state, action)
            loss = loss_fn(next_state, next_state_prediction)

            loss_sum += loss.item()

    print('Validation Mean Loss at {} epoch: {}'.format(loss_sum/len(d4rl_train_dataloader), epoch))

Train Mean Loss at 3.160057742247438 epoch: 0
Validation Mean Loss at 1.3023283789587052 epoch: 0
Train Mean Loss at 1.1191813480999133 epoch: 1
Validation Mean Loss at 0.9346889226655025 epoch: 1
Train Mean Loss at 0.9138090314581027 epoch: 2
Validation Mean Loss at 1.0948217041365693 epoch: 2
Train Mean Loss at 0.7841161046442048 epoch: 3
Validation Mean Loss at 0.7120429783544046 epoch: 3
Train Mean Loss at 0.6987513205510236 epoch: 4
Validation Mean Loss at 0.640969680128458 epoch: 4
Train Mean Loss at 0.6271592181054236 epoch: 5
Validation Mean Loss at 0.5726932004046394 epoch: 5
Train Mean Loss at 0.5807639912219478 epoch: 6
Validation Mean Loss at 0.5678961675988795 epoch: 6
Train Mean Loss at 0.5305012987868134 epoch: 7
Validation Mean Loss at 0.4834953626786036 epoch: 7
Train Mean Loss at 0.49389000348293344 epoch: 8
Validation Mean Loss at 0.4554047950409224 epoch: 8
Train Mean Loss at 0.46626118687312623 epoch: 9
Validation Mean Loss at 0.404511248380003 epoch: 9
Train Mean 

KeyboardInterrupt: 

In [21]:
print('observation size:', dataset['observations'].shape)
print('action size:', dataset['actions'].shape)
print('next observation size:', dataset['next_observations'].shape)
print('rewards size:', dataset['rewards'].shape)
print('terminal size:', dataset['terminals'].shape)


observation size: (1998000, 17)
action size: (1998000, 6)
next observation size: (1998000, 17)
rewards size: (1998000,)
terminal size: (1998000,)


In [None]:
env