In [1]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import cv2
import gym
import torch
from IPython.display import Image
from IPython.core.display import Image, display

import torchvision
from torchvision.utils import save_image
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

from torch.optim import Adam
from torchsummary import summary

# CartPole-v1

In [2]:
env = gym.make('CartPole-v1')

states = []
actions = []
rewards = []
next_states = []
for i in tqdm(range(100)):

    obs = env.reset()
    done = False

    while not done:
        states.append(obs)
        
        action = env.action_space.sample()
        actions.append(action)
        
        obs, reward, done, _ = env.step(action)
        rewards.append(reward)
        next_states.append(obs)
        
env.close()

  0%|          | 0/100 [00:00<?, ?it/s]

In [3]:
s_t = torch.from_numpy(np.array(states).astype(np.float32))
# a_t = torch.from_numpy(np.array(actions).astype(np.float32))  # for LunarLander-v0
a_t = torch.unsqueeze(torch.from_numpy(np.array(actions).astype(np.float32)), 1)
r_t = torch.unsqueeze(torch.from_numpy(np.array(rewards).astype(np.float32)), 1)
s_t1 = torch.from_numpy(np.array(next_states).astype(np.float32))

s_t.shape, a_t.shape, r_t.shape, s_t1.shape

(torch.Size([2311, 4]),
 torch.Size([2311, 1]),
 torch.Size([2311, 1]),
 torch.Size([2311, 4]))

# Models

* Forward: $f(s_t,a_t)=s_{t+1}$
* Backward: $f(s_{t+1}, a_t)=s_t$
* Inverse: $f(s_t,s_{t+1})=a_t$
* Reward: $f(s_{t+1})=r_{t+1}$, i.e. $f(s_t)=r_t$

In [4]:
class Reward(nn.Module):
    """Reward model predicts r_t1 from s_t1."""
    def __init__(self, s_dim):
        super(Reward, self).__init__()
        self.fc1 = nn.Linear(s_dim, 32)
        self.fc2 = nn.Linear(32, 16)
        self.fc3 = nn.Linear(16, 1)
    def forward(self, s_t1):
        x = F.relu(self.fc1(s_t))
        x = F.relu(self.fc2(x))
        r_hat_t1 = self.fc3(x)
        return r_hat_t1

In [5]:
class Forward(nn.Module):
    """Forward model predicts s_{t+1} from (s_t, a_t)."""
    def __init__(self, s_dim, a_dim):
        super(Forward, self).__init__()
        self.fc1 = nn.Linear(s_dim + a_dim, 32)
        self.fc2 = nn.Linear(32, 16)
        self.fc3 = nn.Linear(16, s_dim)
    def forward(self, s_t, a_t):
        x = torch.cat((s_t, a_t), dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        s_hat_t1 = self.fc3(x)
        return s_hat_t1

In [6]:
class Backward(nn.Module):
    """Backward model predicts s_t from (s_{t+1}, a_t)."""
    def __init__(self, s_dim, a_dim):
        super(Backward, self).__init__()
        self.fc1 = nn.Linear(s_dim + a_dim, 32)
        self.fc2 = nn.Linear(32, 16)
        self.fc3 = nn.Linear(16, s_dim)
    def forward(self, s_t1, a_t):
        x = torch.cat((s_t1, a_t), dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        s_hat_t = self.fc3(x)
        return s_hat_t

In [7]:
class Inverse(nn.Module):
    """Inverse model predicts a_t from (s_t, s_{t+1})."""
    def __init__(self, s_dim, a_dim):
        super(Inverse, self).__init__()
        self.fc1 = nn.Linear(2 * s_dim, 32)
        self.fc2 = nn.Linear(32, 16)
        self.fc3 = nn.Linear(16, a_dim)
    def forward(self, s_t, s_t1):
        x = torch.cat((s_t, s_t1), dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        a_hat_t = self.fc3(x)
        return a_hat_t

In [8]:
def f_loss(s_t1, s_hat_t1):
    return F.mse_loss(s_hat_t1, s_t1)

def b_loss(s_t, s_hat_t):
    return F.mse_loss(s_hat_t, s_t)

# i_loss = nn.CrossEntropyLoss()

def i_loss(a_t, a_hat_t):
    return F.mse_loss(a_hat_t, a_t)

def r_loss(r_t, r_hat_t):
    return F.mse_loss(r_hat_t, r_t)

def fbi_loss(s_t, s_hat_t, a_t, a_hat_t, s_t1, s_hat_t1):
    return f_loss(s_t1, s_hat_t1) + b_loss(s_t, s_hat_t) + i_loss(a_t, a_hat_t)

In [9]:
# f_f = Forward(s_dim=4, a_dim=1)
# f_b = Backward(s_dim=4, a_dim=1)
# f_i = Inverse(s_dim=4, a_dim=1)
# f_r = Reward(s_dim=4)

# f_opt = Adam(f_f.parameters())
# b_opt = Adam(f_b.parameters())
# i_opt = Adam(f_i.parameters())
# r_opt = Adam(f_r.parameters())


# fbi_losses = []
# for i in range(100):
#     f_opt.zero_grad()
#     b_opt.zero_grad()
#     i_opt.zero_grad()
#     r_opt.zero_grad()
    
#     s_hat_t1 = f_f(s_t, a_t)
#     s_hat_t = f_b(s_t1, a_t)
#     a_hat_t = f_i(s_t, s_t1)
#     r_hat_t = f_r(s_t1)
    
    
#     loss_f = f_loss(s_t1, s_hat_t1)
#     loss_b = b_loss(s_t, s_hat_t)
#     loss_i = i_loss(a_t, a_hat_t)
#     loss_r = r_loss(r_t, r_hat_t)
    
#     loss_f.backward()
#     loss_b.backward()
#     loss_i.backward()
#     loss_r.backward()
    
#     f_opt.step()
#     b_opt.step()
#     i_opt.step()
#     r_opt.step()
    
#     print(f"Epoch {i} - F: {loss_f:.2f}, B: {loss_b:.2f}, I: {loss_i:.2f}, Reward: {loss_r:.2f}")
#     fbi_losses.append([loss_f, loss_b, loss_i, loss_r])

# Model-Based Policy Optimization (MBPO)

* Based on the Dyna algorithm
1. Collect environment trajectories; add to $D_{env}$
2. Train model ensemble on environment data $D_{env}$
3. Perform k-step model rollouts branched from $D_{env}$; add to $D_{model}$
4. Update policy parameters on model data $D_{model}$

Source: [BAIR](https://bair.berkeley.edu/blog/2019/12/12/mbpo/)

In [10]:
# https://github.com/seungeunrho/minimalRL/blob/master/dqn.py
import gym
import collections
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

#Hyperparameters
learning_rate = 0.0005
gamma         = 0.98
buffer_limit  = 50000
batch_size    = 32

class ReplayBuffer():
    def __init__(self):
        self.buffer = collections.deque(maxlen=buffer_limit)
    
    def put(self, transition):
        self.buffer.append(transition)
    
    def extend(self, transition_batch):
        self.buffer.extend(transition_batch)
    
    def sample(self, n):
        mini_batch = random.sample(self.buffer, n)
        s_t_lst, a_t_lst, r_t_lst, s_t1_lst, done_mask_lst = [], [], [], [], []
        
        for s_t, a_t, r_t, s_t1, done_mask in mini_batch:
            s_t_lst.append(s_t)
            a_t_lst.append(a_t)
            r_t_lst.append(r_t)
            s_t1_lst.append(s_t1)
            done_mask_lst.append(done_mask)
        
        return s_t_lst, a_t_lst, r_t_lst, s_t1_lst, done_mask_lst

    def to_tensors(self, s_t, a_t, r_t, s_t1, done_mask):
        s_t_tensor = torch.tensor(s_t, dtype=torch.float)
        a_t_tensor = torch.tensor([[a_t_i] for a_t_i in a_t], dtype=torch.float)
        r_t_tensor = torch.tensor([[r_t_i] for r_t_i in r_t])
        s_t1_tensor = torch.tensor(s_t1, dtype=torch.float)
        done_mask_tensor = torch.tensor(done_mask)
        
        return s_t_tensor, a_t_tensor, r_t_tensor, s_t1_tensor, done_mask_tensor
    
    def size(self):
        return len(self.buffer)

class Qnet(nn.Module):
    def __init__(self):
        super(Qnet, self).__init__()
        self.fc1 = nn.Linear(4, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 2)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
      
    def sample_action(self, obs, epsilon):
        out = self.forward(obs)
        coin = random.random()
        if coin < epsilon:
            return random.randint(0,1)
        else : 
            return out.argmax().item()
            
def train(q, q_target, dreams, optimizer):
    for i in range(10):
        s_t, a_t, r_t ,s_t1, done_mask = dreams.to_tensors(*dreams.sample(batch_size))
        q_out = q(s_t)
        q_a = q(s_t).gather(1, a_t.type(torch.int64))
        max_q_prime = q_target(s_t1).max(1)[0].unsqueeze(1)
        target = r_t + gamma * max_q_prime * done_mask
        loss = F.smooth_l1_loss(q_a, target)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [16]:
def dream(memory, dreams, f_f, f_opt, f_b=None, b_opt=None, f_i=None, i_opt=None, steps=1):
    # train directional model
    for i in range(20):
        s_t, a_t, r_t, s_t1, done_mask = memory.to_tensors(*memory.sample(batch_size))
        
        # train forward model
        f_opt.zero_grad()
        s_hat_t1 = f_f(s_t, a_t)
        loss_f = f_loss(s_t1, s_hat_t1)
        loss_f.backward()
        f_opt.step()
        
        # train backward model
        if f_b is not None:
            b_opt.zero_grad()
            s_hat_t = f_b(s_t1, a_t)
            loss_b = b_loss(s_t, s_hat_t)
            loss_b.backward()
            b_opt.step()
        
        # train inverse model
        if f_i is not None:
            i_opt.zero_grad()
            a_hat_t = f_i(s_t, s_t1)
            loss_i = i_loss(a_t, a_hat_t)
            loss_i.backward()
            i_opt.step()
    
    # generate model data
    if f_b is not None and f_i is not None:
        real_world_samples = 10
    elif f_b is not None and f_i is None:
        real_world_samples = 15
    elif f_b is None and f_i is None:
        real_world_samples = 30
        
    for i in range(10):
        s_t, a_t, r_t, s_t1, done_mask = memory.sample(real_world_samples)        
        s_t_tensor, a_t_tensor, r_t_tensor, s_t1_tensor, _ = memory.to_tensors(s_t, a_t, r_t, s_t1, done_mask)
        
        # add forward model prediction to dreams
        s_hat_t1 = f_f(s_t_tensor, a_t_tensor).detach().numpy()
        dreams.extend(zip(s_t, a_t, r_t, s_hat_t1, done_mask))
        
        # add backward model prediction to dreams
        if f_b is not None:
            s_hat_t = f_b(s_t1_tensor, a_t_tensor).detach().numpy()
            dreams.extend(zip(s_hat_t, a_t, r_t, s_t1, done_mask))
            
        # add inverse model prediction to dreams
        if f_i is not None:
            a_hat_t = f_i(s_t_tensor, s_t1_tensor).detach().numpy()[0]
            dreams.extend(zip(s_t, a_hat_t, r_t, s_t1, done_mask))

In [19]:
# logging
from torch.utils.tensorboard import SummaryWriter

logdir = "runs/fb"
writer = SummaryWriter(logdir)

In [20]:
# config
episodes = 10_000
print_interval = 100

# model
env = gym.make('CartPole-v1')
q = Qnet()
q_target = Qnet()
q_target.load_state_dict(q.state_dict())
memory = ReplayBuffer()
score = 0.0  
optimizer = optim.Adam(q.parameters(), lr=learning_rate)

# dream
dreams = ReplayBuffer()
s_dim = env.observation_space.shape[0]
a_dim = 1
f_f = Forward(s_dim, a_dim)
f_b = Backward(s_dim, a_dim)
f_i = Inverse(s_dim, a_dim)
f_opt = Adam(f_f.parameters())
b_opt = Adam(f_b.parameters())
i_opt = Adam(f_i.parameters())


# training loop
for e in tqdm(range(episodes)):
    epsilon = max(0.01, 0.08 - 0.01*(e/200)) #Linear annealing from 8% to 1%
    s_t = env.reset()
    done = False

    while not done:
        a_t = q.sample_action(torch.from_numpy(s_t).float(), epsilon)      
        s_t1, r_t, done, info = env.step(a_t)
        done_mask = 0.0 if done else 1.0
        memory.put((s_t, a_t, r_t/100.0, s_t1, done_mask))
        s_t = s_t1

        score += r_t
        if done:
            break

    # train in dream
    if memory.size() > 2000:
#         dream(memory, dreams, f_f, f_opt, steps=1)
        dream(memory, dreams, f_f, f_opt, f_b, b_opt, steps=1)
#         dream(memory, dreams, f_f, f_opt, f_b, b_opt, f_i, i_opt, steps=1)
        train(q, q_target, dreams, optimizer)

    if e % print_interval == 0 and e != 0:
        q_target.load_state_dict(q.state_dict())
        print(f"Episode: {e}, Score: {score/print_interval:.1f}, Buffer size: {memory.size()}, Epsilon : {epsilon*100:.1f}%")
        
        writer.add_scalar('Performance/Score', score/print_interval, e)
        writer.add_scalar('Internals/Memory size', memory.size(), e)
        writer.add_scalar('Internals/Epsilon', epsilon*100, e)
        writer.add_scalar('Internals/Dreams size', dreams.size(), e)
        
        score = 0.0
        
env.close()

  0%|          | 0/10000 [00:00<?, ?it/s]

Episode: 100, Score: 12.1, Buffer size: 1206, Epsilon : 7.5%




Episode: 200, Score: 20.0, Buffer size: 3206, Epsilon : 7.0%
Episode: 300, Score: 137.7, Buffer size: 16976, Epsilon : 6.5%
Episode: 400, Score: 87.6, Buffer size: 25740, Epsilon : 6.0%
Episode: 500, Score: 87.7, Buffer size: 34507, Epsilon : 5.5%
Episode: 600, Score: 38.5, Buffer size: 38354, Epsilon : 5.0%
Episode: 700, Score: 38.8, Buffer size: 42237, Epsilon : 4.5%
Episode: 800, Score: 18.3, Buffer size: 44068, Epsilon : 4.0%
Episode: 900, Score: 18.3, Buffer size: 45901, Epsilon : 3.5%
Episode: 1000, Score: 9.5, Buffer size: 46852, Epsilon : 3.0%
Episode: 1100, Score: 9.5, Buffer size: 47806, Epsilon : 2.5%
Episode: 1200, Score: 9.5, Buffer size: 48756, Epsilon : 2.0%
Episode: 1300, Score: 9.4, Buffer size: 49700, Epsilon : 1.5%
Episode: 1400, Score: 9.4, Buffer size: 50000, Epsilon : 1.0%
Episode: 1500, Score: 9.4, Buffer size: 50000, Epsilon : 1.0%
Episode: 1600, Score: 10.0, Buffer size: 50000, Epsilon : 1.0%
Episode: 1700, Score: 20.1, Buffer size: 50000, Epsilon : 1.0%
Episod