In [1]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import cv2
import gym
import torch
from IPython.display import Image
from IPython.core.display import Image, display

import torchvision
from torchvision.utils import save_image
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

from torch.optim import Adam
from torchsummary import summary

# CartPole-v1

In [2]:
env = gym.make('CartPole-v1')

states = []
actions = []
rewards = []
next_states = []
for i in tqdm(range(100)):

    obs = env.reset()
    done = False

    while not done:
        states.append(obs)
        
        action = env.action_space.sample()
        actions.append(action)
        
        obs, reward, done, _ = env.step(action)
        rewards.append(reward)
        next_states.append(obs)
        
env.close()

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




In [3]:
s_t = torch.from_numpy(np.array(states).astype(np.float32))
# a_t = torch.from_numpy(np.array(actions).astype(np.float32))  # for LunarLander-v0
a_t = torch.unsqueeze(torch.from_numpy(np.array(actions).astype(np.float32)), 1)
r_t = torch.unsqueeze(torch.from_numpy(np.array(rewards).astype(np.float32)), 1)
s_t1 = torch.from_numpy(np.array(next_states).astype(np.float32))

s_t.shape, a_t.shape, r_t.shape, s_t1.shape

(torch.Size([2201, 4]),
 torch.Size([2201, 1]),
 torch.Size([2201, 1]),
 torch.Size([2201, 4]))

# Models

* Forward: $f(s_t,a_t)=s_{t+1}$
* Backward: $f(s_{t+1}, a_t)=s_t$
* Inverse: $f(s_t,s_{t+1})=a_t$
* Reward: $f(s_{t+1})=r_{t+1}$, i.e. $f(s_t)=r_t$

In [4]:
class Reward(nn.Module):
    """Reward model predicts r_t1 from s_t1."""
    def __init__(self, s_dim):
        super(Reward, self).__init__()
        self.fc1 = nn.Linear(s_dim, 32)
        self.fc2 = nn.Linear(32, 16)
        self.fc3 = nn.Linear(16, 1)
    def forward(self, s_t1):
        x = F.relu(self.fc1(s_t))
        x = F.relu(self.fc2(x))
        r_hat_t1 = self.fc3(x)
        return r_hat_t1

In [5]:
class Forward(nn.Module):
    """Forward model predicts s_{t+1} from (s_t, a_t)."""
    def __init__(self, s_dim, a_dim):
        super(Forward, self).__init__()
        self.fc1 = nn.Linear(s_dim + a_dim, 32)
        self.fc2 = nn.Linear(32, 16)
        self.fc3 = nn.Linear(16, s_dim)
    def forward(self, s_t, a_t):
        x = torch.cat((s_t, a_t), dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        s_hat_t1 = self.fc3(x)
        return s_hat_t1

In [6]:
class Backward(nn.Module):
    """Backward model predicts s_t from (s_{t+1}, a_t)."""
    def __init__(self, s_dim, a_dim):
        super(Backward, self).__init__()
        self.fc1 = nn.Linear(s_dim + a_dim, 32)
        self.fc2 = nn.Linear(32, 16)
        self.fc3 = nn.Linear(16, s_dim)
    def forward(self, s_t1, a_t):
        x = torch.cat((s_t1, a_t), dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        s_hat_t = self.fc3(x)
        return s_hat_t

In [7]:
class Inverse(nn.Module):
    """Inverse model predicts a_t from (s_t, s_{t+1})."""
    def __init__(self, s_dim, a_dim):
        super(Inverse, self).__init__()
        self.fc1 = nn.Linear(2 * s_dim, 32)
        self.fc2 = nn.Linear(32, 16)
        self.fc3 = nn.Linear(16, a_dim)
    def forward(self, s_t, s_t1):
        x = torch.cat((s_t, s_t1), dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        a_hat_t = self.fc3(x)
        return a_hat_t

In [8]:
def f_loss(s_t1, s_hat_t1):
    return F.mse_loss(s_hat_t1, s_t1)

def b_loss(s_t, s_hat_t):
    return F.mse_loss(s_hat_t, s_t)

def i_loss(a_t, a_hat_t):
    return F.mse_loss(a_hat_t, a_t)

def r_loss(r_t, r_hat_t):
    return F.mse_loss(r_hat_t, r_t)

def fbi_loss(s_t, s_hat_t, a_t, a_hat_t, s_t1, s_hat_t1):
    return f_loss(s_t1, s_hat_t1) + b_loss(s_t, s_hat_t) + i_loss(a_t, a_hat_t)

In [9]:
f_f = Forward(s_dim=4, a_dim=1)
f_b = Backward(s_dim=4, a_dim=1)
f_i = Inverse(s_dim=4, a_dim=1)
f_r = Reward(s_dim=4)

f_opt = Adam(f_f.parameters())
b_opt = Adam(f_b.parameters())
i_opt = Adam(f_i.parameters())
r_opt = Adam(f_r.parameters())


fbi_losses = []
for i in range(100):
    f_opt.zero_grad()
    b_opt.zero_grad()
    i_opt.zero_grad()
    r_opt.zero_grad()
    
    s_hat_t1 = f_f(s_t, a_t)
    s_hat_t = f_b(s_t1, a_t)
    a_hat_t = f_i(s_t, s_t1)
    r_hat_t = f_r(s_t1)
    
    
    loss_f = f_loss(s_t1, s_hat_t1)
    loss_b = b_loss(s_t, s_hat_t)
    loss_i = i_loss(a_t, a_hat_t)
    loss_r = r_loss(r_t, r_hat_t)
    
    loss_f.backward()
    loss_b.backward()
    loss_i.backward()
    loss_r.backward()
    
    f_opt.step()
    b_opt.step()
    i_opt.step()
    r_opt.step()
    
    print(f"Epoch {i} - F: {loss_f:.2f}, B: {loss_b:.2f}, I: {loss_i:.2f}, Reward: {loss_r:.2f}")
    fbi_losses.append([loss_f, loss_b, loss_i, loss_r])

Epoch 0 - F: 0.27, B: 0.24, I: 0.72, Reward: 0.87
Epoch 1 - F: 0.27, B: 0.24, I: 0.71, Reward: 0.86
Epoch 2 - F: 0.27, B: 0.24, I: 0.70, Reward: 0.84
Epoch 3 - F: 0.27, B: 0.23, I: 0.70, Reward: 0.82
Epoch 4 - F: 0.27, B: 0.23, I: 0.69, Reward: 0.80
Epoch 5 - F: 0.26, B: 0.23, I: 0.68, Reward: 0.78
Epoch 6 - F: 0.26, B: 0.23, I: 0.67, Reward: 0.76
Epoch 7 - F: 0.26, B: 0.22, I: 0.66, Reward: 0.74
Epoch 8 - F: 0.26, B: 0.22, I: 0.66, Reward: 0.72
Epoch 9 - F: 0.26, B: 0.22, I: 0.65, Reward: 0.70
Epoch 10 - F: 0.25, B: 0.22, I: 0.64, Reward: 0.68
Epoch 11 - F: 0.25, B: 0.21, I: 0.63, Reward: 0.66
Epoch 12 - F: 0.25, B: 0.21, I: 0.62, Reward: 0.64
Epoch 13 - F: 0.25, B: 0.21, I: 0.61, Reward: 0.62
Epoch 14 - F: 0.24, B: 0.21, I: 0.61, Reward: 0.60
Epoch 15 - F: 0.24, B: 0.21, I: 0.60, Reward: 0.58
Epoch 16 - F: 0.24, B: 0.20, I: 0.59, Reward: 0.56
Epoch 17 - F: 0.24, B: 0.20, I: 0.58, Reward: 0.54
Epoch 18 - F: 0.23, B: 0.20, I: 0.57, Reward: 0.52
Epoch 19 - F: 0.23, B: 0.20, I: 0.56, Rew

# Model-Based Policy Optimization (MBPO)

* Based on the Dyna algorithm
1. Collect environment trajectories; add to $D_{env}$
2. Train model ensemble on environment data $D_{env}$
3. Perform k-step model rollouts branched from $D_{env}$; add to $D_{model}$
4. Update policy parameters on model data $D_{model}$

Source: [BAIR](https://bair.berkeley.edu/blog/2019/12/12/mbpo/)

In [10]:
# https://github.com/seungeunrho/minimalRL/blob/master/dqn.py
import gym
import collections
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

#Hyperparameters
learning_rate = 0.0005
gamma         = 0.98
buffer_limit  = 50000
batch_size    = 32

class ReplayBuffer():
    def __init__(self):
        self.buffer = collections.deque(maxlen=buffer_limit)
    
    def put(self, transition):
        self.buffer.append(transition)
    
    def sample(self, n):
        mini_batch = random.sample(self.buffer, n)
        s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], []
        
        for transition in mini_batch:
            s, a, r, s_prime, done_mask = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            done_mask_lst.append([done_mask])

        return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
               torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
               torch.tensor(done_mask_lst)
    
    def size(self):
        return len(self.buffer)

class Qnet(nn.Module):
    def __init__(self):
        super(Qnet, self).__init__()
        self.fc1 = nn.Linear(4, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 2)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
      
    def sample_action(self, obs, epsilon):
        out = self.forward(obs)
        coin = random.random()
        if coin < epsilon:
            return random.randint(0,1)
        else : 
            return out.argmax().item()
            
def train(q, q_target, memory, optimizer):
    for i in range(10):
        s,a,r,s_prime,done_mask = memory.sample(batch_size)

        q_out = q(s)
        q_a = q_out.gather(1,a)
        max_q_prime = q_target(s_prime).max(1)[0].unsqueeze(1)
        target = r + gamma * max_q_prime * done_mask
        loss = F.smooth_l1_loss(q_a, target)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [13]:
# config
episodes = 10_000
print_interval = 20

# model
env = gym.make('CartPole-v1')
q = Qnet()
q_target = Qnet()
q_target.load_state_dict(q.state_dict())
memory = ReplayBuffer()

score = 0.0  
optimizer = optim.Adam(q.parameters(), lr=learning_rate)

# training loop
for e in range(episodes):
    epsilon = max(0.01, 0.08 - 0.01*(n_epi/200)) #Linear annealing from 8% to 1%
    s = env.reset()
    done = False

    while not done:
        a = q.sample_action(torch.from_numpy(s).float(), epsilon)      
        s_prime, r, done, info = env.step(a)
        done_mask = 0.0 if done else 1.0
        memory.put((s,a,r/100.0,s_prime, done_mask))
        s = s_prime

        score += r
        if done:
            break

    if memory.size()>2000:
        train(q, q_target, memory, optimizer)

    if e % print_interval == 0 and e != 0:
        q_target.load_state_dict(q.state_dict())
        print(f"Episode: {e}, Score: {score/print_interval:.1f}, Buffer size: {memory.size()}, Epsilon : {epsilon*100:.1f}%")
        score = 0.0
        
env.close()

n_episode :20, score : 10.2, n_buffer : 205, eps : 7.9%
n_episode :40, score : 9.5, n_buffer : 395, eps : 7.8%
n_episode :60, score : 9.8, n_buffer : 592, eps : 7.7%
n_episode :80, score : 9.8, n_buffer : 787, eps : 7.6%
n_episode :100, score : 9.7, n_buffer : 980, eps : 7.5%
n_episode :120, score : 9.7, n_buffer : 1174, eps : 7.4%
n_episode :140, score : 9.9, n_buffer : 1372, eps : 7.3%
n_episode :160, score : 10.0, n_buffer : 1572, eps : 7.2%
n_episode :180, score : 9.5, n_buffer : 1762, eps : 7.1%
n_episode :200, score : 9.9, n_buffer : 1960, eps : 7.0%
n_episode :220, score : 12.6, n_buffer : 2212, eps : 6.9%
n_episode :240, score : 9.7, n_buffer : 2406, eps : 6.8%
n_episode :260, score : 11.2, n_buffer : 2631, eps : 6.7%
n_episode :280, score : 12.0, n_buffer : 2871, eps : 6.6%
n_episode :300, score : 14.6, n_buffer : 3162, eps : 6.5%
n_episode :320, score : 15.5, n_buffer : 3472, eps : 6.4%
n_episode :340, score : 24.4, n_buffer : 3961, eps : 6.3%
n_episode :360, score : 105.5, n

KeyboardInterrupt: 