In [1]:

import numpy as np
import torch 
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


In [2]:
replay_size = 7000

In [3]:
# MLP UDPRL stolen from https://github.com/BY571/Upside-Down-Reinforcement-Learning/blob/master/Upside-Down.ipynb

class BF(nn.Module):
    def __init__(self, state_space, action_space, hidden_size, seed):
        super(BF, self).__init__()
        torch.manual_seed(seed)
        self.actions = np.arange(action_space)
        self.action_space = action_space
        self.fc1 = nn.Linear(state_space, hidden_size)
        self.commands = nn.Linear(2, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, hidden_size)
        self.fc5 = nn.Linear(hidden_size, action_space)

        self.decoder_layer = nn.TransformerDecoderLayer(d_model=hidden_size, nhead=8)
        self.transformer_decoder = nn.TransformerDecoder(self.decoder_layer, num_layers=6)

        self.sigmoid = nn.Sigmoid()
        
    def forward(self, state, command):       
               
        out = self.sigmoid(self.fc1(state))
        command_out = self.sigmoid(self.commands(command))
        out = out * command_out
        out = torch.relu(self.fc2(out))
        out_ad = out.expand(1,-1,-1)
        out = self.transformer_decoder(out_ad, out_ad)[0, :,:]

        out = torch.relu(self.fc3(out))
        out = torch.relu(self.fc4(out))
        out = self.fc5(out)
        
        return out
    
    def action(self, state, desire, horizon):
        """
        Samples the action based on their probability
        """
        command = torch.cat((desire,horizon), dim=-1)
        action_prob = self.forward(state.expand(1, -1), command.expand(1, -1))[0,:]
        probs = torch.softmax(action_prob, dim=-1)
        action = torch.distributions.categorical.Categorical(probs=probs).sample()
        return action
    def greedy_action(self, state, desire, horizon):
        """
        Returns the greedy action 
        """
        command = torch.cat((desire,horizon), dim=-1)
        action_prob = self.forward(state, command)
        probs = torch.softmax(action_prob, dim=-1)
        action = torch.argmax(probs).item()
        return action

In [4]:
import random


class ReplayBuffer():
    def __init__(self, max_size):
        self.max_size = max_size
        self.buffer = []
        
        
    def add_sample(self, states, actions, rewards):
        episode = {"states": states, "actions":actions, "rewards": rewards, "summed_rewards":sum(rewards)}
        self.buffer.append(episode)
        
    
    def sort(self):
        #sort buffer
        random.shuffle(self.buffer)
        # keep the max buffer size
        self.buffer = self.buffer[:self.max_size]
    
    def get_random_samples(self, batch_size):
        self.sort()
        idxs = np.random.randint(0, len(self.buffer), batch_size)
        batch = [self.buffer[idx] for idx in idxs]
        return batch
    
    def __len__(self):
        return len(self.buffer)

In [5]:
class SimpleGame():
    def __init__(self):
        self.possibleStates = [0, 1, 2]
        self.possibleActions = [0, 1]
        self.state = 0
        self.isTerminal = False
        self.isLoss = False

    def getState(self):
        return np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]][self.state])
    
    def getReward(self):
        if self.isTerminal:
            if self.isLoss:
                return -1
            return 1
        return 0
    
    def getActions(self):
        if self.isTerminal:
            return []
        return self.possibleActions

    def action(self, a):
        if a != 1 and a != 0:
            a = 0
        if a == 1:
            if self.state == 0:
                self.state = 1
            elif self.state == 1:
                self.state = 2
                self.isTerminal = True
        if a == 0:
            if self.state == 0:
                self.state = 2
                self.isTerminal = True
                self.isLoss = True
            elif self.state == 1:
                self.state = 0

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

bf = BF(3, 2, 64, 1).to(device)
optimizer = optim.Adam(params=bf.parameters(), lr=1e-3)

In [7]:
init_desired_reward = 1
init_time_horizon = 2

def generate_episode(desired_return = torch.FloatTensor([init_desired_reward]), desired_time_horizon = torch.FloatTensor([init_time_horizon])):    
    """
    Generates more samples for the replay buffer.
    """
    g = SimpleGame()
    states = []
    actions = []
    rewards = []
    while not g.isTerminal:
        # random action
        s = torch.from_numpy(g.getState()).float().to(device)
        states.append(s)
        a = bf.action(s, desired_return, desired_time_horizon)
        g.action(a)
        reward = g.getReward()
        desired_return -= reward
        desired_time_horizon -= 1
        desired_time_horizon = torch.FloatTensor([np.maximum(desired_time_horizon, 1).item()])
        actions.append(a)
        rewards.append(reward)
    return states, actions, rewards

In [8]:
# create replay buffer:
buffer = ReplayBuffer(replay_size)

# init replay buffer with random trajectories:
for i in range(0, replay_size):
    states, actions, rewards = generate_episode()
    buffer.add_sample(states, actions, rewards)

In [9]:


# FUNCTIONS FOR TRAINING
def select_time_steps(saved_episode):
    """
    Given a saved episode from the replay buffer this function samples random time steps (t1 and t2) in that episode:
    T = max time horizon in that episode
    Returns t1, t2 and T 
    """
    # Select times in the episode:
    T = len(saved_episode["states"]) # episode max horizon 
    if T != 1:
        t1 = np.random.randint(0,T)
        t2 = np.random.randint(t1+1,T+1)
    else:
        t1 = 0
        t2 = T

    return t1, t2, T

def create_training_input(episode, t1, t2):
    """
    Based on the selected episode and the given time steps this function returns 4 values:
    1. state at t1
    2. the desired reward: sum over all rewards from t1 to t2
    3. the time horizont: t2 -t1
    
    4. the target action taken at t1
    
    buffer episodes are build like [cumulative episode reward, states, actions, rewards]
    """
    state = episode["states"][t1] 
    desired_reward = sum(episode["rewards"][t1:t2])
    time_horizont = t2-t1
    action = episode["actions"][t1]
    return state, desired_reward, time_horizont, action

def create_training_examples(batch_size):
    """
    Creates a data set of training examples that can be used to create a data loader for training.
    ============================================================
    1. for the given batch_size episode idx are randomly selected
    2. based on these episodes t1 and t2 are samples for each selected episode 
    3. for the selected episode and sampled t1 and t2 trainings values are gathered
    ______________________________________________________________
    Output are two numpy arrays in the length of batch size:
    Input Array for the Behavior function - consisting of (state, desired_reward, time_horizon)
    Output Array with the taken actions 
    """
    input_array = []
    output_array = []
    # select randomly episodes from the buffer
    episodes = buffer.get_random_samples(batch_size)
    for ep in episodes:
        #select time stamps
        t1, t2, T = select_time_steps(ep)
        # For episodic tasks they set t2 to T:
        t2 = T
        state, desired_reward, time_horizont, action = create_training_input(ep, t1, t2)
        input_array.append(torch.cat([state, torch.FloatTensor([desired_reward]), torch.FloatTensor([time_horizont])]))
        output_array.append(action)
    return input_array, output_array

def train_behavior_function(batch_size):
    """
    Trains the BF with on a cross entropy loss were the inputs are the action probabilities based on the state and command.
    The targets are the actions appropriate to the states from the replay buffer.
    """
    X, y = create_training_examples(batch_size)

    X = torch.stack(X)
    state = X[:,0:3]
    d = X[:,3:3+1]
    h = X[:,3+1:3+2]
    command = torch.cat((d,h), dim=-1)
    y = torch.stack(y).long()
    y_ = bf(state.to(device), command.to(device)).float()
    optimizer.zero_grad()
    pred_loss = F.cross_entropy(y_, y)   
    pred_loss.backward()
    optimizer.step()
    return pred_loss.detach().cpu().numpy()



In [10]:
def run_loop():
    i = 0
    while True:
        i += 1
        for _ in range(0, 10):
            states, actions, rewards = generate_episode(torch.FloatTensor([1]), torch.FloatTensor([2]))
            buffer.add_sample(states, actions, rewards)
        
        loss = train_behavior_function(replay_size*1)
        print(i, loss)

In [11]:
run_loop()

1 0.6918533
2 0.6655031
3 0.56628126
4 0.6288855
5 0.48809955
6 0.49119264
7 0.39312568
8 0.37387088
9 0.38383916
10 0.3514984
11 0.33018622
12 0.3113203
13 0.30211848
14 0.30035645
15 0.28957143
16 0.28109035
17 0.18923163
18 0.255644
19 0.23380806
20 0.13778752
21 0.26519132
22 0.2870759
23 0.27952635
24 0.26584065
25 0.2524954
26 0.1256933
27 0.121497735
28 0.12586735
29 0.12653805
30 0.13082182
31 0.12007605
32 0.087930456
33 0.056494035
34 0.026977703
35 0.019940365
36 0.025105666
37 0.017280852
38 0.015297164
39 0.0136148
40 0.011188785
41 0.009167238
42 0.008833972
43 0.0064627524
44 0.005508576
45 0.0050378735
46 0.004147242
47 0.0035999746
48 0.0031798927
49 0.008500742
50 0.0025124135
51 0.0022334082


KeyboardInterrupt: 

In [12]:
lol = 0
for x in buffer.get_random_samples(100):
    lol += x['summed_rewards']
lol

-26

In [110]:
a, b, c = generate_episode(torch.FloatTensor([1]), torch.FloatTensor([25]))
c[len(c)-1], len(b)

(-1, 25)

In [56]:
X, y = create_training_examples(7000)

In [57]:
torch.stack(y).long()

RuntimeError: stack expects each tensor to be equal size, but got [1, 1] at entry 0 and [1] at entry 412