In [126]:

import numpy as np
import torch 
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import gym
import gym_2048
from tqdm import tqdm

In [127]:
class GambleEnvironment():
    state0 = np.array([0])
    state1 = np.array([1])

    def reset(self):
        return self.state0
    
    def step(self, action):
        # lose always
        if action == 0:
            return self.state1, -1, True, None
        if action == 1:
            if np.random.random() <= 0.5:
                return self.state1, -1, True, None
            return self.state1, 1, True, None
        return self.state1, 1, True, None

In [128]:
replay_size = 10000

In [129]:
env = GambleEnvironment()

In [130]:
# MLP UDPRL stolen from https://github.com/BY571/Upside-Down-Reinforcement-Learning/blob/master/Upside-Down.ipynb

class BF(nn.Module):
    def __init__(self, state_space, action_space, hidden_size, seed):
        super(BF, self).__init__()
        torch.manual_seed(seed)
        self.actions = np.arange(action_space)
        self.action_space = action_space
        self.fc1 = nn.Linear(state_space, hidden_size)
        self.commands = nn.Linear(1, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, hidden_size)
        self.fc5 = nn.Linear(hidden_size, hidden_size)
        self.fc6 = nn.Linear(hidden_size, hidden_size)
        self.fc7 = nn.Linear(hidden_size, hidden_size)
        self.fc8 = nn.Linear(hidden_size, hidden_size)
        self.fc9 = nn.Linear(hidden_size, action_space)

        self.sigmoid = nn.Sigmoid()
        
    def forward(self, state, command):       
               
        out = self.sigmoid(self.fc1(state))
        command_out = self.sigmoid(self.commands(command))
        out = out * command_out
        out = torch.relu(self.fc2(out))
        out = torch.relu(self.fc3(out))
        out = self.fc9(out)
        
        return out
    
    def action(self, state, desire, horizon):
        """
        Samples the action based on their probability
        """
        command = desire
        action_prob = self.forward(state.expand(1, -1), command.expand(1, -1))[0,:]
        probs = torch.softmax(action_prob, dim=-1)
        action = torch.distributions.categorical.Categorical(probs=probs).sample()
        return action

In [131]:
import random


class ReplayBuffer():
    def __init__(self, max_size):
        self.max_size = max_size
        self.buffer = []
        
        
    def add_sample(self, states, actions, rewards):
        episode = {"states": states, "actions":actions, "rewards": rewards, "summed_rewards":sum(rewards)}
        self.buffer.append(episode)

    def get_nbest(self, n):
        self.sort()
        return self.buffer[:n]
        
    
    def sort(self):
        #sort buffer
        self.buffer = sorted(self.buffer, key = lambda i: i["summed_rewards"],reverse=True)
        # keep the max buffer size
        self.buffer = self.buffer[:self.max_size]
    
    def get_random_samples(self, batch_size):
        self.sort()
        idxs = np.random.randint(0, len(self.buffer), batch_size)
        batch = [self.buffer[idx] for idx in idxs]
        return batch
    
    def __len__(self):
        return len(self.buffer)

In [132]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

bf = BF(1, 3, 16, 1).to(device)
optimizer = optim.Adam(params=bf.parameters(), lr=1e-3)

In [133]:
init_desired_reward = 1
init_time_horizon = 2

def generate_episode(desired_return = torch.FloatTensor([init_desired_reward]), desired_time_horizon = torch.FloatTensor([init_time_horizon])):    
    """
    Generates more samples for the replay buffer.
    """
    next_state = env.reset()
    states = []
    actions = []
    rewards = []
    done = False
    while not done:
        states.append(next_state)
        action = bf.action(torch.from_numpy(next_state).float().to(device), desired_return, desired_time_horizon)
        next_state, reward, done, info = env.step(action)
        desired_return -= reward
        desired_time_horizon -= 1
        desired_time_horizon = torch.FloatTensor([np.maximum(desired_time_horizon, 1).item()])
        actions.append(action)
        rewards.append(reward)
    return states, actions, rewards

In [134]:
# create replay buffer:
buffer = ReplayBuffer(replay_size)
# init replay buffer with random trajectories:
for i in tqdm(range(0, int(replay_size/10))):
    states, actions, rewards = generate_episode()
    buffer.add_sample(states, actions, rewards)

100%|██████████| 1000/1000 [00:00<00:00, 1432.16it/s]


In [135]:
buffer.get_random_samples(1)

[{'states': [array([0])],
  'actions': [tensor(1)],
  'rewards': [1],
  'summed_rewards': 1}]

In [136]:
# FUNCTIONS FOR Sampling exploration commands
last_few = 1000
def sampling_exploration( top_X_eps = last_few):
    """
    This function calculates the new desired reward and new desired horizon based on the replay buffer.
    New desired horizon is calculted by the mean length of the best last X episodes. 
    New desired reward is sampled from a uniform distribution given the mean and the std calculated from the last best X performances.
    where X is the hyperparameter last_few.
    
    """
    
    top_X = buffer.get_nbest(last_few)
    #The exploratory desired horizon dh0 is set to the mean of the lengths of the selected episodes
    new_desired_horizon = np.mean([len(i["states"]) for i in top_X])
    # save all top_X cumulative returns in a list 
    returns = [i["summed_rewards"] for i in top_X]
    # from these returns calc the mean and std
    mean_returns = np.mean(returns)
    std_returns = np.std(returns)
    # sample desired reward from a uniform distribution given the mean and the std
    new_desired_reward = np.random.uniform(mean_returns, mean_returns+std_returns)

    return torch.FloatTensor([new_desired_reward])  , torch.FloatTensor([new_desired_horizon]) 

In [137]:


# FUNCTIONS FOR TRAINING
def select_time_steps(saved_episode):
    """
    Given a saved episode from the replay buffer this function samples random time steps (t1 and t2) in that episode:
    T = max time horizon in that episode
    Returns t1, t2 and T 
    """
    # Select times in the episode:
    T = len(saved_episode["states"]) # episode max horizon 
    if T != 1:
        t1 = np.random.randint(0,T)
        t2 = np.random.randint(t1+1,T+1)
    else:
        t1 = 0
        t2 = T

    return t1, t2, T

def create_training_input(episode, t1, t2):
    """
    Based on the selected episode and the given time steps this function returns 4 values:
    1. state at t1
    2. the desired reward: sum over all rewards from t1 to t2
    3. the time horizont: t2 -t1
    
    4. the target action taken at t1
    
    buffer episodes are build like [cumulative episode reward, states, actions, rewards]
    """
    state = episode["states"][t1] 
    desired_reward = sum(episode["rewards"][t1:t2])
    time_horizont = t2-t1
    action = episode["actions"][t1]
    return state, desired_reward, time_horizont, action

def create_training_examples(batch_size):
    """
    Creates a data set of training examples that can be used to create a data loader for training.
    ============================================================
    1. for the given batch_size episode idx are randomly selected
    2. based on these episodes t1 and t2 are samples for each selected episode 
    3. for the selected episode and sampled t1 and t2 trainings values are gathered
    ______________________________________________________________
    Output are two numpy arrays in the length of batch size:
    Input Array for the Behavior function - consisting of (state, desired_reward, time_horizon)
    Output Array with the taken actions 
    """
    input_array = []
    output_array = []
    # select randomly episodes from the buffer
    episodes = buffer.get_random_samples(batch_size)
    for ep in episodes:
        #select time stamps
        t1, t2, T = select_time_steps(ep)
        # For episodic tasks they set t2 to T:
        t2 = T
        state, desired_reward, time_horizont, action = create_training_input(ep, t1, t2)
        input_array.append(torch.cat([torch.FloatTensor(state), torch.FloatTensor([desired_reward]), torch.FloatTensor([time_horizont])]))
        output_array.append(action)
    return input_array, output_array

def train_behavior_function(batch_size):
    """
    Trains the BF with on a cross entropy loss were the inputs are the action probabilities based on the state and command.
    The targets are the actions appropriate to the states from the replay buffer.
    """
    X, y = create_training_examples(batch_size)

    X = torch.stack(X)
    state = X[:,0:1]
    d = X[:,1:1+1]
    h = X[:,1+1:1+2]
    command = d
    y = torch.stack(y).long()
    y_ = bf(state.to(device), command.to(device)).float()
    optimizer.zero_grad()
    pred_loss = F.cross_entropy(y_, y)   
    pred_loss.backward()
    optimizer.step()
    return pred_loss.detach().cpu().numpy()



In [138]:
def run_loop():
    i = 0
    while True:
        # compute desired horizon and desired reward
        
        i += 1
        for _ in range(0, int(replay_size/10000)):
            rew, hor = torch.FloatTensor([1]), torch.FloatTensor([1])
            states, actions, rewards = generate_episode(rew, hor)
            buffer.add_sample(states, actions, rewards)

        loss = train_behavior_function(int(replay_size*1))
        print(i, loss)

In [139]:
run_loop()

In [102]:
state = env.reset()
goal = torch.FloatTensor([-1])
F.softmax(bf(torch.FloatTensor(state), goal))

  F.softmax(bf(torch.FloatTensor(state), goal))


tensor([0.2874, 0.3409, 0.3717], grad_fn=<SoftmaxBackward0>)

In [125]:
create_training_examples(2)

([tensor([ 0., -1.,  1.]), tensor([ 0., -1.,  1.])], [tensor(0), tensor(1)])