### This file is for testing radical changes to the codebase. Upside Down Reinforcement Learning.py is the working version.

In [1]:
import gym
env = gym.make('LunarLander-v2')

In [2]:
#command takes form [derired reward, desired horizon]
import numpy as np
def random_policy(obs, command):
    return np.random.randint(env.action_space.n)

In [3]:
import time
from copy import deepcopy
#Visualise agent function
def visualise_agent(policy, command, n=5):
    try:
        for trial_i in range(n):
            current_command = deepcopy(command)
            observation = env.reset()
            done=False
            t=0
            episode_return=0
            while not done:
                env.render()
                action = policy(torch.tensor([observation]).double(), torch.tensor([command]).double())
                observation, reward, done, info = env.step(action)
                episode_return+=reward
                current_command[0]-= reward
                current_command[1] = max(1, current_command[1]-1)
                t+=1
            env.render()
            time.sleep(1.5)
            print("Episode {} finished after {} timesteps. Return = {}".format(trial_i, t, episode_return))
        env.close()
    except KeyboardInterrupt:
        env.close()

In [169]:
#visualise_agent(random_policy, command=[500, 500], 1)

In [5]:
import torch
import torch.nn.functional as F

class FCNN_AGENT(torch.nn.Module):
    def __init__(self, command_scale):
        super().__init__()
        hidden_size=64
        self.command_scale=command_scale
        self.observation_embedding = torch.nn.Sequential(
            torch.nn.Linear(np.prod(env.observation_space.shape), hidden_size),
            torch.nn.Sigmoid()
        )
        self.command_embedding = torch.nn.Sequential(
            torch.nn.Linear(2, hidden_size),
            torch.nn.Sigmoid()
        )
        self.to_output = torch.nn.Sequential(
            torch.nn.Linear(hidden_size, hidden_size),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_size, env.action_space.n)
        )
    
    def forward(self, observation, command):
        obs_emebdding = self.observation_embedding(observation)
        cmd_embedding = self.command_embedding(command*self.command_scale)
        embedding = torch.mul(obs_emebdding, cmd_embedding)
        action_prob_logits = self.to_output(embedding)
        return action_prob_logits
    
    def create_optimizer(self, lr):
        self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)

In [6]:
from copy import deepcopy
def collect_experience(policy, replay_buffer, replay_size, last_few, n_episodes=100, log_to_tensorboard=True):
    global i_episode
    init_replay_buffer = deepcopy(replay_buffer)
    try:
        for _ in range(n_episodes):
            command = sample_command(init_replay_buffer, last_few)
            writer.add_scalar('Command desired reward/Episode', command[0], i_episode)    # write loss to a graph
            writer.add_scalar('Command horizon/Episode', command[1], i_episode)    # write loss to a graph
            observation = env.reset()
            episode_mem = {'observation':[],
                           'action':[],
                           'reward':[],}
            done=False
            while not done:
                action = policy(torch.tensor([observation]).double(), torch.tensor([command]).double())
                new_observation, reward, done, info = env.step(action)
                
                episode_mem['observation'].append(observation)
                episode_mem['action'].append(action)
                episode_mem['reward'].append(reward)
                
                observation=new_observation
                command[0]-= reward
                command[1] = max(1, command[1]-1)
            episode_mem['return']=sum(episode_mem['reward'])
            episode_mem['episode_len']=len(episode_mem['observation'])
            replay_buffer.append(episode_mem)
            i_episode+=1
            if log_to_tensorboard: writer.add_scalar('Return/Episode', sum(episode_mem['reward']), i_episode)    # write loss to a graph
            print("Episode {} finished after {} timesteps. Return = {}".format(i_episode, len(episode_mem['observation']), sum(episode_mem['reward'])))
        env.close()
    except KeyboardInterrupt:
        env.close()
    replay_buffer = sorted(replay_buffer, key=lambda x:x['return'])[-replay_size:]
    return replay_buffer

def sample_command(replay_buffer, last_few):
    if len(replay_buffer)==0:
        return [1, 1]
    else:
        command_samples = replay_buffer[-last_few:]
        lengths = [mem['episode_len'] for mem in command_samples]
        returns = [mem['return'] for mem in command_samples]
        mean_return, std_return = np.mean(returns), np.std(returns)
        command_horizon = np.mean(lengths)
        desired_reward = np.random.uniform(mean_return, mean_return+std_return)
        return [desired_reward, command_horizon]

In [7]:
def train_net(policy_net, replay_buffer, n_updates=100, batch_size=64, log_to_tensorboard=True):
    global i_updates
    all_costs = []
    for i in range(n_updates):
        batch_observations = np.zeros((batch_size, np.prod(env.observation_space.shape)))
        batch_commands = np.zeros((batch_size, 2))
        batch_label = np.zeros((batch_size))
        for b in range(batch_size):
            sample_episode = np.random.randint(0, len(replay_buffer))
            sample_t1 = np.random.randint(0, len(replay_buffer[sample_episode]['observation']))
            sample_t2 = len(replay_buffer[sample_episode]['observation'])
            ##sample_t2 = np.random.randint(sample_t1+1, len(replay_buffer[sample_episode]['observation'])+1)
            sample_horizon = sample_t2-sample_t1
            sample_mem = replay_buffer[sample_episode]['observation'][sample_t1]
            sample_desired_reward = sum(replay_buffer[sample_episode]['reward'][sample_t1:sample_t2])
            network_input = np.append(sample_mem, [sample_desired_reward, sample_horizon])
            label = replay_buffer[sample_episode]['action'][sample_t1]
            batch_observations[b] = sample_mem
            batch_commands[b] = [sample_desired_reward, sample_horizon]
            batch_label[b] = label
        batch_observations = torch.tensor(batch_observations).double()
        batch_commands = torch.tensor(batch_commands).double()
        batch_label = torch.tensor(batch_label).long()
        pred = policy_net(batch_observations, batch_commands)
        cost = F.cross_entropy(pred, batch_label)
        if log_to_tensorboard: writer.add_scalar('Cost/NN update', cost.item() , i_updates)    # write loss to a graph
        all_costs.append(cost.item())
        cost.backward()
        policy_net.optimizer.step()
        policy_net.optimizer.zero_grad()
        i_updates+=1
    return np.mean(all_costs)

In [8]:
def create_greedy_policy(policy_network):
    def policy(obs, command):
        action_logits = policy_network(obs, command)
        action_probs = F.softmax(action_logits, dim=-1)
        action = np.argmax(action_probs.detach().numpy())
        return action
    return policy

def create_stochastic_policy(policy_network):
    def policy(obs, command):
        action_logits = policy_network(obs, command)
        action_probs = F.softmax(action_logits, dim=-1)
        action = torch.distributions.Categorical(action_probs).sample().item()
        return action
    return policy

In [9]:
i_episode=0
i_updates=0 #number of parameter updates to the neural network
replay_buffer = []
log_to_tensorboard = True 

replay_size = 700
last_few = 50
batch_size = 256
n_warm_up_episodes = 50
n_episodes_per_iter = 25
n_updates_per_iter = 150
command_scale = 0.02
lr = 0.001

agent = FCNN_AGENT(command_scale).double()
agent.create_optimizer(lr)

stochastic_policy = create_stochastic_policy(agent)
greedy_policy = create_greedy_policy(agent)

In [10]:
# SET UP TRAINING VISUALISATION
# SET UP TRAINING VISUALISATION
if log_to_tensorboard: from torch.utils.tensorboard import SummaryWriter
if log_to_tensorboard: writer = SummaryWriter() # we will use this to show our models performance on a graph using tensorboard

In [11]:
#Collect warm up episodes
replay_buffer = collect_experience(random_policy, replay_buffer, replay_size, last_few, n_warm_up_episodes, log_to_tensorboard)
train_net(agent, replay_buffer, n_updates_per_iter, batch_size, log_to_tensorboard)

Episode 1 finished after 87 timesteps. Return = -95.47832481120317
Episode 2 finished after 90 timesteps. Return = -308.23957663490256
Episode 3 finished after 125 timesteps. Return = -273.56608353239596
Episode 4 finished after 143 timesteps. Return = -122.86663265739914
Episode 5 finished after 109 timesteps. Return = -132.6634608656605
Episode 6 finished after 65 timesteps. Return = -92.29343003244614
Episode 7 finished after 101 timesteps. Return = -103.48110808986972
Episode 8 finished after 81 timesteps. Return = -305.7735218283285
Episode 9 finished after 113 timesteps. Return = -296.6817198263583
Episode 10 finished after 116 timesteps. Return = -113.86090220688554
Episode 11 finished after 144 timesteps. Return = -207.7920988171057
Episode 12 finished after 112 timesteps. Return = -125.24642596146685
Episode 13 finished after 70 timesteps. Return = -99.61956186253272
Episode 14 finished after 64 timesteps. Return = -64.96988286530956
Episode 15 finished after 84 timesteps. Ret

1.386259598678298

In [None]:
n_iters = 1000
for i in range(n_iters):
    replay_buffer = collect_experience(stochastic_policy, replay_buffer, replay_size, last_few, n_episodes_per_iter, log_to_tensorboard)
    train_net(agent, replay_buffer, n_updates_per_iter, batch_size, log_to_tensorboard)

Episode 51 finished after 82 timesteps. Return = -88.50429894304365
Episode 52 finished after 99 timesteps. Return = -31.286976850922926
Episode 53 finished after 60 timesteps. Return = -106.35597449901124
Episode 54 finished after 121 timesteps. Return = -140.90086400630838
Episode 55 finished after 83 timesteps. Return = -257.39099441858366
Episode 56 finished after 122 timesteps. Return = -68.64573689433801
Episode 57 finished after 114 timesteps. Return = -222.94524424918393
Episode 58 finished after 111 timesteps. Return = -400.5598424644172
Episode 59 finished after 61 timesteps. Return = -98.51600008178612
Episode 60 finished after 125 timesteps. Return = -122.94159869312136
Episode 61 finished after 66 timesteps. Return = -105.29468327867619
Episode 62 finished after 100 timesteps. Return = -485.4342377708204
Episode 63 finished after 86 timesteps. Return = -98.73458116870694
Episode 64 finished after 86 timesteps. Return = -145.48005166040318
Episode 65 finished after 93 times

Episode 171 finished after 96 timesteps. Return = -122.90533551388233
Episode 172 finished after 92 timesteps. Return = -109.05428713969451
Episode 173 finished after 89 timesteps. Return = -89.78004538567173
Episode 174 finished after 81 timesteps. Return = -253.22509801292728
Episode 175 finished after 98 timesteps. Return = -96.02678130481621
Episode 176 finished after 89 timesteps. Return = -249.26062406162904
Episode 177 finished after 122 timesteps. Return = -233.5291243935175
Episode 178 finished after 103 timesteps. Return = -318.2945691992227
Episode 179 finished after 69 timesteps. Return = -80.03359124678876
Episode 180 finished after 84 timesteps. Return = -92.18732160777816
Episode 181 finished after 117 timesteps. Return = -257.7732291949321
Episode 182 finished after 79 timesteps. Return = -134.51195262160724
Episode 183 finished after 63 timesteps. Return = -83.54952571407415
Episode 184 finished after 106 timesteps. Return = -159.09405809497804
Episode 185 finished aft

Episode 295 finished after 73 timesteps. Return = 27.76311832448603
Episode 296 finished after 114 timesteps. Return = -230.20626827005844
Episode 297 finished after 73 timesteps. Return = -131.32228304311974
Episode 298 finished after 111 timesteps. Return = -200.32606374557756
Episode 299 finished after 123 timesteps. Return = -112.84515176928736
Episode 300 finished after 128 timesteps. Return = -393.04631925175516
Episode 301 finished after 84 timesteps. Return = -233.68582909925703
Episode 302 finished after 93 timesteps. Return = -127.23728443783573
Episode 303 finished after 109 timesteps. Return = -349.6832399973654
Episode 304 finished after 68 timesteps. Return = -109.18632531529221
Episode 305 finished after 78 timesteps. Return = -243.44523087336688
Episode 306 finished after 101 timesteps. Return = -279.52461562215603
Episode 307 finished after 84 timesteps. Return = -68.7949216172403
Episode 308 finished after 77 timesteps. Return = -202.1257851391876
Episode 309 finished

Episode 414 finished after 105 timesteps. Return = -151.99457695462803
Episode 415 finished after 76 timesteps. Return = -106.25244697232887
Episode 416 finished after 145 timesteps. Return = -113.05781858131893
Episode 417 finished after 128 timesteps. Return = -119.34912835434582
Episode 418 finished after 85 timesteps. Return = -88.28367932697964
Episode 419 finished after 68 timesteps. Return = -96.15438728091526
Episode 420 finished after 96 timesteps. Return = -89.44083523172363
Episode 421 finished after 124 timesteps. Return = -224.83247172256063
Episode 422 finished after 107 timesteps. Return = -137.82523572752984
Episode 423 finished after 146 timesteps. Return = -300.4324796036036
Episode 424 finished after 65 timesteps. Return = -82.16372147028201
Episode 425 finished after 127 timesteps. Return = -128.6986768325242
Episode 426 finished after 75 timesteps. Return = -94.71366666494853
Episode 427 finished after 130 timesteps. Return = -137.3552830325895
Episode 428 finished

Episode 538 finished after 91 timesteps. Return = -189.525057140691
Episode 539 finished after 82 timesteps. Return = -131.2452231913116
Episode 540 finished after 115 timesteps. Return = -128.15092422916277
Episode 541 finished after 65 timesteps. Return = -84.82986449779435
Episode 542 finished after 59 timesteps. Return = -100.80065354439678
Episode 543 finished after 87 timesteps. Return = -115.62451413522399
Episode 544 finished after 90 timesteps. Return = -224.50738097013908
Episode 545 finished after 136 timesteps. Return = -170.03072775256896
Episode 546 finished after 86 timesteps. Return = -117.82303255624005
Episode 547 finished after 76 timesteps. Return = -101.49954226029566
Episode 548 finished after 121 timesteps. Return = -351.3109949477501
Episode 549 finished after 115 timesteps. Return = -288.35421335617366
Episode 550 finished after 154 timesteps. Return = -90.24164911156849
Episode 551 finished after 109 timesteps. Return = -136.11889821337752
Episode 552 finished

Episode 661 finished after 87 timesteps. Return = -130.6173668827214
Episode 662 finished after 85 timesteps. Return = -93.85414018039091
Episode 663 finished after 69 timesteps. Return = -74.74660038738372
Episode 664 finished after 120 timesteps. Return = -107.96117804753933
Episode 665 finished after 106 timesteps. Return = -154.43750635472455
Episode 666 finished after 84 timesteps. Return = -117.11207266445787
Episode 667 finished after 91 timesteps. Return = -73.8618263075322
Episode 668 finished after 73 timesteps. Return = -79.65439858586
Episode 669 finished after 117 timesteps. Return = -80.93030519506138
Episode 670 finished after 102 timesteps. Return = -85.23760695344275
Episode 671 finished after 80 timesteps. Return = -83.89798939049018
Episode 672 finished after 80 timesteps. Return = -166.71425123772326
Episode 673 finished after 82 timesteps. Return = -59.66581891297703
Episode 674 finished after 98 timesteps. Return = -150.42571363186573
Episode 675 finished after 12

Episode 782 finished after 79 timesteps. Return = -117.37999456370474
Episode 783 finished after 95 timesteps. Return = -130.38593666741738
Episode 784 finished after 103 timesteps. Return = -96.09237317316898
Episode 785 finished after 74 timesteps. Return = -142.23620246397783
Episode 786 finished after 115 timesteps. Return = -92.03309801849298
Episode 787 finished after 88 timesteps. Return = -107.41643200495591
Episode 788 finished after 86 timesteps. Return = -139.69744376489908
Episode 789 finished after 97 timesteps. Return = -123.72157641736109
Episode 790 finished after 93 timesteps. Return = -120.79892463030427
Episode 791 finished after 87 timesteps. Return = -130.27420911303028
Episode 792 finished after 88 timesteps. Return = -137.91168479570555
Episode 793 finished after 101 timesteps. Return = -120.71740516619309
Episode 794 finished after 96 timesteps. Return = -80.82493692206626
Episode 795 finished after 109 timesteps. Return = -149.11092313915734
Episode 796 finishe

Episode 900 finished after 126 timesteps. Return = -108.30144338222992
Episode 901 finished after 99 timesteps. Return = -106.63342537632076
Episode 902 finished after 87 timesteps. Return = -128.97894228085664
Episode 903 finished after 113 timesteps. Return = -124.14824353594943
Episode 904 finished after 72 timesteps. Return = -64.38189966695268
Episode 905 finished after 121 timesteps. Return = -92.82604242318571
Episode 906 finished after 113 timesteps. Return = -156.54102744497257
Episode 907 finished after 105 timesteps. Return = -99.33723608418241
Episode 908 finished after 97 timesteps. Return = -112.0128312615972
Episode 909 finished after 74 timesteps. Return = -132.88954082693078
Episode 910 finished after 75 timesteps. Return = -36.918146940405684
Episode 911 finished after 71 timesteps. Return = -90.27002102387226
Episode 912 finished after 77 timesteps. Return = -81.29284951649905
Episode 913 finished after 100 timesteps. Return = -107.32477913633728
Episode 914 finished

Episode 1021 finished after 94 timesteps. Return = -148.1697358608411
Episode 1022 finished after 120 timesteps. Return = -320.4204073437603
Episode 1023 finished after 81 timesteps. Return = -177.33526763895946
Episode 1024 finished after 106 timesteps. Return = -100.20366602293937
Episode 1025 finished after 111 timesteps. Return = -87.17057016521096
Episode 1026 finished after 111 timesteps. Return = -157.57666083493586
Episode 1027 finished after 92 timesteps. Return = -46.759221996052446
Episode 1028 finished after 98 timesteps. Return = -116.93996711414306
Episode 1029 finished after 80 timesteps. Return = -93.13513219540724
Episode 1030 finished after 80 timesteps. Return = -110.67894062859587
Episode 1031 finished after 75 timesteps. Return = -87.30072585057997
Episode 1032 finished after 99 timesteps. Return = -124.11892877770366
Episode 1033 finished after 71 timesteps. Return = -65.898185367949
Episode 1034 finished after 99 timesteps. Return = -100.36596116838372
Episode 10

Episode 1137 finished after 65 timesteps. Return = -70.93526387082228
Episode 1138 finished after 73 timesteps. Return = -76.90519224239151
Episode 1139 finished after 86 timesteps. Return = -138.71409182004297
Episode 1140 finished after 94 timesteps. Return = -75.94267767346311
Episode 1141 finished after 71 timesteps. Return = -45.98552311842312
Episode 1142 finished after 74 timesteps. Return = -82.17209203435809
Episode 1143 finished after 85 timesteps. Return = -183.32424429702266
Episode 1144 finished after 79 timesteps. Return = -54.53327770619048
Episode 1145 finished after 67 timesteps. Return = -114.6068010838897
Episode 1146 finished after 94 timesteps. Return = -72.8840981670509
Episode 1147 finished after 98 timesteps. Return = -107.00700619326832
Episode 1148 finished after 101 timesteps. Return = -115.38352622368863
Episode 1149 finished after 82 timesteps. Return = -103.25008742771097
Episode 1150 finished after 86 timesteps. Return = -50.6668225024172
Episode 1151 fin

Episode 1258 finished after 65 timesteps. Return = -167.7356964707923
Episode 1259 finished after 88 timesteps. Return = -130.73658385057405
Episode 1260 finished after 113 timesteps. Return = -108.54111374991408
Episode 1261 finished after 112 timesteps. Return = -248.27608509841087
Episode 1262 finished after 94 timesteps. Return = -178.45475142051487
Episode 1263 finished after 120 timesteps. Return = -141.89066209358657
Episode 1264 finished after 112 timesteps. Return = -112.26808965026969
Episode 1265 finished after 94 timesteps. Return = -52.86797669594867
Episode 1266 finished after 115 timesteps. Return = -335.8527348761071
Episode 1267 finished after 79 timesteps. Return = -106.81377140643919
Episode 1268 finished after 113 timesteps. Return = -161.89595672794405
Episode 1269 finished after 81 timesteps. Return = -95.65815804723154
Episode 1270 finished after 88 timesteps. Return = -73.21008864794658
Episode 1271 finished after 90 timesteps. Return = -97.5641696567173
Episode

Episode 1374 finished after 104 timesteps. Return = -127.94215198431039
Episode 1375 finished after 69 timesteps. Return = -88.72617522255109
Episode 1376 finished after 85 timesteps. Return = -88.03940758340664
Episode 1377 finished after 73 timesteps. Return = -26.450869100492454
Episode 1378 finished after 89 timesteps. Return = -146.21552286076312
Episode 1379 finished after 93 timesteps. Return = -126.36690469337128
Episode 1380 finished after 63 timesteps. Return = -25.675022023084395
Episode 1381 finished after 79 timesteps. Return = -54.4277516791889
Episode 1382 finished after 82 timesteps. Return = -1.4824028945792378
Episode 1383 finished after 104 timesteps. Return = -121.95081241044485
Episode 1384 finished after 82 timesteps. Return = -157.21416664760423
Episode 1385 finished after 96 timesteps. Return = -107.85132004856965
Episode 1386 finished after 91 timesteps. Return = -132.92378422561978
Episode 1387 finished after 93 timesteps. Return = -137.80964221797893
Episode 

Episode 1492 finished after 93 timesteps. Return = -107.11161668241849
Episode 1493 finished after 88 timesteps. Return = -108.66800976450689
Episode 1494 finished after 83 timesteps. Return = -111.21387411588312
Episode 1495 finished after 82 timesteps. Return = -104.53991167241912
Episode 1496 finished after 107 timesteps. Return = -365.7714839161973
Episode 1497 finished after 85 timesteps. Return = -110.77974304896233
Episode 1498 finished after 91 timesteps. Return = -103.20931802239178
Episode 1499 finished after 91 timesteps. Return = -116.07509508678928
Episode 1500 finished after 70 timesteps. Return = -49.45383215492371
Episode 1501 finished after 104 timesteps. Return = -96.06422631407328
Episode 1502 finished after 79 timesteps. Return = -79.55093853813581
Episode 1503 finished after 101 timesteps. Return = -79.16352720902586
Episode 1504 finished after 63 timesteps. Return = -41.46983049484005
Episode 1505 finished after 80 timesteps. Return = -109.62457974202938
Episode 1

Episode 1613 finished after 84 timesteps. Return = -211.29846119349193
Episode 1614 finished after 77 timesteps. Return = -32.94418259612321
Episode 1615 finished after 100 timesteps. Return = -71.83381226581257
Episode 1616 finished after 84 timesteps. Return = -120.9663329089205
Episode 1617 finished after 77 timesteps. Return = -74.28271260029808
Episode 1618 finished after 77 timesteps. Return = -83.0175763394177
Episode 1619 finished after 85 timesteps. Return = -88.06714654193914
Episode 1620 finished after 84 timesteps. Return = -162.93787911467166
Episode 1621 finished after 80 timesteps. Return = -81.70377072028107
Episode 1622 finished after 105 timesteps. Return = 3.097057767985987
Episode 1623 finished after 78 timesteps. Return = -111.31217012797461
Episode 1624 finished after 94 timesteps. Return = -211.13948917500167
Episode 1625 finished after 89 timesteps. Return = -122.97240879711653
Episode 1626 finished after 94 timesteps. Return = -106.22528929521127
Episode 1627 f

Episode 1732 finished after 84 timesteps. Return = -78.11158600397269
Episode 1733 finished after 79 timesteps. Return = -144.84975002214435
Episode 1734 finished after 107 timesteps. Return = -127.3232218181309
Episode 1735 finished after 89 timesteps. Return = -62.79456921096202
Episode 1736 finished after 72 timesteps. Return = -111.14542208274011
Episode 1737 finished after 79 timesteps. Return = -54.62664268834686
Episode 1738 finished after 86 timesteps. Return = -150.41161611430306
Episode 1739 finished after 94 timesteps. Return = -139.39445262770633
Episode 1740 finished after 78 timesteps. Return = 6.738651382351449
Episode 1741 finished after 86 timesteps. Return = -146.15339724218296
Episode 1742 finished after 89 timesteps. Return = -110.4805119372252
Episode 1743 finished after 93 timesteps. Return = -132.7657756714523
Episode 1744 finished after 1000 timesteps. Return = 30.987893120418523
Episode 1745 finished after 98 timesteps. Return = -100.54981454002439
Episode 1746

Episode 1848 finished after 1000 timesteps. Return = 136.11512464106445
Episode 1849 finished after 135 timesteps. Return = -229.95569111526152
Episode 1850 finished after 126 timesteps. Return = -164.52212430966367
Episode 1851 finished after 136 timesteps. Return = -65.9732796068325
Episode 1852 finished after 124 timesteps. Return = -140.04861459458465
Episode 1853 finished after 126 timesteps. Return = -28.13271181454843
Episode 1854 finished after 99 timesteps. Return = -153.89201587386788
Episode 1855 finished after 77 timesteps. Return = -152.87737499900635
Episode 1856 finished after 81 timesteps. Return = -44.043658325072784
Episode 1857 finished after 75 timesteps. Return = -72.27639464737882
Episode 1858 finished after 83 timesteps. Return = -197.64694842894284
Episode 1859 finished after 78 timesteps. Return = -60.51221649973678
Episode 1860 finished after 83 timesteps. Return = -37.583768588556495
Episode 1861 finished after 77 timesteps. Return = -41.91808731907081
Episod

Episode 1968 finished after 89 timesteps. Return = -35.12474721163788
Episode 1969 finished after 106 timesteps. Return = -187.17244960436506
Episode 1970 finished after 115 timesteps. Return = -251.96833662070193
Episode 1971 finished after 95 timesteps. Return = -56.695087978048676
Episode 1972 finished after 81 timesteps. Return = -95.1098894039616
Episode 1973 finished after 107 timesteps. Return = -178.9891311724874
Episode 1974 finished after 123 timesteps. Return = -223.30274533870747
Episode 1975 finished after 70 timesteps. Return = -48.96222883648506
Episode 1976 finished after 80 timesteps. Return = -123.53415040959284
Episode 1977 finished after 75 timesteps. Return = -4.8288500032496415
Episode 1978 finished after 85 timesteps. Return = -155.30530456034592
Episode 1979 finished after 88 timesteps. Return = -140.1121290879525
Episode 1980 finished after 111 timesteps. Return = -143.0361883212949
Episode 1981 finished after 76 timesteps. Return = -99.67249564037765
Episode 1

Episode 2086 finished after 121 timesteps. Return = -237.26226323361652
Episode 2087 finished after 88 timesteps. Return = -295.53249398916984
Episode 2088 finished after 108 timesteps. Return = -212.084833074075
Episode 2089 finished after 112 timesteps. Return = -104.72452356164585
Episode 2090 finished after 121 timesteps. Return = -231.5455355014792


In [165]:
#torch.save(agent.state_dict(), 'checkpoints/lunar_lander_64x64_checkpoint_0.pt')
#agent.load_state_dict(torch.load('checkpoints/lunar_lander_32x32_checkpoint_0.pt'))

In [179]:
visualise_agent(greedy_policy, command=[150, 400], n=5)

Episode 0 finished after 833 timesteps. Return = -360.375805006002


In [181]:
visualise_agent(stochastic_policy, command=[150, 400], n=5)

Episode 0 finished after 213 timesteps. Return = -27.11786348432311
Episode 1 finished after 271 timesteps. Return = -226.95918413035955
Episode 2 finished after 325 timesteps. Return = -26.1805768711521


In [130]:
print([mem['return'] for mem in replay_buffer])

# Previous Code

In [None]:
def train_net(policy_net, replay_buffer, n_updates=100, batch_size=64):
    all_costs = []
    for i in range(n_updates):
        batch_input = np.zeros((batch_size, np.prod(env.observation_space.shape)+2))
        batch_label = np.zeros((batch_size))
        for b in range(batch_size):
            sample_episode = np.random.randint(0, len(replay_buffer))
            sample_horizon = np.random.randint(1, len(replay_buffer[sample_episode]['observation'])+1)
            sample_mem_idx = np.random.randint(0, len(replay_buffer[sample_episode]['observation'])+1-sample_horizon)
            sample_mem = replay_buffer[sample_episode]['observation'][sample_mem_idx]
            sample_desired_reward = sum(replay_buffer[sample_episode]['reward'][sample_mem_idx:sample_mem_idx+sample_horizon])
            network_input = np.append(sample_mem, [sample_desired_reward, sample_horizon])
            label = replay_buffer[sample_episode]['action'][sample_mem_idx]
            batch_input[b] = network_input
            batch_label[b] = label
        batch_input = torch.tensor(batch_input).double()
        batch_label = torch.tensor(batch_label).long()
        pred = policy_net(batch_input)
        cost = F.cross_entropy(pred, batch_label)
        all_costs.append(cost.item())
        cost.backward()
        policy_net.optimizer.step()
        policy_net.optimizer.zero_grad()
    return np.mean(all_costs)

In [8]:
def train_net(policy_net, episode_mem, n_samples = 5): #stochastic gradient descent
    all_costs = []
    for i in range(n_samples):
        sample_horizon = np.random.randint(1, len(episode_mem['observation'])+1)
        sample_mem_idx = np.random.randint(0, len(episode_mem['observation'])+1-sample_horizon)
        sample_mem = episode_mem['observation'][sample_mem_idx]
        sample_desired_reward = sum(episode_mem['reward'][sample_mem_idx:sample_mem_idx+sample_horizon])
        network_input = torch.tensor(np.append(sample_mem, [sample_desired_reward, sample_horizon])).double()
        label = torch.tensor([episode_mem['action'][sample_mem_idx]]).double()
        
        pred = policy_net(network_input)
        cost = F.binary_cross_entropy(pred, label)
        all_costs.append(cost.item())
        cost.backward()
        policy_net.optimizer.step()
        policy_net.optimizer.zero_grad()
    return np.mean(all_costs)
    

In [33]:
def train(policy_net, n_episodes=100):
    global i_episode
    global epsilon
    try:
        for _ in range(n_episodes):
            observation = env.reset()
            episode_mem = {'observation':[],
                            'action':[],
                            'reward':[],
                            'done':[]}
            done=False
            while not done:
                network_input = torch.tensor(np.append(observation, [desired_reward, command_horizon])).double()
                action_prob = policy_net(network_input)
                action = np.random.binomial(1, action_prob.item())
                #action = int(action_prob.item()>0.5)
                if np.random.rand()<epsilon: action = np.random.randint(0, 2)
                new_observation, reward, done, info = env.step(action)
                
                episode_mem['observation'].append(observation)
                episode_mem['action'].append(action)
                episode_mem['reward'].append(reward)
                episode_mem['done'].append(done)
                
                observation=new_observation
                epsilon*=0.999
            episode_mem['return']=sum(episode_mem['reward'])
            episode_mem['episode_len']=len(episode_mem['observation'])
            mean_cost = train_net(policy_net, episode_mem)
            
            i_episode+=1
            print("Episode {} finished after {} timesteps. Epsilon={} Mean Cost={}".format(i_episode, len(episode_mem['observation']), epsilon, mean_cost))
        env.close()
    except KeyboardInterrupt:
        env.close()

## Previous version

In [None]:
import gym
import time
import torch
import numpy as np
from copy import deepcopy
import torch.nn.functional as F

env = gym.make('CartPole-v1')

def random_policy(obs):
    return np.random.randint(env.action_space.n)

#Visualise agent function
def visualise_agent(policy, command, n=5):
    try:
        for trial_i in range(n):
            current_command = deepcopy(command)
            observation = env.reset()
            done=False
            t=0
            episode_return=0
            while not done:
                env.render()
                network_input = torch.tensor(np.append(observation, current_command)).double()
                policy_action = policy(network_input)
                observation, reward, done, info = env.step(policy_action)
                episode_return+=reward
                #time.sleep(0.1)
                current_command[0]-= reward
                current_command[1] = max(1, current_command[1]-1)
                
                t+=1
            env.render()
            time.sleep(1.5)
            print("Episode {} finished after {} timesteps. Return = {}".format(trial_i, t, episode_return))
        env.close()
    except KeyboardInterrupt:
        env.close()
        
#Behaviour function - Neural Network
class FCNN_AGENT(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = torch.nn.Sequential(
            torch.nn.Linear(np.prod(env.observation_space.shape)+2, 10),
            torch.nn.ReLU(),
            torch.nn.Linear(10, 10),
            torch.nn.ReLU(),
            torch.nn.Linear(10, env.action_space.n)
        )
    
    def forward(self, x):
        x = self.fc(x)
        return F.softmax(x, dim=-1)
    
    def create_optimizer(self, lr):
        self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)

#Full the replay buffer with more experience
def collect_experience(policy, replay_buffer, replay_size, last_few, n_episodes=100, log_to_tensorboard=True):
    global i_episode
    init_replay_buffer = deepcopy(replay_buffer)
    try:
        for _ in range(n_episodes):
            command = sample_command(init_replay_buffer, last_few)
            if log_to_tensorboard: writer.add_scalar('Command desired reward', command[0], i_episode)    # write loss to a graph
            if log_to_tensorboard: writer.add_scalar('Command horizon', command[1], i_episode)    # write loss to a graph
            observation = env.reset()
            episode_mem = {'observation':[],
                            'action':[],
                            'reward':[]}
            done=False
            while not done:
                network_input = torch.tensor(np.append(observation, command)).double()
                action = policy(network_input)
                new_observation, reward, done, info = env.step(action)
                
                episode_mem['observation'].append(observation)
                episode_mem['action'].append(action)
                episode_mem['reward'].append(reward)
                
                observation=new_observation
                #command[0]-= reward
                command[0] = max(1, command[0]-reward)
                command[1] = max(1, command[1]-1)
            episode_mem['return']=sum(episode_mem['reward'])
            episode_mem['episode_len']=len(episode_mem['observation'])
            replay_buffer.append(episode_mem)
            i_episode+=1
            if log_to_tensorboard: writer.add_scalar('Return', sum(episode_mem['reward']), i_episode)    # write loss to a graph
            print("Episode {} finished after {} timesteps. Return = {}".format(i_episode, len(episode_mem['observation']), sum(episode_mem['reward'])))
        env.close()
    except KeyboardInterrupt:
        env.close()
    replay_buffer = sorted(replay_buffer, key=lambda x:x['return'])[-replay_size:]
    return replay_buffer

#Sample exploratory command
def sample_command(replay_buffer, last_few):
    if len(replay_buffer)==0:
        return [1, 1]
    else:
        command_samples = replay_buffer[-last_few:]
        lengths = [mem['episode_len'] for mem in command_samples]
        returns = [mem['return'] for mem in command_samples]
        mean_return, std_return = np.mean(returns), np.std(returns)
        command_horizon = np.mean(lengths)
        desired_reward = np.random.uniform(mean_return, mean_return+std_return)
        return [desired_reward, command_horizon]

#Improve behviour function by training on replay buffer
def train_net(policy_net, replay_buffer, n_updates=100, batch_size=64):
    all_costs = []
    for i in range(n_updates):
        batch_input = np.zeros((batch_size, np.prod(env.observation_space.shape)+2))
        batch_label = np.zeros((batch_size))
        for b in range(batch_size):
            sample_episode = np.random.randint(0, len(replay_buffer))
            sample_t1 = np.random.randint(0, len(replay_buffer[sample_episode]['observation']))
            sample_t2 = len(replay_buffer[sample_episode]['observation'])
            sample_horizon = sample_t2-sample_t1
            sample_mem = replay_buffer[sample_episode]['observation'][sample_t1]
            sample_desired_reward = sum(replay_buffer[sample_episode]['reward'][sample_t1:sample_t2])
            network_input = np.append(sample_mem, [sample_desired_reward, sample_horizon])
            label = replay_buffer[sample_episode]['action'][sample_t1]
            batch_input[b] = network_input
            batch_label[b] = label
        batch_input = torch.tensor(batch_input).double()
        batch_label = torch.tensor(batch_label).long()
        pred = policy_net(batch_input)
        cost = F.cross_entropy(pred, batch_label)
        all_costs.append(cost.item())
        cost.backward()
        policy_net.optimizer.step()
        policy_net.optimizer.zero_grad()
    return np.mean(all_costs)

#Return a greedy policy from a given network
def create_greedy_policy(policy_network):
    def policy(obs):
        action_prob = policy_network(obs)
        action = np.argmax(action_prob.detach().numpy())
        return action
    return policy

#Return a stochastic policy from a given network
def create_stochastic_policy(policy_network):
    def policy(obs):
        action_prob = policy_network(obs)
        action_sample = np.random.multinomial(1, action_prob.detach().numpy())
        action = np.argmax(action_sample)
        return action
    return policy


#Define hyperparameters, initialize behaviour function
i_episode=0
replay_buffer = []
replay_size = 600
last_few = 50
log_to_tensorboard = False 

batch_size = 64
n_warm_up_episodes = 50
n_episodes_per_iter = 50
n_updates_per_iter = 100

lr = 0.001
agent = FCNN_AGENT().double()
agent.create_optimizer(lr)

stochastic_policy = create_stochastic_policy(agent)
greedy_policy = create_greedy_policy(agent)

# SET UP TRAINING VISUALISATION
if log_to_tensorboard: from torch.utils.tensorboard import SummaryWriter
if log_to_tensorboard: writer = SummaryWriter() # we will use this to show our models performance on a graph using tensorboard

#Collect warm up episodes
replay_buffer = collect_experience(random_policy, replay_buffer, replay_size, last_few, n_warm_up_episodes, log_to_tensorboard)
train_net(agent, replay_buffer, n_updates=n_updates_per_iter, batch_size=batch_size)

#Collect experience and train behaviour function for given number of iterations
n_iters = 1000
for i in range(n_iters):
    replay_buffer = collect_experience(stochastic_policy, replay_buffer, replay_size, last_few, n_episodes_per_iter, log_to_tensorboard)
    train_net(agent, replay_buffer, n_updates=n_updates_per_iter, batch_size=batch_size)

#Visualise final trained agent
visualise_agent(greedy_policy, command=[250, 200], n=5)
