In [1]:
import numpy as np
from numpy import random as rnd
from matplotlib import pyplot as plt
import warnings,os,time,datetime,sys

import torch
import pandas as pd

import gym

warnings.filterwarnings('ignore')

In [2]:
class Net(torch.nn.Module):
    def __init__(self,input_shape,output_shape):
        super(Net,self).__init__()
        self.l_in = torch.nn.Linear(input_shape,16)
        self.l0 = torch.nn.Linear(16,8)
        self.l_out = torch.nn.Linear(8,output_shape)
        self.optimizer = torch.optim.Adam(self.parameters(),lr=0.05)
    def forward(self,x):
        x = torch.nn.functional.relu(self.l_in(x))
        x = torch.nn.functional.relu(self.l0(x))
        x = torch.nn.functional.softmax(self.l_out(x))
        return x
    def discount_rewards(self,rewards,gamma=0.99):
        r = np.array([gamma**k*rewards[k] for k in range(len(rewards))])
        r = r[::-1].cumsum()[::-1]
        return r-r.mean()
    def action_probs_as_numpy(self,s):
        action_p = self(torch.FloatTensor(s)).detach().numpy()
        return action_p
    def train(self,states,rewards,actions):
        self.optimizer.zero_grad()
        state_t = torch.FloatTensor(states)
        reward_t = torch.FloatTensor(rewards)
        action_t = torch.LongTensor(actions)
        logprob = torch.log(net(state_t))
        selected_logprobs = reward_t*torch.gather(logprob,1,action_t.reshape(action_t.shape[0],1)).squeeze()
        loss = -selected_logprobs.mean()
        loss.backward()
        self.optimizer.step()
        return

In [218]:
NUM_EPISODES = 500
BATCH_SIZE = 3

In [219]:
net = Net(2,3)

In [220]:
env = gym.make('MountainCar-v0')

total_rewards = []
batch_rewards = []
batch_actions = []
batch_states = []

viz_step = int(5**np.floor(np.log10(NUM_EPISODES)))
batch_counter = 1

episodes = 0
while episodes<NUM_EPISODES:

    s0 = env.reset()

    states = []
    rewards = []
    actions = []

    done = False
    while not done:

        ap = net.action_probs_as_numpy(s0)
        #if rnd.rand()<0.25: ap[1] = 0
        #if ap.sum()==0: ap = np.ones(shape=[3])
        action_probs = ap
        action_c = rnd.choice(np.arange(env.action_space.n),p=action_probs)

        s1,r,done,_ = env.step(action_c)
        r = r + 0.5/(s1[0]-0.5)

        states.append(s0)
        rewards.append(r)
        actions.append(action_c)

        s0 = s1

        if done:

            total_rewards.append(sum(rewards))

            batch_states.extend(states)
            batch_rewards.extend(net.discount_rewards(rewards,gamma=1.2))
            batch_actions.extend(actions)

            batch_counter += 1

            if batch_counter==BATCH_SIZE:
                net.train(batch_states,batch_rewards,batch_actions)
                batch_states = []
                batch_rewards = []
                batch_actions = []
                batch_counter = 1
            episodes += 1
    if episodes%viz_step==0: print('{}/{} -> Avg rewards: {}'.format(episodes,NUM_EPISODES,np.mean(total_rewards[-viz_step:])))
env.close()

WARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.
25/500 -> Avg rewards: -290.44587135423734
50/500 -> Avg rewards: -285.979468977219
75/500 -> Avg rewards: -285.92203946718257
100/500 -> Avg rewards: -286.033462418021
125/500 -> Avg rewards: -285.99775238216466
150/500 -> Avg rewards: -285.95232146930397
175/500 -> Avg rewards: -286.0035075297968
200/500 -> Avg rewards: -285.9481456513953
225/500 -> Avg rewards: -286.01512232418315
250/500 -> Avg rewards: -285.96784960743685
275/500 -> Avg rewards: -286.02444232330237
300/500 -> Avg rewards: -285.98639611725105
325/500 -> Avg rewards: -285.9247707969297
350/500 -> Avg rewards: -285.9277821539909
375/500 -> Avg rewards: -285.9086911858145
400/500 -> Avg rewards: -285.96288333758434
425/500 -> Avg rewards: -285.9602195138696
450/500 -> Avg rewards: -285.931076544184
475/500 -> Avg rewards: -285.96052784724947
500/500 -> Avg rewards: -285.9900399811576


In [222]:
env = gym.make('MountainCar-v0')

s0 = env.reset()

states = []
rewards = []
actions = []
actionprobs = []

done = False
while not done:

    ap = net.action_probs_as_numpy(s0)
    action_probs = ap/ap.sum()
    action_c = rnd.choice(np.arange(env.action_space.n),p=action_probs)

    s1,r,done,_ = env.step(action_c)
    env.render()

    states.append(s0)
    rewards.append(r)
    actions.append(action_c)
    actionprobs.append(action_probs)

    s0 = s1

    if done: 
        net.train(states,rewards,actions)
env.close()

WARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.


In [223]:
rewards

[-1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,
 -1.0,