In [2]:
import numpy as np
from numpy import random as rnd
from matplotlib import pyplot as plt
import warnings,os,time,datetime,sys

import torch
import pandas as pd

import gym

In [3]:
class Net(torch.nn.Module):
    def __init__(self,env):
        super(Net,self).__init__()
        self.l0 = torch.nn.Linear(env.observation_space.shape[0],16)
        self.l1 = torch.nn.Linear(16,env.action_space.n)
    def forward(self,x):
        x = torch.nn.functional.relu(self.l0(x))
        x = torch.nn.functional.softmax(self.l1(x))
        return x

In [4]:
def discount_rewards(rewards,gamma=0.99):
    r = np.array([gamma**k*rewards[k] for k in range(len(rewards))])
    r = r[::-1].cumsum()[::-1]
    return r-r.mean()

In [20]:
batch_size = 25

In [16]:
net = Net(env)
optimizer = torch.optim.Adam(net.parameters(),lr=0.01)

In [45]:
env = gym.make('CartPole-v0')
action_space = np.arange(env.action_space.n)

total_rewards = []

batch_rewards = []
batch_actions = []
batch_states = []

batch_counter = 1

episodes = 0
while episodes<500:

    s0 = env.reset()

    states = []
    rewards = []
    actions = []

    done = False

    while not done:

        action_probs = net(torch.FloatTensor(s0)).detach().numpy()
        action_c = rnd.choice(action_space,p=action_probs)

        s1,r,done,_ = env.step(action_c)

        states.append(s0)
        rewards.append(r)
        actions.append(action_c)

        s0 = s1

        if done:

            total_rewards.append(sum(rewards))

            batch_states.extend(states)
            batch_rewards.extend(discount_rewards(rewards))
            batch_actions.extend(actions)

            batch_counter += 1

            if batch_counter==batch_size:

                optimizer.zero_grad()

                state_t = torch.FloatTensor(batch_states)
                reward_t = torch.FloatTensor(batch_rewards)
                action_t = torch.LongTensor(batch_actions)

                logprob = torch.log(net(state_t))
                selected_logprobs = reward_t*torch.gather(logprob,1,action_t.reshape(action_t.shape[0],1)).squeeze()
                loss = -selected_logprobs.mean()

                loss.backward()
                optimizer.step()

                batch_states = []
                batch_rewards = []
                batch_actions = []
                batch_counter = 1
                
            print('Avg rewards over previous 100 episodes: {}'.format(np.mean(total_rewards[-100:])))
            episodes += 1
            
env.close()

WARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.


  


Avg rewards over previous 100 episodes: 96.0
Avg rewards over previous 100 episodes: 128.5
Avg rewards over previous 100 episodes: 119.66666666666667
Avg rewards over previous 100 episodes: 139.75
Avg rewards over previous 100 episodes: 138.8
Avg rewards over previous 100 episodes: 132.16666666666666
Avg rewards over previous 100 episodes: 125.42857142857143
Avg rewards over previous 100 episodes: 122.75
Avg rewards over previous 100 episodes: 123.44444444444444
Avg rewards over previous 100 episodes: 131.1
Avg rewards over previous 100 episodes: 137.36363636363637
Avg rewards over previous 100 episodes: 142.58333333333334
Avg rewards over previous 100 episodes: 147.0
Avg rewards over previous 100 episodes: 148.42857142857142
Avg rewards over previous 100 episodes: 151.46666666666667
Avg rewards over previous 100 episodes: 147.375
Avg rewards over previous 100 episodes: 149.41176470588235
Avg rewards over previous 100 episodes: 147.55555555555554
Avg rewards over previous 100 episodes:

Avg rewards over previous 100 episodes: 187.97
Avg rewards over previous 100 episodes: 187.97
Avg rewards over previous 100 episodes: 188.63
Avg rewards over previous 100 episodes: 188.63
Avg rewards over previous 100 episodes: 188.5
Avg rewards over previous 100 episodes: 188.5
Avg rewards over previous 100 episodes: 188.5
Avg rewards over previous 100 episodes: 188.5
Avg rewards over previous 100 episodes: 188.92
Avg rewards over previous 100 episodes: 188.92
Avg rewards over previous 100 episodes: 189.15
Avg rewards over previous 100 episodes: 189.58
Avg rewards over previous 100 episodes: 189.58
Avg rewards over previous 100 episodes: 190.26
Avg rewards over previous 100 episodes: 190.51
Avg rewards over previous 100 episodes: 190.51
Avg rewards over previous 100 episodes: 190.51
Avg rewards over previous 100 episodes: 190.88
Avg rewards over previous 100 episodes: 190.88
Avg rewards over previous 100 episodes: 191.51
Avg rewards over previous 100 episodes: 192.14
Avg rewards over 

Avg rewards over previous 100 episodes: 197.9
Avg rewards over previous 100 episodes: 197.9
Avg rewards over previous 100 episodes: 197.9
Avg rewards over previous 100 episodes: 197.9
Avg rewards over previous 100 episodes: 197.9
Avg rewards over previous 100 episodes: 197.9
Avg rewards over previous 100 episodes: 197.9
Avg rewards over previous 100 episodes: 197.9
Avg rewards over previous 100 episodes: 197.9
Avg rewards over previous 100 episodes: 197.9
Avg rewards over previous 100 episodes: 197.9
Avg rewards over previous 100 episodes: 197.9
Avg rewards over previous 100 episodes: 197.9
Avg rewards over previous 100 episodes: 197.9
Avg rewards over previous 100 episodes: 197.9
Avg rewards over previous 100 episodes: 198.29
Avg rewards over previous 100 episodes: 198.29
Avg rewards over previous 100 episodes: 198.29
Avg rewards over previous 100 episodes: 198.29
Avg rewards over previous 100 episodes: 198.29
Avg rewards over previous 100 episodes: 198.29
Avg rewards over previous 10

In [48]:
env = gym.make('CartPole-v0')

s0 = env.reset()
env.render()

done = False
while not done:
    
    action_probs = net(torch.FloatTensor(s0)).detach().numpy()
    action_c = rnd.choice(action_space,p=action_probs)
    
    s0,r,done,_ = env.step(action_c)
    env.render()

env.close()

WARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.


  
