In [1]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from IPython.display import clear_output
from time import sleep


#Hyperparameters
learning_rate = 0.0002
gamma         = 0.98


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        
        
        self.fc1 = nn.Linear(4, 128)
        self.fc2 = nn.Linear(128, 2)
        
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.softmax(self.fc2(x), dim=0)
        return x
      

        


env = gym.make('CartPole-v1')
pi = Policy().to(device)
optimizer = optim.Adam(pi.parameters(), lr=learning_rate)

score = 0.0
print_interval = 20
    

scores = []
for n_epi in range(10000):
    s = env.reset()
    done = False
    data = []
    while not done: # CartPole-v1 forced to terminates at 500 step.
        prob = pi(torch.from_numpy(s).float().to(device))
        m = Categorical(prob)
        a = m.sample()
        s_prime, r, done, info = env.step(a.item())
        data.append((r,prob[a]))
        s = s_prime
        score += r
        
            
    G = 0
    optimizer.zero_grad()
    for r, prob in data[::-1]:
        G = r + gamma * G
        loss = -torch.log(prob).to(device) * G
        loss.backward()
    optimizer.step()
    data = []
    
    if n_epi%print_interval==0 and n_epi!=0:
        clear_output(wait=True)
        print("# of episode :{}, avg score : {}".format(n_epi, score/print_interval))
        scores.append(score)
        score = 0.0
env.close()





# of episode :9980, avg score : 491.65


In [None]:
import matplotlib.pyplot as plt
plt.plot(range(len(scores)), np.array(scores), 'b', linewidth = 2, label = 'REINFORCE')
plt.legend(prop={'size':12})
plt.xlabel('Episode')
plt.ylabel('Total rewards')
#plt.xlim(0, no_of_episodes)
#plt.ylim(0, 20000)
#plt.legend(['Double DQN', 'Dueling DQN', 'D3QN'], loc=4)
plt.grid(True)

In [3]:

# TEST   
episode = 0
state = env.reset()    
score = 0
while episode < 10:  # episode loop
    env.render()       
    prob = pi(torch.from_numpy(state).float().to(device))
    m = Categorical(prob)
    action = m.sample().item()
    next_state, reward, done, info = env.step(action)  # take a random action
    state = next_state
    sleep(0.01)
    score = score + reward
    if done:
        episode = episode + 1
        clear_output(wait=True)
        print('Episode: {} Score: {}'.format(episode, score))
        state = env.reset()
        score = 0
env.close()       


Episode: 10 Score: 500.0
