In [13]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from IPython.display import clear_output
from time import sleep



#Hyperparameters
learning_rate = 0.0002
gamma         = 0.98
n_rollout     = 200





class ActorCritic(nn.Module):
    def __init__(self):
        super(ActorCritic, self).__init__()
        
        self.fc1 = nn.Linear(4,256)
        self.fc_pi = nn.Linear(256,2)
        self.fc_v = nn.Linear(256,1)
        
    def pi(self, x, softmax_dim = 0):
        x = F.relu(self.fc1(x))
        x = self.fc_pi(x)
        prob = F.softmax(x, dim=softmax_dim)
        return prob
    
    def v(self, x):
        x = F.relu(self.fc1(x))
        v = self.fc_v(x)
        return v
    

def make_batch(data):
    s_lst, a_lst, r_lst, s_prime_lst, done_lst = [], [], [], [], []
    for transition in data:
        s,a,r,s_prime,done = transition
        s_lst.append(s)
        a_lst.append([a])
        r_lst.append([r/100.0])
        s_prime_lst.append(s_prime)
        done_mask = 0.0 if done else 1.0
        done_lst.append([done_mask])
        
    s_batch, a_batch, r_batch, s_prime_batch, done_batch = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
                                                           torch.tensor(r_lst, dtype=torch.float), torch.tensor(s_prime_lst, dtype=torch.float), \
                                                           torch.tensor(done_lst, dtype=torch.float)

    return s_batch, a_batch, r_batch, s_prime_batch, done_batch
  
      
env = gym.make('CartPole-v1')
model = ActorCritic()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


print_interval = 20
score = 0.0
scores = []
data = []
for n_epi in range(3000):
    done = False
    s = env.reset()
    while not done:
        for t in range(n_rollout):
            prob = model.pi(torch.from_numpy(s).float())
            m = Categorical(prob)
            a = m.sample().item()
            s_prime, r, done, info = env.step(a)

            data.append((s,a,r,s_prime,done))
                
            s = s_prime
            score += r
                
            if done:
                break                     
            
            
    s, a, r, s_prime, done = make_batch(data)
    data = []
    td_target = r + gamma * model.v(s_prime) * done
    delta = td_target - model.v(s)
        
    pi = model.pi(s, softmax_dim=1)
    pi_a = pi.gather(1,a)
    loss = -torch.log(pi_a) * delta.detach() + F.smooth_l1_loss(model.v(s), td_target.detach())

    optimizer.zero_grad()
    loss.mean().backward()
    optimizer.step()   
        
            
    if n_epi%print_interval==0 and n_epi!=0:
        clear_output(wait=True)
        print("# of episode :{}, avg score : {:.1f}".format(n_epi, score/print_interval))
        scores.append(score)
        score = 0.0
env.close()


# of episode :2980, avg score : 461.5


In [None]:
import matplotlib.pyplot as plt
plt.plot(range(len(scores)), np.array(scores), 'b', linewidth = 2, label = 'AC')
plt.legend(prop={'size':12})
plt.xlabel('Episode')
plt.ylabel('Total rewards')
#plt.xlim(0, no_of_episodes)
#plt.ylim(0, 20000)
#plt.legend(['Double DQN', 'Dueling DQN', 'D3QN'], loc=4)
plt.grid(True)

In [None]:
# TEST   
episode = 0
state = env.reset()    
score = 0
while episode < 100:  # episode loop
    env.render()       
    prob = model.pi(torch.from_numpy(state).float())
    m = Categorical(prob)
    action = m.sample().item()
    next_state, reward, done, info = env.step(action)  # take a random action
    state = next_state
    score = score + reward
    if done:
        episode = episode + 1
        print('Episode: {} Score: {}'.format(episode, score))
        state = env.reset()
        score = 0
env.close() 