In [156]:
import gym
import numpy as np
from collections import deque
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical 
import torch.optim as optim

In [157]:
env = gym.make('CartPole-v1')

In [158]:
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

In [159]:
state_size

4

In [160]:
state = env.reset()

In [161]:
state

array([-0.0453813 , -0.024898  ,  0.02862003,  0.02088778])

In [162]:
class Agent(nn.Module):
    def __init__(self, state_size, action_size):
        super(Agent, self).__init__()
        self.fc1 = nn.Linear(state_size, 8)
        self.fc2 = nn.Linear(8, action_size)
    
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = self.fc2(x)
        return F.softmax(x, dim=1)
    
    def get_action(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        action_probs = agent(state)
        m = Categorical(action_probs)
        action = m.sample()
        log_prob = m.log_prob(action)
        return action.item(), log_prob

In [200]:
agent = Agent(state_size, action_size)
optimizer = optim.Adam(agent.parameters(), lr=1e-2)

def reinforce(n_episodes=1000, gamma=0.9, max_t=1000):
    scores_window = deque(maxlen=100)
    scores = []
    for episode in range(1, n_episodes+1):
        state = env.reset()
        log_probs = []
        rewards = []
        for i in range(max_t):
            action, prob = agent.get_action(state)
            log_probs.append(prob)
            next_state, reward, done, _ = env.step(action)
            state = next_state
            rewards.append(reward)
            if done:
                break
        
        scores_window.append(sum(rewards))
        scores.append(sum(rewards))
        discounts = [gamma * i for i in range(len(rewards)+1)]
        R = sum([x*y for x, y in zip(discounts, rewards)])
        
        loss = []
        for prob in log_probs:
            loss.append(-prob * R)
        loss = torch.cat(loss).sum()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        print("\rEpisode : {}\tAverage Score: {:2f}".format(episode, np.mean(scores_window)), end="")
        
        if episode % 100==0:
            print("\rEpisode : {}\tAverage Score:{:2f}".format(episode, np.mean(scores_window)))
        
    

In [201]:
reinforce(n_episodes=2000)

Episode : 100	Average Score:25.5000000
Episode : 200	Average Score:38.1300000
Episode : 300	Average Score:52.4300000
Episode : 400	Average Score:92.0700000
Episode : 500	Average Score:57.1500000
Episode : 600	Average Score:35.1000000
Episode : 700	Average Score:254.5700000
Episode : 800	Average Score:492.7900000
Episode : 900	Average Score:473.4100000
Episode : 1000	Average Score:500.0000000
Episode : 1100	Average Score:497.2900000
Episode : 1200	Average Score:500.0000000
Episode : 1300	Average Score:427.0300000
Episode : 1400	Average Score:460.7900000
Episode : 1500	Average Score:487.4800000
Episode : 1600	Average Score:490.3500000
Episode : 1700	Average Score:471.2100000
Episode : 1800	Average Score:470.8500000
Episode : 1900	Average Score:500.0000000
Episode : 2000	Average Score:500.0000000


In [203]:
state = env.reset()
score = 0
for i in range(1000):
    env.render()
    action, prob = agent.get_action(state)
    next_state, reward, done, _ = env.step(action)
    state = next_state
    score += reward
    if done:
        break
print("Score : {}".format(score))
env.close()

Score : 500.0
