# Welcome!
Below, we will learn to implement and train a policy to play atari-pong, using only the pixels as input. We will use convolutional neural nets, multiprocessing, and pytorch to implement and train our policy. Let's get started!

(I strongly recommend you to try this notebook on the Udacity workspace first before running it locally on your desktop/laptop, as performance might suffer in different environments)

In [9]:
import math
import time
from collections import deque

import gym
import matplotlib.pyplot as plt
import numpy as np
import progressbar as pb
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from apex import amp

%matplotlib inline

In [2]:
# check which device is being used.
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("using device: ",device)

using device:  cuda:0


In [3]:
env = gym.make('CartPole-v0')
print(env.action_space.n)
print(env.observation_space.shape)

2
(4,)


# Preprocessing
To speed up training, we can simplify the input by cropping the images and use every other pixel

In [4]:
state = env.reset()
while True:
    env.render()
    state, reward, done, _ = env.step(np.random.choice(env.env.action_space.n))
    if done:
        break
time.sleep(1)

env.close()

# Policy

## Exercise 1: Implement your policy
 
Here, we define our policy. The input is the stack of two different frames (which captures the movement), and the output is a number $P_{\rm right}$, the probability of moving left. Note that $P_{\rm left}= 1-P_{\rm right}$

In [10]:
class Actor(nn.Module):

    def __init__(self, num_input=4, num_output=2):
        super(Actor, self).__init__()
        
        num_hidden1 = 300
        num_hidden2 = 200

        self.fc1 = nn.Linear(num_input, num_hidden1)
        self.fc2 = nn.Linear(num_hidden1, num_hidden2)
        self.fc3 = nn.Linear(num_hidden2, num_output)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)


In [112]:
class Critic(nn.Module):

    def __init__(self, num_input=4, num_output=1):
        super(Critic, self).__init__()
        
        num_hidden1 = 300
        num_hidden2 = 200

        self.fc1 = nn.Linear(num_input, num_hidden1)
        self.fc2 = nn.Linear(num_hidden1 + 1, num_hidden2)
        self.fc3 = nn.Linear(num_hidden2, num_output)

    def forward(self, state, action):
        xs = F.relu(self.fc1(state))
        x = torch.cat((xs, action), dim=1)
        x = F.relu(self.fc2(x))
        return self.fc3(x) 


# Function Definitions



# Training
We are now ready to train our policy!
WARNING: make sure to turn on GPU, which also enables multicore processing. It may take up to 45 minutes even with GPU enabled, otherwise it will take much longer!

In [128]:
episode = 5000

state = env.reset()

gamma = .99
actor_lr = 1e-4
critic_lr = 1e-4

actor = Actor().to(device)
actor_optimizer = optim.Adam(actor.parameters(), lr=actor_lr)

critic = Critic().to(device)
critic_optimizer = optim.Adam(critic.parameters(), lr=critic_lr)

# keep track of progress
total_rewards = deque(maxlen=100)

critic_loss_fn = nn.MSELoss()


for i in range(1, episode+1):
    state = env.reset()
    total_reward = 0
    
    states = []
    actions = []
    next_actions = []
    next_states = []
    rewards = []
    dones = []
    log_probs = []
    done = False
    while not done:
        state_tensor = torch.tensor(state).float().to(device)

        probs_tensor = actor(state_tensor)
        m = Categorical(logits=probs_tensor)
        action_tensor = m.sample()
        action = action_tensor.detach().cpu().numpy()
        log_prob = m.log_prob(action_tensor).detach().cpu().item()
        
        # Step
        next_state, reward, done, _ = env.step(action)
        
        states.append(state)
        actions.append(action)
        next_states.append(next_state)
        dones.append(done)
        log_probs.append(log_prob)
        
        total_rewards.append(total_reward)
        state = next_state
        
        if done:
            reward = 1
        else:
            reward = 1
        rewards.append(reward)
        total_reward += reward

#     discounts = gamma ** np.arange(len(rewards))
#     rewards = np.asarray(rewards) * discounts
#     rewards_future = rewards[::-1].cumsum(axis=0)[::-1]
    rewards_future = np.array(rewards)


#     print(rewards_future)
#     break
    actions = np.array(actions)

    states_tensor = torch.tensor(states).float().to(device)
    actions_tensor = torch.tensor(actions).float().to(device).unsqueeze(1)
    next_states_tensor = torch.tensor(next_states).float().to(device)
    rewards_tensor = torch.tensor(rewards_future.tolist()).float().to(device).unsqueeze(1)
    dones_tensor = torch.tensor(dones).float().to(device).unsqueeze(1)

    # Critic target
    probs_tensor = actor(next_states_tensor)
    m = Categorical(logits=probs_tensor)
    next_actions_tensor = m.sample().unsqueeze(1)
        
    vs_next = critic(next_states_tensor, next_actions_tensor)
    vs_target = rewards_tensor + gamma * vs_next * (1 - dones_tensor)

    # Critic expected
    vs = critic(states_tensor, actions_tensor)

    critic_loss = critic_loss_fn(vs, vs_target)
    critic_optimizer.zero_grad()
    critic_loss.backward()
    critic_optimizer.step()

    states_tensor = torch.tensor(states).float().to(device)
    next_states_tensor = torch.tensor(next_states).float().to(device)
    rewards_tensor = torch.tensor(rewards_future.tolist()).float().to(device).unsqueeze(1)
    dones_tensor = torch.tensor(dones).float().to(device).unsqueeze(1)
    log_probs_tensor = torch.tensor(log_probs).float().to(device).unsqueeze(1)
    
    # A(a, s)
    probs_tensor = actor(next_states_tensor)
    m = Categorical(logits=probs_tensor)
    next_actions_tensor = m.sample().unsqueeze(1)
#     advantage = rewards_tensor + gamma * critic(next_states_tensor, next_actions_tensor) * (1 - dones_tensor) - critic(states_tensor, actions_tensor)

#     actor_loss = (-log_probs_tensor * advantage)
    actor_loss = -log_probs_tensor * critic(states_tensor, actions_tensor)
    actor_loss = actor_loss.sum()
#     print(actor_loss)

    actor_optimizer.zero_grad()
    actor_loss.backward()
    actor_optimizer.step()


    # display some progress every 20 iterations
    print("\rEpisode: {0:d}, score: {1:f}".format(i, np.mean(total_rewards)), end="")
    if i % 100 == 0:
        print("\rEpisode: {0:d}, score: {1:f}".format(i, np.mean(total_rewards)))


Episode: 100, score: 16.530000
Episode: 200, score: 21.900000
Episode: 300, score: 15.350000
Episode: 400, score: 12.170000
Episode: 500, score: 12.440000
Episode: 600, score: 32.340000
Episode: 700, score: 10.610000
Episode: 763, score: 17.290000

KeyboardInterrupt: 

In [127]:

state = env.reset()
while True:
    env.render()
    with torch.no_grad():
        state_tensor = torch.tensor(state, dtype=torch.float).to(device)

        # Decide action
        probs_tensor = actor(state_tensor)
        m = Categorical(logits=probs_tensor)
        action_tensor = m.sample()
        action = action_tensor.detach().cpu().numpy()

        # Step
        next_state, reward, done, _ = env.step(action)
        
        if done:
            break

        state = next_state
        
time.sleep(1)
env.close()

In [35]:
a = deque(maxlen=3)
for i in range(5):
    a.append(i)
    print(a)

deque([0], maxlen=3)
deque([0, 1], maxlen=3)
deque([0, 1, 2], maxlen=3)
deque([1, 2, 3], maxlen=3)
deque([2, 3, 4], maxlen=3)
