In [1]:
# Implement Neural Fitted Q Iteration
"""
Reinforcement learning with function approximators poses two critical challenges. 
Not only do we need to fit the function approximator to iteratively observed new data, 
we also need to make sure that the function approximator does not un-learn or forget 
in other parts of the state and actions space. 
This is particularly difficult for (large) neural network function approximators and 
for a long time these issues have limited reinforcement learning to small function 
approximator models with few parameters. 
For temporal-difference learning, a common and important approach to address these problems 
is storing and re-using transitions and this technique will be in the focus here.
In this task, you are asked to re-implement one of the early and seminal papers about 
using neural networks for temporal-difference learning. If you have not done so already, 
read the paper

Riedmiller, Martin. 
"Neural fitted Q iteration–first experiences with a data efficient neural reinforcement learning method." 
In European Conference on Machine Learning, pp. 317-328. Springer, Berlin, Heidelberg, 2005.

Solution of Open AI gym environment "Cartpole-v0" 
(https://gym.openai.com/envs/CartPole-v0) using NFQ and Pytorch.:

1. Set up a gym environment with (discrete and) finite action space and continuous state space such as 
CartPole-v0 or MountainCar-v0 and make sure that you can control the apparatus in the gym correctly.

2. Implement the NFQ algorithm and make the necessary adaptions. 
(You might want to take inspiration from the paper and make some adaptions to the reward function.) 
As the function approximator use a neural network model that you construct in PyTorch and 
be prepared to try different variants.

3. Train NFQ with the gym using the (low-dimensional) state representation provided by the gym. 
Conduct a thorough investigation with trying out different parameter settings and 
collecting and recording the necessary information from the learning process needed to determine what works, what does not work, and why. Share and discuss your findings in the discussion board if you have problems or if you find something interesting.

4. Optional - This step is optional but very helpful for the next homework assignment. Repeat step 3 but this time use the (high-dimensional) rendered visual representation as the state space. You will have to find out how to access the images for the next task anyway.

5. Submit your implementation for this assignment. Take notes and records of your results and bring them to the second session. Be prepared to present and explain your implementation, your findings, and your conclusions. Keep your notes and records for preparing the seminar presentation at the final session where you will be asked to compare to another algorithm.
"""
# Imports here
import gym
from gym import wrappers
import random
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import matplotlib.pyplot as plt


In [2]:
# hyper parameters
EPISODES = 200  # number of episodes
EPS_START = 0.9  # e-greedy threshold start value
EPS_END = 0.05  # e-greedy threshold end value
EPS_DECAY = 200  # e-greedy threshold decay
GAMMA = 0.8  # Q-learning discount factor
LR = 0.001  # NN optimizer learning rate
HIDDEN_LAYER = 256  # NN hidden layer size
BATCH_SIZE = 64  # Q-learning batch size

In [3]:
# if gpu is to be used
use_cuda = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
#ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor
#Tensor = FloatTensor


In [4]:
class ReplayMemory:
    """
    This class if for storing transitions. 
    
        
    Attributes
    ----------
    says_str : str
        a formatted string to print out what the animal says
    name : str
        the name of the animal
    sound : str
        the sound that the animal makes
    num_legs : int
        the number of legs the animal has (default 4)

    Methods
    -------
    says(sound=None)
        Prints the animals name and what sound it makes
    """
    def __init__(self, capacity=0):
        """
        Parameters
        ----------
        capacity : int, optional
            The upper bound or the capacity of the Memory
        """
        self.capacity = capacity
        self.memory = []

    def push(self, transition):
        """
        Add a transition to the memory and 
        pop the oldest transition if the capacity limit is reached. 
        
        Parameters
        ----------
        transition : transitionObject
            The transition to add to the Memory
        """
        self.memory.append(transition)
        if 0 < capacity:
            if len(self.memory) > self.capacity:
                del self.memory[0]
        # Capacity = 0 - no upper bound. The memory will keep increasing

    def random_draw(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)


In [5]:
class Network(nn.Module):
    def __init__(self):
        nn.Module.__init__(self)
        self.l1 = nn.Linear(4, HIDDEN_LAYER)
        self.l2 = nn.Linear(HIDDEN_LAYER, 2)

    def forward(self, x):
        x = F.relu(self.l1(x))
        x = self.l2(x)
        return x

class Net_CNN_MNIST(nn.Module):
    def __init__(self):
        super().__init__()
        # First 2D convolutional layer, taking in 1 input channel (image),
        # outputting 6 convolutional features, with a square kernel size of 3
        self.conv1 = nn.Conv2d(1, 6, 3)
        # Second 2D convolutional layer, taking in the 6 input layers,
        # outputting 16 convolutional features, with a square kernel size of 3
        self.conv2 = nn.Conv2d(6, 16, 3)
        
        # First fully connected layer
        # 28*28 -> 13*13 -> 5*5...  
        # Why? https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html#torch.nn.MaxPool2d
        self.fc1 = nn.Linear(16 * 5 * 5, 120)  
        self.fc2 = nn.Linear(120, 84)
        # Second fully connected layer that outputs our 10 labels
        self.fc3 = nn.Linear(84, 10)
        self.pool = nn.MaxPool2d(2, 2)
        #self.flatten = nn.Flatten()
        
    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = self.pool(F.relu(self.conv1(x)))
        # If the size is a square, you can specify with a single number
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

In [6]:
env = gym.make('CartPole-v0')
env = wrappers.Monitor(env, './tmp/cartpole-v0-1', force=True) # Delete video_callalbe=False to render while training

model = Network()
if use_cuda:
    model.cuda()
memory = ReplayMemory(10000)
# If you need to move a model to GPU via .cuda(), please do so before constructing optimizers for it. 
# Parameters of a model after .cuda() will be different objects with those before the call.
optimizer = optim.Adam(model.parameters(), LR)
#optimizer = optim.Rprop(model.parameters(), lr=LR)
#optimizer = optim.SGD(model.parameters(), lr=LR)
steps_done = 0
episode_durations = []



## How to adjust learning rate
torch.optim.lr_scheduler provides several methods to adjust the learning rate based on the number of epochs. torch.optim.lr_scheduler.ReduceLROnPlateau allows dynamic learning rate reducing based on some validation measurements.

Learning rate scheduling should be applied after optimizer’s update; e.g., you should write your code this way:

>>> scheduler = ...
>>> for epoch in range(100):
>>>     train(...)
>>>     validate(...)
>>>     scheduler.step()

In [7]:
def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        return model(Variable(state, volatile=True).type(FloatTensor)).data.max(1)[1].view(1, 1)
    else:
        return LongTensor([[random.randrange(2)]])

In [8]:
def run_episode(e, environment):
    state = environment.reset()
    steps = 0
    while True:
        environment.render()
        action = select_action(FloatTensor([state]))
        next_state, reward, done, _ = environment.step(action[0, 0])

        # negative reward when attempt ends
        if done:
            reward = -1

        memory.push((FloatTensor([state]),
                     action,  # action is already a tensor
                     FloatTensor([next_state]),
                     FloatTensor([reward])))

        learn()

        state = next_state
        steps += 1

        if done:
            print("{2} Episode {0} finished after {1} steps"
                  .format(e, steps, '\033[92m' if steps >= 195 else '\033[99m'))
            episode_durations.append(steps)
            plot_durations()
            break

In [9]:
def learn():
    if len(memory) < BATCH_SIZE:
        return

    # random transition batch is taken from experience replay memory
    transitions = memory.sample(BATCH_SIZE)
    batch_state, batch_action, batch_next_state, batch_reward = zip(*transitions)

    batch_state = Variable(torch.cat(batch_state))
    batch_action = Variable(torch.cat(batch_action))
    batch_reward = Variable(torch.cat(batch_reward))
    batch_next_state = Variable(torch.cat(batch_next_state))

    # current Q values are estimated by NN for all actions
    current_q_values = model(batch_state).gather(1, batch_action)
    # expected Q values are estimated from actions which gives maximum Q value
    max_next_q_values = model(batch_next_state).detach().max(1)[0]
    expected_q_values = batch_reward + (GAMMA * max_next_q_values)

    # loss is measured from error between current and newly expected Q values
    loss = F.smooth_l1_loss(current_q_values, expected_q_values)

    # backpropagation of loss to NN
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [10]:
def plot_durations():
    plt.figure(2)
    plt.clf()
    durations_t = torch.FloatTensor(episode_durations)
    plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(durations_t.numpy())
    # take 100 episode averages and plot them too
    if len(durations_t) >= 100:
        means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy())

    plt.pause(0.001)  # pause a bit so that plots are updated

In [11]:
for e in range(EPISODES):
    print('Episode: ', e)
    run_episode(e, env)

print('Complete')
env.render(close=True)
env.close()
plt.ioff()
plt.show()

AssertionError: tensor(0, device='cuda:0') (<class 'torch.Tensor'>) invalid