In [1]:
# Solution of Open AI gym environment "Cartpole-v0" (https://gym.openai.com/envs/CartPole-v0) using DQN and Pytorch.
# It is is slightly modified version of Pytorch DQN tutorial from
# http://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html.
# The main difference is that it does not take rendered screen as input but it simply uses observation values from the \
# environment.

import gym
from gym import wrappers
import random
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
# hyper parameters
EPISODES = 400  # number of episodes
EPS_START = 0.9  # e-greedy threshold start value
EPS_END = 0.05  # e-greedy threshold end value
EPS_DECAY = 100  # e-greedy threshold decay
GAMMA = 0.9  # Q-learning discount factor
LR = 1  # NN optimizer learning rate
HIDDEN_LAYER = 256  # NN hidden layer size
BATCH_SIZE = 64  # Q-learning batch size

# if gpu is to be used
use_cuda = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor
Tensor = FloatTensor


class ReplayMemory:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []

    def push(self, transition):
        self.memory.append(transition)
        if len(self.memory) > self.capacity:
            del self.memory[0]

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def latest(self):
        return self.memory[-1]
    
    def __len__(self):
        return len(self.memory)


class Network(nn.Module):
    def __init__(self):
        nn.Module.__init__(self)
        self.l1 = nn.Linear(4, HIDDEN_LAYER)
        self.l2 = nn.Linear(HIDDEN_LAYER, 2)

    def forward(self, x):
        x = F.relu(self.l1(x))
        x = self.l2(x)
        return x

"""
State space model:
thate_t+1 = theta_t + Q
r_t = Q(s_t,a_t;thetaPred_t) -gamma*max_a Q(s_t+1,a;thetaPred_t) + R
"""
class Kalman():
    def __init__(self, params, Q0, R0, P0=1):
        self.Q = Q0*np.diag(np.ones(params))
        self.R = R0
        self.P = P0*np.diag(np.ones(params))
        self.K = np.zeros(params)
    def updateP(self,newP):
        self.P -= newP
    
env = gym.make('CartPole-v0')
env = wrappers.Monitor(env, './tmp/cartpole-v0-1', force=True)

model = Network()
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])

if use_cuda:
    model.cuda()
memory = ReplayMemory(10000)
optimizer = optim.SGD(model.parameters(), LR)
steps_done = 0
episode_durations = []
kalman = Kalman(params,0.01,0.001,1)

def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        return model(Variable(state, volatile=True).type(FloatTensor)).data.max(1)[1].view(1, 1)
    else:
        return LongTensor([[random.randrange(2)]])


def run_episode(e, environment):
    state = environment.reset()
    steps = 0
    while True:
        environment.render()
        action = select_action(FloatTensor([state]))
        next_state, reward, done, _ = environment.step(action[0, 0])
        # negative reward when attempt ends
        if done:
            reward = -1
        #a = model(torch.autograd.Variable(torch.from_numpy(state))
        memory.push((FloatTensor([state]),
                     action,  # action is already a tensor
                     FloatTensor([next_state]),
                     FloatTensor([reward])))

        learn()

        state = next_state
        steps += 1

        if done:
            print("{2} Episode {0} finished after {1} steps"
                  .format(e, steps, '\033[92m' if steps >= 195 else '\033[99m'))
            episode_durations.append(steps)
            plot_durations()
            break

def learn():
    #if len(memory) < BATCH_SIZE:
    #    return

    # random transition batch is taken from experience replay memory
    """transitions = memory.sample(BATCH_SIZE)
    batch_state, batch_action, batch_next_state, batch_reward = zip(*transitions)
    batch_state = Variable(torch.cat(batch_state))
    batch_action = Variable(torch.cat(batch_action))
    batch_reward = Variable(torch.cat(batch_reward))
    batch_next_state = Variable(torch.cat(batch_next_state))
"""
    
    current = memory.latest()
    curr_state, curr_action, curr_next_state, curr_reward = current
    curr_state = Variable(torch.cat(curr_state))
    curr_action = Variable(torch.cat(curr_action))
    curr_reward = Variable(curr_reward)
    curr_next_state = Variable(torch.cat(curr_next_state))

    # current Q values are estimated by NN for all actions
    # current_q_values = model(batch_state).gather(1, batch_action)
    current_q_values = model(curr_state).gather(0, curr_action)
    # expected Q values are estimated from actions which gives maximum Q value
    #max_next_q_values = model(batch_next_state).detach().max(1)[0]
    #expected_q_values = batch_reward + (GAMMA * max_next_q_values)
    max_next_q_values = model(curr_next_state).detach().max()
    expected_q_values = curr_reward + (GAMMA * max_next_q_values)

    d = current_q_values
    optimizer.zero_grad()
    
    H = np.zeros((1,params))
    d.backward(torch.FloatTensor([1]),retain_variables=True)
    i = 0
    sizes = [i]
    for name, parameter in model.named_parameters():
        temp = parameter.grad.view(-1).data.numpy()
        H[0,i:i+temp.shape[0]] = temp
        i += temp.shape[0]
        sizes.append(i)
    
    Ppred = kalman.P+kalman.Q
    y = expected_q_values-current_q_values #Innovation

    S = np.matmul(np.matmul(H,Ppred),np.transpose(H))+kalman.R
    K = (np.matmul(Ppred,np.transpose(H)))/S
    theta_new = K*(y.data.numpy())

    i = 0
    for f in model.parameters():
        b = torch.from_numpy(theta_new[sizes[i]:sizes[i+1]]).type(FloatTensor)
        b = b.view(f.grad.size())
        f.data += b
        i+=1
        
    P = np.matmul(K*S,np.transpose(K))
    kalman.updateP(P)

def plot_durations():
    plt.figure(2)
    plt.clf()
    durations_t = torch.FloatTensor(episode_durations)
    plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(durations_t.numpy())
    # take 100 episode averages and plot them too
    if len(durations_t) >= 100:
        means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy())

    plt.pause(0.001)  # pause a bit so that plots are updated


for e in range(EPISODES):
    run_episode(e, env)

print('Complete')
env.render(close=True)
env.close()
plt.ioff()
plt.show()


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
1794




[99m Episode 0 finished after 10 steps


<Figure size 640x480 with 1 Axes>



[99m Episode 1 finished after 17 steps


<Figure size 640x480 with 1 Axes>

[99m Episode 2 finished after 20 steps


<Figure size 640x480 with 1 Axes>

[99m Episode 3 finished after 14 steps


<Figure size 640x480 with 1 Axes>

[99m Episode 4 finished after 19 steps


<Figure size 640x480 with 1 Axes>

KeyboardInterrupt: 