In [1]:
import gym
import numpy as np
import random

In [2]:
def getRandomMaxAction(q):
    index = (q==q.max()).nonzero()[0]
    return np.random.choice(index)

###### Q-Learning with learning-rate.

In [146]:
env = gym.make("FrozenLake-v0")
Q = np.zeros((obs_size,action_size))

action_size = env.action_space.n
obs_size = env.observation_space.n

eps = 1
gamma = 0.95
lr = 0.4
cleared = 0
epoch = 2000
use_noise = False

for i in range(2000):
    obs = env.reset()
    while True:
        if use_noise:
            action = getRandomMaxAction(Q[obs]+np.random.randn(4)/(cleared+1))
        else:
            if random.random() < eps/(cleared+1):
                action = env.action_space.sample()
            else:
                action = getRandomMaxAction(Q[obs])
        next_obs, rew, done, etc = env.step(action)
        
        if next_obs != obs:
            Q[obs, action] = (1-lr) * Q[obs,action] + lr*(rew + gamma * Q[next_obs].max())
        
        if done:
            break
        obs = next_obs

    if rew == 1:
        cleared +=1
print("Cleared Rate = %.4f"%(cleared/epoch))

Cleared Rate = 0.5115


###### Q-Network

In [149]:
import torch
import torch.nn as nn
import torch.optim as optim

In [160]:
class Net(nn.Module):
    def __init__(self, in_size, hidden_size, out_size):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(in_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, out_size)
        )
    def forward(self, x):
        return self.net(x)

In [187]:
def getOneHot(idx, size):
    x = np.zeros((1,size))
    x[0,idx] = 1
    return torch.FloatTensor(x)

In [271]:
env = gym.make("FrozenLake-v0")

action_size = env.action_space.n
obs_size = env.observation_space.n
hidden_size = 64
eps = 0.1

QNet = Net(obs_size, hidden_size, action_size)
loss = nn.MSELoss()
opt = optim.Adam(QNet.parameters(), lr = 0.3)
cleared = 0

for i in range(epoch):
    obs = env.reset()
    while True:
        if random.random() < eps:
            action = env.action_space.sample()
        else:
            opt.zero_grad()
            Q = QNet(getOneHot(obs, obs_size))
            action = Q.data.numpy().argmax()
            
        next_obs, rew, done, etc = env.step(action)
        
        if next_obs != obs:
            Y=Q.clone().detach()
            if done:
                Y[0, action] = rew
                break
            else:
                Y[0, action] = rew + gamma * QNet(getOneHot(next_obs, obs_size).detach()).data.numpy().max()
            loss_ = loss(Q, Y)
            loss_.backward(retain_graph=True)
            opt.step()
            
            obs = next_obs
        
        if done:
            break
    if rew == 1:
        cleared +=1
    if i%(epoch/10) == 0:
        print("!",end="")
print()
print("Cleared Rate = %.4f"%(cleared/epoch))

!!!!!!!!!!
Cleared Rate = 0.0265


In [272]:
loss(Q,Y)

tensor([[ 1.4732e-13,  2.3317e-14, -2.1763e-13,  2.2249e-13]],
       grad_fn=<AddmmBackward>) tensor([[ 1.4732e-13,  2.3317e-14, -2.1763e-13,  0.0000e+00]])


In [273]:
loss(Q,Y)

tensor(1.2375e-26, grad_fn=<MseLossBackward>)