In [0]:
import gym

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np

import random
import math

import collections
from collections import namedtuple

In [0]:
stepInfo = namedtuple('StepInfo',('cur_state', 'action','next_state', 'reward',))

class ReplayList():
    def __init__(self, size):
        self.memory = collections.deque(maxlen = size)
        
    def append(self, data):
        type(self.memory)
        self.memory.append(data)
        
    def sample(self, size):        
        if len(self.memory) < size:
            return None
        else:
            return random.sample(self.memory, size)
    
    def __len__(self):
        return len(self.memory)
    
    def __repr__(self):
        return print(self.memory)

In [0]:
class QNet(nn.Module):
    def __init__(self, in_size, hidden_size, out_size):
        super(QNet, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(in_size, hidden_size),
            nn.ReLU(),
#             nn.Linear(hidden_size, hidden_size),
#             nn.ReLU(),
            nn.Linear(hidden_size, out_size)
        )
        
    def forward(self,x):
        return self.net(x)

In [0]:
def getOneHotTensor(pos, size):
    ret = np.zeros((1,size))
    ret[0][pos] = 1
    return torch.FloatTensor(ret)#.cuda()

In [0]:
HIDDEN_SIZE = 128
LR = 0.001
EPOCH = 2000

MEM_SIZE = 30000

GAMMA = 0.95
EPS_START = .9
EPS_END = 0.005
EPS_DECAY = 200

BATCH = 256
TARGET_UPDATE = 10

env = gym.make("FrozenLake-v0")

policyNet = QNet(env.observation_space.n, HIDDEN_SIZE, env.action_space.n)#.cuda()
targetNet = QNet(env.observation_space.n, HIDDEN_SIZE, env.action_space.n)#.cuda()
targetNet.load_state_dict(policyNet.state_dict())
targetNet.eval()

memory = ReplayList(MEM_SIZE)

done_size = 0

optimizer = optim.Adam(policyNet.parameters(), lr = LR)
loss = nn.MSELoss(reduction="sum")

In [57]:
if __name__ == "__main__":
    obs = env.reset()
    loss_list = []
    step_count = 0
    cur_target = 0
    target_list = []
    for i in range(EPOCH):
        if i%TARGET_UPDATE == 0:
            if cur_target/TARGET_UPDATE >= 0.5:
                target_list.append((i,cur_target,targetNet.state_dict()))
            else:
                targetNet.load_state_dict(policyNet.state_dict())
            cur_target = 0
            
            print(i)
        steps = 0
        while True:
            eps = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * step_count / EPS_DECAY)
            if random.random()< eps:
                act = env.action_space.sample()
            else:
                with torch.no_grad():
                    Q = policyNet(getOneHotTensor(obs, env.observation_space.n))
                    act = Q.data.numpy().argmax()
            
            next_obs, rew, done, _ = env.step(act)
            step_count += 1
            steps += 1
            if done:
                rew = rew*4 - 2
            info = stepInfo(obs, act, next_obs, rew)
            
            memory.append(info)
            replay = memory.sample(BATCH)
            
            if replay:
                replay = stepInfo(*zip(*replay))
                
                states = torch.cat(list(map(lambda s:getOneHotTensor(s,env.observation_space.n),replay.cur_state)))
                next_states = torch.cat(list(map(lambda s:getOneHotTensor(s,env.observation_space.n),replay.next_state)))
                actions = torch.LongTensor(replay.action).view(-1,1)
                rewards = torch.FloatTensor(replay.reward).view(-1,1)#.cuda()
                
                optimizer.zero_grad()
                Qpred = policyNet(states).gather(1, actions)
                Qtarget = rewards + GAMMA*targetNet(next_states).max(1)[0].unsqueeze(1).detach()
                
                loss_ = F.smooth_l1_loss(Qpred, Qtarget)
                loss_.backward()
                loss_list.append(loss_.data)
                optimizer.step()
            
            obs = next_obs
            if done:
                if rew > 0:
                    done_size += 1
                    cur_target += 1
                    print("%d, steps = %d, loss = %.4f"%(i, steps, loss_.data), "Cleared")
                obs = env.reset()
                break       

0
10
12, steps = 8, loss = 0.0494 Cleared
20
30
39, steps = 14, loss = 0.0510 Cleared
40
43, steps = 18, loss = 0.0725 Cleared
50
51, steps = 8, loss = 0.0529 Cleared
56, steps = 77, loss = 0.0380 Cleared
60
70
78, steps = 97, loss = 0.0204 Cleared
80
83, steps = 58, loss = 0.0167 Cleared
90
100
102, steps = 11, loss = 0.0126 Cleared
105, steps = 33, loss = 0.0269 Cleared
107, steps = 28, loss = 0.0195 Cleared
110
116, steps = 97, loss = 0.0261 Cleared
118, steps = 73, loss = 0.0130 Cleared
119, steps = 30, loss = 0.0207 Cleared
120
130
130, steps = 35, loss = 0.0162 Cleared
140
150
160
170
170, steps = 35, loss = 0.0201 Cleared
174, steps = 95, loss = 0.0230 Cleared
176, steps = 39, loss = 0.0116 Cleared
180
180, steps = 63, loss = 0.0173 Cleared
181, steps = 18, loss = 0.0339 Cleared
182, steps = 22, loss = 0.0252 Cleared
183, steps = 70, loss = 0.0421 Cleared
186, steps = 71, loss = 0.0170 Cleared
189, steps = 31, loss = 0.0292 Cleared
190
191, steps = 27, loss = 0.0290 Cleared
196,

In [58]:
for i in range(16):
    with torch.no_grad():
        print(policyNet(getOneHotTensor(i,16)).max(1)[1])

tensor([0])
tensor([3])
tensor([3])
tensor([3])
tensor([0])
tensor([2])
tensor([2])
tensor([3])
tensor([3])
tensor([1])
tensor([0])
tensor([2])
tensor([0])
tensor([2])
tensor([1])
tensor([2])


In [59]:
for i in range(16):
    with torch.no_grad():
        print(targetNet(getOneHotTensor(i,16)).max(1)[1])

tensor([0])
tensor([3])
tensor([3])
tensor([3])
tensor([0])
tensor([2])
tensor([2])
tensor([1])
tensor([3])
tensor([1])
tensor([0])
tensor([2])
tensor([0])
tensor([2])
tensor([1])
tensor([0])


In [60]:
done_size

1223