In [1]:
import gym
import torch
import numpy as np
from torch import nn
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('RLresult/hopper/dqn')
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
env = gym.make('Hopper-v3') 
print(env.action_space)
print(env.observation_space)

Box([-1. -1. -1.], [1. 1. 1.], (3,), float32)
Box([-inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf], [inf inf inf inf inf inf inf inf inf inf inf], (11,), float64)


In [3]:
HIDDEN_SIZE = 32
BATCH_SIZE = 10000
learning_rate = 0.01
HOP_ACTION = 125
CAPACITY = 10000
TRAIN_ITER = CAPACITY*10/BATCH_SIZE
PARAM_PATH = 'DQN_param/hopperDQN.pth'
PARAM_PATH_TEST = 'DQN_param/hopperDQN_test.pth'
#print(Transition(2,1,2,3))
#Transition(state=2, action=1, next_state=2, reward=3)

In [4]:
print(env.observation_space.sample())

[ 0.79789507 -0.53175809  0.42578659  1.87268997 -1.10877438  0.64848218
  0.30792365  0.86519849 -0.45800793 -1.12981825 -0.70058099]


In [5]:
input_state_size = len(env.observation_space.sample())

class DQN(nn.Module):

    def __init__(self, input_size, hidden_size, output_size):
        super(DQN, self).__init__()
        # self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size),
            nn.ReLU()
        )

    def forward(self, input_element):
        output = self.linear_relu_stack(input_element)
        return output

updatedDQN = DQN(input_state_size, HIDDEN_SIZE, HOP_ACTION).to(device)
baseDQN = DQN(input_state_size, HIDDEN_SIZE, HOP_ACTION).to(device)


In [6]:
updatedDQN.load_state_dict(torch.load(PARAM_PATH))
baseDQN.load_state_dict(updatedDQN.state_dict())

<All keys matched successfully>

In [7]:
class fortest(nn.Module):

    def __init__(self, input_size, hidden_size, output_size):
        super(DQN, self).__init__()
        # self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size),
            nn.ReLU()
        )

    def forward(self, input_element):
        output = self.linear_relu_stack(input_element)
        return output
    


In [5]:
import random
import numpy as np
np.shape(random.sample(list(range(5)),3))

(3,)

In [7]:
[i for i in range(5)]

[0, 1, 2, 3, 4]

In [11]:
import numpy as np
import random
from collections import namedtuple, deque
from torch.utils.data import Dataset

Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward', 'done'))


class EnvData(Dataset):
    
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def __getitem__(self, idx):
        return self.memory[idx]

    def __len__(self):
        return len(self.memory)

    def push(self, *args):
        self.memory.append(Transition(*args))


class CustomDataLoader:

    def __init__(self, dataset, batch_size, suffle):
        self.dataset = dataset
        self.batch_size = batch_size
        self.num_category = len(self.dataset[0])
        self.suffle = suffle
        self.iternum = 0

    def __iter__(self):
        return self

    def __next__(self):
        if self.suffle == True:
            index = random.sample(list(range(len(self.dataset))), self.batch_size)
        else:
            index = [i + self.iternum * self.batch_size for i in range(self.batch_size)]
        _batchdata = np.array(self.dataset[index[0]], dtype=np.object)
        _cnum = 0
        while _cnum < self.num_category:
            i = 0
            i = i + 1
            _batchdata[_cnum] = np.array([_batchdata[_cnum]])
            while i < self.batch_size:
                added_data = np.array([self.dataset[index[i]][_cnum]])
                _batchdata[_cnum] = np.concatenate((_batchdata[_cnum], added_data), axis=0)
                i = i + 1
            _cnum = _cnum + 1
        self.iternum = self.iternum + 1
        if (self.iternum + 1) * self.batch_size > len(self.dataset):
            self.iternum = 0
        return _batchdata



In [21]:
    
mycartdata = EnvData(capacity = 3)
#
np_pre_observation = 1
action = 2
observation = 3
reward = 4
done = 5
mycartdata.push(np_pre_observation, action, observation, reward, np.float32(done))
mycartdata.push(np_pre_observation, action, observation, reward, np.float32(done))
mycartdata.push(np_pre_observation, action, observation, reward, np.float32(done))
np_pre_observation = 1
action = 2
observation = 3
reward = 4
done = 6

mycartdata.push(np_pre_observation, action, observation, reward, np.float32(done))
print(mycartdata[1])

Transition(state=1, action=2, next_state=3, reward=4, done=5.0)


In [8]:
print(mycartdata[2])

Transition(state=1, action=2, next_state=3, reward=4, done=6.0)


In [22]:
mydataloader = CustomDataLoader(mycartdata, batch_size=1, suffle = False)

In [13]:
next(iter(mydataloader))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  _batchdata = np.array(self.dataset[index[0]], dtype=np.object)


array([array([1, 1]), array([2, 2]), array([3, 3]), array([4, 4]),
       array([5., 5.], dtype=float32)], dtype=object)

In [14]:
next(iter(mydataloader))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  _batchdata = np.array(self.dataset[index[0]], dtype=np.object)


array([array([1, 1]), array([2, 2]), array([3, 3]), array([4, 4]),
       array([5., 5.], dtype=float32)], dtype=object)

In [15]:
next(iter(mydataloader))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  _batchdata = np.array(self.dataset[index[0]], dtype=np.object)


array([array([1, 1]), array([2, 2]), array([3, 3]), array([4, 4]),
       array([5., 5.], dtype=float32)], dtype=object)

In [23]:
i = 0
while(i < 10):
    print(next(iter(mydataloader)))
    i = i + 1

[array([1]) array([2]) array([3]) array([4]) array([5.], dtype=float32)]
[array([1]) array([2]) array([3]) array([4]) array([5.], dtype=float32)]
[array([1]) array([2]) array([3]) array([4]) array([6.], dtype=float32)]
[array([1]) array([2]) array([3]) array([4]) array([5.], dtype=float32)]
[array([1]) array([2]) array([3]) array([4]) array([5.], dtype=float32)]
[array([1]) array([2]) array([3]) array([4]) array([6.], dtype=float32)]
[array([1]) array([2]) array([3]) array([4]) array([5.], dtype=float32)]
[array([1]) array([2]) array([3]) array([4]) array([5.], dtype=float32)]
[array([1]) array([2]) array([3]) array([4]) array([6.], dtype=float32)]
[array([1]) array([2]) array([3]) array([4]) array([5.], dtype=float32)]


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  _batchdata = np.array(self.dataset[index[0]], dtype=np.object)


In [17]:
mycartdata.push(np_pre_observation, action, observation, reward, np.float32(done))
mycartdata.push(np_pre_observation, action, observation, reward, np.float32(done))

In [18]:
print(mycartdata[1])

Transition(state=1, action=2, next_state=3, reward=4, done=6.0)


In [19]:
i = 0
while(i < 10):
    print(next(iter(mydataloader)))
    i = i + 1

[array([1, 1]) array([2, 2]) array([3, 3]) array([4, 4])
 array([6., 6.], dtype=float32)]
[array([1, 1]) array([2, 2]) array([3, 3]) array([4, 4])
 array([6., 6.], dtype=float32)]
[array([1, 1]) array([2, 2]) array([3, 3]) array([4, 4])
 array([6., 6.], dtype=float32)]
[array([1, 1]) array([2, 2]) array([3, 3]) array([4, 4])
 array([6., 6.], dtype=float32)]
[array([1, 1]) array([2, 2]) array([3, 3]) array([4, 4])
 array([6., 6.], dtype=float32)]
[array([1, 1]) array([2, 2]) array([3, 3]) array([4, 4])
 array([6., 6.], dtype=float32)]
[array([1, 1]) array([2, 2]) array([3, 3]) array([4, 4])
 array([6., 6.], dtype=float32)]
[array([1, 1]) array([2, 2]) array([3, 3]) array([4, 4])
 array([6., 6.], dtype=float32)]
[array([1, 1]) array([2, 2]) array([3, 3]) array([4, 4])
 array([6., 6.], dtype=float32)]
[array([1, 1]) array([2, 2]) array([3, 3]) array([4, 4])
 array([6., 6.], dtype=float32)]


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  _batchdata = np.array(self.dataset[index[0]], dtype=np.object)


In [24]:
len(mycartdata)

3

In [28]:
total_num = 0
pause = 0
memory_capacity = 5
while total_num < memory_capacity - pause:
    t = 0
    while t < memory_capacity - total_num: #if pg, gain accumulate
        print(t)
        t = t + 1
        if t==3:
            total_num += t
            t = 0
            break
    pause = t
    print("p=",pause)

0
1
2
p= 0
0
1
p= 2


In [160]:
from utils.dataset import SimData
from utils.dataloader import CustomDataLoader
import numpy as np
mycartdata = SimData(capacity = 13)
#
np_pre_observation = 1
action = 2
observation = 3.0
reward = np.float64(4)
done = 0
mycartdata.push(np_pre_observation, action, observation, reward, np.float32(done))
mycartdata.push(np_pre_observation, action, observation, reward, np.float32(done))
mycartdata.push(np_pre_observation, action, observation, reward, np.float32(done))
mycartdata.push(np_pre_observation, action, observation, reward, np.float32(done))
mycartdata.push(np_pre_observation, action, observation, reward, np.float32(done))
mycartdata.push(np_pre_observation, action, observation, reward, np.float32(done))
np_pre_observation = 1
action = 2
observation = 3.3
reward = 4
done = 1

mycartdata.push(np_pre_observation, action, observation, reward, np.float32(done))

np_pre_observation = 1
action = 2
observation = 3.33
reward = np.float64(4)
done = 0
mycartdata.push(np_pre_observation, action, observation, reward, np.float32(done))
mycartdata.push(np_pre_observation, action, observation, reward, np.float32(done))
mycartdata.push(np_pre_observation, action, observation, reward, np.float32(done))
mycartdata.push(np_pre_observation, action, observation, reward, np.float32(done))
mycartdata.push(np_pre_observation, action, observation, reward, np.float32(done))
mycartdata.push(np_pre_observation, action, observation, reward, np.float32(done))

In [161]:
mydataloader = CustomDataLoader(mycartdata, 13, False)

In [162]:
pre_observation, action, observation, reward, done = next(iter(mydataloader))

In [163]:
print(done)
print(observation)

[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]


In [164]:
#cal per trajectary to_end length ex) 4 3 2 1 6 5 4 3 2 1
#set step to upper bound ex) step = 5 ->  4 3 2 1 5 5 4 3 2 1
global_index = len(done) - 1
local_index = 0
step = 5
done_penalty = 1
while 0 <= global_index:
    
    if done[global_index] == 1:
        local_index =  1
        done[global_index] = local_index
        reward[global_index] -= done_penalty
        print("reset")

    else:
        local_index = local_index + 1
        if local_index > step:
            local_index = step
        done[global_index] = local_index
    
    global_index = global_index - 1

reset


In [167]:
#cal newreward per state-action pair
gamma = 0.9
global_index = 0
while global_index < len(done):
    observation[global_index] = observation[global_index + done[global_index] -1]
    #change observation to last step indexed observation state
    local_index = 1
    while local_index < done[global_index]:
        tmp = reward[global_index + local_index]*gamma**local_index
        reward[global_index] += tmp
        local_index = local_index + 1
    global_index +=1

3.6
3.24
2.9160000000000004
2.6244
3.6
3.24
2.9160000000000004
2.6244
3.6
3.24
2.9160000000000004
1.9683000000000002
3.6
3.24
2.1870000000000003
3.6
2.43
2.7
3.6
3.24
2.9160000000000004
2.6244
3.6
3.24
2.9160000000000004
2.6244
3.6
3.24
2.9160000000000004
3.6
3.24
3.6


array([16.3804, 16.3804, 15.7243, 13.027 , 10.03  ,  6.7   ,  3.    ,
       16.3804, 16.3804, 13.756 , 10.84  ,  7.6   ,  4.    ])