In [1]:
import gym
import torch
import numpy as np
import sys
from torch.utils.data import Dataset
from torchvision.transforms import ToTensor, Lambda
from torch.utils.data import DataLoader
from torch import nn
from collections import namedtuple, deque

import random
import torch.onnx as onnx
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('RLresult/hopper/dqn')
device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [2]:
env = gym.make('Hopper-v2') 
print(env.action_space)
print(env.observation_space)

Box([-1. -1. -1.], [1. 1. 1.], (3,), float32)
Box([-inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf], [inf inf inf inf inf inf inf inf inf inf inf], (11,), float64)


In [3]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


HIDDEN_SIZE = 32
BATCH_SIZE = 10000
learning_rate = 0.01
HOP_ACTION = 125
CAPACITY = 10000
TRAIN_ITER = CAPACITY*10/BATCH_SIZE
PARAM_PATH = 'DQN_param/hopperDQN.pth'
PARAM_PATH_TEST = 'DQN_param/hopperDQN_test.pth'
#print(Transition(2,1,2,3))
#Transition(state=2, action=1, next_state=2, reward=3)

class envdata(Dataset):
    
    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)
        
    def __getitem__(self, idx):
        return self.memory[idx]
        
    def __len__(self):
        return len(self.memory)
    
    def push(self, *args):
        self.memory.append(Transition(*args))
        

In [4]:
class DQN(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        super(DQN, self).__init__()
        #self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, HOP_ACTION),
            nn.ReLU()
        )

        
    def forward(self, input):
        output = self.linear_relu_stack(input)
        return output

In [5]:
class custom_dataloder():
    
    def __init__(self, dataset, batch_size) :
        self.dataset = dataset
        self.batch_size = batch_size
        self.num_category = len(self.dataset[0])
        
    def __iter__(self) :
        return self         

    def __next__(self) :
        index = random.sample(list(range(len(self.dataset))),self.batch_size)
        
        batchdata = np.array(self.dataset[index[0]], dtype=np.object)
        cnum = 0
        while(cnum < self.num_category):
            i = 0
            i = i + 1
            batchdata[cnum] = np.array([batchdata[cnum]])
            while(i < self.batch_size):
                added_data = np.array([self.dataset[index[i]][cnum]])
                batchdata[cnum] = np.concatenate((batchdata[cnum], added_data), axis = 0)
                i = i + 1
            cnum = cnum + 1
            
        return batchdata                  
        

In [6]:
def output_to_action(_input, batch):
    
    if batch == 1:
        first_action = (_input%5/2) -1
        sec_action = ((_input%25 - _input%5)/10) -1
        third_action = ((_input - _input%25)/50) -1
        out = np.array([first_action, sec_action, third_action])
        
    
    else:
        i = 0
        out = np.zeros((batch,3))
        while(i < batch):
            # print(outputnum_to_3array(84)) = [ 1.  -0.5  0.5]
            first_action = (_input[i]%5/2) -1
            sec_action = ((_input[i]%25 - _input[i]%5)/10) -1
            third_action = ((_input[i] - _input[i]%25)/50) -1
            out[i] = np.array([first_action, sec_action, third_action])
            i = i + 1
    
    return out

def action_to_output(_input, batch):
    
    if batch == 1:
        _input = _input + 1
        _input = _input*2
        out = _input[2]*25 + _input[1]*5 + _input[0]
    
    else:    
        i = 0
        out = np.zeros(batch)
        while(i < batch):
            _input[i] = _input[i] + 1
            _input[i] = _input[i]*2
            out[i] = _input[i][2]*25 + _input[i][1]*5 + _input[i][0]
            
            i = i + 1
    return out

def select_random_action():
    return np.random.randint(5, size=3)

In [7]:
def renewal_memory(renewal_capacity, rendering):
    
    total_num = 0
    pause = 0
    
    while(total_num < renewal_capacity - pause):
        pre_observation = env.reset()
        pre_observation = torch.tensor(pre_observation, device = device, dtype=torch.float32)
        t = 0
        
        while(t < renewal_capacity - total_num):
            t = t + 1
            if (rendering == False and random.random()) < 0.1:
                action = (select_random_action()/2)-1
            else:
                with torch.no_grad():
                    baseDQN_action = baseDQN(pre_observation)
                
                max_action = np.argmax(baseDQN_action.cpu().numpy())
                action = output_to_action(max_action, 1)
            observation, reward, done, info = env.step(action)
            myhopdata.push(pre_observation, action, observation, reward -np.float32(done))
            pre_observation = torch.tensor(observation, device = device, dtype=torch.float32)
            if done:
                #print("Episode finished after {} timesteps".format(t+1))
                #env.render()
                total_num += t
                t = 0
                break
                
        pause = t
    
    #print("load_memory_complete")
    return 0

In [8]:
def training(iteration):
    GAMMA = 0.999
    i = 0
    while(i < iteration):
        
        #print(i)
        pre_observation, action, observation, reward = next(iter(dqn_dataloader))
        
        action_idx = action_to_output(action, BATCH_SIZE).astype(np.int64)
        action_idx = torch.from_numpy(action_idx).to(device).unsqueeze(axis=-1)
        
        state_action_values = torch.gather(updatedDQN(pre_observation), 1, action_idx )
        
        obs_to_cuda = torch.tensor(observation, dtype=torch.float32).to(device)
        reward_to_cuda = torch.tensor(reward, dtype=torch.float32).to(device)
        
        with torch.no_grad():
            nextobs = baseDQN(obs_to_cuda)
            expected_state_action_values = GAMMA*torch.argmax(nextobs, dim = 1) + reward_to_cuda
            
        criterion = nn.MSELoss()
        loss = criterion(state_action_values, expected_state_action_values.unsqueeze(axis=-1))
        
        optimizer.zero_grad()
        loss.backward() 
        for param in updatedDQN.parameters():
            param.grad.data.clamp_(-1, 1)
        optimizer.step()
        
        #for p in updatedDQN.parameters(): 
        #    print("params = ", p.grad)
        i = i + 1
    
    print("loss = ",loss)
    #print("train complete iter = ", iteration)
    return loss

In [16]:
%%time

myhopdata = envdata(capacity = CAPACITY)

#dqn_dataloader = DataLoader(myhopdata, batch_size=64, shuffle=True)

input_state_size = len(env.observation_space.sample())


updatedDQN = DQN(input_state_size,HIDDEN_SIZE).to(device)
updatedDQN.load_state_dict(torch.load(PARAM_PATH))
optimizer = torch.optim.SGD(updatedDQN.parameters(), lr=learning_rate)

baseDQN = DQN(input_state_size,HIDDEN_SIZE).to(device)
baseDQN.load_state_dict(updatedDQN.state_dict())
baseDQN.eval()

dqn_dataloader = DataLoader(myhopdata, batch_size=BATCH_SIZE, shuffle=False)
#len(env.observation_space.sample())

train_iter = 100
i = 0
#for p in updatedDQN.parameters(): 
#    print(p)

while(i < train_iter):
    i = i + 1
    renewal_memory(CAPACITY, rendering = False)
    loss = training(TRAIN_ITER)
    writer.add_scalar("loss", loss, i)
    torch.save(updatedDQN.state_dict(), PARAM_PATH)
    baseDQN.load_state_dict(updatedDQN.state_dict())
    baseDQN.eval()

env.close()


  
  from ipykernel import kernelapp as app


loss =  tensor(1165896.3750, device='cuda:0', grad_fn=<MseLossBackward>)
loss =  tensor(178925.9375, device='cuda:0', grad_fn=<MseLossBackward>)
loss =  tensor(32147.1152, device='cuda:0', grad_fn=<MseLossBackward>)
loss =  tensor(24535.7793, device='cuda:0', grad_fn=<MseLossBackward>)
loss =  tensor(24160.0898, device='cuda:0', grad_fn=<MseLossBackward>)
loss =  tensor(24238.2773, device='cuda:0', grad_fn=<MseLossBackward>)
loss =  tensor(24101.2734, device='cuda:0', grad_fn=<MseLossBackward>)
loss =  tensor(24090.2559, device='cuda:0', grad_fn=<MseLossBackward>)
loss =  tensor(24030.8535, device='cuda:0', grad_fn=<MseLossBackward>)
loss =  tensor(24724.3574, device='cuda:0', grad_fn=<MseLossBackward>)
loss =  tensor(23868.5820, device='cuda:0', grad_fn=<MseLossBackward>)
loss =  tensor(23921.8750, device='cuda:0', grad_fn=<MseLossBackward>)
loss =  tensor(23947.4082, device='cuda:0', grad_fn=<MseLossBackward>)


KeyboardInterrupt: 

In [10]:
#writer.add_graph(updatedDQN, myhopdata[0][0])
#writer.add_graph(baseDQN, myhopdata[0][0])


In [11]:
writer.flush()
writer.close()

In [12]:
'''
print(env.unwrapped.sim.model.get_joint_qpos_addr('root'))
print(env.unwrapped.sim.model.get_joint_qpos_addr('hip_1'))
print(env.unwrapped.sim.model.get_joint_qpos_addr('hip_2'))
print(env.unwrapped.sim.model.get_joint_qpos_addr('hip_3'))
print(env.unwrapped.sim.model.get_joint_qpos_addr('hip_4'))
print(env.unwrapped.sim.model.get_joint_qpos_addr('ankle_1'))
print(env.unwrapped.sim.model.get_joint_qpos_addr('ankle_2'))
print(env.unwrapped.sim.model.get_joint_qpos_addr('ankle_3'))
print(env.unwrapped.sim.model.get_joint_qpos_addr('ankle_4'))
#if u enter <body> in addr then this will show you available list
#('root', 'hip_1', 'ankle_1', 'hip_2', 'ankle_2', 'hip_3', 'ankle_3', 'hip_4', 'ankle_4')
#qpos is front value index
'''


"\nprint(env.unwrapped.sim.model.get_joint_qpos_addr('root'))\nprint(env.unwrapped.sim.model.get_joint_qpos_addr('hip_1'))\nprint(env.unwrapped.sim.model.get_joint_qpos_addr('hip_2'))\nprint(env.unwrapped.sim.model.get_joint_qpos_addr('hip_3'))\nprint(env.unwrapped.sim.model.get_joint_qpos_addr('hip_4'))\nprint(env.unwrapped.sim.model.get_joint_qpos_addr('ankle_1'))\nprint(env.unwrapped.sim.model.get_joint_qpos_addr('ankle_2'))\nprint(env.unwrapped.sim.model.get_joint_qpos_addr('ankle_3'))\nprint(env.unwrapped.sim.model.get_joint_qpos_addr('ankle_4'))\n#if u enter <body> in addr then this will show you available list\n#('root', 'hip_1', 'ankle_1', 'hip_2', 'ankle_2', 'hip_3', 'ankle_3', 'hip_4', 'ankle_4')\n#qpos is front value index\n"

In [13]:
target = torch.empty(3, dtype=torch.long).random_(5)

In [14]:
target

tensor([1, 4, 1])

In [15]:
input = torch.randn(3, 5, requires_grad=True)