In [1]:
import gym
import torch
import numpy as np
import sys
from torch.utils.data import Dataset
from torchvision.transforms import ToTensor, Lambda
from torch.utils.data import DataLoader
from torch import nn
from collections import namedtuple, deque

import random
import torch.onnx as onnx
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('RLresult/cartpole/dqn')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [2]:
env = gym.make('CartPole-v1') 
print("act", env.action_space)
print("obs", env.observation_space)


act Discrete(2)
obs Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)


In [3]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

HIDDEN_SIZE = 32
BATCH_SIZE = 10000
learning_rate = 0.01
CART_ACTION = 2
CAPACITY = 10000
TRAIN_ITER = CAPACITY*10/BATCH_SIZE
PARAM_PATH = 'DQN_param/cartpoleDQN.pth'
PARAM_PATH_TEST = 'DQN_param/cartpoleDQN_test.pth'
#print(Transition(2,1,2,3))
#Transition(state=2, action=1, next_state=2, reward=3)

class envdata(Dataset):
    
    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)
        
    def __getitem__(self, idx):
        return self.memory[idx]
        
    def __len__(self):
        return len(self.memory)
    
    def push(self, *args):
        self.memory.append(Transition(*args))
        

In [4]:
class DQN(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        super(DQN, self).__init__()
        #self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, CART_ACTION),
            nn.ReLU()
        )

        
    def forward(self, input):
        output = self.linear_relu_stack(input)
        return output

In [5]:
class custom_dataloder():
    
    def __init__(self, dataset, batch_size) :
        self.dataset = dataset
        self.batch_size = batch_size
        self.num_category = len(self.dataset[0])
        
    def __iter__(self) :
        return self         

    def __next__(self) :
        index = random.sample(list(range(len(self.dataset))),self.batch_size)
        
        batchdata = np.array(self.dataset[index[0]], dtype=np.object)
        cnum = 0
        while(cnum < self.num_category):
            i = 0
            i = i + 1
            batchdata[cnum] = np.array([batchdata[cnum]])
            while(i < self.batch_size):
                added_data = np.array([self.dataset[index[i]][cnum]])
                batchdata[cnum] = np.concatenate((batchdata[cnum], added_data), axis = 0)
                i = i + 1
            cnum = cnum + 1
            
        return batchdata                  
        

In [6]:
def output_to_action(_input, batch):
    
    return _input

def action_to_output(_input, batch):
    
    return _input

def select_random_action():
    return np.random.randint(2)

In [7]:
def renewal_memory(renewal_capacity, rendering):
    
    total_num = 0
    pause = 0
    
    while(total_num < renewal_capacity - pause):
        pre_observation_np = env.reset()
        pre_observation = torch.tensor(pre_observation_np, device = device, dtype=torch.float32)
        t = 0
        
        while(t < renewal_capacity - total_num):
            t = t + 1
            if (rendering == False and random.random()) < 0.0001:
                action = np.int64(select_random_action())
            else:
                with torch.no_grad():
                    baseDQN_action = baseDQN(pre_observation)
                
                max_action = np.argmax(baseDQN_action.cpu().numpy())
                
                action = output_to_action(max_action, 1)
            observation, reward, done, info = env.step(action)
            pre_observation_np = pre_observation.cpu().numpy()
            mycartdata.push(pre_observation_np, action, observation, reward - (np.float32(done)*5.0))
            pre_observation = torch.tensor(observation, device = device, dtype=torch.float32)
            if done:
                total_num += t
                t = 0
                if rendering:
                    t = 0
                    #print("Episode finished after {} timesteps".format(t+1))
                else:
                    break
                
            if rendering:
                env.render()
                
        pause = t
    
    print("load_memory_complete")
    return 0

In [11]:
def training(iteration):
    GAMMA = 0.999
    i = 0
    meanloss = 0
    while(i < iteration):
        
        pre_observation_np, action, observation, reward = next(iter(dqn_dataloader))
        
        action_idx = action_to_output(action, BATCH_SIZE).astype(np.int64)
        action_idx = torch.from_numpy(action_idx).to(device).unsqueeze(axis=-1)
        pre_observation = torch.tensor(pre_observation_np, device=device, dtype=torch.float32)
        
        state_action_values = torch.gather(updatedDQN(pre_observation), 1, action_idx )
        
        obs_to_cuda = torch.tensor(observation, dtype=torch.float32).to(device)
        reward_to_cuda = torch.tensor(reward, dtype=torch.float32).to(device)
        
        
        with torch.no_grad():
            nextobs = baseDQN(obs_to_cuda) #error seq why??
            expected_state_action_values = GAMMA*torch.argmax(nextobs, dim = 1) + reward_to_cuda
            
        criterion = nn.MSELoss()
        
        loss = criterion(state_action_values, expected_state_action_values.unsqueeze(axis=-1))
        meanloss = meanloss + loss
        #if i == 0:
        #print("loss = ",loss)
        optimizer.zero_grad()
        loss.backward() 
        for param in updatedDQN.parameters():
            param.grad.data.clamp_(-1, 1)
        optimizer.step()
        
        #for p in baseDQN.parameters(): 
        #    print("params = ", p)
        i = i + 1
    #meanloss = meanloss/iteration
    print("meanloss = ",meanloss/iteration)
    #print("train complete iter = ", iteration)
    return meanloss

In [12]:
%%time

input_state_size = len(env.observation_space.sample())
updatedDQN = DQN(input_state_size,HIDDEN_SIZE).to(device)
updatedDQN.load_state_dict(torch.load(PARAM_PATH_TEST))

baseDQN = DQN(input_state_size,HIDDEN_SIZE).to(device)
baseDQN.load_state_dict(updatedDQN.state_dict())
baseDQN.eval()


#for p in updatedDQN.parameters(): 
#    print(p)

mycartdata = envdata(capacity = CAPACITY)
renewal_memory(CAPACITY, rendering = False)
dqn_dataloader = custom_dataloder(mycartdata, batch_size=BATCH_SIZE)

load_memory_complete
CPU times: user 3.27 s, sys: 769 µs, total: 3.27 s
Wall time: 3.27 s


In [13]:
optimizer = torch.optim.SGD(updatedDQN.parameters(), lr=learning_rate)
train_iter = 100
i = 0
while(i < train_iter):
    i = i + 1
    loss = training(TRAIN_ITER)
    writer.add_scalar("loss", loss, i)
    torch.save(updatedDQN.state_dict(), PARAM_PATH_TEST)
    baseDQN.load_state_dict(updatedDQN.state_dict())
    baseDQN.eval()
    renewal_memory(CAPACITY, rendering = False)
    
env.close()


meanloss =  tensor(1.8030, device='cuda:0', grad_fn=<DivBackward0>)
load_memory_complete
meanloss =  tensor(1.8015, device='cuda:0', grad_fn=<DivBackward0>)
load_memory_complete


KeyboardInterrupt: 

In [None]:
writer.flush()
writer.close()

In [18]:
renewal_memory(CAPACITY, rendering = True)



KeyboardInterrupt: 

In [None]:
#for p in updatedDQN.parameters(): 
#    print("params = ", p)

In [None]:
'''
loss = nn.MSELoss()
input1 = torch.tensor(np.array([[-4,-5]], dtype=np.float32), requires_grad=True)
input2 = torch.tensor(np.array([[2,1]], dtype=np.float32), requires_grad=True)
input3 = torch.tensor(np.array([[-6,-2]], dtype=np.float32), requires_grad=True)
input4 = torch.cat((input1, input2),axis = 0)
input4 = torch.cat((input4, input3),axis = 0)
print(input4)
target = torch.tensor(np.array([[1, 1]], dtype = np.float32))
target1 = torch.cat((target, target), axis = 0)
target = torch.cat((target, target1), axis = 0)
print(target)
output = loss(input4, target)
print(output)
'''

In [None]:
'''
CART_ACTION = 3
class DQN(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        super(DQN, self).__init__()
        #self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, CART_ACTION),
            nn.ReLU()
        )

        
    def forward(self, input):
        output = self.linear_relu_stack(input)
        return output
    
testDQN = DQN(2,2)
'''