### Cartpole DQN

In [7]:
import gym
import collections
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

#Hyperparameters
learning_rate = 0.0005
gamma         = 0.98
buffer_limit  = 50000
batch_size    = 32

class ReplayBuffer():
    def __init__(self):
        self.buffer = collections.deque(maxlen=buffer_limit)
    
    def put(self, transition):
        self.buffer.append(transition)
    
    def sample(self, n): # 버퍼에서 샘플링
        mini_batch = random.sample(self.buffer, n)
        s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], []
        
        for transition in mini_batch:
            s, a, r, s_prime, done_mask = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            done_mask_lst.append([done_mask])

        return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
               torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
               torch.tensor(done_mask_lst)
    
    def size(self):
        return len(self.buffer)

class Qnet(nn.Module):
    def __init__(self):
        super(Qnet, self).__init__()
        self.fc1 = nn.Linear(4, 64)
        self.fc2 = nn.Linear(64, 64)   
        self.fc3 = nn.Linear(64, 2)

    def forward(self, x): # Q Value 리턴 (음수가 될 수 도 있음)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))    
        x = self.fc3(x)
        return x
      
    def sample_action(self, obs, epsilon):
        out = self.forward(obs)
        coin = random.random() # 0 ~ 1 
        if coin < epsilon:
            return random.randint(0,1)
        else : 
            return out.argmax().item()
            
def train(q, q_target, memory, optimizer):
    for i in range(10):
        s, a, r, s_prime, done_mask = memory.sample(batch_size)

        q_out = q(s) # input size (32,4) return size (32,2)
        q_a = q_out.gather(1, a) # 취한 액션의 큐값만 골라냄 (32,1)
        max_q_prime = q_target(s_prime).max(1)[0].unsqueeze(1)
        target = r + gamma * max_q_prime * done_mask
        loss = F.smooth_l1_loss(q_a, target)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


env = gym.make('CartPole-v1')
q = Qnet()
q_target = Qnet()
q_target.load_state_dict(q.state_dict()) # Copy network weights
memory = ReplayBuffer()

print_interval = 50
score = 0.0  
optimizer = optim.Adam(q.parameters(), lr=learning_rate) # No weight updates - q_target

for n_epi in range(2000):
    epsilon = max(0.01, 0.08 - 0.01*(n_epi/200)) # Linear annealing from 8% to 1%
    s = env.reset()
    done = False

    while not done:
        a = q.sample_action(torch.from_numpy(s).float(), epsilon)      
        s_prime, r, done, info = env.step(a)
        done_mask = 0.0 if done else 1.0
        memory.put((s, a, r/100.0, s_prime, done_mask))
        s = s_prime

        score += r
        if done:
            break

    if memory.size()>2000:
        train(q, q_target, memory, optimizer)

    if n_epi%print_interval==0 and n_epi!=0:
        q_target.load_state_dict(q.state_dict()) # target network updates for every 50 episodes
        print("n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%".format(n_epi, score/print_interval, memory.size(), epsilon*100))                
        
        if (score/print_interval) > 300:
            break
            
        score = 0.0
        
env.close()

n_episode :50, score : 9.7, n_buffer : 485, eps : 7.8%
n_episode :100, score : 9.7, n_buffer : 972, eps : 7.5%
n_episode :150, score : 9.8, n_buffer : 1462, eps : 7.3%
n_episode :200, score : 9.9, n_buffer : 1956, eps : 7.0%
n_episode :250, score : 11.0, n_buffer : 2508, eps : 6.8%
n_episode :300, score : 9.8, n_buffer : 2999, eps : 6.5%
n_episode :350, score : 11.0, n_buffer : 3549, eps : 6.2%
n_episode :400, score : 12.7, n_buffer : 4185, eps : 6.0%
n_episode :450, score : 27.8, n_buffer : 5573, eps : 5.8%
n_episode :500, score : 85.3, n_buffer : 9837, eps : 5.5%
n_episode :550, score : 75.7, n_buffer : 13620, eps : 5.3%
n_episode :600, score : 166.0, n_buffer : 21920, eps : 5.0%
n_episode :650, score : 311.2, n_buffer : 37480, eps : 4.8%


In [9]:
import gym
import collections
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


path = 'cartpole.pt'
# torch.save(q_target.state_dict(), path) # save weights only
# torch.save(q_target, path)
q_target = torch.load(path)

- ### RL Learned Policy

In [10]:
import time
for i_episode in range(3):
    observation = env.reset()
    for t in range(550):
        time.sleep(0.02)
        env.render()
        action = q_target(torch.Tensor(observation)).argmax().item() 
        observation, reward, done, info = env.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            time.sleep(1)
            break
env.close()

Episode finished after 500 timesteps
Episode finished after 431 timesteps
Episode finished after 500 timesteps


- ### Random or Simple Policy

In [42]:
import time
import gym
env = gym.make('CartPole-v1')


for i_episode in range(5):
    observation = env.reset()
    for t in range(300):
        time.sleep(0.1)
        env.render()
        
#         action = env.action_space.sample()
#         action = random.randint(0,1)

        if observation[0] < 0: # if the pole is on the right side
            action = 1 #  pushing the cart to the left
        else:
            action = 0 #  pushing the cart to the right
 

        observation, reward, done, info = env.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            time.sleep(1)
            break
env.close()

Episode finished after 10 timesteps
Episode finished after 35 timesteps
Episode finished after 33 timesteps
Episode finished after 32 timesteps
Episode finished after 33 timesteps


- ### Physics

In [7]:
def theta_omega_policy(obs):
    theta, w = obs[2:4]
    if abs(theta) < 0.03:
        return 0 if w < 0 else 1
    else:
        return 0 if theta < 0 else 1
    
import time
import gym
env = gym.make('CartPole-v1')


for i_episode in range(3):
    observation = env.reset()
    for t in range(550):
        time.sleep(0.01)
        env.render()
        action = theta_omega_policy(observation)
        
        observation, reward, done, info = env.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            time.sleep(1)
            break
env.close()    

Episode finished after 500 timesteps
Episode finished after 500 timesteps
Episode finished after 500 timesteps


- ### Keep Pushing to the left or Right

In [3]:
env = gym.make('CartPole-v1')

In [5]:
import gym
import random
import time

env.seed(0)

print('starting states')
s = env.reset()
print(s)
print('\n')

a = 0
for i in range(10):
    time.sleep(0.1)
    env.render()
    s_prime, r, done, info = env.step(a)
    print(i, 'action:', a, 'states:' , s_prime, 'reward:', r)
    
env.close()  

starting states
[-0.04456399  0.04653909  0.01326909 -0.02099827]


0 action: 0 states: [-0.04363321 -0.14877062  0.01284913  0.2758415 ] reward: 1.0
1 action: 0 states: [-0.04660862 -0.3440735   0.01836596  0.5725492 ] reward: 1.0
2 action: 0 states: [-0.05349009 -0.5394481   0.02981694  0.87096095] reward: 1.0
3 action: 0 states: [-0.06427906 -0.73496264  0.04723616  1.1728673 ] reward: 1.0
4 action: 0 states: [-0.07897831 -0.93066573  0.07069351  1.4799768 ] reward: 1.0
5 action: 0 states: [-0.09759162 -1.1265757   0.10029304  1.7938743 ] reward: 1.0
6 action: 0 states: [-0.12012314 -1.3226682   0.13617052  2.1159716 ] reward: 1.0
7 action: 0 states: [-0.1465765  -1.5188614   0.17848997  2.4474478 ] reward: 1.0
8 action: 0 states: [-0.17695373 -1.7149992   0.22743891  2.7891784 ] reward: 1.0
9 action: 0 states: [-0.21125372 -1.9108318   0.2832225   3.1416543 ] reward: 0.0


In [6]:
env.seed(0)

s = env.reset()
print('starting states')
print(s)
print('\n')

a = 1
for i in range(11):
    time.sleep(0.1)
    env.render()
    s_prime, r, done, info = env.step(a)
    print(i, 'action:', a, 'states:' , s_prime, 'reward:', r)
    
env.close()

starting states
[-0.04456399  0.04653909  0.01326909 -0.02099827]


0 action: 1 states: [-0.04363321  0.24146827  0.01284913 -0.3094653 ] reward: 1.0
1 action: 1 states: [-0.03880385  0.4364048   0.00665982 -0.5980684 ] reward: 1.0
2 action: 1 states: [-0.03007575  0.63143295 -0.00530154 -0.8886461 ] reward: 1.0
3 action: 1 states: [-0.01744709  0.8266264  -0.02307447 -1.1829909 ] reward: 1.0
4 action: 1 states: [-9.1456366e-04  1.0220401e+00 -4.6734285e-02 -1.4828167e+00] reward: 1.0
5 action: 1 states: [ 0.01952624  1.2176999  -0.07639062 -1.7897208 ] reward: 1.0
6 action: 1 states: [ 0.04388024  1.4135911  -0.11218503 -2.1051378 ] reward: 1.0
7 action: 1 states: [ 0.07215206  1.6096447  -0.15428779 -2.4302828 ] reward: 1.0
8 action: 1 states: [ 0.10434495  1.8057201  -0.20289344 -2.766083  ] reward: 1.0
9 action: 1 states: [ 0.14045936  2.0015864  -0.2582151  -3.1130984 ] reward: 1.0
10 action: 1 states: [ 0.18049109  2.196903   -0.32047707 -3.4714346 ] reward: 0.0
