<a href="https://colab.research.google.com/github/haluowan/pytorch/blob/master/Reinforcement_Learning_DQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F 
import numpy as np
import gym

In [0]:
# Hyper parameters
batch_size = 32
lr = 1e-3
epsilon = 0.9
gamma = 0.9
target_replace_iter = 100
memory_capacity = 2000
env = gym.make('CartPole-v0')
env = env.unwrapped
n_actions = env.action_space.n
n_states = env.observation_space.shape[0]


In [0]:
class Net(nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        self.fc1 = nn.Linear(n_states,10)
        self.fc1.weight.data.normal_(0,0.1) # initialization
        self.out = nn.Linear(10,n_actions)
        self.out.weight.data.normal_(0,0.1) # initialization
        
    def forward(self,x):
        x = self.fc1(x)
        x = F.relu(x)
        actions_value = self.out(x)
        return actions_value
        

In [0]:
class DQN(object):
    def __init__(self):
        self.eval_net,self.target_net = Net(),Net()
        
        self.learn_step_counter = 0                                # for target updating
        self.memory_counter = 0                                    # for storing memory
        self.memory = np.zeros((memory_capacity,n_states * 2 + 2))   # initialize memory
        self.optimizer = torch.optim.Adam(self.eval_net.parameters(),lr=lr)
        self.loss_fuc = nn.MSELoss()
        
    def choose_action(self,x):
        x = torch.unsqueeze(torch.FloatTensor(x),0)
        # input only one sample
        if np.random.uniform() < epsilon: # greedy
            actions_value = self.eval_net.forward(x)
            action = torch.max(actions_value,1)[1].data.numpy()[0,0] # return the argmax
        else:
            action = np.random.randint(0,n_actions)
        return action

    def store_transition(self,s,a,r,s_):
        transition = np.hstack((s,[a,r],s_))
        # replace the old memory with new memory
        self.memory[index,:] = transition
        self.memory_counter += 1
        
    def learn(self):
        # target parameters update
        if self.learn_step_counter % target_replace_iter == 0:
            self.target_net.load_state_dict(self.eval_net.state_dict())
        self.learn_step_counter += 1
    
        # sample batch transitions
        sample_index = np.random.choice(memory_capacity,batch_size)
        b_memory = self.memory[sample_index,:]
        b_s = torch.FloatTensor(b_memory[:,:n_states])
        b_a = torch.LongTensor(b_memory[:,:n_states:n_states+1].astype(int))
        b_r = torch.FloatTensor(b_memory[:,n_states+1:n_states+2])
        b_s_ = torch.FloatTensor(b_memory[:,-n_states:])

        # q_eval w,r,t the action in experience
        q_eval = self.eval_net(b_s).gather(1,b_a)
        q_next = self.target_net(b_s_).detach()
        q_target = b_r + gamma*q_next.max(1)[0]

        loss = self.loss_fuc(q_eval,q_target)


        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
    
    

In [0]:
dqn = DQN()

In [50]:
for i_episode in range(400):
    s = env.reset()
    ep_r = 0
    while True:
        env.render()
        a = dqn.choose_action(s)
        
        # take action
        s_,r,done,info = env.step(a)
        
        # modify the reward
        x,x_dot,theta,theta_dot = s_
        r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
        r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
        r = r1+r2
        
        dqn.store_transition(s,a,r,s_)
        
        ep_r += r
        if dqn.memory_counter > memory_capacity:
            dqn.learn()
            if done:
                print('Ep:',i_episode,'|Ep_r',round(ep_r,2))
                
                
        if done:
            break
            
        s = s_
    
    

NameError: ignored