<a href="https://colab.research.google.com/github/khushdeep-singh/Khushdeep-robotics/blob/master/RL%20agent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch as T 
import torch.nn as nn
import torch.nn.functional as F 
import torch.optim as optim 
import numpy as np
import gym 
import matplotlib.pyplot as plt 

In [0]:
class DQN(nn.Module):
    def __init__(self,alpha):
        super(DQN,self).__init__()
        
        # We define two networks 
        
        #define convolutional networks with dfferent inputs and outputs
        self.conv1 = nn.Conv2d(1,32,8, stride =4, padding =1)
        self.conv2 = nn.Conv2d(32,64,4, stride =2)
        self.conv3 = nn.Conv2d(64,128, 3)
        
        #initialize fully connected networks 
        self.fc1 = nn.Linear(128*19*8, 512) # 128*19*8 is a number from users' experience , 512 is ouput 
        self.fc2 = nn.Linear(512,6)  # 512 as input and 6 as output ( because the actions are 6 in this specific game) 
        
        self.optimizer = optim.RMSprop(self.parameters(), lr = alpha)
        self.loss = nn.MSELoss()
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        
        #send the network to the device 
        self.to(self.device) 
     
    # Feed forwarding the network 
    def forward(self, observation):
        # observations are sequence of frames, we can trancate the frame to meet reduced memory reqd.
        # using 3 frames to get sense of motion and then converting them to Tensors
        observation = T.Tensor(observation).to(self.device) # sending converted observation tensors to device
        # re-size the array
        observation = observation.view(-1, 1, 185, 95)
        # activate and feed forward
        observation = F.relu(self.conv1(observation))
        observation = F.relu(self.conv2(observation))
        observation = F.relu(self.conv3(observation))
 
        # Take convolved images and flatten them to feed into fully connected neural networks 
        observation = observation.view(-1, 128*19*8)
        # Activate and feed forward
        observation = F.relu(self.fc1(observation))

        actions = F.relu(self.fc2(observation))

        return actions 



In [0]:
class Agent(object):
  def __init__(self, gamma, epsilon, alpha,
               maxMemorysize, epsEnd= 0.05,
               replace = 10000, actionSpace = [0,1,2,3,4,5]):
    # defining the agent parameters
    self.gamma = gamma
    self.epsilon = epsilon 
    self.epsEnd = epsEnd
    self.actionSpace = actionSpace
    self.memSize = maxMemorysize
    self.steps = 0 
    self.learnstepcounter = 0 
    self.memory = []
    self.memcntr =  0
    self.replacetargetcntr = replace
    self.Qeval = DQN(alpha)
    self.Qnext = DQN(alpha)

  def storeTransition(self, state, action, reward, state_):
    # storing up the stack of SARS
    if self.memcntr < self.memSize:
      self.memory.append([state, action, reward, state_])
    else:
      self.memory[self.memcntr%self.memSize] = [state, action, reward, state_]
    
    self.memcntr += 1

  def chooseAction(self, observation):
    # choosing an agent's actions 
    rand = np.random.random()
    # forward propagate stack of frames through cnn and fully connected nn to get some set of states
    actions = self.Qeval.forward(observation)

    if rand < 1- self.epsilon:
      action = T.argmax(actions[1]).item()
    else:
      action = np.random.choice(self.actionSpace) # choose randomly 
    
    self.steps += 1

    return action

  def learn(self,batchsize):
    # we do batch learning to avoid correlation between state tansitions 
    self.Qeval.optimizer.zero_grad() # zero the gradients after every batch

    if self.replacetargetcntr is not None and (self.learnstepcounter % self.replacetargetcntr == 0):
      # convert into state dictionary 
      self.Qnext.load_state_dict(self.Qeval.state_dict()) 

    # start of memory sub sampling 
    if self.memcntr + batchsize < self.memSize:
      memStart = int(np.random.choice(range(self.memcntr)))
    else:
      memStart = int(np.random.choice(range(self.memSize - batchsize - 1)))  

    # sample the batch of memory and convert to numpy array 
    miniBatch = self.memory[memStart:memStart + batchsize ]
    memory = np.array(miniBatch)

  # convert to list because memory is an array of numpy objects 
  # feed forwarding the current and successor state 

    Qpred = self.Qeval.forward(list(memory[:,0][:])).to(self.Qeval.device)
    Qnext = self.Qnext.forward(list(memory[:,3][:])).to(self.Qeval.device)

    # get the max action 
    maxA = T.argmax(Qnext, dim=1).to(self.Qeval.device)
    # get the reward form 2nd memory element 
    rewards = T.Tensor(list(memory[:,2])).to(self.Qeval.device)

    Qtarget = Qpred.clone()
    indices = np.arange(batchsize)
    Qtarget[indices,maxA] = rewards + self.gamma*T.max(Qnext[1])

    # for converging to small value 
    if self.steps > 500:
        if self.epsilon - 1e-4 > self.epsEnd:
            self.epsilon -= 1e-4
        else:
            self.epsilon = self.epsEnd

    #Qpred.requires_grad_()
    loss = self.Qeval.loss(Qtarget, Qpred).to(self.Qeval.device) # calculate loss
    loss.backward()   # back propagate 
    self.Qeval.optimizer.step() # optimize 
    self.learnstepcounter += 1



In [0]:
def plotLearning(x, scores, epsilons, filename, lines=None):
    fig=plt.figure()
    ax=fig.add_subplot(111, label="1")
    ax2=fig.add_subplot(111, label="2", frame_on=False)

    ax.plot(x, epsilons, color="C0")
    ax.set_xlabel("Game", color="C0")
    ax.set_ylabel("Epsilon", color="C0")
    ax.tick_params(axis='x', colors="C0")
    ax.tick_params(axis='y', colors="C0")

    N = len(scores)
    running_avg = np.empty(N)
    for t in range(N):
	    running_avg[t] = np.mean(scores[max(0, t-20):(t+1)])

    ax2.scatter(x, running_avg, color="C1")
    #ax2.xaxis.tick_top()
    ax2.axes.get_xaxis().set_visible(False)
    ax2.yaxis.tick_right()
    #ax2.set_xlabel('x label 2', color="C1")
    ax2.set_ylabel('Score', color="C1")
    #ax2.xaxis.set_label_position('top')
    ax2.yaxis.set_label_position('right')
    #ax2.tick_params(axis='x', colors="C1")
    ax2.tick_params(axis='y', colors="C1")

    if lines is not None:
        for line in lines:
            plt.axvline(x=line)

    plt.savefig(filename)

class SkipEnv(gym.Wrapper):
    def __init__(self, env=None, skip=4):
        super(SkipEnv, self).__init__(env)
        self._skip = skip

    def step(self, action):
        t_reward = 0.0
        done = False
        for _ in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            t_reward += reward
            if done:
                break
        return obs, t_reward, done, info

    def reset(self):
        self._obs_buffer = []
        obs = self.env.reset()
        self._obs_buffer.append(obs)
        return obs

class PreProcessFrame(gym.ObservationWrapper):
    def __init__(self, env=None):
        super(PreProcessFrame, self).__init__(env)
        self.observation_space = gym.spaces.Box(low=0, high=255,
                                                shape=(80,80,1), dtype=np.uint8)
    def observation(self, obs):
        return PreProcessFrame.process(obs)

    @staticmethod
    def process(frame):

        new_frame = np.reshape(frame, frame.shape).astype(np.float32)

        new_frame = 0.299*new_frame[:,:,0] + 0.587*new_frame[:,:,1] + \
                    0.114*new_frame[:,:,2]

        new_frame = new_frame[35:195:2, ::2].reshape(80,80,1)

        return new_frame.astype(np.uint8)

class MoveImgChannel(gym.ObservationWrapper):
    def __init__(self, env):
        super(MoveImgChannel, self).__init__(env)
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0,
                            shape=(self.observation_space.shape[-1],
                                   self.observation_space.shape[0],
                                   self.observation_space.shape[1]),
                            dtype=np.float32)

    def observation(self, observation):
        return np.moveaxis(observation, 2, 0)

class ScaleFrame(gym.ObservationWrapper):
    def observation(self, obs):
        return np.array(obs).astype(np.float32) / 255.0

class BufferWrapper(gym.ObservationWrapper):
    def __init__(self, env, n_steps):
        super(BufferWrapper, self).__init__(env)
        self.observation_space = gym.spaces.Box(
                             env.observation_space.low.repeat(n_steps, axis=0),
                             env.observation_space.high.repeat(n_steps, axis=0),
                             dtype=np.float32)

    def reset(self):
        self.buffer = np.zeros_like(self.observation_space.low, dtype=np.float32)
        return self.observation(self.env.reset())

    def observation(self, observation):
        self.buffer[:-1] = self.buffer[1:]
        self.buffer[-1] = observation
        return self.buffer

def make_env(env_name):
    env = gym.make(env_name)
    env = SkipEnv(env)
    env = PreProcessFrame(env)
    env = MoveImgChannel(env)
    env = BufferWrapper(env, 4)
    return ScaleFrame(env)

In [37]:
if __name__ == '__main__':
  # the main loop here 
  env = gym.make('SpaceInvaders-v0')
  brain = Agent(gamma= 0.95, epsilon = 1.0, alpha = 0.03, maxMemorysize=5000, replace = None )
  while brain.memcntr < brain.memSize:
        observation = env.reset()
        done = False
        while not done:
            # 0 no action, 1 fire, 2 move right, 3 move left, 4 move right fire, 5 move left fire
            action = env.action_space.sample()
            observation_, reward, done, info = env.step(action)
            if done and info['ale.lives'] == 0:
                reward = -100
            brain.storeTransition(np.mean(observation[15:200,30:125], axis=2), action, reward,
                                np.mean(observation_[15:200,30:125], axis=2))
            observation = observation_
  print('done initializing memory')

  scores = []
  epsHistory = []
  numGames = 50
  batch_size=32
  # uncomment the line below to record every episode.
  #env = Wrappers.Monitor(env, "tmp/space-invaders-1", video_callable=lambda episode_id: True, force=True)
  for i in range(numGames):
      print('starting game ', i+1, 'epsilon: %.4f' % brain.epsilon)
      epsHistory.append(brain.epsilon)
      done = False
      observation = env.reset()
      frames = [np.sum(observation[15:200,30:125], axis=2)]
      score = 0
      lastAction = 0
      while not done:
          if len(frames) == 3:
              action = brain.chooseAction(frames)
              frames = []
          else:
              action = lastAction
          observation_, reward, done, info = env.step(action)
          score += reward
          frames.append(np.sum(observation_[15:200,30:125], axis=2))
          if done and info['ale.lives'] == 0:
              reward = -100
          brain.storeTransition(np.mean(observation[15:200,30:125], axis=2), action, reward,
                                np.mean(observation_[15:200,30:125], axis=2))
          observation = observation_
          brain.learn(batch_size)
          lastAction = action
          #env.render(
      scores.append(score)
      print('score:',score)
  x = [i+1 for i in range(numGames)]
  fileName = str(numGames) + 'Games' + 'Gamma' + str(brain.gamma) + \
              'Alpha' + str(brain.alpha) + 'Memory' + str(brain.memSize)+ '.png'
  plotLearning(x, scores, epsHistory, fileName)

done initializing memory
starting game  1 epsilon: 1.0000


RuntimeError: ignored