In [0]:

# importing libraries 
import gym
from gym import wrappers
import numpy as np # linear algebra
import os
import pybullet_envs



class Hp():                                 #hyperparameter class
    
    def __init__(self):
        self.num_steps = 1000               #no of times the update takes place
        self.episode_length = 1000
        self.learning_rate = 0.02
        self.num_directions = 16            #no. of small changes(+/-)
        self.num_best_directions = 16       #direction having max reward
        assert self.num_best_directions <= self.num_directions
        self.noise = 0.03                   #standard deviation(sigma),taken small to have low variance
        self.seed = 1
        self.env_name = "HalfCheetahBulletEnv-v0"
    

class Normalizer():                       #for normalizing the states
    
    def __init__(self,num_inputs):
        self.mean = np.zeros(num_inputs)
        self.n = np.zeros(num_inputs)            #counter for counting no of states  
        self.mean_diff = np.zeros(num_inputs)    #for the numerator of variance eqn
        self.var = np.zeros(num_inputs)
     
    #updates the variables each time we encounter a new state
    def observe(self, x):      #x is the new state
        self.n += 1.           #increases the total no. of states by one
        last_mean = self.mean
        self.mean += (x - self.mean) / self.n    #new mean after the new state
        self.mean_diff += (x - last_mean)*(x - self.mean)   #new numerator for variance eqn
        self.var = ((self.mean_diff) / self.n).clip(min = 1e-2)    #variance can never be 0 as it will be reqd in denom in normalisation
    
    def normalizer(self, inputs):           #takes the input as state vars
        obs_mean = self.mean
        obs_std = np.sqrt(self.var)
        return (inputs - obs_mean) / obs_std       #returns the normalized state
    
#building the ai

class Policy():
    
    def __init__(self, input_size, output_size):
        self.theta = np.zeros((output_size, input_size))
        
    def evaluate(self, input, delta = None, direction = None):    #delta is the small pertubation & direction is +/-
        if direction is None:
            return self.theta.dot(input)                      #when no pertubation is applied
        elif direction is 'positive':
            return (self.theta + hp.noise*delta).dot(input)   #for positive direction of pertubation
        else:
            return (self.theta - hp.noise*delta).dot(input)   #for opp. direction of pertubation
        
    def sample_deltas(self):
        return [np.random.randn(*self.theta.shape) for _ in range(hp.num_directions)]    #(*theta.shape) is used so that both dimensions of theta are passed as size for delta
                #will return lsit of 16 diff metrices containing random numbers as pertubations
    
    def update(self, rollouts, sigma_r):        #rollout refers to the combination of +ve and -ve reward generated by any pertubation metrix d
        step = np.zeros(self.theta.shape)
        for r_pos, r_neg, d in rollouts:
            step += (r_pos - r_neg) * d         #method of finite differences
        self.theta += (hp.learning_rate)/(hp.num_best_directions * sigma_r) * step    
    


#explore the policy in one specific direction on one full episode    
def explore(env, normalizer, policy, direction = None, delta = None):       #env is a pybullet object
    state = env.reset()
    done = False
    num_plays = 0
    sum_rewards = 0
    while not done and num_plays < hp.episode_length:
        normalizer.observe(state)
        state = normalizer.normalize(state)
        action = policy.evaluate(state, delta, direction)
        state, reward, done, _ = env.step(action)
        reward = max(min(reward, 1), -1)       #this statement takes care of the outliers.
                                               #if reward >1, it makes it 1 and if reward <-1, it makes it -1
        sum_rewards += reward
        num_plays += 1
    return sum_rewards         #returns the total reward for each episode


#training the ai
def train(env, policy, normalizer, hp):
    
    for step in range(hp.num_steps):
        #initialising the pertubation deltas and pos/neg rewards
        deltas = policy.sample_deltas()
        positive_rewards = [0] * hp.num_directions
        negative_rewards = [0] * hp.num_directions          #rewards for pertubations in neg direction
    
        #getting the +ve rewards in +ve directions and -ve in -ve direction
        for k in range(hp.num_directions):
            positive_rewards[k] = explore(env, normalizer, policy, direction = "positive", delta = deltas[k])
            negative_rewards[k] = explore(env, normalizer, policy, direction = "negative", delta = deltas[k])
        
        #concetenating both rewards into one numpy array so that to calculate sigma_r    
        all_rewards = np.array(positive_rewards + negative_rewards)
        sigma_r = all_rewards.std()
        
        #sorting rollouts by max(r_pos, r_neg) and best directions
        scores = {k:max(r_pos, r_neg) for k,(r_pos, r_neg) in enumerate(zip(positive_rewards, negative_rewards))}
        order = sorted(scores.keys(), key = lambda x:scores[x])[:hp.num_best_directions]
                    #sorts the scores dict with respect to value in key-value pairs of scores dict
        rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order] 
        
        #updating the policy 
        policy.update(rollouts, sigma_r) 
        
        #reward evaluation
        reward_evaluation = explore(env, normalizer, policy)
        print("step ",step," reward: ",reward_evaluation)
        

        
#running the main code
def mkdir(base, name):
    path = os.path.join(base, name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path    
work_dir = mkdir('exp', 'ars')
monitor_dir = mkdir(work_dir, 'monitor')         #to create a dir for saving the monitor videos

hp = Hp()
np.random.seed(hp.seed)
env = gym.make(hp.env_name)                      #creating a gym environment
env = wrappers.Monitor(env, monitor_dir, force = True)     #force = True forces to overrun all warning signs
num_inputs = env.observation_space.shape[0]                #number of inputs from environment
num_outputs = env.action_space.shape[0]                    #number of actions performed (outputs)
policy = Policy(num_inputs, num_outputs)
normalizer = Normalizer(num_inputs)
train(env, policy, normalizer, hp)