# ARS in tensorflow 

This is from the paper: Simple random search provides a competitive approach to reinforcement learning. 
The link can be found [here](https://arxiv.org/pdf/1803.07055.pdf). 

Performance: 

![image](ars-performance.png)


Import dependencies which are tensorflow, numpy, matplotlib and gym

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plot 
import gym 
from multiprocessing import Pool 

#random seed
np.random.seed(0)



Define class for the ARS algorithm using tensorflow

In [None]:
class ARS(object):
    
    def __init__(self, alpha = 0.1, N = 5, v = 0.1, b = 5, input_size = 4, output_size = 2):
        self.alpha = alpha #step size 
        self.N = N #number of directions sampled
        self.v = v #standard deviation of exploration noise
        self.b = b #number of top performing directions to use 
        self.input_size = input_size #size of the state space
        self.output_size = output_size #size of the action space
        self.generate_model(np.zeros((self.input_size,self.output_size)))
    
    def generate_model(self, policy):
        self.models = np.array([policy for _ in range(self.N * 2)])

    def pertebate_models(self):
        self.delta = np.random.normal(0, self.v, size=(self.N, self.input_size, self.output_size))
        self.models = self.models + np.vstack([self.delta, - self.delta])
        return self.models
    
    def update_model(self, results):
        sorting = np.flip(np.argsort(results))[0:self.b]
        std = np.max([0.0001,results[sorting].std()]) # if this goes to 0 it kills the algorithm.
        policy = self.models[0] + ((self.alpha/(self.b*std)) * np.sum(np.multiply(ars.delta[sorting].T,results[sorting]).T, axis = 0))        
        self.generate_model(policy)
        return policy
         

In [None]:
# this needs some functionality to handle an environment that has died. 
class PolicyEval(object):
    
    def __init__(self, automate = True, name = "CartPole-v0"):
        self.env = gym.make(name) #one for each policy
        self.env.seed(0)
        self.actions = np.array([0,1])
        self.limit = 200
    
    def eval_policy(self, state, policy):
        linear_output = np.dot(state, policy)
        probabilities = softmax(linear_output)            
        action = np.random.choice(self.actions, p=probabilities)
        return action
    
    def run_policy(self, policy):
        states = []
        iterations = 0
        done = False
        reward = 0
        state = self.env.reset()
        while iterations < self.limit and not done:
            states.append(state)
            state, r, done, _ = self.env.step(self.eval_policy(state, policy))
            reward += r 
            iterations += 1
        return reward, np.array(states)
    

In [None]:
def automate_PolicyEval(policy):
    run = PolicyEval()
    return run.run_policy(policy)

In [None]:
def softmax(input_values):
    exponential = np.exp(input_values)
    sum_exponential = np.sum(exponential)
    softmax = np.divide(exponential,sum_exponential)
    return softmax 

In [None]:
def running_average(data):
    new_data =[]
    for i in range(len(data)):
        new_data.append(np.average(data[max(0, i - 20):i+20]))
    return new_data
        

In [None]:
ars = ARS(0.00015,10,0.01,3,4,2)
        
episodes = 0 
total_results = []
while episodes < 1000:
    results = []
    ars.pertebate_models()
    policies = ars.models
    with Pool() as pool:  
        reward_evaluate = pool.starmap_async(automate_PolicyEval, zip(policies))
        reward_evaluate.wait()
    output = np.array(reward_evaluate.get())
    results = np.array([output[ind][0] for ind in range(ars.N * 2)])
    states = np.array([output[ind][1] for ind in range(ars.N * 2)])
    print("Max achieved in episode %d: %s"%(episodes, np.max(results)))
    total_results.append(np.max(results))
    results = np.reshape(results, (2,ars.N))
    results = np.subtract(results[0], results[1])
    policy = ars.update_model(results)
    episodes += 1 



In [None]:
plot.plot(total_results)
plot.plot(running_average(total_results))
plot.ylabel('Reward')
plot.xlabel('Episodes')
plot.title('Reward on CartPole')
plot.show()