In [None]:
import gym
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random

from keras import backend as K
from keras.layers import Dense, Activation, Input
from keras.models import Model, load_model
from keras.optimizers import Adam
from IPython.display import clear_output

from sklearn.neural_network import MLPRegressor
from warnings import filterwarnings
filterwarnings('ignore')


import seaborn as sns
sns.set_context("paper")
sns.set_style("darkgrid")

In [None]:
env=gym.make("CartPole-v1")#LunarLander-v2 #CartPole-v1
env._max_episode_steps=200
env.reset()
n_actions = env.action_space.n
state_dim = env.observation_space.shape

# 1. Random policy

In [None]:
def random_action():
    return [1/n_actions]*n_actions

In [None]:
Iter=500
G_log=[]
for epoch in range(Iter):
    done=False
    s=env.reset()
    G=0
    while not done:
        action_probas = random_action()
        a = np.random.choice(n_actions, 1, p=action_probas)[0]
        sp, r, done, info = env.step(a)
        s = sp
        G+=r
    G_log.append(G)

In [None]:
v=plt.hist(G_log,bins=2*int(np.sqrt(len(G_log))))
mean=np.mean(G_log)
plt.vlines(mean,[0],[max(v[0])],label="mean={:.2f}".format(mean))
plt.legend()
plt.show()    

# 2. REINFORCE (episodic)

In [None]:
class REINFORCE(object):
    def __init__(self,paras={}):
        self.dim_state=paras['dim_state']
        self.num_action=paras['num_action']
        self.alpha_A=paras['alpha_A']
        self.paras_A=np.zeros((self.dim_state,self.num_action))
        self.gamma=0.999

    def get_action_proba(self,s):
        perfs=[self.h(s,b) for b in range(self.num_action)]
        c=max(perfs)
        tt=sum([np.exp(self.h(s,b)-c) for b in range(self.num_action)])
        return [np.exp(self.h(s,a)-c)/tt for a in range(self.num_action)]
    
    def h(self,s,a):
        return s.dot(self.paras_A)[a]
    
    def update_paras(self,states,actions,rewards):
        T=len(states)
        for t in range(T):
            G=sum([rewards[k]*(self.gamma**(k-t-1)) for k in range(t+1,T)])
            a=actions[t]
            prob=self.get_action_proba(states[t])
            for b in range(self.num_action):
                self.paras_A[:,b]+=self.alpha_A*(self.gamma**t)*G*(int(a==b)-prob[b])*states[t]

In [None]:
agent_config={}
agent_config['dim_state']=state_dim[0]
agent_config['num_action']=n_actions
agent_config['alpha_A']=3e-2
agent=REINFORCE(agent_config)

Iter=500
G_log=[]
for epoch in range(Iter):
    done=False
    s=env.reset()
    G=0
    states=[]
    actions=[]
    rewards=[]
    while not done:
        action_probas = agent.get_action_proba(s)
        a = np.random.choice(n_actions, 1, p=action_probas)[0]
        sp, r, done, info = env.step(a)
        states.append(s)
        actions.append(a)
        rewards.append(r)
        s = sp
        G+=r
    G_log.append(G)
    agent.update_paras(states,actions,rewards)
    
    if (epoch+1)%100==0:
        print(epoch+1,np.mean(G_log[-100:]))
        
plt.figure(figsize=(18,4))
plt.subplot(1,2,1)
plt.plot(G_log)
plt.plot(pd.DataFrame(G_log).rolling(100).mean())

plt.subplot(1,2,2)
v=plt.hist(G_log[-100:],bins=2*int(np.sqrt(len(G_log[-100:]))))
mean=np.mean(G_log[-100:])
plt.vlines(mean,[0],[max(v[0])],label="mean={:.2f}".format(mean))
plt.legend()
plt.show()            

# 3. REINFORCE with baseline (episodic)

In [None]:
class REINFORCE_with_baseline(object):
    def __init__(self,paras={}):
        self.dim_state=paras['dim_state']
        self.num_action=paras['num_action']
        self.alpha_A=paras['alpha_A']
        self.alpha_baseline=paras['alpha_baseline']
        self.paras_A=np.zeros((self.dim_state,self.num_action))
        self.paras_v=np.zeros(self.dim_state)
        self.gamma=0.999
    
    def get_action_proba(self,s):
        perfs=[self.h(s,b) for b in range(self.num_action)]
        c=max(perfs)
        tt=sum([np.exp(self.h(s,b)-c) for b in range(self.num_action)])
        return [np.exp(self.h(s,a)-c)/tt for a in range(self.num_action)]
    def v(self,s):
        return s.dot(self.paras_v)
    
    def h(self,s,a):
        return s.dot(self.paras_A)[a]
    
    def update_paras(self,states,actions,rewards):
        T=len(states)
        for t in range(T):
            G=sum([rewards[k]*(self.gamma**(k-t-1)) for k in range(t+1,T)])
            delta=G-self.v(states[t])
            prob=self.get_action_proba(states[t])
            self.paras_v+=self.alpha_baseline*delta*states[t]
            for a in range(self.num_action):
                self.paras_A[:,a]+=self.alpha_A*(self.gamma**t)*delta*(int(actions[t]==a)-prob[a])*states[t]

In [None]:
agent_config={}
agent_config['dim_state']=state_dim[0]
agent_config['num_action']=n_actions
agent_config['alpha_A']=1e-2
agent_config['alpha_baseline']=3e-2
agent=REINFORCE_with_baseline(agent_config)
Iter=500
G_log=[]
for epoch in range(Iter):
    done=False
    s=env.reset()
    G=0
    states=[]
    actions=[]
    rewards=[]
    while not done:
        action_probas = agent.get_action_proba(s)
        a = np.random.choice(n_actions, 1, p=action_probas)[0]
        sp, r, done, info = env.step(a)
        states.append(s)
        actions.append(a)
        rewards.append(r)
        s = sp
        G+=r
    G_log.append(G)
    agent.update_paras(states,actions,rewards)
    
    if (epoch+1)%100==0:
        print(epoch+1,np.mean(G_log[-100:]))

plt.figure(figsize=(18,4))
plt.subplot(1,2,1)
plt.plot(G_log)
plt.plot(pd.DataFrame(G_log).rolling(100).mean())

plt.subplot(1,2,2)
v=plt.hist(G_log[-100:],bins=2*int(np.sqrt(len(G_log[-100:]))))
mean=np.mean(G_log[-100:])
plt.vlines(mean,[0],[max(v[0])],label="mean={:.2f}".format(mean))
plt.legend()
plt.show()    

# 4. Actor-Critic (episodic)

In [None]:
class Actor_Critic(object):
    def __init__(self,paras={}):
        self.gamma=0.999
        self.dim_state=paras['dim_state']
        self.num_action=paras['num_action']
        self.alpha_A=paras['alpha_A']
        self.alpha_C=paras['alpha_C']
        
        state = Input(shape=[self.dim_state])
        for idx,layer in enumerate(paras['structure']):
            if idx==0:
                dense_layers = [Dense(layer, activation='relu')(state)]
            else:
                dense_layers.append(Dense(layer, activation='relu')(dense_layers[-1]))

        Qvalue = Dense(self.num_action, activation='linear')(dense_layers[-1])
        value = Dense(1, activation='linear')(dense_layers[-1])
        
        critic = Model(input=[state], output=[value])
        critic.compile(optimizer=Adam(lr=self.alpha_C), loss="mse")
        
        actor = Model(input=[state], output=[Qvalue])
        actor.compile(optimizer=Adam(lr=self.alpha_A), loss="mse")
        
        self.actor=actor
        self.critic=critic
        self.Phi=K.function([self.actor.layers[0].input],[self.actor.layers[-2].output])         
                
    def v(self,s):
        return self.critic.predict(s.reshape(1,-1))
    
    def get_action_proba(self,s):
        prefs = self.actor.predict(s.reshape(1,-1))[0]
        c=max(prefs)
        tt=sum([np.exp(prefs[b]-c) for b in range(self.num_action)])
        return [np.exp(prefs[a]-c)/tt for a in range(self.num_action)]
    
    def get_target_q_s_a(self,s,a,td_error,prob):
        phi=self.Phi(s.reshape(1,-1))[0][0]
        w,b=np.zeros(self.actor.get_weights()[-2].shape),np.zeros(self.actor.get_weights()[-1].shape)
        for ap in range(self.num_action):
            w[:,ap]+=(td_error*(int(a==ap)-prob[ap])*phi)[0]
            b[ap]+=td_error*(int(a==ap)-prob[ap])
        target_q_s_a=phi.dot(w)+b+self.actor.predict(s.reshape(1,-1))
        return target_q_s_a
    
    def update_paras(self,s,a,sp,r,prob,done):
        td_error=r+self.gamma*self.v(sp)*int(not done)-self.v(s)
        critic_target=r+self.gamma*self.v(sp)*int(not done)
        target_q_s_a=self.get_target_q_s_a(s,a,td_error,prob)
        
        self.actor.fit(s.reshape(1,-1), target_q_s_a.reshape(1,-1), verbose=0)
        self.critic.fit(s.reshape(1,-1), critic_target, verbose=0)

In [None]:
class Actor_Critic(object):
    def __init__(self,paras):
        self.gamma=0.999
        self.dim_state=paras['dim_state']
        self.num_action=paras['num_action']
        self.alpha_A=paras['alpha_A']
        self.actor_structure=paras['actor_structure']
        self.alpha_C=paras['alpha_C']
        self.critic_structure=paras['critic_structure']
        self.actor,self.policy=self.build_actor()   
        self.critic=self.build_critic()
    
    def build_actor(self):
        td_error = Input(shape=[1,])
        state = Input(shape=[self.dim_state,])
        for idx,layer in enumerate(self.actor_structure):
            if idx==0:
                dense_layers = [Dense(layer, activation='relu')(state)]
            else:
                dense_layers.append(Dense(layer, activation='relu')(dense_layers[-1]))
        pi = Dense(self.num_action, activation='softmax')(dense_layers[-1])
        def custom_loss(y_true, prob):
            loss = -K.log(prob)*y_true*td_error
            return loss
        actor = Model(input=[state,td_error], output=[pi])
        actor.compile(optimizer=Adam(lr=self.alpha_A), loss=custom_loss)
        policy = Model(input=[state], output=[pi])
        return actor,policy
    
    def build_critic(self):
        state = Input(shape=[self.dim_state,])
        for idx,layer in enumerate(self.actor_structure):
            if idx==0:
                dense_layers = [Dense(layer, activation='relu')(state)]
            else:
                dense_layers.append(Dense(layer, activation='relu')(dense_layers[-1]))
        value = Dense(1, activation='linear')(dense_layers[-1])
        critic = Model(input=[state], output=[value])
        critic.compile(optimizer=Adam(lr=self.alpha_C), loss="mse")
        return critic
    
    def v(self,s):
        return self.critic.predict(s)
    
    def get_action_proba(self,s):
        return self.policy.predict(s)[0]
    
    def learn(self,s,a,r,sp,done):  
        td_error=r+self.gamma*self.v(sp.reshape(1,-1))*(1-done)-self.v(s.reshape(1,-1))
        critic_target=r+self.gamma*self.v(sp.reshape(1,-1))*(1-done)
        actions = np.zeros(self.num_action)
        actions[a]=1
        self.critic.fit(s.reshape(1,-1), critic_target.reshape(1,-1), verbose=0)
        self.actor.fit([s.reshape(1,-1), td_error.reshape(1,-1)], actions.reshape(1,-1), verbose=0)
        


In [None]:
agent_config={}
agent_config['dim_state']=state_dim[0]
agent_config['num_action']=n_actions
agent_config['alpha_A']=1e-5
agent_config['alpha_C']=1e-3
agent_config['actor_structure']=[128,128]
agent_config['critic_structure']=[128,128]


agent=Actor_Critic(agent_config)
G_log=[]
Iter=500
for epoch in range(Iter):
    done=False
    s=env.reset()
    G=0
    while not done:
        action_probas = agent.get_action_proba(s.reshape(1,-1))
        a = np.random.choice(n_actions, 1, p=action_probas)[0]     
        sp, r, done, info = env.step(a)
        agent.learn(s,a,sp,r,done)
        s = sp
        G+=r
    G_log.append(G)
    if (epoch+1)%50==0:
        print(epoch+1,np.mean(G_log[-100:]))

plt.figure(figsize=(18,4))
plt.subplot(1,2,1)
plt.plot(G_log)
plt.plot(pd.DataFrame(G_log).rolling(100).mean())

plt.subplot(1,2,2)
v=plt.hist(G_log[-100:],bins=2*int(np.sqrt(len(G_log[-100:]))))
mean=np.mean(G_log[-100:])
plt.vlines(mean,[0],[max(v[0])],label="mean={:.2f}".format(mean))
plt.legend()
plt.show()            

# 5. Actor-Critic with PER(episodic) 

In [None]:
class Actor_Critic_PER(object):
    def __init__(self,paras):
        self.gamma=0.999
        self.memeory=[]
        self.dim_state=paras['dim_state']
        self.num_action=paras['num_action']
        self.alpha_A=paras['alpha_A']
        self.actor_structure=paras['actor_structure']
        self.alpha_C=paras['alpha_C']
        self.critic_structure=paras['critic_structure']
        self.memeory_size=paras['memeory_size']
        self.batch_size=paras['batch_size'] 
        self.alpha=paras['alpha']
        self.actor,self.policy=self.build_actor()   
        self.critic=self.build_critic()
    
    def build_actor(self):
        td_error = Input(shape=[1,])
        state = Input(shape=[self.dim_state,])
        for idx,layer in enumerate(self.actor_structure):
            if idx==0:
                dense_layers = [Dense(layer, activation='relu')(state)]
            else:
                dense_layers.append(Dense(layer, activation='relu')(dense_layers[-1]))
        pi = Dense(self.num_action, activation='softmax')(dense_layers[-1])
        def custom_loss(prob_action, prob_taken_action):
            prob_taken_action = K.clip(prob_taken_action, 1e-8, 1-1e-8)
            log_lik = prob_action*K.log(prob_taken_action)
            return K.sum(-log_lik*td_error)
        actor = Model(input=[state,td_error], output=[pi])
        actor.compile(optimizer=Adam(lr=self.alpha_A), loss=custom_loss)
        policy = Model(input=[state], output=[pi])
        return actor,policy
    
    def build_critic(self):
        state = Input(shape=[self.dim_state,])
        for idx,layer in enumerate(self.actor_structure):
            if idx==0:
                dense_layers = [Dense(layer, activation='relu')(state)]
            else:
                dense_layers.append(Dense(layer, activation='relu')(dense_layers[-1]))
        value = Dense(1, activation='linear')(dense_layers[-1])
        critic = Model(input=[state], output=[value])
        critic.compile(optimizer=Adam(lr=self.alpha_C), loss="mse")
        return critic
    
    def v(self,s):
        return self.critic.predict(s)
    
    def get_action_proba(self,s):
        return self.policy.predict(s)[0]
    
    def update_paras(self,s,a,r,sp,done):  
        td_error=r+self.gamma*self.v(sp)*(1-done)-self.v(s)
        critic_target=r+self.gamma*self.v(sp)*(1-done)
        actions = np.zeros([len(a), self.num_action])
        for idx in range(len(a)):
            actions[idx,a[idx][0]]=1
        actions
        self.critic.fit(s, critic_target, verbose=0)
        self.actor.fit([s, td_error], actions, verbose=0)
        
    def memeorize(self,s,a,r,sp,done):
        self.memeory.append([s,a,r,sp,done])
        if len(self.memeory)>self.memeory_size:
            del self.memeory[0]    
    
    def learn(self):
        sampled_exp=self.sample_exp()
        s=np.array([list(_) for _ in sampled_exp[:,0]])
        a=sampled_exp[:,1].reshape(-1,1)
        r=sampled_exp[:,2].reshape(-1,1)
        sp=np.array([list(_) for _ in sampled_exp[:,3]])
        done=sampled_exp[:,4].reshape(-1,1)
        self.update_paras(s,a,r,sp,done)
        
    def sample_exp(self):
        G=np.zeros(len(self.memeory))
        end_idx=[-1]+list(np.where(np.array(self.memeory)[:,4]==True)[0])
        R=np.array(self.memeory)[:,2]
        if len(end_idx)>1:
            for idx in range(1,len(end_idx)):
                G[end_idx[idx-1]+1:end_idx[idx]+1]=sum(R[end_idx[idx-1]+1:end_idx[idx]+1])
        G=(G-G.min()+1e-5)/(G.max()-G.min()+1e-5)        
        prob=np.array(G)**self.alpha/sum(np.array(G)**self.alpha)
        
        index_set=np.random.choice(range(len(self.memeory)),
                    size=min(len(self.memeory),self.batch_size),p=prob,replace=False)
        return np.array(self.memeory)[index_set]

In [None]:
agent_config={}
agent_config['dim_state']=state_dim[0]
agent_config['num_action']=n_actions
agent_config['alpha_A']=1e-5
agent_config['alpha_C']=1e-3
agent_config['memeory_size']=1e5
agent_config['batch_size']=32
agent_config['actor_structure']=[128,128]
agent_config['critic_structure']=[128,128]
agent_config['alpha']=0.9
agent=Actor_Critic_PER(agent_config)

Iter=200
G_log=[]
for epoch in range(Iter):
    done=False
    s=env.reset()
    G=0
    while not done:
        action_probas = agent.get_action_proba(s.reshape(1,-1))
        a = np.random.choice(n_actions, 1, p=action_probas)[0]     
        sp, r, done, info = env.step(a)
        agent.memeorize(s,a,r,sp,done)
        agent.learn()
        s = sp
        G+=r
    G_log.append(G)
    if (epoch+1)%10==0:
        print(epoch+1,np.mean(G_log[-100:]))
        

plt.plot(G_log)
plt.plot(pd.DataFrame(G_log).rolling(100).mean())
plt.show()            
      

# 6. Average Reward Actor-Critic with PER(episodic)

In [None]:
class Average_Reward_Actor_Critic_PER(object):
    def __init__(self,paras):
        self.R=0
        self.memeory=[]
        self.dim_state=paras['dim_state']
        self.num_action=paras['num_action']
        self.alpha_A=paras['alpha_A']
        self.actor_structure=paras['actor_structure']
        self.alpha_C=paras['alpha_C']
        self.critic_structure=paras['critic_structure']
        self.alpha_R=paras['alpha_R']
        self.memeory_size=paras['memeory_size']
        self.batch_size=paras['batch_size'] 
        self.alpha=paras['alpha']
        self.actor,self.policy=self.build_actor()   
        self.critic=self.build_critic()
    
    def build_actor(self):
        td_error = Input(shape=[1,])
        state = Input(shape=[self.dim_state,])
        for idx,layer in enumerate(self.actor_structure):
            if idx==0:
                dense_layers = [Dense(layer, activation='relu')(state)]
            else:
                dense_layers.append(Dense(layer, activation='relu')(dense_layers[-1]))
        pi = Dense(self.num_action, activation='softmax')(dense_layers[-1])
        def custom_loss(prob_action, prob_taken_action):
            prob_taken_action = K.clip(prob_taken_action, 1e-8, 1-1e-8)
            log_lik = prob_action*K.log(prob_taken_action)
            return K.sum(-log_lik*td_error)
        actor = Model(input=[state,td_error], output=[pi])
        actor.compile(optimizer=Adam(lr=self.alpha_A), loss=custom_loss)
        policy = Model(input=[state], output=[pi])
        return actor,policy
    
    def build_critic(self):
        state = Input(shape=[self.dim_state,])
        for idx,layer in enumerate(self.actor_structure):
            if idx==0:
                dense_layers = [Dense(layer, activation='relu')(state)]
            else:
                dense_layers.append(Dense(layer, activation='relu')(dense_layers[-1]))
        value = Dense(1, activation='linear')(dense_layers[-1])
        critic = Model(input=[state], output=[value])
        critic.compile(optimizer=Adam(lr=self.alpha_C), loss="mse")
        return critic
    
    def v(self,s):
        return self.critic.predict(s)
    
    def get_action_proba(self,s):
        return self.policy.predict(s)[0]
    
    def update_paras(self,s,a,r,sp,done): 
        td_error=r-self.R+self.v(sp)*(1-done)-self.v(s)
        critic_target=r-self.R+self.v(sp)*(1-done)
        self.R+=self.alpha_R*td_error.mean()
        actions = np.zeros([len(a), self.num_action])
        for idx in range(len(a)):
            actions[idx,a[idx][0]]=1
        self.critic.fit(s, critic_target, verbose=0)
        self.actor.fit([s, td_error], actions, verbose=0)
        
    def memeorize(self,s,a,r,sp,done):
        self.memeory.append([s,a,r,sp,done])
        if len(self.memeory)>self.memeory_size:
            del self.memeory[0]    
    
    def learn(self):
        sampled_exp=self.sample_exp()
        s=np.array([list(_) for _ in sampled_exp[:,0]])
        a=sampled_exp[:,1].reshape(-1,1)
        r=sampled_exp[:,2].reshape(-1,1)
        sp=np.array([list(_) for _ in sampled_exp[:,3]])
        done=sampled_exp[:,4].reshape(-1,1)
        self.update_paras(s,a,r,sp,done)
        
    def sample_exp(self):
        G=np.zeros(len(self.memeory))
        end_idx=[-1]+list(np.where(np.array(self.memeory)[:,4]==True)[0])
        R=np.array(self.memeory)[:,2]
        if len(end_idx)>1:
            for idx in range(1,len(end_idx)):
                G[end_idx[idx-1]+1:end_idx[idx]+1]=sum(R[end_idx[idx-1]+1:end_idx[idx]+1])
        G=(G-G.min()+1e-5)/(G.max()-G.min()+1e-5)        
        prob=np.array(G)**self.alpha/sum(np.array(G)**self.alpha)
        
        index_set=np.random.choice(range(len(self.memeory)),
                    size=min(len(self.memeory),self.batch_size),p=prob,replace=False)
        return np.array(self.memeory)[index_set]

In [None]:
agent_config={}
agent_config['dim_state']=state_dim[0]
agent_config['num_action']=n_actions
agent_config['alpha_A']=1e-5
agent_config['alpha_C']=1e-3
agent_config['alpha_R']=1e-2
agent_config['memeory_size']=1e5
agent_config['batch_size']=32
agent_config['actor_structure']=[128,128]
agent_config['critic_structure']=[128,128]
agent_config['alpha']=0.9
agent=Average_Reward_Actor_Critic_PER(agent_config)

Iter=200
G_log=[]
for epoch in range(Iter):
    done=False
    s=env.reset()
    G=0
    while not done:
        action_probas = agent.get_action_proba(s.reshape(1,-1))
        a = np.random.choice(n_actions, 1, p=action_probas)[0]     
        sp, r, done, info = env.step(a)
        agent.memeorize(s,a,r,sp,done)
        agent.learn()
        s = sp
        G+=r
    G_log.append(G)
    if (epoch+1)%10==0:
        print(epoch+1,np.mean(G_log[-100:]))
        

plt.plot(G_log)
plt.plot(pd.DataFrame(G_log).rolling(100).mean())
plt.show()            
      

In [None]:
class Average_Reward_Actor_Critic_PER(object):
    def __init__(self,paras):
        self.R=0
        self.memeory=[]
        self.dim_state=paras['dim_state']
        self.num_action=paras['num_action']
        self.alpha_A=paras['alpha_A']
        self.actor_structure=paras['actor_structure']
        self.alpha_C=paras['alpha_C']
        self.critic_structure=paras['critic_structure']
        self.alpha_R=paras['alpha_R']
        self.memeory_size=paras['memeory_size']
        self.batch_size=paras['batch_size'] 
        self.alpha=paras['alpha']
        self.actor,self.Phi=self.build_actor()   
        self.critic=self.build_critic()
    
    def build_actor(self):
        td_error = Input(shape=[1,])
        state = Input(shape=[self.dim_state,])
        for idx,layer in enumerate(self.actor_structure):
            if idx==0:
                dense_layers = [Dense(layer, activation='relu')(state)]
            else:
                dense_layers.append(Dense(layer, activation='relu')(dense_layers[-1]))
        Qvalue = Dense(self.num_action, activation='linear')(dense_layers[-1])
        actor = Model(input=[state], output=[Qvalue])
        actor.compile(optimizer=Adam(lr=self.alpha_A), loss="mse")
        Phi=K.function([actor.layers[0].input],[actor.layers[-2].output])  
        return actor,Phi
    
    def build_critic(self):
        state = Input(shape=[self.dim_state,])
        for idx,layer in enumerate(self.actor_structure):
            if idx==0:
                dense_layers = [Dense(layer, activation='relu')(state)]
            else:
                dense_layers.append(Dense(layer, activation='relu')(dense_layers[-1]))
        value = Dense(1, activation='linear')(dense_layers[-1])
        critic = Model(input=[state], output=[value])
        critic.compile(optimizer=Adam(lr=self.alpha_C), loss="mse")
        return critic
    
    def v(self,s):
        return self.critic.predict(s)
    
    def get_action_proba(self,s):
        prefs = self.actor.predict(s)
        c=prefs.max(axis=1)
        prob=np.exp(prefs-c.reshape(-1,1))/np.exp(prefs-c.reshape(-1,1)).sum(axis=1).reshape(-1,1)
        return prob
    
    def get_target_q_s_a(self,s,a,td_error):
        prob=self.get_action_proba(s)
        phi=self.Phi(s)[0]
        w=np.zeros(list(self.actor.get_weights()[-2].shape)+[s.shape[0]])
        b=np.zeros(list(self.actor.get_weights()[-1].shape)+[s.shape[0]])
        
        for ap in range(self.num_action):
            temp=(np.array([1 if _==ap else 0 for _ in a])-prob[:,ap]).reshape(-1,1)
            w[:,ap,:]=w[:,ap,:]+(td_error*temp*phi).T
            b[ap,:]=b[ap,:]+(td_error*temp).reshape(1,-1)
        target_q_s_a=np.array([list(phi[idx,:].dot(w[:,:,idx])) for idx in range(len(s))])+b.T+self.actor.predict(s)
        return target_q_s_a
    def update_paras(self,s,a,r,sp,done): 
        td_error=r-self.R+self.v(sp)*(1-done)-self.v(s)
        critic_target=r-self.R+self.v(sp)*(1-done)
        self.R+=self.alpha_R*td_error.mean()
        
        self.critic.fit(s, critic_target, verbose=0)
                
        target_q_s_a=self.get_target_q_s_a(s,a,td_error)
        self.actor.fit(s, target_q_s_a, verbose=0)
        
    def memeorize(self,s,a,r,sp,done):
        self.memeory.append([s,a,r,sp,done])
        if len(self.memeory)>self.memeory_size:
            del self.memeory[0]    
    
    def learn(self):
        sampled_exp=self.sample_exp()
        s=np.array([list(_) for _ in sampled_exp[:,0]])
        a=sampled_exp[:,1].reshape(-1,1)
        r=sampled_exp[:,2].reshape(-1,1)
        sp=np.array([list(_) for _ in sampled_exp[:,3]])
        done=sampled_exp[:,4].reshape(-1,1)
        self.update_paras(s,a,r,sp,done)
        
    def sample_exp(self):
        G=np.zeros(len(self.memeory))
        end_idx=[-1]+list(np.where(np.array(self.memeory)[:,4]==True)[0])
        R=np.array(self.memeory)[:,2]
        if len(end_idx)>1:
            for idx in range(1,len(end_idx)):
                G[end_idx[idx-1]+1:end_idx[idx]+1]=sum(R[end_idx[idx-1]+1:end_idx[idx]+1])
        G=(G-G.min()+1e-5)/(G.max()-G.min()+1e-5)        
        prob=np.array(G)**self.alpha/sum(np.array(G)**self.alpha)
        
        index_set=np.random.choice(range(len(self.memeory)),
                    size=min(len(self.memeory),self.batch_size),p=prob,replace=False)
        return np.array(self.memeory)[index_set]

In [None]:
agent_config={}
agent_config['dim_state']=state_dim[0]
agent_config['num_action']=n_actions
agent_config['alpha_A']=1e-5
agent_config['alpha_C']=1e-3
agent_config['alpha_R']=1e-2
agent_config['memeory_size']=1e5
agent_config['batch_size']=32
agent_config['actor_structure']=[128,128]
agent_config['critic_structure']=[128,128]
agent_config['alpha']=0.9
agent=Average_Reward_Actor_Critic_PER(agent_config)


Iter=200
G_log=[]
for epoch in range(Iter):
    done=False
    s=env.reset()
    G=0
    while not done:
        action_probas = agent.get_action_proba(s.reshape(1,-1))[0]
        a = np.random.choice(n_actions, 1, p=action_probas)[0]     
        sp, r, done, info = env.step(a)
        agent.memeorize(s,a,r,sp,done)
        agent.learn()
        s = sp
        G+=r
    G_log.append(G)
    if (epoch+1)%10==0:
        print(epoch+1,np.mean(G_log[-100:]))
        

plt.plot(G_log)
plt.plot(pd.DataFrame(G_log).rolling(100).mean())
plt.show()            
      

In [None]:
agent_config={}
agent_config['dim_state']=state_dim[0]
agent_config['num_action']=n_actions
agent_config['alpha_A']=1e-5
agent_config['alpha_C']=1e-3
agent_config['alpha_R']=1e-2
agent_config['memeory_size']=1e5
agent_config['batch_size']=32
agent_config['actor_structure']=[128,128]
agent_config['critic_structure']=[128,128]
agent_config['alpha']=0.75
agent=Average_Reward_Actor_Critic_PER(agent_config)


Iter=200
G_log=[]
for epoch in range(Iter):
    done=False
    s=env.reset()
    G=0
    while not done:
        action_probas = agent.get_action_proba(s.reshape(1,-1))[0]
        a = np.random.choice(n_actions, 1, p=action_probas)[0]     
        sp, r, done, info = env.step(a)
        agent.memeorize(s,a,r,sp,done)
        agent.learn()
        s = sp
        G+=r
    G_log.append(G)
    if (epoch+1)%10==0:
        print(epoch+1,np.mean(G_log[-100:]))
        

plt.plot(G_log)
plt.plot(pd.DataFrame(G_log).rolling(100).mean())
plt.show()            
      