In [1]:
import gym  # open ai gym
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mp
import os
import time
import random
from keras.models import Sequential,load_model
from keras.layers import Dense, Dropout,Activation
from keras.optimizers import Adam
from collections import deque

%matplotlib notebook

Using TensorFlow backend.


In [2]:
class Deep_QN:
    def __init__(self,env,model_available,file_name):
        self.model_env = env                     
        self.gamma = 0.99
        self.epsilon = 1.0
        self.min_epsilon = 0.01
        self.dec_epsilon = 0.996
        self.learning_rate = 0.0005
        self.model_available = model_available
        self.weights_file = file_name
        self.batch_size = 32
        self.max_memory = 50_000
        self.memory = deque(maxlen=self.max_memory)
        if model_available:
            self.model = load_model(self.weights_file)
            self.model_target = load_model(self.weights_file)
        else:
            self.model = self.BuildModel()
            self.model_target = self.BuildModel()

        
    def BuildModel(self):
        model_input_shape = self.model_env.observation_space.shape       #model input shape for  input layer
        model_output_shape = self.model_env.action_space.n              #model output shape for output layer
        model = Sequential()
        model.add(Dense(512, input_dim=model_input_shape[0],activation="relu"))
        model.add(Dense(512, activation="relu"))
        model.add(Dense(24, activation="relu"))
        model.add(Dense(self.model_env.action_space.n,activation="linear"))                   #output layer with num of outputs equal to action space
        model.compile(loss="mean_squared_error",optimizer=Adam(lr=self.learning_rate))        #compile model and return
        return model
    
    def Recall(self, pres_state, action, reward, future_state, done):     #Keep track of states,action....... in memory     
        self.memory.append([pres_state,action,reward,future_state,done])

    def ModelSave(self):      #save the model under specified name
        self.model.save(self.weights_file)       ##save model in specified weights file
    
    def Replay(self):
        if len(self.memory) < self.batch_size:           #if memory isn't up to batch size yet return.
            return
        sample_minibatch = random.sample(self.memory,self.batch_size)       #select random batch from memory

        for sm in sample_minibatch:                      #iterate through the sampled memory
            pres_state, action, reward, future_state, done = sm
            pres_state = pres_state[np.newaxis,:]         #reshape states
            future_state = future_state[np.newaxis,:]  
            target_network = self.model_target.predict(pres_state)
            tn = self.model_target.predict(future_state) 
            status = not done                                                      #invert done 
            target_network[0,action] = reward + self.gamma*np.max(tn[0])*status     #calculate reward for target network
            self.model.fit(pres_state,target_network,epochs=1,verbose=0)           #train the model
        self.epsilon = self.epsilon*self.dec_epsilon        #decay epsilon
        if self.epsilon < self.min_epsilon:     #if the epsilon value is past the min threshold, use the min threshold
            self.epsilon = self.min_epsilon
            return self.epsilon
        
        
    def ActionChoice(self,states):
        states = states[np.newaxis,:]            #reshape states by using newaxis
        rd = np.random.random()                 #select random value
        if rd < self.epsilon:
            action_select = self.model_env.action_space.sample()     #samples a random action
        else:
            new_action = self.model.predict(states)           #predict based on states
            action_select = np.argmax(new_action)              #select maximum action
        return action_select
    
            
    def T_network(self):
        model_theta = self.model.get_weights()                #get model weights
        self.model_target.set_weights(model_theta)            #set model weights to target model
        
        
    def plot(self,score_history,num_of_episodes):
        history = np.array(score_history)              #convert list to numpy
        avg = 20
        c_sum = np.cumsum(score_history)               #calculates the cumulative sum of the score history array
        plt.style.use(['dark_background'])
        plt.rcParams["figure.figsize"] = (12,5)        #change plot size
        plt.plot(history)
        plt.title('Plot of Rewards Distribution over %d Complete Episodes'% num_of_episodes)
        plt.ylabel('Rewards')
        plt.xlabel('Episodes')
        c_sum[avg:] = c_sum[avg:] - c_sum[:-avg]     
        plt.plot(c_sum[avg - 1:] / avg)
        plt.show()
        mn = np.sum(history)/num_of_episodes
        print("Mean value over complete episodes is {}.".format(mn))
        print("Highest reward is:{}.".format(np.max(history)))
        print("Lowest reward is:{}.".format(np.min(history)))


In [None]:
file_name = "AcrobatModel_results3.h5"      #filename for saving weights
model_available = False                     #set to false
if os.path.isfile(file_name):
    model_available = True
env_Acrobot = gym.make('Acrobot-v1')         #initialize environment and agent.
DQN = Deep_QN(env_Acrobot,model_available,file_name)  #initiate Deep_QN class and pass parameters
num_of_episodes = 5000                             #number of episodes
score_history = []                                  #score history list 
update_c = 60                                    #number of iterations before the target network can be updated                                      
                                         

for run in range(num_of_episodes):                 #run episodes
    score_num,ind = 0,0                         #initialize score and ind to zero
    pres_state = env_Acrobot.reset()            
    done = False
    while not done:
        env_Acrobot.render() # call this before env.reset, if you want a window showing the environment
        action = DQN.ActionChoice(pres_state)
        future_state,reward,done,info = env_Acrobot.step(action)
        score_num += reward                                       #increment reward until done is true
        DQN.Recall(pres_state,action,reward,future_state,done)    #save specified to memory 
        pres_state = future_state             #set state x to state x+1
        ind = ind + 1                         #counter
        if (ind%update_c) == 0:               #update target model after every specified update_c iterations
            DQN.T_network()
    eps = DQN.Replay()
    if (run%15)== 0:                           #save the model after every 20 episode iterations
        DQN.ModelSave()
    score_history.append(score_num)             #store rewards
    print("Episode number is:{} and Reward is:{}".format(run,score_num))   #print episode number and corresponding reward
DQN.plot(score_history,num_of_episodes)        #plot rewards received
DQN.ModelSave()                                #save model


W0519 14:59:39.022083 29736 deprecation_wrapper.py:119] From C:\Users\kikiy\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0519 14:59:39.053997 29736 deprecation_wrapper.py:119] From C:\Users\kikiy\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0519 14:59:39.200198 29736 deprecation_wrapper.py:119] From C:\Users\kikiy\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0519 14:59:39.201228 29736 deprecation_wrapper.py:119] From C:\Users\kikiy\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:184: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.

W0519 14:59:39.204187 29736 deprecation_wrapper.py:119] From C:\Use