In [1]:
import time
import gym
from gym import wrappers
import random
import collections
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Activation, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
np.random.seed(0)

In [2]:
BUFFERLIMIT = 50_000
MINI_BATCH_SIZE = 32
LEARNING_RATE = 0.001
DISCOUNT_RATE = 0.99

In [3]:
class ReplayBuffer:
    def __init__(self):
        self.buffer = collections.deque(maxlen=BUFFERLIMIT)

    def put(self, transition):
        if(self.size() > BUFFERLIMIT):
            self.buffer.pop()
        self.buffer.append(transition)

    def sample(self):
        mini_batch = random.sample(self.buffer, min(len(self.buffer), MINI_BATCH_SIZE))
        #s_lst, a_lst, r_lst, s_prime_lst, done_lst = [], [], [], [], []
        #for transition in mini_batch:
        #    s,a,r,s_prime,done = transition
        return mini_batch
    
    def size(self):
        return len(self.buffer)

In [4]:
class Qnet:
    def __init__(self, observation_space, action_space):
        self.model = tf.keras.models.Sequential([
            tf.keras.layers.Dense(128, input_shape=(observation_space,), activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(action_space, activation = "linear")           
        ])
        self.model.compile(optimizer =Adam(lr=LEARNING_RATE), loss="mse")
    
    def summary(self):
        return self.model.summary()
    
    def get_weights(self):
        return self.model.get_weights()
    
    def set_weights(self, other):
        return self.model.set_weights(other.model.get_weights())
    
    def train(self,x,y):
        return self.model.train_on_batch(x,y)
        
    def predict(self,inp):
        return self.model.predict(inp)
    
    def sample_action(self, obs, epsilon):
        coin = np.random.random_sample()
        if(coin<=epsilon):
            return np.random.randint(low=0, high=2) # returns values between 0 and 1
        else:
            return np.argmax(self.model.predict(obs))
        

In [5]:
def train(q, q_target, memory):
    mini_batch = memory.sample()
    x = []
    y = []
    for s,a,r,s_prime,done in mini_batch:
        max_future_q = np.max(q_target.predict(tf.constant(s,shape=(1,4))))
        new_q = r + DISCOUNT_RATE*max_future_q*np.invert(done)
        current_q = q.predict(tf.constant(s,shape=(1,4))) # current q_values for the actions
        current_q[0][a] = new_q # updating the q_value of the chosen action to that of the target q value
        x.append(s)
        y.append(current_q)
        
    x = tf.constant(x,shape=(len(x), input_shape))
    y = tf.constant(y, shape=(len(y), output_shape))
    q.train(x,y)

In [6]:
def test(test_env, q):
    test_s = test_env.reset()
    test_done = False
    test_score = 0
    while not test_done:
        test_a = np.argmax(q.predict(tf.constant(test_s,shape=(1,4))))
        test_s_prime, test_r, test_done, test_info =test_env.step(test_a)
        test_s = test_s_prime
        test_score += test_r
        test_env.render()
        time.sleep(1/120)
        if test_done:
            break
    return test_score       

In [7]:
if __name__=="__main__":
    
    EPISODES = 5 # total number of episodes to train for
    env = gym.make("CartPole-v1") # select environment. Currently only tested on CartPole-v1
    test_env = gym.make("CartPole-v1")
    test_env = wrappers.Monitor(test_env, './videos/' + str(time.time()) + '/', video_callable=lambda episode_id: n_epi%10==0)
    input_shape = env.observation_space.shape[0]
    output_shape = env.action_space.n
    q = Qnet(input_shape,output_shape)
    q_target = Qnet(input_shape, output_shape)
    q_target.set_weights(q)
    memory = ReplayBuffer()
    update_target_interval = 20
    ep_vec = [] # Vector to store ep_number for plotting
    score_vec = [] # vector to store score in this episode for plotting
    
    for n_epi in range(EPISODES):
        epsilon = max(0.01, (1 - 0.99/(200)*n_epi))
        s = env.reset()
        done = False
        score = 0.
        
        while not done:
            a = q.sample_action(tf.constant(s,shape=(1,4)), epsilon) #select action from updated q net
            s_prime, r, done, info = env.step(a)
            memory.put((s,a,r,s_prime,int(done))) # insert into experience replay
            s = s_prime
            score += r
            #env.render()
            time.sleep(1/120)
            if done:
                break
        if(memory.size() >= 1000 ):
            train(q, q_target, memory)    # update q net

        # after k eps update target_q params with q_net
        if(n_epi!=0 and n_epi%update_target_interval == 0):
            q_target.set_weights(q)
            #print("Episode: {}. Score: {}".format(ep_vec[-1],score_vec[-1]))

        ep_vec.append(n_epi)
        score_vec.append(test(test_env,q))
        print("Episode: {}. Score: {}".format(ep_vec[-1],score_vec[-1]))
    test_env.close()

Episode: 0. Score: 10.0
Episode: 1. Score: 10.0
Episode: 2. Score: 10.0
Episode: 3. Score: 10.0
Episode: 4. Score: 10.0


In [None]:
test_env.close()

In [None]:
mini_batch = memory.sample()
x = []
y = []
for s,a,r,s_prime,done in mini_batch:
    x.append(s)
x= tf.constant(x,shape=(len(x),4))

In [None]:
print(q.predict(tf.reshape(x[0],shape=(1,4)))[0][0])

In [None]:
x = []
y = []
for s,a,r,s_prime,done in memory.buffer:
    x.append(s)
x= tf.constant(x,shape=(len(x),4))


In [None]:
q.summary()

In [None]:
x=[[1,2,3,4],[5,6,7,8]]
q.predict(x)

In [None]:
s_ =[]
a_ =[]
for transition in buff:
    s,a,r,s_prime, done = transition
    s_.append(s)
    a_.append(a)

In [7]:
q = Qnet(4,2)

In [None]:
res = q.sample_action(tmp2, 1)
res

In [None]:
np.argmax(res[0])

In [None]:
q.summary()

In [None]:
s_[0]

In [None]:
env = gym.make("CartPole-v1")

In [None]:
tmp2 = tf.constant(env.reset(),shape=(1,4))

In [None]:
tmp2

In [None]:
env.reset()

In [None]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

In [None]:
q2 = Qnet(4,2)
q2.set_weights(q)

In [None]:
len(buff)*30

In [8]:
q.model.load_weights('./Model/dqn_weights')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f7b71230370>

In [20]:
tst = q.predict(tf.constant(env.reset(), shape=(1,4)))

In [21]:
tst

array([[-0.00858856,  0.00359186]], dtype=float32)

In [23]:
np.argmax(tst)

1