In [1]:
import numpy as np
import os 
import tensorflow as tf


2023-07-19 00:06:40.731666: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
class DeepQNetwork(object):
    def __init__(self, lr, n_actions,name, fc1_dims=256,input_dims= (210,160,4),chkpt_dir='tmp/dqn'):
        self.lr = lr
        self.n_actions = n_actions
        self.name = name
        self.fc1_dims = fc1_dims
        self.input_dims = input_dims
        self.sess = tf.Session()
        self.build_network()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver()
        self.checkpoint_file = os.path.join(chkpt_dir,'deepqnet.ckpt')
        self.params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope=self.name)
    
    def build_network(self):
        with tf.variable_scope(self.name):
            self.input = tf.placeholder(tf.float32,shape=[None,*self.input_dims],name='inputs')
            
            self.actions = tf.placeholder(tf.float32,shape=[None,self.n_actions],name='action_taken')
            
            self.q_target = tf.placeholder(tf.float32,shape=[None,self.n_actions],name='q_value')
            
            conv1 = tf.layers.conv2d(inputs=self.input,filters=32,kernel_size=(8,8),
                                    strides=4,name='conv1',kernel_initializer=tf.variance_scaling_initializer(scale=2))
            
            conv1_activated = tf.nn.relu(conv1)
            
            conv2 = tf.layers.conv2d(inputs=conv1_activated,filters=64,kernel_size=(4,4),
                                    strides=2,name='conv2',kernel_initializer=tf.variance_scaling_initializer(scale=2))
            
            conv2_activated = tf.nn.relu(conv2)

            conv3 = tf.layers.conv2d(inputs=conv2_activated,filters=128,kernel_size=(3,3),
                                    strides=1,name='conv3',kernel_initializer=tf.variance_scaling_initializer(scale=2))
            
            conv3_activated = tf.nn.relu(conv3)

            flat = tf.layers.flatten(conv3_activated)
            dense1 = tf.layers.dense(flat,units=self.fc1_dims,activation=tf.nn.relu,
                                     kernel_initializer=tf.variance_scaling_initializer(scale=2))
            
            self.Q_values = tf.layers.dense(dense1,units=self.n_actions,kernel_initializer=tf.variance_scaling_initializer(scale=2))

            self.q = tf.reduce_sum(tf.multiply(self.Q_values,self.actions))
            self.loss = tf.reduce_sum(tf.square(self.q - self.q_target))
            self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)

    def load_checkpoint(self):
        print('... loading checkpoint ...')
        self.saver.restore(self.sess,self.checkpoint_file)

    def save_checkpoint(self):
        print('... saving checkpoint ...')
        self.saver.save(self.sess,self.checkpoint_file)


In [None]:

class Agent(object):
    def __init__(self,alpha,gamma,mem_size,n_actions,epsilon,batch_size,replace_target=5000,input_dims=(210,160,4),
                 q_next_dir='tmp/q_next',q_eval_dir='tmp/q_eval'):
        self.n_actions = n_actions
        self.action_space = [i for i in range(self.n_actions)]
        self.gamma = gamma
        self.mem_size = mem_size
        self.mem_cntr = 0
        self.epsilon = epsilon
        self.batch_size = batch_size
        self.replace_target = replace_target
        self.q_next = DeepQNetwork(alpha,n_actions=n_actions,input_dims=input_dims,name='q_next',chkpt_dir=q_next_dir)
        self.q_eval = DeepQNetwork(alpha,n_actions=n_actions,input_dims=input_dims,name='q_eval',chkpt_dir=q_eval_dir)
        self.state_memory = np.zeros((self.mem_size,*input_dims),dtype=np.float32)
        self.new_state_memory = np.zeros((self.mem_size,*input_dims),dtype=np.float32)
        self.action_memory = np.zeros((self.mem_size,self.n_actions),dtype=np.int32)
        self.reward_memory = np.zeros(self.mem_size,dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size,dtype=np.bool)

    def store_transition(self,state,action,reward,state_,terminal):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        actions = np.zeros(self.n_actions)
        actions[action] = 1.0
        self.action_memory[index] = actions
        self.reward_memory[index] = reward
        self.new_state_memory[index] = state_
        self.terminal_memory[index] = terminal
        self.mem_cntr += 1

    def choose_action(self,stats):
        rand = np.random.random()
        if rand < self.epsilon:
            action = np.random.choice(self.action_space)
        else:
            actions = self.q_eval.sess.run(self.q_eval.Q_values,feed_dict={self.q_eval.input:stats})
            action = np.argmax(actions)
        return action 
    
    def learn(self):
        if self.mem_cntr % self.replace_target == 0:
            self.update_graph()
        max_mem = min(self.mem_cntr,self.mem_size)
        batch = np.random.choice(max_mem,self.batch_size)
        state_batch = self.state_memory[batch]
        action_batch = self.action_memory[batch]
        action_values = np.array([0,1,2],dtype=np.int8)
        action_indices = np.dot(action_batch,action_values)
        reward_batch = self.reward_memory[batch]
        terminal_batch = self.terminal_memory[batch]
        q_eval = self.q_eval.sess.run(self.q_eval.Q_values,feed_dict={self.q_eval.input:state_batch})
        
        q_next = self.q_next.sess.run(self.q_next.Q_values,feed_dict={self.q_next.input:state_batch})
        q_target = q_eval.copy()
        q_target[:,action_indices] = reward_batch + self.gamma*np.max(q_next,axis=1)*terminal_batch
        _ = self.q_eval.sess.run(self.q_eval.train_op,feed_dict={
                                                                self.q_eval.input:state_batch,
                                                                self.q_eval.actions:action_batch,
                                                                self.q_eval.q_target:q_target})
        
        if self.mem_cnt > 100000:
            if self.epsilon > 0.01:
                self.epsilon *= 0.9999999
            elif self.epsilon <= 0.01:
                self.epsilon = 0.01
    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()
        
    def update_graph(self):
        t_params = self.q_next.params
        e_params = self.q_eval.params
        for t,e in zip(t_params,e_params):
            self.q_eval.sess.run(tf.assign(t,e))


In [None]:
import gym
from gym import wrappers
import numpy as np

In [None]:
def preprocces(observation):
    return np.mean(observation[30:,:],axis=2).reshape(180,160,1)

def stack_frames(stacked_frames,frame,buffer_size):
    if stacked_frames is None:
    
        stacked_frames = np.zeros((buffer_size,*frame.shape))
    
        for idx ,_ in enumerate(stacked_frames):
            stacked_frames[idx,:] = frame
    else:
    
        stacked_frames[0:buffer_size-1,:] = stacked_frames[1:,:]
        stacked_frames[buffer_size-1,:] = frame

    stacked_frames = stacked_frames.reshape(1,180,160,buffer_size)
    
    return stacked_frames


if __name__ =='main':
    env = gym.make('Breakout-v0')
    load_checkpoint = False
    agent = Agent(gamma=0.99,epsilon=1.0,alpha=0.00025,input_dims=(180,160,4),n_actions=3,mem_size=25000,batch_size=32)

    if load_checkpoint:
        agent.load_models()

    scores = []
    score = 0
    num_games = 200
    stack_size = 4

    while agent.mem_cntr < 25000:
        done = False
        observation = env.reset()
        observation = preprocces(observation)
        stacked_frames = None
        observation = stack_frames(stacked_frames,observation,stack_size)
        while not done:
            action = np.random.choice([0,1,2])
            action += 1
            observation_,reward,done,info = env.step(action)
            observation_ = preprocces(observation_)
            observation_ = stack_frames(stacked_frames,observation_,stack_size)
            action -= 1
            agent.store_transition(observation,action,reward,observation_,done)
            observation = observation_

        
    print('done with pretraining')

    for i in range(num_games):
        done = False
        if i %10 == 0 and i > 0:
            avg_score = np.mean(scores[max(0,i-10):(i+1)])
            print('episode ',i,'score ',score,'average score %.3f' % avg_score,'epsilon %.3f' % agent.epsilon)
            agent.save_models()
        else:
            print('episode ',i,'score ',score)

        observation = env.reset()
        observation = preprocces(observation)
        stacked_frames = None
        observation = stack_frames(stacked_frames,observation,stack_size)
        while not done:
            action = agent.choose_action(observation)
            action += 1
            observation_,reward,done,info = env.step(action)
            observation_ = preprocces(observation_)
            observation_ = stack_frames(stacked_frames,observation_,stack_size)
            action -= 1
            agent.store_transition(observation,action,reward,observation_,done)
            observation = observation_
            agent.learn()
            score += reward
        scores.append(score)
        