In [11]:
import tensorflow as tf

In [2]:
# -*- coding: utf-8 -*-
# https://github.com/openai/gym/wiki/CartPole-v0
import tensorflow as tf
import gym
import numpy as np
import random
import matplotlib.pyplot as plt
from collections import deque

from skimage.transform import resize
from skimage.color import rgb2gray



# 하이퍼파라미터
INITIAL_EPSILON = 0.5
learning_rate = 0.0001
max_episodes = 10000
discount_factor = 0.9
episode_list = []
train_error_list = []
actions_list = []
HEIGHT = 210
WIDTH = 160

# 테스트 에피소드 주기
TEST_PERIOD = 100

# src model에서 target model로 trainable variable copy 주기
COPY_PERIOD = 10



#네트워크 클래스 구성
class DQN:
    def __init__(self, session, height, width, output_size, name="main"):
        # 네트워크 정보 입력
        self.session = session
        self.height = HEIGHT
        self.width = WIDTH
        self.output_size = output_size
        self.net_name = name
        
        # 네트워크 생성
        self.build_network()

    def build_network(self):
        with tf.variable_scope(self.net_name):
            # Convolutional Neural Network (3 filter 2 Fc layer)
            self.X = tf.placeholder(shape=[None, self.height, self.width, 1], dtype=tf.float32)
            self.Y = tf.placeholder(shape=[None], dtype=tf.float32)

            W_conv1 = tf.Variable(tf.truncated_normal([6, 4, 1, 32], stddev =0.1))
            W_conv2 = tf.Variable(tf.truncated_normal([4, 4, 32, 64], stddev =0.1))
            W_conv3 = tf.Variable(tf.truncated_normal([5, 3, 64, 64], stddev =0.1))
            b_conv1 = tf.Variable(tf.constant(0.1, shape = [32]))
            b_conv2 = tf.Variable(tf.constant(0.1, shape = [64]))
            b_conv3 = tf.Variable(tf.constant(0.1, shape = [64]))
            
            W_fc1 = tf.Variable(tf.truncated_normal([11*9*64, 512], stddev= 0.1))
            b_fc1 = tf.Variable(tf.constant(0.1, shape = [512]))
            W_fc2 = tf.Variable(tf.truncated_normal([512, output_size], stddev =0.1))
            b_fc2 = tf.Variable(tf.constant(0.1, shape = [output_size]))
            
            h_conv1 = tf.nn.relu(tf.nn.conv2d(self.X, W_conv1, strides= [1,4,4,1], padding='VALID') + b_conv1)
            h_conv2 = tf.nn.relu(tf.nn.conv2d(h_conv1, W_conv2, strides = [1,2,2,1], padding ='VALID') + b_conv2)
            h_conv3 = tf.nn.relu(tf.nn.conv2d(h_conv2, W_conv3, strides = [1,2,2,1], padding ='VALID') + b_conv3)
            
            L1 = tf.reshape(h_conv3, [-1, W_fc1.get_shape().as_list()[0]])
            L2 = tf.nn.relu(tf.matmul(L1,W_fc1)+b_fc1)
            self.Qpred = tf.matmul(L2, W_fc2)+b_fc2
            
        # 손실 함수 및 최적화 함수
        self.action = tf.placeholder(shape=[None, self.output_size], dtype=tf.float32)
        Q_action = tf.reduce_sum(tf.multiply(self.Qpred, self.action), reduction_indices=1)
        self.loss = tf.reduce_mean(tf.square(self.Y - Q_action))
        self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.loss)

    # 예측한 Q값 구하기
    def predict(self, state):
#         x = np.reshape(state, newshape=[-1, 84, 84, 1])

        x = np.reshape(state, newshape=[-1, 210,160, 1])
        return self.session.run(self.Qpred, feed_dict={self.X: x})

    # e-greedy 를 사용하여 action값 구함
    def egreedy_action(self, epsilon, env, state):
        if np.random.rand(1) < epsilon:
            action = env.action_space.sample()
            #print("Episode: {0}, Action: {1}".format(episode, action))
        else:
            Q_h = self.predict(state)
            action = np.argmax(Q_h)
            #print("Episode: {0}, State: {1}, Q_h: {2}, Action: {3}".format(episode, state, Q_h, action))
        return action
    
def update_from_memory(mainDQN, targetDQN, batch_size):
    state_batch = np.ndarray(shape=[batch_size, mainDQN.height, mainDQN.width, 1])
    action_batch = np.ndarray(shape=[batch_size, mainDQN.output_size])

    minibatch = random.sample(REPLAY_MEMORY, batch_size)
    i = 0
    y_batch = []
    for sample in minibatch:
        state, action, reward,new_state, ter = sample         # unpacking

        if done:
            y_batch.append(reward)
        else:
            y_batch.append(reward + (1 - ter ) * discount_factor * np.max(targetDQN.predict(new_state)))

        one_hot_action = np.zeros(mainDQN.output_size) # [0.0, 0.0]
        one_hot_action[action] = 1
        
#         one_hot_action = tf.one_hot(action, mainDQN.output_size, 1.0, 0.0)

        state_batch[i] = np.reshape(new_state, newshape=[210,160,1])
        action_batch[i] = one_hot_action
        i += 1

    # DQN 알고리즘으로 학습
    loss_value, _ = mainDQN.session.run([mainDQN.loss, mainDQN.optimizer],
                                     feed_dict={mainDQN.X: state_batch, mainDQN.Y: y_batch, mainDQN.action: action_batch})
    return loss_value




def get_terminal(start_live, l, reward, no_life_game, ter): 
    '''목숨이 줄어들거나, negative reward를 받았을 때, terminal 처리 
    Args: start_live(int): 라이프가 있는 게임일 경우, 현재 라이프 수 
    l(dict): 다음 상태에서 라이프가 줄었는지 확인하기 위한 다음 frame의 라이프 
    info no_life_game(bool): 라이프가 없는 게임일 경우, negative reward를 받으면 
    terminal 처리를 해주기 위한 게임 타입 ter(bool): terminal 처리를 저장할 
    arg Returns: 
    list: 
    ter(bool): terminal 상태 
    start_live(int): 줄어든 라이프로 업데이트된 값 ''' 
    if no_life_game: # 목숨이 없는 게임일 경우 Terminal 처리 
        if reward < 0: 
            ter = True 
    else: # 목숨 있는 게임일 경우 Terminal 처리 
        if start_live > l['ale.lives']: 
            ter = True 
            start_live = l['ale.lives'] 
    return [ter, start_live]

def bot_play(DQN, env):
    """
    See our trained network in action
    """
    state = env.reset()
    state = np.sum(state, axis=2)
    reward_sum = 0
    done = False
    while not done:
#         env.render()
        action = np.argmax(DQN.predict(state))
        new_state, reward, done, info = env.step(action)
        reward_sum += reward
        state = np.sum(new_state, axis=2)

    return reward_sum

def get_copy_var_ops(*, src_scope_name='main', target_scope_name='target'):
    op_holder = []

    src_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=src_scope_name)
    target_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=target_scope_name)

    for src_var, target_var in zip(src_vars, target_vars):
        op_holder.append(target_var.assign(src_var.value()))

    return op_holder

def saveModel(session, src_scope_name='main', path='./breakout.ckpt'):
    src_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=src_scope_name)
    tf.train.Saver(src_vars).save(session, path)
    print("Model saved successfully!")
    
if __name__ == "__main__":
    
    env = gym.make('Breakout-v0')
    env.reset()
    episode = 0
    reward_sum = 0
    num_actions = 0
    action_desc = ['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']
    #The three FIRE actions, 'FIRE', 'RIGHTFIRE', 'LEFTFIRE', make the game start for human player, but this is unnecessary in learning procedure.
    output_size = env.action_space.n                # 6 'NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE'
    height = HEIGHT
    width = WIDTH
    # 리플레이를 저장할 리스트
    REPLAY_MEMORY = deque()

    # 미니배치 - 꺼내서 사용할 리플레이 갯수
    BATCH_SIZE = 32

    with tf.Session() as sess:
        # DQN 클래스의 mainDQN 인스턴스 생성
        mainDQN = DQN(sess, height, width, output_size, name='main')
        targetDQN = DQN(sess, height, width, output_size, name='target')

        # 변수 초기화
        init = tf.global_variables_initializer()
        sess.run(init)
        epsilon = INITIAL_EPSILON

        copy_ops = get_copy_var_ops(src_scope_name='main', target_scope_name='target')
        sess.run(copy_ops)

        for episode in range(max_episodes):
            state = env.reset()
            state = np.sum(state, axis=2)
            rAll = 0
            done = False
            epsilon *= 0.99
            start_lives = 5
        
            
            while not done:
                # action을 수행함 --> Get new state and reward from environment
                action = mainDQN.egreedy_action(epsilon, env, state)
                new_state, reward, done, info = env.step(action)

                ter = done
                
                ter, start_lives = get_terminal(start_lives,info, reward, False, ter)
                
                new_state = np.sum(new_state, axis=2)
                
                # state, action, reward, next_state, done 을 메모리에 저장
                REPLAY_MEMORY.append((state, action, reward, new_state,ter))

                # 메모리에 10000개 이상의 값이 들어가면 가장 먼저 들어간 것부터 삭제
                if len(REPLAY_MEMORY) > 10000:
                    REPLAY_MEMORY.popleft()

                # REPLAY_MEMORY 크기가 BATCH_SIZE 보다 크면 학습 - 이런식으로 되면 너무 코드가 오래걸림 학습을 너무 자주시키는 느낌
                # 그래서 50000만이상으로 바꿈
                if len(REPLAY_MEMORY) > 50000:
                    mean_loss_value = update_from_memory(mainDQN, targetDQN, BATCH_SIZE)

                rAll += reward
                state = new_state

#             if len(REPLAY_MEMORY) > BATCH_SIZE:
#                 episode_list.append(episode)
#                 train_error_list.append(mean_loss_value)
#                 actions_list.append(rAll)

            if episode % COPY_PERIOD == 1:
                sess.run(copy_ops)

#             if episode % TEST_PERIOD == 0:
#                 total_reward = 0
#                 for i in range(1):
#                     total_reward += bot_play(mainDQN, env)

#                 ave_reward = total_reward / 1
#                 print("episode: {0}, Epsilon: {1}, Evaluation Average Reward: {2}".format(episode,epsilon, ave_reward))
#                 if ave_reward >= 200:
#                     break
                    
            print("episode: {0}, Epsilon: {1}, Reward: {2}".format(episode,epsilon, rAll))

        saveModel(sess, src_scope_name='main', path='./breakout.ckpt')

        env.reset()
        env.close()
#         draw_error_values()

        input("Press Enter to make the trained bot play...")
        bot_play(mainDQN, env)

[2017-05-28 22:15:42,116] Making new env: Breakout-v0


episode: 0, Epsilon: 0.495, Reward: 2.0
episode: 1, Epsilon: 0.49005, Reward: 2.0
episode: 2, Epsilon: 0.48514949999999996, Reward: 2.0
episode: 3, Epsilon: 0.480298005, Reward: 2.0
episode: 4, Epsilon: 0.47549502494999996, Reward: 3.0
episode: 5, Epsilon: 0.47074007470049994, Reward: 2.0
episode: 6, Epsilon: 0.46603267395349496, Reward: 2.0
episode: 7, Epsilon: 0.46137234721396, Reward: 3.0
episode: 8, Epsilon: 0.45675862374182036, Reward: 2.0
episode: 9, Epsilon: 0.45219103750440215, Reward: 3.0
episode: 10, Epsilon: 0.44766912712935814, Reward: 3.0
episode: 11, Epsilon: 0.44319243585806456, Reward: 2.0
episode: 12, Epsilon: 0.4387605114994839, Reward: 0.0
episode: 13, Epsilon: 0.43437290638448905, Reward: 2.0
episode: 14, Epsilon: 0.43002917732064416, Reward: 2.0
episode: 15, Epsilon: 0.4257288855474377, Reward: 2.0
episode: 16, Epsilon: 0.4214715966919633, Reward: 0.0
episode: 17, Epsilon: 0.4172568807250437, Reward: 0.0
episode: 18, Epsilon: 0.41308431191779327, Reward: 2.0
episod

KeyboardInterrupt: 

In [2]:
import gym
import numpy as np
import time

env = gym.make('Breakout-v0')
output = env.observation_space
output1 = env.action_space
print(output1)

[2017-05-27 20:27:45,059] Making new env: Breakout-v0


Discrete(6)


In [10]:
for i in range(4):
    print(i)

0
1
2
3


In [None]:
a = tf.