In [1]:
import os
import random

import tensorflow as tf      # Deep Learning library
import numpy as np           # Handle matrices
import retro                 # Retro Environment
from retro.retro_env import RetroEnv

import matplotlib.pyplot as plt # Display graphs
from IPython import display

from collections import deque # Ordered collection with ends

import gym



rom_path = os.path.dirname(os.path.abspath('.')) + '/StreetFighterIISpecialChampionEdition-Genesis'

scenario_name = 'scenario_sunppang'

model_path = '/root/sf2-workspace/sf2-env/sf2/models/sunppang_more_states/model.ckpt'
os.makedirs(os.path.dirname(model_path), exist_ok=True)
#log_file = open('./dqn.log', 'w')

### PREPROCESSING HYPERPARAMETERS
#stack_size = 10                 # Number of frames stacked
stack_size = 5
state_element_number = 31

# x축 거리(0~187), y축 거리(0~70), 좌(상대편의 왼쪽), 우(상대편의 오른쪽) (장풍은 제거, TODO: 장풍 state 추가, 공격 범위 추가)
state_size = state_element_number * stack_size
#learning_rate =  0.00025
#learning_rate =  0.0005
learning_rate = 0.005

### TRAINING 관련
total_episodes = 2000            
max_steps = 50000              
batch_size = 64

# Exploration parameters for epsilon greedy strategy
explore_start = 1.0            # exploration probability at start
explore_stop = 0.01            # minimum exploration probability 
#decay_rate = 0.00001           # exponential decay rate for exploration prob
decay_rate = 0.00005           # exponential decay rate for exploration prob

# Q learning hyperparameters
#gamma = 0.9                    # Discounting rate
gamma = 0.95

### MEMORY HYPERPARAMETERS
pretrain_length = batch_size   # Number of experiences stored in the Memory when initialized for the first time
memory_size = 1000000          # Number of experiences the Memory can keep

### MODIFY THIS TO FALSE IF YOU JUST WANT TO SEE THE TRAINED AGENT
training = True

## TURN THIS TO TRUE IF YOU WANT TO RENDER THE ENVIRONMENT
episode_render = False




In [2]:



class SF2Env(RetroEnv):
    KEY_LIST = ["B", "A", "MODE", "START", "UP", "DOWN", "LEFT", "RIGHT", "C", "Y", "X", "Z"]
    
    
    
    def __init__(self, game, state=retro.State.DEFAULT, scenario=None, info=None, use_restricted_actions=retro.Actions.FILTERED,
                 record=False, players=1, inttype=retro.data.Integrations.STABLE, obs_type=retro.Observations.IMAGE, press_button_print=False):
        self.recent_action = None
        
        # action 입력은 DISCRETE 모드
        use_restricted_actions = retro.Actions.DISCRETE
        self.press_button_print = press_button_print
        
        RetroEnv.__init__(self, game, state, scenario, info, use_restricted_actions,
                 record, players, inttype, obs_type)
        self.buttons = ["B", "A", "MODE", "START", "UP", "DOWN", "LEFT", "RIGHT", "C", "Y", "X", "Z"]
        self.buttons_dict = {}
        for k,v in enumerate(self.buttons):
            self.buttons_dict[v] = k
        
        self.actions = [
            ['LEFT'],
            ['RIGHT'],
            ['UP'],
            ['DOWN'],
            ['LEFT','UP'],
            ['LEFT', 'DOWN'],
            ['RIGHT', 'UP'],
            ['RIGHT', 'DOWN'],
            ['A'],
            ['B'],
            ['C'],
            ['X'],
            ['Y'],
            ['Z'],
            ['DOWN', 'A'],
            ['DOWN', 'B'],
            ['DOWN', 'C'],
            ['DOWN', 'X'],
            ['DOWN', 'Y'],
            ['DOWN', 'Z']
        ]
        
        self.action_space = gym.spaces.Discrete(len(self.actions) ** players)
    
    def step(self, a):
        self.recent_action = a
        reward_sum = 0
        for i in range(5):
            ob, rew, done, info = RetroEnv.step(self, a)
            reward_sum += rew
        
        if self.press_button_print:
            print(self.action_array_to_keys(self.action_to_array(a)))
        
        return self.get_state_from(), reward_sum, done, info
    
    def reset(self):
        RetroEnv.reset(self)
        self.recent_action = None
        return self.get_state_from()
            
    def action_array_to_keys(self, action_array):
        press_keys = []
        for i,v in enumerate(action_array[0]):
            if v == 1:
                press_keys.append(self.KEY_LIST[i])
        return press_keys
    
    def action_to_array(self, a):
        button_array = [0] * 12
        for button in self.actions[a]:
            button_array[self.buttons_dict[button]] = 1
        return [button_array]
        
    
    # env로부터 state 값을 변환
    def get_state_from(self):
        state = []
        
        is_1p_atackking = 1 if int(self.data.lookup_value('is_first_player_atackking')) in [1, 513, 1537] else 0
        is_2p_atackking = 1 if int(self.data.lookup_value('is_second_player_atackking')) in [1, 513, 1537] else 0
        
        first_player_x = int(self.data.lookup_value('first_player_x'))
        second_player_x = int(self.data.lookup_value('second_player_x'))
        left_or_right = 0
        left_or_right = 1 if first_player_x > second_player_x else 0
        if first_player_x > second_player_x:
            left_or_right = 1
        elif first_player_x < second_player_x: 
            left_or_right = -1
        else:
            left_or_right = 0
        
        state.append(int(self.data.lookup_value('distance_x_between_players'))/188)
        state.append(int(self.data.lookup_value('distance_y_between_players'))/71)
        
        
        state.append(int(self.data.lookup_value('first_player_attack_x')))
        state.append(int(self.data.lookup_value('first_player_x')))
        state.append(int(self.data.lookup_value('first_player_y')))
        state.append(is_1p_atackking)
        #state.append(int(self.data.lookup_value('is_first_player_jangpoong')))
        #state.append(int(self.data.lookup_value('is_first_player_jangpoong_x')))
        #state.append(int(self.data.lookup_value('is_first_player_jangpoong_y')))
        state.append(is_2p_atackking)
        #state.append(int(self.data.lookup_value('is_second_player_jangpoong')))
        #state.append(int(self.data.lookup_value('is_second_player_jangpoong_x')))
        #state.append(int(self.data.lookup_value('is_second_player_jangpoong_y')))
        state.append(int(self.data.lookup_value('second_player_attack_x')))
        state.append(int(self.data.lookup_value('second_player_x')))
        state.append(int(self.data.lookup_value('second_player_y')))
        #state.append(int(self.data.lookup_value('continuetimer'))/153) # max 153
        #state.append(int(self.data.lookup_value('first_player_health'))) # range : -1 ~ 176
        #state.append(int(self.data.lookup_value('second_player_health')))
        #state.append(int(self.data.lookup_value('first_player_action_kind')))
        state.append(left_or_right)
        
        # 에이전트의 최근 액션
        for j in range(len(self.actions)):
            if self.recent_action == j:
                state.append(1)
            else:
                state.append(0)
        
    
        
        return np.asarray(state)


env = SF2Env(rom_path, 
             state='rvsb.state', 
             scenario=scenario_name,
             press_button_print=False)


possible_actions = np.array(list(range(0, env.action_space.n)))





# initialize (deque 사용, max 4개 유지)
#stacked_frames  =  deque([np.zeros(188*71+2, dtype=np.int) for i in range(stack_size)], maxlen=4)
stacked_frames  =  deque([np.zeros(state_element_number, dtype=np.int) for i in range(stack_size)], maxlen=stack_size)

def stack_frames(stacked_frames, state, is_new_episode):
    frame = state
    
    if is_new_episode:
        # clear stacked_frames
        stacked_frames = deque([np.zeros(state_element_number, dtype=np.int) for i in range(stack_size)], maxlen=stack_size)
        
        for i in range(stack_size-1):
            stacked_frames.append(frame)
        
        stacked_state = np.stack(stacked_frames, axis=1)
        
    else:
        stacked_frames.append(frame)
        
        stacked_state = np.stack(stacked_frames) 
    
    return stacked_state, stacked_frames





class DQNetwork:
    def __init__(self, state_size, action_size, learning_rate, name='DQNetwork'):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        
        with tf.variable_scope(name):
            # We create the placeholders
            # *state_size means that we take each elements of state_size in tuple hence is like if we wrote
            # [None, 84, 84, 4]
            self.inputs_ = tf.placeholder(tf.float32, [None, state_size], name="inputs")
            self.actions_ = tf.placeholder(tf.float32, [None, self.action_size], name="actions_")
            
            # Remember that target_Q is the R(s,a) + ymax Qhat(s', a')
            self.target_Q = tf.placeholder(tf.float32, [None], name="target")
            
            self.fc = tf.layers.dense(inputs = self.inputs_,
                                  units = 512,
                                  activation = tf.nn.elu,
                                       kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                name="fc1")
            
            
            self.output = tf.layers.dense(inputs = self.fc, 
                                           kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                          units = self.action_size, 
                                        activation=None)
  
            # Q is our predicted Q value.
            self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions_))
            
            # The loss is the difference between our predicted Q_values and the Q_target
            # Sum(Qtarget - Q)^2
            self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q))
            
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)

            
# Reset the graph
tf.reset_default_graph()

# Instantiate the DQNetwork
DQNetwork = DQNetwork(state_size, env.action_space.n, learning_rate)




class Memory():
    def __init__(self, max_size):
        self.buffer = deque(maxlen = max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
    
    def sample(self, batch_size):
        buffer_size = len(self.buffer)
        index = np.random.choice(np.arange(buffer_size),
                                size = batch_size,
                                replace = False)
        
        return [self.buffer[i] for i in index]



# Instantiate memory
memory = Memory(max_size = memory_size)
for i in range(pretrain_length):
    # If it's the first step
    if i == 0:
        state = env.reset()
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        state = state.flatten()
        
    action = random.randint(0, env.action_space.n - 1)
    next_state, reward, done, _ = env.step(action)
    
    #env.render()
    
    # Stack the frames
    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
    next_state = next_state.flatten()
    
    # If the episode is finished (we're dead 3x)
    if done:
        # We finished the episode
        next_state = np.zeros(state.shape)
        
        # Add experience to memory
        
        memory.add((state, action, reward, next_state, done))
        
        # Start a new episode
        state = env.reset()
        
        # Stack the frames
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        state = state.flatten()
        
        
    else:
        # Add experience to memory
        memory.add((state, action, reward, next_state, done))
        
        # Our new state is now the next_state
        state = next_state


# Setup TensorBoard Writer
tb_path = os.path.dirname(os.path.abspath('.')) + '/tensorboard/dqn/small_space'

writer = tf.summary.FileWriter(tb_path)

## Losses
tf.summary.scalar("Loss", DQNetwork.loss)

write_op = tf.summary.merge_all()




"""
This function will do the part
With ϵϵ select a random action atat, otherwise select at=argmaxaQ(st,a)
"""
def predict_action(explore_start, explore_stop, decay_rate, decay_step, state, actions):
    ## EPSILON GREEDY STRATEGY
    # Choose action a from state s using epsilon greedy.
    ## First we randomize a number
    exp_exp_tradeoff = np.random.rand()

    # Here we'll use an improved version of our epsilon greedy strategy used in Q-learning notebook
    explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
    
    if (explore_probability > exp_exp_tradeoff):
        # Make a random action (exploration)
        choice = random.randint(1,len(possible_actions))-1
        action = possible_actions[choice]
        
    else:
        # Get action from Q-network (exploitation)
        # Estimate the Qs values state
        Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state.reshape((1, *state.shape))})
        
        # Take the biggest Q value (= the best action)
        choice = np.argmax(Qs)
        action = possible_actions[choice]
                
                
    return action, explore_probability


# Saver will help us to save our model
saver = tf.train.Saver()
training = True
episode_render = False



if training == True:
    with tf.Session() as sess:
        # Initialize the variables
        sess.run(tf.global_variables_initializer())

        # Initialize the decay rate (that will use to reduce epsilon) 
        decay_step = 0
        
        for episode in range(total_episodes):
            # Set step to 0
            step = 0
            
            # Initialize the rewards of the episode
            episode_rewards = []
            
            # Make a new episode and observe the first state
            state = env.reset()
            
            # Remember that stack frame function also call our preprocess function.
            state, stacked_frames = stack_frames(stacked_frames, state, True)
            state = state.flatten()
            
            
            while step < max_steps:
                step += 1
                
                #Increase decay_step
                decay_step +=1
                
                # Predict the action to take and take it
                action, explore_probability = predict_action(explore_start, explore_stop, decay_rate, decay_step, state, possible_actions)
                #print(action)
                
                #Perform the action and get the next_state, reward, and done information
                next_state, reward, done, _ = env.step(action)
                
                if episode_render and step % 100 == 0 :
                    plt.imshow(env.render(mode='rgb_array'))
                    plt.axis('off')
                    display.clear_output(wait=True)
                    display.display(plt.gcf())
                
                # Add the reward to total reward
                episode_rewards.append(reward)
                
                # If the game is finished
                if done:
                    # The episode ends so no next state
                    next_state = np.zeros(state_element_number, dtype=np.int)
                    
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    next_state = next_state.flatten()

                    # Set step = max_steps to end the episode
                    step = max_steps

                    # Get the total reward of the episode
                    total_reward = np.sum(episode_rewards)
                    
                    if env.data.lookup_value('first_player_health') > env.data.lookup_value('second_player_health'):
                        episode_result = 'win'
                    else:
                        episode_result = 'lose'

                    result_str = '{' + '"episode":{}, "reward":{}, "explore": {:.4f}, "loss": {:.4f}, "result": "{}"'.format(episode+1, total_reward, explore_probability, loss, episode_result) + '}'
                    print(result_str)
                    #log_file.write(result_str + '\n')

                    #rewards_list.append((episode, total_reward))

                    # Store transition <st,at,rt+1,st+1> in memory D
                    memory.add((state, action, reward, next_state, done))

                else:
                    # Stack the frame of the next_state
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    next_state = next_state.flatten()
                
                    # Add experience to memory
                    memory.add((state, action, reward, next_state, done))

                    # st+1 is now our current state
                    state = next_state
                    

                ### LEARNING PART            
                # Obtain random mini-batch from memory
                batch = memory.sample(batch_size)
                #print(np.array(batch).shape)
                states_mb = np.array([each[0] for each in batch], ndmin=1)
                #print(states_mb.shape)
                
                #actions_mb = np.array([each[1] for each in batch])
                list_agg = []
                for each in batch:
                    a_list = [0]*20
                    a_list[each[1]-1] = 1
                    list_agg.append(a_list)
                actions_mb = np.array(list_agg, ndmin=2)
                
                
                rewards_mb = np.array([each[2] for each in batch]) 
                next_states_mb = np.array([each[3] for each in batch], ndmin=1)
                dones_mb = np.array([each[4] for each in batch])

                target_Qs_batch = []

                # Get Q values for next_state 
                for i, v in enumerate(next_states_mb):
                    #if len(v) == 13350:
                    if len(v) == state_element_number:
                        print(i)
                
                
                Qs_next_state = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: next_states_mb})
                
                # Set Q_target = r if the episode ends at s+1, otherwise set Q_target = r + gamma*maxQ(s', a')
                for i in range(0, len(batch)):
                    terminal = dones_mb[i]

                    # If we are in a terminal state, only equals reward
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                        
                    else:
                        target = rewards_mb[i] + gamma * np.max(Qs_next_state[i])
                        target_Qs_batch.append(target)
                        

                targets_mb = np.array([each for each in target_Qs_batch])

                loss, _ = sess.run([DQNetwork.loss, DQNetwork.optimizer],
                                        feed_dict={DQNetwork.inputs_: states_mb,
                                                   DQNetwork.target_Q: targets_mb,
                                                   DQNetwork.actions_: actions_mb})

                # Write TF Summaries
                summary = sess.run(write_op, feed_dict={DQNetwork.inputs_: states_mb,
                                                       DQNetwork.target_Q: targets_mb,
                                                       DQNetwork.actions_: actions_mb})
                writer.add_summary(summary, episode)
                writer.flush()

            # Save model every 5 episodes
            if episode % 5 == 4:
                save_path = saver.save(sess, model_path)
                print("Model Saved")
                
#log_file.close()
env.close()

{"episode":1, "reward":-34.0, "explore": 0.9995, "loss": 484082.6875, "result": "lose"}
{"episode":2, "reward":-34.0, "explore": 0.9990, "loss": 1563187.1250, "result": "lose"}
{"episode":3, "reward":-35.0, "explore": 0.9985, "loss": 78083.2812, "result": "lose"}
{"episode":4, "reward":6.0, "explore": 0.9981, "loss": 69193.2969, "result": "win"}
{"episode":5, "reward":-34.0, "explore": 0.9976, "loss": 33952.3398, "result": "lose"}
Model Saved
{"episode":6, "reward":-35.0, "explore": 0.9971, "loss": 56155.5000, "result": "lose"}
{"episode":7, "reward":-34.0, "explore": 0.9966, "loss": 8101.9795, "result": "lose"}
{"episode":8, "reward":-35.0, "explore": 0.9961, "loss": 578.1920, "result": "lose"}
{"episode":9, "reward":-34.0, "explore": 0.9954, "loss": 2095.3997, "result": "lose"}
{"episode":10, "reward":4.0, "explore": 0.9950, "loss": 27467.3867, "result": "win"}
Model Saved
{"episode":11, "reward":-35.0, "explore": 0.9945, "loss": 1510.5059, "result": "lose"}
{"episode":12, "reward":-

{"episode":96, "reward":-34.0, "explore": 0.9509, "loss": 465.0902, "result": "lose"}
{"episode":97, "reward":-34.0, "explore": 0.9504, "loss": 135.0242, "result": "lose"}
{"episode":98, "reward":36.0, "explore": 0.9500, "loss": 585.4940, "result": "win"}
{"episode":99, "reward":-34.0, "explore": 0.9496, "loss": 1085.4275, "result": "lose"}
{"episode":100, "reward":-34.0, "explore": 0.9491, "loss": 2585.3457, "result": "lose"}
Model Saved
{"episode":101, "reward":-34.0, "explore": 0.9485, "loss": 384.3636, "result": "lose"}
{"episode":102, "reward":-35.0, "explore": 0.9481, "loss": 119.8109, "result": "lose"}
{"episode":103, "reward":7.0, "explore": 0.9477, "loss": 170.8371, "result": "win"}
{"episode":104, "reward":6.0, "explore": 0.9473, "loss": 1062.7100, "result": "win"}
{"episode":105, "reward":-34.0, "explore": 0.9468, "loss": 91.9634, "result": "lose"}
Model Saved
{"episode":106, "reward":-34.0, "explore": 0.9463, "loss": 151.3534, "result": "lose"}
{"episode":107, "reward":36.0

{"episode":190, "reward":-35.0, "explore": 0.9032, "loss": 213.5036, "result": "lose"}
Model Saved
{"episode":191, "reward":-35.0, "explore": 0.9027, "loss": 80.2912, "result": "lose"}
{"episode":192, "reward":-34.0, "explore": 0.9023, "loss": 152.2427, "result": "lose"}
{"episode":193, "reward":-35.0, "explore": 0.9019, "loss": 163.8259, "result": "lose"}
{"episode":194, "reward":-8.0, "explore": 0.9014, "loss": 821.6469, "result": "lose"}
{"episode":195, "reward":-34.0, "explore": 0.9010, "loss": 120.8628, "result": "lose"}
Model Saved
{"episode":196, "reward":-34.0, "explore": 0.9006, "loss": 410.8427, "result": "lose"}
{"episode":197, "reward":-34.0, "explore": 0.9002, "loss": 445.3503, "result": "lose"}
{"episode":198, "reward":-34.0, "explore": 0.8998, "loss": 1418.3770, "result": "lose"}
{"episode":199, "reward":17.0, "explore": 0.8993, "loss": 106.4480, "result": "win"}
{"episode":200, "reward":-35.0, "explore": 0.8988, "loss": 1589.6980, "result": "lose"}
Model Saved
{"episode

{"episode":283, "reward":-35.0, "explore": 0.8531, "loss": 813.5224, "result": "lose"}
{"episode":284, "reward":-34.0, "explore": 0.8527, "loss": 830.6860, "result": "lose"}
{"episode":285, "reward":-34.0, "explore": 0.8523, "loss": 570.5466, "result": "lose"}
Model Saved
{"episode":286, "reward":-34.0, "explore": 0.8519, "loss": 57.5266, "result": "lose"}
{"episode":287, "reward":-35.0, "explore": 0.8515, "loss": 383.4731, "result": "lose"}
{"episode":288, "reward":36.0, "explore": 0.8511, "loss": 292.8664, "result": "win"}
{"episode":289, "reward":-34.0, "explore": 0.8507, "loss": 725.1093, "result": "lose"}
{"episode":290, "reward":-34.0, "explore": 0.8503, "loss": 134.5602, "result": "lose"}
Model Saved
{"episode":291, "reward":-34.0, "explore": 0.8498, "loss": 854.5136, "result": "lose"}
{"episode":292, "reward":-36.0, "explore": 0.8493, "loss": 138.5237, "result": "lose"}
{"episode":293, "reward":-34.0, "explore": 0.8489, "loss": 198.9728, "result": "lose"}
{"episode":294, "rewar

{"episode":378, "reward":-31.0, "explore": 0.8088, "loss": 58.4693, "result": "lose"}
{"episode":379, "reward":36.0, "explore": 0.8085, "loss": 501.7405, "result": "win"}
{"episode":380, "reward":-34.0, "explore": 0.8081, "loss": 198.4938, "result": "lose"}
Model Saved
{"episode":381, "reward":6.0, "explore": 0.8077, "loss": 101.5464, "result": "win"}
{"episode":382, "reward":6.0, "explore": 0.8073, "loss": 109.9746, "result": "win"}
{"episode":383, "reward":36.0, "explore": 0.8070, "loss": 70.5568, "result": "win"}
{"episode":384, "reward":-35.0, "explore": 0.8066, "loss": 107.9278, "result": "lose"}
{"episode":385, "reward":-34.0, "explore": 0.8061, "loss": 169.1669, "result": "lose"}
Model Saved
{"episode":386, "reward":-35.0, "explore": 0.8057, "loss": 46.3658, "result": "lose"}
{"episode":387, "reward":6.0, "explore": 0.8053, "loss": 99.1673, "result": "win"}
{"episode":388, "reward":17.0, "explore": 0.8050, "loss": 57.4949, "result": "win"}
{"episode":389, "reward":-34.0, "explor

{"episode":473, "reward":-34.0, "explore": 0.7713, "loss": 141.5394, "result": "lose"}
{"episode":474, "reward":-34.0, "explore": 0.7709, "loss": 106.0538, "result": "lose"}
{"episode":475, "reward":-34.0, "explore": 0.7706, "loss": 130.5406, "result": "lose"}
Model Saved
{"episode":476, "reward":-34.0, "explore": 0.7702, "loss": 402.4307, "result": "lose"}
{"episode":477, "reward":-8.0, "explore": 0.7698, "loss": 147.1977, "result": "lose"}
{"episode":478, "reward":31.0, "explore": 0.7695, "loss": 143.4559, "result": "win"}
{"episode":479, "reward":-36.0, "explore": 0.7691, "loss": 357.8600, "result": "lose"}
{"episode":480, "reward":-35.0, "explore": 0.7687, "loss": 348.7650, "result": "lose"}
Model Saved
{"episode":481, "reward":-35.0, "explore": 0.7684, "loss": 79.0845, "result": "lose"}
{"episode":482, "reward":39.0, "explore": 0.7680, "loss": 592.8964, "result": "win"}
{"episode":483, "reward":-35.0, "explore": 0.7676, "loss": 84.5464, "result": "lose"}
{"episode":484, "reward":-

{"episode":569, "reward":-34.0, "explore": 0.7362, "loss": 592.8131, "result": "lose"}
{"episode":570, "reward":-34.0, "explore": 0.7359, "loss": 138.8771, "result": "lose"}
Model Saved
{"episode":571, "reward":-34.0, "explore": 0.7355, "loss": 417.2097, "result": "lose"}
{"episode":572, "reward":-34.0, "explore": 0.7351, "loss": 174.5932, "result": "lose"}
{"episode":573, "reward":-8.0, "explore": 0.7348, "loss": 1276.6383, "result": "lose"}
{"episode":574, "reward":-35.0, "explore": 0.7344, "loss": 464.5731, "result": "lose"}
{"episode":575, "reward":-34.0, "explore": 0.7340, "loss": 370.9875, "result": "lose"}
Model Saved
{"episode":576, "reward":37.0, "explore": 0.7337, "loss": 899.2292, "result": "win"}
{"episode":577, "reward":-34.0, "explore": 0.7334, "loss": 48.2640, "result": "lose"}
{"episode":578, "reward":7.0, "explore": 0.7330, "loss": 579.3259, "result": "win"}
{"episode":579, "reward":-29.0, "explore": 0.7314, "loss": 606.3334, "result": "lose"}
{"episode":580, "reward":

{"episode":663, "reward":-34.0, "explore": 0.7006, "loss": 81.2767, "result": "lose"}
{"episode":664, "reward":36.0, "explore": 0.7003, "loss": 326.3723, "result": "win"}
{"episode":665, "reward":-34.0, "explore": 0.7000, "loss": 204.6219, "result": "lose"}
Model Saved
{"episode":666, "reward":6.0, "explore": 0.6997, "loss": 702.9682, "result": "win"}
{"episode":667, "reward":-34.0, "explore": 0.6994, "loss": 1191.3595, "result": "lose"}
{"episode":668, "reward":-34.0, "explore": 0.6990, "loss": 764.1040, "result": "lose"}
{"episode":669, "reward":-34.0, "explore": 0.6987, "loss": 1022.9277, "result": "lose"}
{"episode":670, "reward":-34.0, "explore": 0.6984, "loss": 124.6200, "result": "lose"}
Model Saved
{"episode":671, "reward":-34.0, "explore": 0.6980, "loss": 1322.4836, "result": "lose"}
{"episode":672, "reward":-8.0, "explore": 0.6977, "loss": 165.8936, "result": "lose"}
{"episode":673, "reward":-34.0, "explore": 0.6974, "loss": 231.9416, "result": "lose"}
{"episode":674, "reward

{"episode":757, "reward":-35.0, "explore": 0.6647, "loss": 155.0875, "result": "lose"}
{"episode":758, "reward":-38.0, "explore": 0.6642, "loss": 157.4898, "result": "lose"}
{"episode":759, "reward":-34.0, "explore": 0.6639, "loss": 149.0390, "result": "lose"}
{"episode":760, "reward":-35.0, "explore": 0.6636, "loss": 182.0482, "result": "lose"}
Model Saved
{"episode":761, "reward":18.0, "explore": 0.6624, "loss": 67.6899, "result": "win"}
{"episode":762, "reward":-34.0, "explore": 0.6621, "loss": 154.6589, "result": "lose"}
{"episode":763, "reward":28.0, "explore": 0.6617, "loss": 260.8558, "result": "win"}
{"episode":764, "reward":2.0, "explore": 0.6614, "loss": 1109.8910, "result": "win"}
{"episode":765, "reward":-35.0, "explore": 0.6611, "loss": 230.0575, "result": "lose"}
Model Saved
{"episode":766, "reward":-34.0, "explore": 0.6608, "loss": 1643.8336, "result": "lose"}
{"episode":767, "reward":-34.0, "explore": 0.6604, "loss": 311.6593, "result": "lose"}
{"episode":768, "reward":

{"episode":853, "reward":-35.0, "explore": 0.6254, "loss": 16368.9941, "result": "lose"}
{"episode":854, "reward":39.0, "explore": 0.6251, "loss": 17218.9551, "result": "win"}
{"episode":855, "reward":23.0, "explore": 0.6247, "loss": 41550.9062, "result": "win"}
Model Saved
{"episode":856, "reward":-34.0, "explore": 0.6244, "loss": 218.7599, "result": "lose"}
{"episode":857, "reward":-34.0, "explore": 0.6241, "loss": 3650.0405, "result": "lose"}
{"episode":858, "reward":-35.0, "explore": 0.6237, "loss": 598.7924, "result": "lose"}
{"episode":859, "reward":-34.0, "explore": 0.6234, "loss": 52030.5859, "result": "lose"}
{"episode":860, "reward":-36.0, "explore": 0.6231, "loss": 250.8664, "result": "lose"}
Model Saved
{"episode":861, "reward":-20.0, "explore": 0.6214, "loss": 4719.4526, "result": "lose"}
{"episode":862, "reward":36.0, "explore": 0.6212, "loss": 77.5056, "result": "win"}
{"episode":863, "reward":-34.0, "explore": 0.6209, "loss": 1231.9846, "result": "lose"}
{"episode":864,

{"episode":949, "reward":17.0, "explore": 0.5928, "loss": 122.5758, "result": "win"}
{"episode":950, "reward":28.0, "explore": 0.5926, "loss": 84.0948, "result": "win"}
Model Saved
{"episode":951, "reward":-34.0, "explore": 0.5923, "loss": 55.6696, "result": "lose"}
{"episode":952, "reward":-34.0, "explore": 0.5920, "loss": 97.1715, "result": "lose"}
{"episode":953, "reward":17.0, "explore": 0.5917, "loss": 64.6087, "result": "win"}
{"episode":954, "reward":-34.0, "explore": 0.5914, "loss": 73.5640, "result": "lose"}
{"episode":955, "reward":-36.0, "explore": 0.5911, "loss": 75.9948, "result": "lose"}
Model Saved
{"episode":956, "reward":-35.0, "explore": 0.5908, "loss": 73.3070, "result": "lose"}
{"episode":957, "reward":-29.0, "explore": 0.5860, "loss": 129.1140, "result": "lose"}
{"episode":958, "reward":-8.0, "explore": 0.5857, "loss": 351.2562, "result": "lose"}
{"episode":959, "reward":6.0, "explore": 0.5855, "loss": 154.0176, "result": "win"}
{"episode":960, "reward":-34.0, "exp

{"episode":1042, "reward":-8.0, "explore": 0.5567, "loss": 357.8149, "result": "lose"}
{"episode":1043, "reward":-35.0, "explore": 0.5563, "loss": 606.6809, "result": "lose"}
{"episode":1044, "reward":-30.0, "explore": 0.5550, "loss": 106.3684, "result": "lose"}
{"episode":1045, "reward":-34.0, "explore": 0.5547, "loss": 214.8562, "result": "lose"}
Model Saved
{"episode":1046, "reward":-34.0, "explore": 0.5544, "loss": 1482.3210, "result": "lose"}
{"episode":1047, "reward":6.0, "explore": 0.5541, "loss": 489.9911, "result": "win"}
{"episode":1048, "reward":-34.0, "explore": 0.5538, "loss": 104.6595, "result": "lose"}
{"episode":1049, "reward":21.0, "explore": 0.5536, "loss": 3657.1235, "result": "win"}
{"episode":1050, "reward":-8.0, "explore": 0.5533, "loss": 299.4715, "result": "lose"}
Model Saved
{"episode":1051, "reward":38.0, "explore": 0.5531, "loss": 542.0915, "result": "win"}
{"episode":1052, "reward":-8.0, "explore": 0.5528, "loss": 7852.8691, "result": "lose"}
{"episode":1053

{"episode":1134, "reward":-34.0, "explore": 0.5287, "loss": 110.8911, "result": "lose"}
{"episode":1135, "reward":36.0, "explore": 0.5285, "loss": 81.5111, "result": "win"}
Model Saved
{"episode":1136, "reward":-35.0, "explore": 0.5282, "loss": 162.1065, "result": "lose"}
{"episode":1137, "reward":-34.0, "explore": 0.5279, "loss": 76.1391, "result": "lose"}
{"episode":1138, "reward":-38.0, "explore": 0.5276, "loss": 124.9069, "result": "lose"}
{"episode":1139, "reward":-34.0, "explore": 0.5274, "loss": 65.9389, "result": "lose"}
{"episode":1140, "reward":29.0, "explore": 0.5271, "loss": 96.4808, "result": "win"}
Model Saved
{"episode":1141, "reward":-35.0, "explore": 0.5269, "loss": 86.6043, "result": "lose"}
{"episode":1142, "reward":-34.0, "explore": 0.5266, "loss": 26.0720, "result": "lose"}
{"episode":1143, "reward":-36.0, "explore": 0.5264, "loss": 74.8747, "result": "lose"}
{"episode":1144, "reward":-35.0, "explore": 0.5261, "loss": 121.0442, "result": "lose"}
{"episode":1145, "r

{"episode":1226, "reward":-34.0, "explore": 0.5021, "loss": 180.9379, "result": "lose"}
{"episode":1227, "reward":37.0, "explore": 0.5019, "loss": 200.7295, "result": "win"}
{"episode":1228, "reward":-35.0, "explore": 0.5016, "loss": 765.2621, "result": "lose"}
{"episode":1229, "reward":-34.0, "explore": 0.5014, "loss": 120.2122, "result": "lose"}
{"episode":1230, "reward":-34.0, "explore": 0.5011, "loss": 60.0753, "result": "lose"}
Model Saved
{"episode":1231, "reward":39.0, "explore": 0.5009, "loss": 192.6845, "result": "win"}
{"episode":1232, "reward":-34.0, "explore": 0.5006, "loss": 610.5862, "result": "lose"}
{"episode":1233, "reward":-34.0, "explore": 0.5004, "loss": 188.4544, "result": "lose"}
{"episode":1234, "reward":-34.0, "explore": 0.5001, "loss": 246.8917, "result": "lose"}
{"episode":1235, "reward":-34.0, "explore": 0.4998, "loss": 571.2111, "result": "lose"}
Model Saved
{"episode":1236, "reward":-4.0, "explore": 0.4996, "loss": 206.3017, "result": "lose"}
{"episode":123

{"episode":1319, "reward":-34.0, "explore": 0.4775, "loss": 222.3855, "result": "lose"}
{"episode":1320, "reward":-34.0, "explore": 0.4773, "loss": 67.9413, "result": "lose"}
Model Saved
{"episode":1321, "reward":-20.0, "explore": 0.4767, "loss": 90.5328, "result": "lose"}
{"episode":1322, "reward":-34.0, "explore": 0.4765, "loss": 152.8195, "result": "lose"}
{"episode":1323, "reward":-34.0, "explore": 0.4762, "loss": 101.3893, "result": "lose"}
{"episode":1324, "reward":-36.0, "explore": 0.4760, "loss": 57.0073, "result": "lose"}
{"episode":1325, "reward":36.0, "explore": 0.4758, "loss": 83.6580, "result": "win"}
Model Saved
{"episode":1326, "reward":6.0, "explore": 0.4756, "loss": 153.4433, "result": "win"}
{"episode":1327, "reward":-34.0, "explore": 0.4754, "loss": 103.7788, "result": "lose"}
{"episode":1328, "reward":-34.0, "explore": 0.4751, "loss": 102.9558, "result": "lose"}
{"episode":1329, "reward":-35.0, "explore": 0.4749, "loss": 106.2927, "result": "lose"}
{"episode":1330, 

{"episode":1412, "reward":-34.0, "explore": 0.4518, "loss": 29585.8203, "result": "lose"}
{"episode":1413, "reward":-34.0, "explore": 0.4515, "loss": 4018.6694, "result": "lose"}
{"episode":1414, "reward":-8.0, "explore": 0.4513, "loss": 1655.6135, "result": "lose"}
{"episode":1415, "reward":-34.0, "explore": 0.4510, "loss": 20.2290, "result": "lose"}
Model Saved
{"episode":1416, "reward":36.0, "explore": 0.4509, "loss": 1014.6330, "result": "win"}
{"episode":1417, "reward":36.0, "explore": 0.4507, "loss": 6339.6348, "result": "win"}
{"episode":1418, "reward":6.0, "explore": 0.4505, "loss": 2489.4160, "result": "win"}
{"episode":1419, "reward":-34.0, "explore": 0.4502, "loss": 137.1387, "result": "lose"}
{"episode":1420, "reward":-34.0, "explore": 0.4500, "loss": 6245.0430, "result": "lose"}
Model Saved
{"episode":1421, "reward":-34.0, "explore": 0.4498, "loss": 5613.1924, "result": "lose"}
{"episode":1422, "reward":-34.0, "explore": 0.4496, "loss": 275.6249, "result": "lose"}
{"episod

{"episode":1507, "reward":-34.0, "explore": 0.4215, "loss": 77.6865, "result": "lose"}
{"episode":1508, "reward":39.0, "explore": 0.4213, "loss": 2447.7930, "result": "win"}
{"episode":1509, "reward":-36.0, "explore": 0.4211, "loss": 198.2673, "result": "lose"}
{"episode":1510, "reward":17.0, "explore": 0.4209, "loss": 576.1729, "result": "win"}
Model Saved
{"episode":1511, "reward":17.0, "explore": 0.4207, "loss": 647.0790, "result": "win"}
{"episode":1512, "reward":-36.0, "explore": 0.4205, "loss": 2362.5186, "result": "lose"}
{"episode":1513, "reward":-35.0, "explore": 0.4203, "loss": 777.1451, "result": "lose"}
{"episode":1514, "reward":-35.0, "explore": 0.4201, "loss": 568.2614, "result": "lose"}
{"episode":1515, "reward":-35.0, "explore": 0.4199, "loss": 584.2629, "result": "lose"}
Model Saved
{"episode":1516, "reward":18.0, "explore": 0.4197, "loss": 156.8675, "result": "win"}
{"episode":1517, "reward":-34.0, "explore": 0.4194, "loss": 534.7960, "result": "lose"}
{"episode":1518

{"episode":1600, "reward":-35.0, "explore": 0.3960, "loss": 170.3258, "result": "lose"}
Model Saved
{"episode":1601, "reward":-8.0, "explore": 0.3958, "loss": 117.8990, "result": "lose"}
{"episode":1602, "reward":29.0, "explore": 0.3957, "loss": 152.9324, "result": "win"}
{"episode":1603, "reward":-29.0, "explore": 0.3942, "loss": 89.7574, "result": "lose"}
{"episode":1604, "reward":8.0, "explore": 0.3941, "loss": 42.0617, "result": "win"}
{"episode":1605, "reward":-34.0, "explore": 0.3939, "loss": 78.2782, "result": "lose"}
Model Saved
{"episode":1606, "reward":-8.0, "explore": 0.3937, "loss": 38.5211, "result": "lose"}
{"episode":1607, "reward":-34.0, "explore": 0.3935, "loss": 156.5865, "result": "lose"}
{"episode":1608, "reward":-34.0, "explore": 0.3933, "loss": 60.3905, "result": "lose"}
{"episode":1609, "reward":-34.0, "explore": 0.3931, "loss": 72.6810, "result": "lose"}
{"episode":1610, "reward":-34.0, "explore": 0.3929, "loss": 49.5269, "result": "lose"}
Model Saved
{"episode"

{"episode":1691, "reward":-35.0, "explore": 0.3769, "loss": 331.7797, "result": "lose"}
{"episode":1692, "reward":-34.0, "explore": 0.3767, "loss": 2500.0403, "result": "lose"}
{"episode":1693, "reward":-34.0, "explore": 0.3765, "loss": 1501.7670, "result": "lose"}
{"episode":1694, "reward":-34.0, "explore": 0.3763, "loss": 3238.9912, "result": "lose"}
{"episode":1695, "reward":20.0, "explore": 0.3762, "loss": 4479.8789, "result": "win"}
Model Saved
{"episode":1696, "reward":-35.0, "explore": 0.3760, "loss": 4967.1802, "result": "lose"}
{"episode":1697, "reward":36.0, "explore": 0.3758, "loss": 2752.7310, "result": "win"}
{"episode":1698, "reward":-34.0, "explore": 0.3756, "loss": 325.2080, "result": "lose"}
{"episode":1699, "reward":-34.0, "explore": 0.3755, "loss": 970.9104, "result": "lose"}
{"episode":1700, "reward":-34.0, "explore": 0.3753, "loss": 782.6624, "result": "lose"}
Model Saved
{"episode":1701, "reward":-34.0, "explore": 0.3751, "loss": 47.5268, "result": "lose"}
{"episo

{"episode":1784, "reward":-34.0, "explore": 0.3585, "loss": 1250.2205, "result": "lose"}
{"episode":1785, "reward":-34.0, "explore": 0.3583, "loss": 6838.7275, "result": "lose"}
Model Saved
{"episode":1786, "reward":-16.0, "explore": 0.3582, "loss": 114.2270, "result": "lose"}
{"episode":1787, "reward":8.0, "explore": 0.3580, "loss": 769.2108, "result": "win"}
{"episode":1788, "reward":-16.0, "explore": 0.3579, "loss": 88.1891, "result": "lose"}
{"episode":1789, "reward":-34.0, "explore": 0.3577, "loss": 166.7666, "result": "lose"}
{"episode":1790, "reward":-34.0, "explore": 0.3575, "loss": 2353.3037, "result": "lose"}
Model Saved
{"episode":1791, "reward":-34.0, "explore": 0.3573, "loss": 903.2786, "result": "lose"}
{"episode":1792, "reward":-35.0, "explore": 0.3572, "loss": 152.0767, "result": "lose"}
{"episode":1793, "reward":-34.0, "explore": 0.3570, "loss": 222.2578, "result": "lose"}
{"episode":1794, "reward":-34.0, "explore": 0.3568, "loss": 634.1150, "result": "lose"}
{"episode

{"episode":1877, "reward":-34.0, "explore": 0.3422, "loss": 246.5388, "result": "lose"}
{"episode":1878, "reward":-34.0, "explore": 0.3421, "loss": 97.3642, "result": "lose"}
{"episode":1879, "reward":30.0, "explore": 0.3419, "loss": 400.6254, "result": "win"}
{"episode":1880, "reward":-34.0, "explore": 0.3418, "loss": 183.3772, "result": "lose"}
Model Saved
{"episode":1881, "reward":-34.0, "explore": 0.3416, "loss": 136.5960, "result": "lose"}
{"episode":1882, "reward":-34.0, "explore": 0.3414, "loss": 107.9190, "result": "lose"}
{"episode":1883, "reward":-8.0, "explore": 0.3413, "loss": 49.3431, "result": "lose"}
{"episode":1884, "reward":-34.0, "explore": 0.3411, "loss": 167.3464, "result": "lose"}
{"episode":1885, "reward":18.0, "explore": 0.3410, "loss": 98.1698, "result": "win"}
Model Saved
{"episode":1886, "reward":36.0, "explore": 0.3408, "loss": 142.0779, "result": "win"}
{"episode":1887, "reward":-34.0, "explore": 0.3406, "loss": 79.3715, "result": "lose"}
{"episode":1888, "r

{"episode":1971, "reward":-8.0, "explore": 0.3249, "loss": 519.6973, "result": "lose"}
{"episode":1972, "reward":-35.0, "explore": 0.3248, "loss": 254.6293, "result": "lose"}
{"episode":1973, "reward":-36.0, "explore": 0.3246, "loss": 3846.0234, "result": "lose"}
{"episode":1974, "reward":-15.0, "explore": 0.3245, "loss": 5024.5996, "result": "lose"}
{"episode":1975, "reward":-34.0, "explore": 0.3244, "loss": 354.8978, "result": "lose"}
Model Saved
{"episode":1976, "reward":1.0, "explore": 0.3242, "loss": 5649.4160, "result": "win"}
{"episode":1977, "reward":-34.0, "explore": 0.3241, "loss": 139.7737, "result": "lose"}
{"episode":1978, "reward":-35.0, "explore": 0.3240, "loss": 2436.2976, "result": "lose"}
{"episode":1979, "reward":-35.0, "explore": 0.3235, "loss": 109.8726, "result": "lose"}
{"episode":1980, "reward":-34.0, "explore": 0.3233, "loss": 2036.5588, "result": "lose"}
Model Saved
{"episode":1981, "reward":-34.0, "explore": 0.3232, "loss": 497.9303, "result": "lose"}
{"episo

In [None]:
plt.figure(3)
plt.clf()

env.close()
saver = tf.train.Saver()
env = env = SF2Env(rom_path, 
                   'rvsb.state', 
                   scenario=scenario_name,
                   players=1,
                   use_restricted_actions=retro.Actions.DISCRETE)

with tf.Session() as sess:
    total_test_rewards = []
    
    # Load the model
    saver.restore(sess, model_path)
    
    for episode in range(1):
        total_rewards = 0
        
        state = env.reset()
        
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        state = state.flatten()
        
        
        print("****************************************************")
        print("EPISODE ", episode)
        
        t = 0
        while True:
            t += 1
            # Reshape the state
            state = state.reshape((1, state_size))
            # Get action from Q-network 
            # Estimate the Qs values state
            Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state})
            
            # Take the biggest Q value (= the best action)
            choice = np.argmax(Qs)
            action = possible_actions[choice]
            a_list = [0]*env.action_space.n
            a_list[choice] = 1
            
            #Perform the action and get the next_state, reward, and done information
            next_state, reward, done, _ = env.step(action)
            
            plt.imshow(env.render(mode='rgb_array'))
            plt.axis('off')
            display.clear_output(wait=True)
            display.display(plt.gcf())
            
            total_rewards += reward

            if done:
                print ("Total reward", total_rewards)
                total_test_rewards.append(total_rewards)
                break
                
            next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
            next_state = next_state.flatten()
            state = next_state
            
    env.close()

In [None]:
print(type(memory.sample(1)))
print(memory.sample(1))

In [None]:
env.close()
