In [189]:
# IMPORTS
##########################

import random
import copy
from keras.models import Sequential
from keras.layers import InputLayer, Dense
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import time 
import os, sys
from keras.callbacks import TensorBoard

# Making sure path to SUMO bins correctly specified
if 'SUMO_HOME' in os.environ:
    tools = os.path.join(os.environ['SUMO_HOME'], 'tools')
    sys.path.append(tools)
else:
    sys.exit("please declare environment variable 'SUMO_HOME'")

import sumolib
import traci

In [190]:
# OUTPUT FILE FOLDER
####################################

def get_output_folder(parent_dir, exp_id):
    """Return save folder.
    Assumes folders in the parent_dir have suffix -run{run
    number}. Finds the highest run number and sets the output folder
    to that number + 1. This is just convenient so that if you run the
    same script multiple times tensorboard can plot all of the results
    on the same plots with different names.
    
    Parameters
    ----------
    parent_dir: str
      Path of the directory containing all experiment runs.
    
    Returns
    -------
    parent_dir/run_dir
      Path to this run's save directory.
    
    """
    try:
        # Returns an error if parent_dir already exists
        os.makedirs(parent_dir)
    except:
        pass
    
    if exp_id in os.listdir(parent_dir):
        
        experiment_id = 1
        new_folder = os.path.join(parent_dir,exp_id+"_"+str(experiment_id))
        
        while os.path.exists(new_folder):
            experiment_id +=1
            new_folder = os.path.join(parent_dir,exp_id+"_"+str(experiment_id))
        
        parent_dir = new_folder
        os.makedirs(parent_dir)
        os.mkdir(os.path.join(parent_dir,"model_checkpoints"))
    else:
        parent_dir = os.path.join(parent_dir,exp_id)
        os.makedirs(parent_dir)
        os.mkdir(os.path.join(parent_dir,"model_checkpoints"))
        
    return parent_dir

In [191]:
# MEMORY CLASS
##################################################

class ReplayMemory():
    """ 
    Store transitions.

    Parameters
    ----------

    state_shape: np.array shape of state

    num_actions: number of actions (traffic signal phases)
    """
    def __init__(self, max_size, state_shape, num_actions):
    # initialize the whole memory at once
        self.memory = [SingleSample(state_shape,num_actions) for _ in range(max_size)]
        self.max_size = max_size
        self.itr = 0  # insert the next element here
        self.cur_size = 0
    
    def append(self, state, action, reward, nextstate):
        self.memory[self.itr].assign(state, action, reward, nextstate)
        self.itr += 1
        self.cur_size = min(self.cur_size + 1, self.max_size)
        self.itr %= self.max_size
        
    def sample(self, batch_size):
        # Uniform sampling, later prioritized experience replay can be implemented
        states, actions,rewards,next_states = [],[],[],[]
        for i, idx in enumerate(np.random.randint(0, self.cur_size, size=batch_size)):
            transition = self.memory[idx]
            states.append(transition.state)
            actions.append(transition.action)
            rewards.append(transition.reward)
            next_states.append(transition.nextstate)
        return np.vstack(states), actions, rewards, np.vstack(next_states)
    
    def print_obs(self,obs):
        self.memory[obs].print_obs()

    def get_size(self):
        return self.cur_size
    
    
class SingleSample():
    def __init__(self, state_shape, num_actions): #Num actions not used up to now
        self.state = np.zeros(state_shape)
        self.action = 0
        self.reward = 0
        self.nextstate = np.zeros(state_shape)
        
    def assign(self, state, action, reward, nextstate):
        self.state[:] = state
        self.action = action
        self.reward = reward
        self.nextstate[:] = nextstate
        
    def print_obs(self):
        print( "State: \n\n",self.state,
               "\n\nAction:\n\n",self.action,
               "\n\nReward:\n\n",self.reward,
               "\n\nNext State:\n\n",self.nextstate)

In [192]:
# ROUTING (DEMAND)
#################################################

### TO DO
# Create a class to spcify different types of demand

def generate_routefile():
    random.seed(42)  # make tests reproducible
    N = 3600  # number of time steps
    # demand per second from different directions

    pEW = 1 / 20
    pNS = 1 / 80
    pWE = 1 / 20
    pSN = 1 / 80

    with open("cross.rou.xml", "w") as routes:
        print("""<routes>
        <vType id="car" accel="0.8" decel="4.5" sigma="0.5" length="5" minGap="2.5" maxSpeed="16.67" guiShape="passenger"/>
        <route id="right" edges="51o 1i 2o 52i" />
        <route id="left" edges="52o 2i 1o 51i" />
        <route id="down" edges="54o 4i 3o 53i" />
        <route id="up" edges="53o 3i 4o 54i" />""", file=routes)
        vehNr = 0
        for i in range(N):
            if random.uniform(0, 1) < pWE:
                print('    <vehicle id="right_%i" type="car" route="right" depart="%i" />' % (
                    vehNr, i), file=routes)
                vehNr += 1
            if random.uniform(0, 1) < pEW:
                print('    <vehicle id="left_%i" type="car" route="left" depart="%i" />' % (
                    vehNr, i), file=routes)
                vehNr += 1
            if random.uniform(0, 1) < pNS:
                print('    <vehicle id="down_%i" type="car" route="up" depart="%i" color="1,0,0"/>' % (
                    vehNr, i), file=routes)
                vehNr += 1
            if random.uniform(0, 1) < pSN:
                print('    <vehicle id="UP_%i" type="car" route="down" depart="%i" color="1,0,0"/>' % (
                    vehNr, i), file=routes)
                vehNr += 1
        print("</routes>", file=routes)

In [193]:
# Q NETWORKS
################################

# More ANN arquitectures to be specified here
def get_model(model_name, *args):
    if model_name == 'linear':
        return linear(*args)
    elif model_name == 'simple':
        return simple(*args)
    else:
        raise ValueError()

def linear(input_shape, num_actions):
    model = Sequential()
    model.add(Flatten(input_shape=input_shape)) # If a vector this does not have any effect. (only matrices)
    model.add(Dense(num_actions,activation=None))
    return model

def simple(input_shape, num_actions):
    model = Sequential()
    model.add(Dense(9, input_shape = input_shape,activation='relu'))
    model.add(Dense(9, activation='relu'))
    model.add(Dense(num_actions, activation=None))
    return model     

In [194]:
# ENVIRONMENT CLASS
##################################   
class Env:
    
    """
    SUMO Environment for Traffic Signal Control
    net_file: (str) SUMO .net.xml file
    route_file: (str) SUMO .rou.xml file
    use_gui: (bool) Wheter to run SUMO simulation with GUI visualisation
    delta_time: (int) Simulation seconds between actions
    """
    
    def __init__(self,
                 net_file, 
                 route_file,
                 state_shape,
                 num_actions,
                 use_gui = False,
                 delta_time=10):
        
        self.net = net_file
        self.route = route_file
        self.use_gui = use_gui
        self.time_step = delta_time
        if self.use_gui:
            self.sumo_binary = sumolib.checkBinary('sumo-gui')
        else:
            self.sumo_binary = sumolib.checkBinary('sumo')
            
        self.obs = Observation(state_shape)
        self.action = Action(num_actions)
 
    def start_simulation(self):
        
        sumo_cmd = [self.sumo_binary, 
                    '-n', self.net,
                    '-r' ,self.route]
        
        if self.use_gui:
            sumo_cmd.append('--start')
        traci.start(sumo_cmd)
        self.obs.update_state()
        
    def take_action(self,action):
        #action = 0 -> row vertically
        if action == 0 and traci.trafficlight.getPhase("0") != 0:
            traci.trafficlight.setPhase("0",3)
        #action = 1 -> row horizontally    
        elif action == 1 and traci.trafficlight.getPhase("0") != 2:
            traci.trafficlight.setPhase("0",1)
                
    def compute_reward(self,state,next_state):
        # Here is whre reward is specified
        a = next_state - state
        b = np.round(state,decimals=1)

        aux = np.divide(a, b, out=np.zeros_like(a), where=b!=0)
        
        return -np.sum(aux[0,0:4]) #-np.sum(next_state[0,:4])#np.sum(a[0,4:-1])-np.sum(a[0,:4]) # - delta number of stopped cars
        
    def step( self, action):
        
        state = copy.deepcopy(self.obs.get())
        
        if action != None:
            self.take_action(action)
        
        traci.simulationStep(traci.simulation.getTime() + self.time_step) # Run the simulation time_step (s)
        
        self.obs.update_state()
        next_state = self.obs.get()
        
        reward = self.compute_reward(state,next_state)
        
        return state, reward, next_state, self.done()
    
    def done(self):
        return traci.simulation.getMinExpectedNumber() == 0
    
    def stop_simulation(self):
        traci.close()
        

    
class Observation:
    def __init__(self,
                 shape):
        self.obs = np.zeros(shape)
        
    def update_state(self):
        
        lanes = ["4i_0","2i_0","3i_0","1i_0"]           
        for i,lane in enumerate(lanes):

            self.obs[:,i] = traci.lane.getLastStepHaltingNumber(lane)
            self.obs[:,i+4] = traci.lane.getLastStepMeanSpeed(lane)
        
        self.obs[:,8] = traci.trafficlight.getPhase("0")
        
    def get(self):
        return self.obs
        
    def get_reward(self):
        return -np.sum(self.obs[0:4])
    
class Action:
    """ One-hot encoding of the phase of the traffic signal"""
    def __init__( self, num_actions):
        self.num_actions = num_actions
        self.action_space = np.identity(num_actions)
        
    def select_action(self,policy, **kwargs):
        if policy == "randUni":
            return self.select_randuni()
        elif policy == "greedy":
            # NOt implemented yet
            return self.select_greedy(**kwargs)
        elif policy == "epsGreedy":
            # Not implemented yet
            return self.select_epsgreedy(**kwargs)
    
    def select_randuni(self):
        return np.random.randint(0, self.num_actions)
    
    def select_greedy(self, q_values):
        return np.argmax(q_values)
    
    def select_epsgreedy(self,eps,q_values):
        
        if np.random.uniform() < eps:
            return self.select_randuni()
        else:
            return self.select_greedy(q_values)


In [195]:
# DOUBLE DQN
################################

SAVE_AFTER = 11000 # Save model checkpoint

class DoubleDQN:
    def __init__(self,
                 q_network, 
                 target_q_network,
                 memory, # Replay memory
                 gamma, # Discount factor
                 target_update_freq, # Frequency to update target network
                 num_burn_in, # How many samples to fill replay memory
                 #train_freq,
                 batch_size,
                 optimizer,
                 loss_func,
                 max_ep_length,
                 env_name,
                 output_dir,
                 experiment_id,
                 summary_writer,
                 model_checkpoint = True,
                 opt_metric=None # Used to judge the performance of the ANN
                 ): 
        self.q_network = q_network
        self.target_q_network = target_q_network
        self.target_q_network.set_weights(self.q_network.get_weights())
        self.__compile(optimizer, loss_func,opt_metric)
        self.memory = memory
        #self.policy = policy
        self.gamma = gamma
        self.target_update_freq = target_update_freq
        self.num_burn_in = num_burn_in
        #self.train_freq = train_freq
        self.batch_size = batch_size
        self.training_reward_seen = 0
        self.trained_episodes = 0
        self.max_ep_len = max_ep_length
        self.output_dir = output_dir
        self.experiment_id = experiment_id
        self.summary_writer=summary_writer
        self.itr=0
        
        #self.learning_type = learning_type
        
    
    def __compile(self, optimizer, loss_func, opt_metric):
        
        self.q_network.compile(optimizer, loss_func, opt_metric)
        self.target_q_network.compile(optimizer, loss_func, opt_metric)
        
    def fill_replay(self,env,policy):
        
        print("Filling experience replay memory...")
        
        env.start_simulation()
        
        for i in range(self.num_burn_in):
            action = env.action.select_action(policy)
            state, reward, nextstate,done = env.step(action)
            self.memory.append(state,action,reward,nextstate)
            
        env.stop_simulation()
        
        print("...Done")
    
    def predict_q(self,network,state):
        return network.predict(state)
    
    def save(self):
        
        filename =  "{}/model_checkpoints/run{}_iter{}.h5" .format(self.output_dir, 
                                               self.experiment_id, 
                                               self.itr)
        self.q_network.save(filename)
    
    def update_network(self):
        
        # Sample mini batch
        states_m,actions_m,rewards_m,states_m_p = self.memory.sample(self.batch_size)
        
        # Compute target
        # Notice that we want to incur in loss in the actions that we have selected.
        # Q_target and Q are set equal for not relevant actions so the loss is 0. 
        # (weights not being updated due to these actions)
        target_batch = self.q_network.predict(states_m) 
        
        target_q = self.target_q_network.predict(states_m_p)
        selected_actions = np.argmax(target_q, axis=1 )
        
        for i,action in enumerate(selected_actions):
            
            target_batch[i,action] =  rewards_m[i] + self.gamma * np.max(target_q[i])

        loss = self.q_network.train_on_batch(states_m,target_batch)
        
        # Update weights every target_update_freq steps
        if self.itr % self.target_update_freq == 0:
            self.target_q_network.set_weights(self.q_network.get_weights())
        
        # Storing Logs
        if self.output_dir != None:
            # Save network every save_after iterations
            if self.itr % SAVE_AFTER == 0:
                self.save()
                
        return loss
            
    def train(self,env,num_episodes,policy,**kwargs):
        
        all_stats = []
        all_rewards = []
        
        start_train_ep = self.trained_episodes
        
        for i in range(num_episodes):
            
            sys.stdout.write("\r"+'Running episode {} / {}'.format(self.trained_episodes+1, 
                                                       start_train_ep + num_episodes))
            
            env.start_simulation()
            nextstate = env.obs.get()
            done = False
            
            stats = {
                'ep_id' : self.trained_episodes,
                'total_reward': 0,
                'episode_length': 0,
                'max_q_value': 0, 
            }
            
            tf_loss = tf.placeholder(tf.float32,shape=None,name='training_loss')
            
            
            while not done and stats["episode_length"] < self.max_ep_len:
                
                q_values = self.q_network.predict(nextstate)
                action = env.action.select_action(policy, q_values = q_values, eps = eps)
                state, reward, nextstate,done = env.step(action)
                self.memory.append(state,action,reward,nextstate)
                
                loss = self.update_network()
                
                self.summary_writer.add_summary(tf.Summary(value=[
                    tf.Summary.Value(tag='loss',
                                     simple_value=loss.item())]),
                    global_step=self.itr)
                
                self.itr += 1
                
                stats["ep_id"] = self.trained_episodes
                stats["episode_length"] += 1
                stats['total_reward'] += reward
                stats['max_q_value'] += max(q_values)
            
            env.stop_simulation()
            self.trained_episodes += 1
        
            all_stats.append(stats)
            all_rewards.append(stats['total_reward'])
            
        print('\nCurrent mean+std: {} {}'.format(np.mean(all_rewards),np.std(all_rewards)))

        
    def evaluate(self,env):
            
            env.start_simulation()
            nextstate = env.obs.get()
            done = False
            it = 0
            
            while not done and it < self.max_ep_len:
                
                q_values = self.q_network.predict(nextstate)
                action = env.action.select_action("greedy", q_values = q_values)
                state, reward, nextstate,done = env.step(action)
                it +=1
                
            env.stop_simulation()
  

In [196]:
# MAIN
##################################

num_actions = 2
state_shape = (1,9) # State var in rows
memory_size = 100000
gamma = 0.8
target_update_frequency = 100
num_init_samples_mem = 1000
batch_size = 50
max_episode_length = 100000
optimizer = 'adam'
loss = "mse"
eps = 0.2
env_name = "Simple_Cross"
experiment_id = "Test"
monitoring = True # Store variables for TensorBoard monitoring and model_checkpoints



# Define logs directory and experiment id. 
# Experiment id to be changed each time there are structural network changes
if monitoring:
    output_dir = get_output_folder("./Logs",experiment_id)
    summary_writer = tf.summary.FileWriter(logdir=output_dir)
else:
    output_dir = None
    summary_writer = None


# Initialize Q-networks (value and target)
q_network = get_model('simple',(state_shape[1],),num_actions)
target_q_network = get_model('simple',(state_shape[1],),num_actions)

# Initialize environment
sumo_env =  Env(    "cross.net.xml",
                    "cross.rou.xml",
                    state_shape,
                    num_actions,
                    use_gui=False
               )

# Initialize replay memory
memory = ReplayMemory(    memory_size,
                          state_shape,
                          num_actions
                     )

# Initialize Double DQN algorithm
ddqn = DoubleDQN(   q_network,
                    target_q_network,
                    memory,
                    gamma,
                    target_update_frequency,
                    num_init_samples_mem,
                    batch_size,
                    optimizer,
                    loss,
                    max_episode_length,
                    sumo_env,
                    output_dir,
                    experiment_id,
                    summary_writer
                )

# Fill Replay Memory
ddqn.fill_replay(sumo_env,'randUni')

# Trains Double DQN
ddqn.train(  sumo_env, 20, "epsGreedy", eps=eps)

Filling experience replay memory...
...Done
Running episode 20 / 20
Current mean+std: 4.307559051122688 18.242929332259347


In [197]:
# Evaluates Double DQN
sumo_env =  Env(    "cross.net.xml",
                    "cross.rou.xml",
                    state_shape,
                    num_actions,
                    use_gui=True
               )
ddqn.evaluate( sumo_env)