In [1]:
import argparse

def args_parser():
    parser = argparse.ArgumentParser()
      
    #RL的参数
    parser.add_argument('--bs', type=int, default=128)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--lr_decay', type=float, default=0.003, help="lr decay")
    parser.add_argument('--tau', type=float, default=0.001)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--momentum', type=float, default=0.5, help="SGD momentum (default: 0.5)")

    #训练参数
    parser.add_argument('--max_episode', type=int, default=5000)
    parser.add_argument('--max_step', type=int, default=3600)
    parser.add_argument('--max_buffer', type=int, default=10000)
    parser.add_argument('--max_total_reward', type=float)
    parser.add_argument('--epsilon', type=float, default=0.95)
    parser.add_argument('--learning_start', type=int, default=600)
    parser.add_argument('--update_freq', type=int, default=300)
    parser.add_argument('--update_target_freq', type=int, default=1500)

    args = parser.parse_args(args=[])
    return args



In [2]:
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import pickle
from itertools import count

import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal
from torch.autograd import grad
from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
from torch.autograd import Variable
import math
import shutil

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import copy
import random
from torchvision import datasets, transforms
import cityflow
import json
import pandas as pd


args = args_parser()

os.environ["CUDA_VISIBLE_DEVICES"] = '1'


In [3]:
class CityFlowEnv():
    '''
    Simulator Environment with CityFlow
    '''
    def __init__(self, args):
        self.env = cityflow.Engine(config_file='examples/config_control.json', thread_num=1)
#         self.eng.load_roadnet(config['roadnet'])
#         self.eng.load_flow(config['flow'])
#         self.config = config
        self.num_step = args.max_step
        self.lane_phase_info = self.parse_roadnet('examples/roadnet.json') # "intersection_1_1"

        self.intersection_id = list(self.lane_phase_info.keys())[0]
        self.start_lane = self.lane_phase_info[self.intersection_id]['start_lane']
        self.phase_list = self.lane_phase_info[self.intersection_id]["phase"]
        self.phase_startLane_mapping = self.lane_phase_info[self.intersection_id]["phase_startLane_mapping"]

        self.current_phase = self.phase_list[0]
        self.current_phase_time = 0
        self.yellow_time = 5

        self.phase_log = []

    def parse_roadnet(self, roadnetFile):
        roadnet = json.load(open(roadnetFile))
        lane_phase_info_dict ={}

        # many intersections exist in the roadnet and virtual intersection is controlled by signal
        for intersection in roadnet["intersections"]:
            if intersection['virtual']:
                continue
            lane_phase_info_dict[intersection['id']] = {"start_lane": [],
                                                         "end_lane": [],
                                                         "phase": [],
                                                         "phase_startLane_mapping": {},
                                                         "phase_roadLink_mapping": {}}
            road_links = intersection["roadLinks"]

            start_lane = []
            end_lane = []
            roadLink_lane_pair = {ri: [] for ri in
                                  range(len(road_links))}  # roadLink includes some lane_pair: (start_lane, end_lane)

            for ri in range(len(road_links)):
                road_link = road_links[ri]
                for lane_link in road_link["laneLinks"]:
                    sl = road_link['startRoad'] + "_" + str(lane_link["startLaneIndex"])
                    el = road_link['endRoad'] + "_" + str(lane_link["endLaneIndex"])
                    start_lane.append(sl)
                    end_lane.append(el)
                    roadLink_lane_pair[ri].append((sl, el))

            lane_phase_info_dict[intersection['id']]["start_lane"] = sorted(list(set(start_lane)))
            lane_phase_info_dict[intersection['id']]["end_lane"] = sorted(list(set(end_lane)))

            for phase_i in range(1, len(intersection["trafficLight"]["lightphases"])):
                p = intersection["trafficLight"]["lightphases"][phase_i]
                lane_pair = []
                start_lane = []
                for ri in p["availableRoadLinks"]:
                    lane_pair.extend(roadLink_lane_pair[ri])
                    if roadLink_lane_pair[ri][0][0] not in start_lane:
                        start_lane.append(roadLink_lane_pair[ri][0][0])
                lane_phase_info_dict[intersection['id']]["phase"].append(phase_i)
                lane_phase_info_dict[intersection['id']]["phase_startLane_mapping"][phase_i] = start_lane
                lane_phase_info_dict[intersection['id']]["phase_roadLink_mapping"][phase_i] = lane_pair

        return lane_phase_info_dict
    
    def reset(self):
        self.env.reset()
        self.phase_log = []

    def step(self, next_phase):
        if self.current_phase == next_phase:
            self.current_phase_time += 1
        else:
            self.current_phase = next_phase
            self.current_phase_time = 1

        self.env.set_tl_phase(self.intersection_id, self.current_phase)
        self.env.next_step()
        self.phase_log.append(self.current_phase)

    def get_state(self):
        state = {}
        state['lane_vehicle_count'] = self.env.get_lane_vehicle_count()  # {lane_id: lane_count, ...}
        state['start_lane_vehicle_count'] = {lane: self.env.get_lane_vehicle_count()[lane] for lane in self.start_lane}
        state['lane_waiting_vehicle_count'] = self.env.get_lane_waiting_vehicle_count()  # {lane_id: lane_waiting_count, ...}
        state['lane_vehicles'] = self.env.get_lane_vehicles()  # {lane_id: [vehicle1_id, vehicle2_id, ...], ...}
        state['vehicle_speed'] = self.env.get_vehicle_speed()  # {vehicle_id: vehicle_speed, ...}
        state['vehicle_distance'] = self.env.get_vehicle_distance() # {vehicle_id: distance, ...}
        state['current_time'] = self.env.get_current_time()
        state['current_phase'] = self.current_phase
        state['current_phase_time'] = self.current_phase_time

        return state

    def get_reward(self):
        # a sample reward function which calculates the total of waiting vehicles
        lane_waiting_vehicle_count = self.env.get_lane_waiting_vehicle_count()
        reward = -1 * sum(list(lane_waiting_vehicle_count.values()))
        return reward

    def log(self):
        #self.eng.print_log(self.config['replay_data_path'] + "/replay_roadnet.json",
        #                   self.config['replay_data_path'] + "/replay_flow.json")
        df = pd.DataFrame({self.intersection_id: self.phase_log[:self.num_step]})
        if not os.path.exists(self.config['data']):
            os.makedirs(self.config["data"])
        df.to_csv(os.path.join(self.config['data'], 'signal_plan_template.txt'), index=None)

In [4]:
class Trainer:
    def __init__(self, state_dim, action_dim, lane_phase_info, args):
        self.args = args
        self.state_size = state_dim
        self.action_size = action_dim
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.update_target_freq = 5
        self.batch_size = 30
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_target_network()

        intersection_id = list(lane_phase_info.keys())[0]
        self.phase_list = lane_phase_info[intersection_id]['phase']

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(40, input_dim=self.state_size, activation='relu'))
        model.add(Dense(40, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.learning_rate))
        return model

    def update_target_network(self):
        weights = self.model.get_weights()
        self.target_model.set_weights(weights)

    def remember(self, state, action, reward, next_state):
        action = self.phase_list.index(action)
        self.memory.append((state, action, reward, next_state))

    def choose_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action

    def replay(self):
        minibatch = random.sample(self.memory, self.batch_size)
        for state, action, reward, next_state in minibatch:
            target = (reward + self.gamma *
                      np.amax(self.target_model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [5]:
env = CityFlowEnv(args)

lane_phase_info = env.lane_phase_info
intersection_id = list(lane_phase_info.keys())[0]
phase_list = lane_phase_info[intersection_id]['phase']

s_dim = len(lane_phase_info[intersection_id]['start_lane']) + 1
a_dim = len(phase_list)

In [6]:
trainer = Trainer(s_dim, a_dim, lane_phase_info, args)

  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [None]:
for i in range(args.max_episode):
    env.reset()
    
    t = 0
    s = env.get_state()
    s = np.array(list(s['start_lane_vehicle_count'].values()) + 
                 [s['current_phase']])
    s = np.reshape(s, [1, s_dim])
    s = s.astype(np.float32)
#     s = torch.tensor(s)
#     print(trainer.choose_action(s))
#     last_action = phase_list[int(trainer.choose_action(torch.tensor(s)))]
    last_action = phase_list[trainer.choose_action(s)]
    
    while t < args.max_step:
#         a_choice = trainer.choose_action(torch.tensor(s))
#         a = phase_list[int(a_choice)]

        a = phase_list[trainer.choose_action(s)]
            
        if a == last_action:
            env.step(a)
        else:
            for _ in range(env.yellow_time):
                env.step(0)
                t += 1
                flag = (t >= args.max_step)
                if flag:
                    break
            if flag:
                break
            env.step(a)
            
        last_action = a
        t += 1
        next_state = env.get_state()
        r = env.get_reward()
        next_state = np.array(list(next_state['start_lane_vehicle_count'].values()) + 
                              [next_state['current_phase']])
        next_state = np.reshape(next_state, [1, s_dim])
        next_state = next_state.astype(np.float32)
#         next_state = torch.tensor(next_state)
        
        trainer.remember(s, a, r, next_state)
        s = next_state
        
        total_time = t + i * args.max_step
        if total_time > args.learning_start and total_time % args.update_freq == 0:
            trainer.replay()
        if total_time > args.learning_start and total_time % args.update_target_freq == 0:
            trainer.replay()
        # 所有车辆的平均行驶时间，除以总时间后越大则越好，[0,1]
#         average_travel_time = eng.get_average_travel_time()
#         reward_travel_time = eng.get_current_time()/average_travel_time
        
        if i % 10 == 0 and t % 1200 == 0:
            print("episode: {}/{}, time: {}, acton: {}, reward: {}"
              .format(i, args.max_episode, t-1, a, r))

episode: 0/5000, time: 2399, acton: 5, reward: -414
episode: 0/5000, time: 3599, acton: 2, reward: -203
episode: 10/5000, time: 1199, acton: 2, reward: -203
episode: 10/5000, time: 3599, acton: 2, reward: -297
episode: 30/5000, time: 3599, acton: 4, reward: -194
episode: 50/5000, time: 2399, acton: 6, reward: -198
episode: 60/5000, time: 2399, acton: 4, reward: -272
episode: 70/5000, time: 1199, acton: 6, reward: -255
episode: 70/5000, time: 3599, acton: 3, reward: -361
episode: 80/5000, time: 2399, acton: 2, reward: -280
episode: 80/5000, time: 3599, acton: 2, reward: -319
episode: 90/5000, time: 1199, acton: 6, reward: -330
episode: 90/5000, time: 2399, acton: 6, reward: -319
episode: 100/5000, time: 2399, acton: 2, reward: -179
episode: 120/5000, time: 1199, acton: 4, reward: -303
episode: 120/5000, time: 2399, acton: 4, reward: -226
episode: 120/5000, time: 3599, acton: 4, reward: -260
episode: 130/5000, time: 3599, acton: 4, reward: -236
episode: 140/5000, time: 1199, acton: 5, re

In [10]:
trainer.save('DQN.h5')