In [259]:
import argparse

def args_parser():
    parser = argparse.ArgumentParser()
      
    #RL的参数
    parser.add_argument('--bs', type=int, default=128)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--lr_decay', type=float, default=0.003, help="lr decay")
    parser.add_argument('--tau', type=float, default=0.001)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--momentum', type=float, default=0.5, help="SGD momentum (default: 0.5)")

    #训练参数
    parser.add_argument('--max_episode', type=int, default=5000)
    parser.add_argument('--max_step', type=int, default=3600)
    parser.add_argument('--max_buffer', type=int, default=10000)
    parser.add_argument('--max_total_reward', type=float)
    parser.add_argument('--epsilon', type=float, default=0.95)
    parser.add_argument('--learning_start', type=int, default=600)
    parser.add_argument('--update_freq', type=int, default=5)

    args = parser.parse_args(args=[])
    return args

In [260]:
import pickle
from itertools import count

import os
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal
from torch.autograd import grad
from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
from torch.autograd import Variable
import math
import shutil

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import copy
import random
from torchvision import datasets, transforms
from collections import deque
import cityflow
import json

args = args_parser()

In [261]:
class MemoryBuffer:

    def __init__(self, size):
        self.buffer = deque(maxlen=size)
        self.maxSize = size
        self.len = 0

    def sample(self, count):
        """
        samples a random batch from the replay memory buffer
        :param count: batch size
        :return: batch (numpy array)
        """
        batch = []
        count = min(count, self.len)
        batch = random.sample(self.buffer, count)
#         print(batch)
        s_arr = np.float32([arr[0] for arr in batch])
        a_arr = np.float32([arr[1] for arr in batch])
        r_arr = np.float32([arr[2] for arr in batch])
        s1_arr = np.float32([arr[3] for arr in batch])

        return s_arr, a_arr, r_arr, s1_arr

    def len(self):
        return self.len

    def add(self, s, a, r, s1):
        """
        adds a particular transaction in the memory buffer
        :param s: current state
        :param a: action taken
        :param r: reward received
        :param s1: next state
        :return:
        """
        transition = (s,a,r,s1)
        self.len += 1
        if self.len > self.maxSize:
            self.len = self.maxSize
        self.buffer.append(transition)

In [262]:
import pandas as pd
import os

class CityFlowEnv():
    '''
    Simulator Environment with CityFlow
    '''
    def __init__(self, args):
        self.env = cityflow.Engine(config_file='examples/config_control.json', thread_num=1)
#         self.eng.load_roadnet(config['roadnet'])
#         self.eng.load_flow(config['flow'])
#         self.config = config
        self.num_step = args.max_step
        self.lane_phase_info = self.parse_roadnet('examples/roadnet.json') # "intersection_1_1"

        self.intersection_id = list(self.lane_phase_info.keys())[0]
        self.start_lane = self.lane_phase_info[self.intersection_id]['start_lane']
        self.phase_list = self.lane_phase_info[self.intersection_id]["phase"]
        self.phase_startLane_mapping = self.lane_phase_info[self.intersection_id]["phase_startLane_mapping"]

        self.current_phase = self.phase_list[0]
        self.current_phase_time = 0
        self.yellow_time = 5

        self.phase_log = []

    def parse_roadnet(self, roadnetFile):
        roadnet = json.load(open(roadnetFile))
        lane_phase_info_dict ={}

        # many intersections exist in the roadnet and virtual intersection is controlled by signal
        for intersection in roadnet["intersections"]:
            if intersection['virtual']:
                continue
            lane_phase_info_dict[intersection['id']] = {"start_lane": [],
                                                         "end_lane": [],
                                                         "phase": [],
                                                         "phase_startLane_mapping": {},
                                                         "phase_roadLink_mapping": {}}
            road_links = intersection["roadLinks"]

            start_lane = []
            end_lane = []
            roadLink_lane_pair = {ri: [] for ri in
                                  range(len(road_links))}  # roadLink includes some lane_pair: (start_lane, end_lane)

            for ri in range(len(road_links)):
                road_link = road_links[ri]
                for lane_link in road_link["laneLinks"]:
                    sl = road_link['startRoad'] + "_" + str(lane_link["startLaneIndex"])
                    el = road_link['endRoad'] + "_" + str(lane_link["endLaneIndex"])
                    start_lane.append(sl)
                    end_lane.append(el)
                    roadLink_lane_pair[ri].append((sl, el))

            lane_phase_info_dict[intersection['id']]["start_lane"] = sorted(list(set(start_lane)))
            lane_phase_info_dict[intersection['id']]["end_lane"] = sorted(list(set(end_lane)))

            for phase_i in range(1, len(intersection["trafficLight"]["lightphases"])):
                p = intersection["trafficLight"]["lightphases"][phase_i]
                lane_pair = []
                start_lane = []
                for ri in p["availableRoadLinks"]:
                    lane_pair.extend(roadLink_lane_pair[ri])
                    if roadLink_lane_pair[ri][0][0] not in start_lane:
                        start_lane.append(roadLink_lane_pair[ri][0][0])
                lane_phase_info_dict[intersection['id']]["phase"].append(phase_i)
                lane_phase_info_dict[intersection['id']]["phase_startLane_mapping"][phase_i] = start_lane
                lane_phase_info_dict[intersection['id']]["phase_roadLink_mapping"][phase_i] = lane_pair

        return lane_phase_info_dict
    
    def reset(self):
        self.env.reset()
        self.phase_log = []

    def step(self, next_phase):
        if self.current_phase == next_phase:
            self.current_phase_time += 1
        else:
            self.current_phase = next_phase
            self.current_phase_time = 1

        self.env.set_tl_phase(self.intersection_id, self.current_phase)
        self.env.next_step()
        self.phase_log.append(self.current_phase)

    def get_state(self):
        state = {}
        state['lane_vehicle_count'] = self.env.get_lane_vehicle_count()  # {lane_id: lane_count, ...}
        state['start_lane_vehicle_count'] = {lane: self.env.get_lane_vehicle_count()[lane] for lane in self.start_lane}
        state['lane_waiting_vehicle_count'] = self.env.get_lane_waiting_vehicle_count()  # {lane_id: lane_waiting_count, ...}
        state['lane_vehicles'] = self.env.get_lane_vehicles()  # {lane_id: [vehicle1_id, vehicle2_id, ...], ...}
        state['vehicle_speed'] = self.env.get_vehicle_speed()  # {vehicle_id: vehicle_speed, ...}
        state['vehicle_distance'] = self.env.get_vehicle_distance() # {vehicle_id: distance, ...}
        state['current_time'] = self.env.get_current_time()
        state['current_phase'] = self.current_phase
        state['current_phase_time'] = self.current_phase_time

        return state

    def get_reward(self):
        # a sample reward function which calculates the total of waiting vehicles
        lane_waiting_vehicle_count = self.env.get_lane_waiting_vehicle_count()
        reward = -1 * sum(list(lane_waiting_vehicle_count.values()))
        return reward

    def log(self):
        #self.eng.print_log(self.config['replay_data_path'] + "/replay_roadnet.json",
        #                   self.config['replay_data_path'] + "/replay_flow.json")
        df = pd.DataFrame({self.intersection_id: self.phase_log[:self.num_step]})
        if not os.path.exists(self.config['data']):
            os.makedirs(self.config["data"])
        df.to_csv(os.path.join(self.config['data'], 'signal_plan_template.txt'), index=None)

In [263]:
def fanin_init(size, fanin=None):
    fanin = fanin or size[0]
    v = 1. / np.sqrt(fanin)
    return torch.Tensor(size).uniform_(-v, v)

class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, args):
        super(Actor, self).__init__()
        self.args = args
        self.state_dim = state_dim
        self.action_dim = action_dim
        
        self.fc1 = nn.Linear(state_dim, 256)
        self.fc1.weight.data = fanin_init(self.fc1.weight.data.size())

        self.fc2 = nn.Linear(256, 128)
        self.fc2.weight.data = fanin_init(self.fc2.weight.data.size())
        
        self.fc3 = nn.Linear(128, 64)
        self.fc3.weight.data = fanin_init(self.fc3.weight.data.size())
        
        self.fc4 = nn.Linear(64, action_dim)
        self.fc4.weight.data.uniform_(-self.args.lr_decay, self.args.lr_decay)
        
    def forward(self, state):
#         state = state.squeeze(dim=1)
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.tanh(self.fc4(x))
#         print(x.shape)
        x = x.detach().numpy()
        
        action = []
        for i in range(len(x)):
            action.append(np.argmax(x[i]))
        return action

    def forward_train(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.tanh(self.fc4(x))
#         print(x.shape)
        
        return x

class Critic(nn.Module):
    def __init__(self, state_dim, action_dim, args):
        super(Critic, self).__init__()
        self.args = args
        self.state_dim = state_dim
        self.action_dim = action_dim
        
        self.fcs1 = nn.Linear(state_dim,256)
        self.fcs1.weight.data = fanin_init(self.fcs1.weight.data.size())
        self.fcs2 = nn.Linear(256,128)
        self.fcs2.weight.data = fanin_init(self.fcs2.weight.data.size())

        self.fca1 = nn.Linear(action_dim,128)
        self.fca1.weight.data = fanin_init(self.fca1.weight.data.size())

        self.fc2 = nn.Linear(256,128)
        self.fc2.weight.data = fanin_init(self.fc2.weight.data.size())

        self.fc3 = nn.Linear(128,1)
        self.fc3.weight.data.uniform_(-self.args.lr_decay, self.args.lr_decay)

    def forward(self, state, action):
#         state = state.squeeze(dim=1)
#         action = action.squeeze(dim=1)
        s1 = F.relu(self.fcs1(state))
        s2 = F.relu(self.fcs2(s1))
        a1 = F.relu(self.fca1(action))
        x = torch.cat((s2,a1),dim=1)

        x = F.relu(self.fc2(x))
        q = self.fc3(x)

        return q

In [264]:
class OrnsteinUhlenbeckActionNoise:

    def __init__(self, action_dim, mu = 0, theta = 0.15, sigma = 0.2):
        self.action_dim = action_dim
        self.mu = mu
        self.theta = theta
        self.sigma = sigma
        self.X = np.ones(self.action_dim) * self.mu

    def reset(self):
        self.X = np.ones(self.action_dim) * self.mu

    def sample(self):
        dx = self.theta * (self.mu - self.X)
        dx = dx + self.sigma * np.random.randn(len(self.X))
        self.X = self.X + dx
        return self.X

In [265]:
class Trainer:

    def __init__(self, state_dim, action_dim, replay_buffer, lane_phase_info, args):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.replay_buffer = replay_buffer
        self.iter = 0
        self.loss_critic_save = []
        self.loss_actor_save = []
        self.args = args
        self.noise = OrnsteinUhlenbeckActionNoise(self.action_dim)

        self.actor = Actor(self.state_dim, self.action_dim, self.args)
        self.target_actor = Actor(self.state_dim, self.action_dim, self.args)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), self.args.lr)

        self.critic = Critic(self.state_dim, 
                             self.action_dim, self.args)
        self.target_critic = Critic(self.state_dim, 
                                    self.action_dim, self.args)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), self.args.lr)

        self.hard_update(self.target_actor, self.actor)
        self.hard_update(self.target_critic, self.critic)
        
        intersection_id = list(lane_phase_info.keys())[0]
        self.phase_list = lane_phase_info[intersection_id]['phase']

    def soft_update(self, target, source, tau):
        for target_param, param in zip(target.parameters(), source.parameters()):
            target_param.data.copy_(
                target_param.data * (1.0 - tau) + param.data * tau
            )
            
    def hard_update(self, target, source):
        for target_param, param in zip(target.parameters(), source.parameters()):
                target_param.data.copy_(param.data)
    
    
#     def get_exploitation_action(self, state):
#         state = Variable(torch.from_numpy(state))
#         action = self.target_actor.forward(state).detach()
#         return action.data.numpy()

    
#     def get_exploration_action(self, state):
#         state = Variable(torch.from_numpy(state))
#         action = self.actor.forward(state).detach()
#         new_action = action.data.numpy() + (self.noise.sample())
#         return new_action

    def choose_action(self, state):
        if np.random.rand() > self.args.epsilon:
            return random.randrange(self.action_dim)
        act_values = self.actor.forward(state)
        return act_values[0]  # returns action

    
    def optimize(self):
        s1,a1,r1,s2 = self.replay_buffer.sample(self.args.bs)
        s1 = Variable(torch.from_numpy(s1))
        a1 = Variable(torch.from_numpy(a1))
        r1 = Variable(torch.from_numpy(r1))
        s2 = Variable(torch.from_numpy(s2))
        s1 = s1.squeeze(dim=1)
        s2 = s2.squeeze(dim=1)
#         print(s1.shape)
#         print(a1.shape)
#         print(r1.shape)
#         r1 = r1.to(self.args.device)
#         r1 = Variable(r1)
#         a1 = Variable(a1)
#         for i in range(len(s1)):
#             s1[i] = Variable(s1[i])
#             s2[i] = Variable(s2[i])

        # ---------------------- optimize critic ----------------------
        # Use target actor exploitation policy here for loss evaluation
        # 这里应该是TD的方法
        a2 = self.target_actor.forward_train(s2)
#         print(a2)
        next_val = torch.squeeze(self.target_critic.forward(s2, a2).detach())
        y_expected = r1 + self.args.gamma * next_val
        y_predicted = torch.squeeze(self.critic.forward(s1, self.actor.forward_train(s1)))
        
        loss_critic = F.smooth_l1_loss(y_predicted, y_expected)
        self.critic_optimizer.zero_grad()
        loss_critic.backward()
        self.critic_optimizer.step()
        self.loss_critic_save.append(loss_critic)
        # ---------------------- optimize actor ----------------------
        pred_a1 = self.actor.forward_train(s1)
        loss_actor = -1*torch.sum(self.critic.forward(s1, pred_a1))
        self.actor_optimizer.zero_grad()
        loss_actor.backward()
        self.actor_optimizer.step()
        self.loss_actor_save.append(loss_actor)

        self.soft_update(self.target_actor, self.actor, self.args.tau)
        self.soft_update(self.target_critic, self.critic, self.args.tau)

    def save_models(self, episode_count):
        torch.save(self.target_actor.state_dict(), './Models/' + str(episode_count) + '_actor.pt')
        torch.save(self.target_critic.state_dict(), './Models/' + str(episode_count) + '_critic.pt')
        
    def load_models(self, episode):
        self.actor.load_state_dict(torch.load('./Models/' + str(episode) + '_actor.pt'))
        self.critic.load_state_dict(torch.load('./Models/' + str(episode) + '_critic.pt'))
        utils.hard_update(self.target_actor, self.actor)
        utils.hard_update(self.target_critic, self.critic)

In [266]:
env = CityFlowEnv(args)

lane_phase_info = env.lane_phase_info
intersection_id = list(lane_phase_info.keys())[0]
phase_list = lane_phase_info[intersection_id]['phase']

s_dim = len(lane_phase_info[intersection_id]['start_lane']) + 1
a_dim = len(phase_list)

In [267]:
replay_buffer = MemoryBuffer(args.max_buffer)
trainer = Trainer(s_dim, a_dim, replay_buffer, lane_phase_info, args)

In [268]:
os.environ["CUDA_VISIBLE_DEVICES"] = '1'

for i in range(args.max_episode):
    env.reset()
    
    t = 0
    s = env.get_state()
    s = np.array(list(s['start_lane_vehicle_count'].values()) + 
                 [s['current_phase']])
    s = np.reshape(s, [1, s_dim])
    s = s.astype(np.float32)
#     s = torch.tensor(s)
#     print(trainer.choose_action(s))
    last_action = phase_list[int(trainer.choose_action(torch.tensor(s)))]
    
    while t < args.max_step:
        a_choice = trainer.choose_action(torch.tensor(s))
        a = phase_list[int(a_choice)]
            
        if a == last_action:
            env.step(a)
        else:
            for _ in range(env.yellow_time):
                env.step(0)
                t += 1
                flag = (t >= args.max_step)
                if flag:
                    break
            if flag:
                break
            env.step(a)
            
        last_action = a
        t += 1
        next_state = env.get_state()
        r = env.get_reward()
        next_state = np.array(list(next_state['start_lane_vehicle_count'].values()) + 
                              [next_state['current_phase']])
        next_state = np.reshape(next_state, [1, s_dim])
        next_state = next_state.astype(np.float32)
#         next_state = torch.tensor(next_state)
        
        trainer.replay_buffer.add(s, a_choice, r, next_state)
        s = next_state
        
        total_time = t + i * args.max_step
        if total_time > args.learning_start and total_time % args.update_freq == 0:
            trainer.optimize()
        # 所有车辆的平均行驶时间，除以总时间后越大则越好，[0,1]
#         average_travel_time = eng.get_average_travel_time()
#         reward_travel_time = eng.get_current_time()/average_travel_time
        
        if i % 10 == 0 and t % 20 == 0:
            print("episode: {}/{}, time: {}, acton: {}, reward: {}"
              .format(i, args.max_episode, t-1, a, r))

#     if i%100 == 0:
#         trainer.save_models(i)
        
        

episode: 0/5000, time: 39, acton: 6, reward: -22
episode: 0/5000, time: 59, acton: 6, reward: -47
episode: 0/5000, time: 79, acton: 6, reward: -71
episode: 0/5000, time: 99, acton: 2, reward: -75
episode: 0/5000, time: 119, acton: 6, reward: -78
episode: 0/5000, time: 139, acton: 6, reward: -105
episode: 0/5000, time: 179, acton: 6, reward: -128
episode: 0/5000, time: 199, acton: 6, reward: -155
episode: 10/5000, time: 39, acton: 1, reward: -28
episode: 10/5000, time: 59, acton: 1, reward: -61
episode: 10/5000, time: 79, acton: 1, reward: -97
episode: 10/5000, time: 139, acton: 1, reward: -148
episode: 10/5000, time: 159, acton: 1, reward: -170
episode: 20/5000, time: 19, acton: 1, reward: 0
episode: 20/5000, time: 39, acton: 1, reward: -32
episode: 20/5000, time: 79, acton: 1, reward: -82
episode: 20/5000, time: 99, acton: 1, reward: -116
episode: 20/5000, time: 119, acton: 1, reward: -149
episode: 20/5000, time: 139, acton: 1, reward: -183
episode: 20/5000, time: 159, acton: 1, rewar

episode: 240/5000, time: 39, acton: 1, reward: -32
episode: 240/5000, time: 59, acton: 1, reward: -67
episode: 240/5000, time: 79, acton: 1, reward: -68
episode: 240/5000, time: 99, acton: 1, reward: -100
episode: 240/5000, time: 119, acton: 1, reward: -105
episode: 240/5000, time: 139, acton: 1, reward: -149
episode: 240/5000, time: 199, acton: 1, reward: -209
episode: 250/5000, time: 19, acton: 1, reward: 0
episode: 250/5000, time: 39, acton: 1, reward: -32
episode: 250/5000, time: 59, acton: 1, reward: -50
episode: 250/5000, time: 99, acton: 1, reward: -97
episode: 250/5000, time: 119, acton: 1, reward: -130
episode: 250/5000, time: 139, acton: 1, reward: -140
episode: 250/5000, time: 159, acton: 1, reward: -173
episode: 250/5000, time: 179, acton: 1, reward: -205
episode: 250/5000, time: 199, acton: 1, reward: -239
episode: 260/5000, time: 19, acton: 6, reward: 0
episode: 260/5000, time: 39, acton: 1, reward: -28
episode: 260/5000, time: 59, acton: 1, reward: -63
episode: 260/5000,

episode: 480/5000, time: 39, acton: 1, reward: -24
episode: 480/5000, time: 59, acton: 1, reward: -48
episode: 480/5000, time: 79, acton: 1, reward: -83
episode: 480/5000, time: 119, acton: 1, reward: -111
episode: 480/5000, time: 139, acton: 1, reward: -152
episode: 480/5000, time: 159, acton: 1, reward: -184
episode: 480/5000, time: 179, acton: 1, reward: -219
episode: 480/5000, time: 199, acton: 1, reward: -254
episode: 490/5000, time: 19, acton: 1, reward: 0
episode: 490/5000, time: 39, acton: 1, reward: -26
episode: 490/5000, time: 59, acton: 1, reward: -52
episode: 490/5000, time: 99, acton: 1, reward: -106
episode: 490/5000, time: 119, acton: 1, reward: -138
episode: 490/5000, time: 159, acton: 1, reward: -178
episode: 490/5000, time: 179, acton: 1, reward: -210
episode: 490/5000, time: 199, acton: 1, reward: -244
episode: 500/5000, time: 19, acton: 1, reward: 0
episode: 500/5000, time: 39, acton: 1, reward: -32
episode: 500/5000, time: 59, acton: 1, reward: -65
episode: 500/500

episode: 720/5000, time: 59, acton: 1, reward: -46
episode: 720/5000, time: 79, acton: 1, reward: -81
episode: 720/5000, time: 99, acton: 1, reward: -99
episode: 720/5000, time: 119, acton: 1, reward: -131
episode: 720/5000, time: 179, acton: 1, reward: -177
episode: 720/5000, time: 199, acton: 1, reward: -216
episode: 730/5000, time: 39, acton: 1, reward: -28
episode: 730/5000, time: 59, acton: 1, reward: -51
episode: 730/5000, time: 79, acton: 1, reward: -76
episode: 730/5000, time: 99, acton: 1, reward: -110
episode: 730/5000, time: 119, acton: 1, reward: -126
episode: 730/5000, time: 139, acton: 1, reward: -158
episode: 730/5000, time: 159, acton: 1, reward: -192
episode: 730/5000, time: 179, acton: 1, reward: -225
episode: 730/5000, time: 199, acton: 1, reward: -227
episode: 740/5000, time: 19, acton: 1, reward: 0
episode: 740/5000, time: 59, acton: 5, reward: -48
episode: 740/5000, time: 99, acton: 1, reward: -106
episode: 740/5000, time: 139, acton: 1, reward: -154
episode: 740/

episode: 940/5000, time: 179, acton: 5, reward: -190
episode: 940/5000, time: 199, acton: 1, reward: -193
episode: 950/5000, time: 19, acton: 1, reward: 0
episode: 950/5000, time: 39, acton: 1, reward: -21
episode: 950/5000, time: 59, acton: 1, reward: -43
episode: 950/5000, time: 99, acton: 5, reward: -88
episode: 950/5000, time: 119, acton: 1, reward: -111
episode: 950/5000, time: 159, acton: 1, reward: -167
episode: 950/5000, time: 179, acton: 1, reward: -172


KeyboardInterrupt: 