# Dial-a-ride problem using Transformers and Reinforcement Learning

In [None]:
from env import DarpEnv
from log import logger, set_level

## Environment (env.py)

In [None]:

logger = set_level(logger, "debug")

# loading and initializing a darp environment
FILE_NAME = 'data/cordeau/a2-16.txt'
env = DarpEnv(size=10, nb_requests=16, nb_vehicles=2, time_end=1440, max_step=1000, dataset=FILE_NAME)
obs = env.representation()

# simulate env with nearest neighbor action
rewards = []
for t in range(100):
    action = env.nearest_action_choice()    
    obs, reward, done = env.step(action)
    rewards.append(reward)
    delivered =  sum([request.state == "delivered" for request in env.requests])
    all_delivered = env.is_all_delivered()
    if done:
        break

env.penalize_broken_time_windows()

# print out results
total = sum([v.total_distance_travelled for v in env.vehicles])
logger.info(f"Episode finished after {t + 1} steps, with reward {total}")
for vehicle in env.vehicles:
    logger.info(f'{vehicle} history: {vehicle.history}')
delivered =  sum([request.state == "delivered" for request in env.requests])
in_trunk = sum([r.state == "in_trunk" for r in env.requests])
pickup = sum([r.state == "pickup" for r in env.requests])
logger.info(f'delivered: {delivered}, in trunk: {in_trunk}, waiting: {pickup}')
logger.info(f'delivered: {delivered}, in trunk: {in_trunk}, waiting: {pickup}')
logger.info("*** PENALTY ***")
logger.info("start_window: %s", env.penalty["start_window"])
logger.info("end_window: %s", env.penalty["end_window"])
logger.info("max_route_duration: %s", env.penalty["max_route_duration"])
logger.info("max_ride_time: %s", env.penalty["max_ride_time"])
logger.info("total penalty: %s", env.penalty["sum"])


## Requests and Vehicles (entity.py)

The environment is populated with Request and Vehicle objects. 

In [None]:
from entity import Request, Vehicle
import numpy as np

In [None]:
driver = Vehicle(id=0,
                position=np.array([0,0]),
                capacity=3,
                max_route_duration=300)


request = Request(id=0,
                pickup_position=np.array([1,2]),
                dropoff_position=np.array([3,4]),
                service_time = 3,
                start_window=np.array([0,100]),
                end_window=np.array([100,200]),
                max_ride_time=30)

## Generating & loading datasets (generate.py)

generating a list of DarpEnv objects, with randomly located requests

In [None]:
from generator import generate_environments, dump_data, load_aoyo

logger = set_level(logger, "info")
envs = generate_environments(N=10000,
                        size= 10, 
                        nb_vehicles=4,
                        nb_requests=48,
                        time_end=1440,
                        max_step=1000,
                        max_route_duration=720,
                        capacity=3,
                        max_ride_time=30,
                        window=True)

    
logger.info("data dump starts...")
path = "data/processed/generated-10000-a4-48.pkl"
dump_data(envs, path)
logger.info("data successfully dumped")

Loading Aoyu's dataset

In [None]:
instance = "a2-16"
train_envs, test_envs = load_aoyo(instance)

logger.info("data dump starts...")
train_path = f"data/processed/aoyu-10000-{instance}-train.pkl"
test_path = f"data/processed/aoyu-10000-{instance}-test.pkl"
dump_data(train_envs, train_path)
dump_data(test_envs, test_path)
logger.info("data successfully dumped")

## Training supervised learning model (supervised.py)

In [None]:
from supervised import supervised_trainer
from model import Aoyu
import torch
from utils import get_device

instance="a4-48"
result_path = "models"
supervised_policy="rf"
trial = "01"
batch_size = 256
nb_epochs = 10
id = f"result-{instance}-supervised-{supervised_policy}-{trial}-aoyu256"

#initialize policy
policy = Aoyu(d_model=256, nhead=8, nb_requests=48, nb_vehicles=4, num_layers=4, time_end=1440, env_size=10)
device = get_device()
policy = policy.to(device)
logger.info("training on device: %s", device)

#initialize optimizer
optimizer = torch.optim.Adam(policy.parameters(), lr=1e-4)

#start train
result = supervised_trainer(id, 
                        instance,
                        result_path,
                        supervised_policy,
                        batch_size, 
                        nb_epochs, 
                        policy,
                        optimizer) 

## Training a reinforcement learning model (model.py)

In [None]:
from utils import seed_everything
from generator import load_data
from model import reinforce_trainer

seed_everything(1)
logger = set_level(logger, "info")

result_path = "models"
nb_vehicles = 2
nb_requests = 16
variant = "a"
instance = f"{variant}{nb_requests}-{nb_vehicles}"
test_env_path = f'data/cordeau/{instance}.txt'  
id = f"result-{instance}-reinforce-01-aoyu"

nb_episodes= 1000
update_baseline = 100

# load model from supervised training
policy = Aoyu(d_model=256, nhead=8, nb_requests=nb_requests, nb_vehicles=nb_vehicles, num_layers=4, time_end=1440, env_size=10)
PATH = "models/result-a2-16-supervised-rf-01-aoyu256"
r = load_data(PATH)
state = r.policy_dict
policy.load_state_dict(state)

# pass model to CUDA if available
device = get_device()
policy = policy.to(device)
logger.info("training on device: %s", device)

# initialize optimizer
optimizer = torch.optim.Adam(policy.parameters(), lr=1e-4, weight_decay=1e-3)

reinforce_trainer(test_env_path, result_path, id ,nb_episodes, nb_requests, nb_vehicles, update_baseline, policy, optimizer)

## Evaluate model on test set (evaluate.py)

Evaluate model on one instance

In [None]:
from evaluate import evaluate_model, evaluate_aoyu

logger = set_level(logger, "info")

# loading the darp instance
FILE_NAME = 'data/cordeau/a2-16.txt'
test_env = DarpEnv(size=10, nb_requests=16, nb_vehicles=2, time_end=1440, max_step=34, dataset=FILE_NAME)

policy = Aoyu(d_model=256, nhead=8, nb_requests=16, nb_vehicles=2, num_layers=4, time_end=1440, env_size=10)
# loading a Result object, containing a state_dict of a trained model (WARNING: for now model hyperparameters are not stored in the result object) 
PATH = "models/result-a2-16-supervised-rf-01-aoyu256"
r = load_data(PATH)
state = r.policy_dict
policy.load_state_dict(state)

# passing the model to CUDA if available 
device = get_device()
policy.to(device)
policy.eval()

routing_cost, window_penalty, delivered = evaluate_model(policy, test_env)

Evaluate model on 1.000 instance from Aoyu's dataset

In [None]:
logger = set_level(logger, "info")

instance = "a2-16"
test_path = f"data/aoyu/{instance}-test.txt"

policy = Aoyu(d_model=256, nhead=8, nb_requests=16, nb_vehicles=2, num_layers=4, time_end=1440, env_size=10)
# loading a Result object, containing a state_dict of a trained model (WARNING: for now model hyperparameters are not stored in the result object) 
PATH = "models/result-a2-16-supervised-rf-01-aoyu256"
r = load_data(PATH)
state = r.policy_dict
policy.load_state_dict(state)

# passing the model to CUDA if available 
device = get_device()
policy.to(device)
policy.eval()

df = evaluate_aoyu(policy, test_path)
df.to_csv(f"evaluations/data-{instance}-test-model-rf-a2-16-02")