# Vehicle Routing problem solved using QLearning

## IE7374 Project Spring 2021

### By Sahil and Peng

In [None]:
# %pip install numpy==1.23.4

In [None]:
import numpy as np
import pandas as pd

from tqdm import tqdm

import matplotlib.pyplot as plt

In [17]:
# %pip install ray==2.0.1

Collecting ray==2.0.1
  Downloading ray-2.0.1-cp38-cp38-win_amd64.whl (20.7 MB)
     ---------------------------------------- 0.0/20.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/20.7 MB ? eta -:--:--
     ---------------------------------------- 0.1/20.7 MB 1.7 MB/s eta 0:00:13
     - -------------------------------------- 0.9/20.7 MB 6.8 MB/s eta 0:00:03
     -- ------------------------------------- 1.0/20.7 MB 8.3 MB/s eta 0:00:03
     --- ------------------------------------ 1.7/20.7 MB 7.8 MB/s eta 0:00:03
     ---- ----------------------------------- 2.1/20.7 MB 8.8 MB/s eta 0:00:03
     ----- ---------------------------------- 2.8/20.7 MB 9.0 MB/s eta 0:00:02
     ------ --------------------------------- 3.1/20.7 MB 10.0 MB/s eta 0:00:02
     ------ --------------------------------- 3.1/20.7 MB 10.0 MB/s eta 0:00:02
     -------- ------------------------------- 4.2/20.7 MB 9.5 MB/s eta 0:00:02
     -------- ------------------------------- 4.2/20.7 MB 9.5 M

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\h.wang\\AppData\\Local\\anaconda3\\envs\\OR_Gym_Env\\Lib\\site-packages\\~.y\\core\\src\\ray\\gcs\\gcs_server.exe'
Consider using the `--user` option or check the permissions.



In [None]:
%%capture
import or_gym
from or_gym.utils import create_env
import ray
from ray.rllib import agents
from ray import tune
import ray.rllib.agents.dqn as dqn

In [None]:
def register_env(env_name, env_config={}):
    env = create_env(env_name)
    tune.register_env(env_name, 
        lambda env_name: env(env_name,
            env_config=env_config))

In [None]:
env_name = "VehicleRouting-v0"

env_config = {
    'max_orders': 2,
    'vehicle_capacity': 1,
    
    'n_restaurants': 1,
    
    'order_prob': 0.75,
    
    'order_timeout_prob': 0,
    
    'num_zones' : 1,
    'order_probs_per_zone': [1],
    'order_reward_min': [8, 5],
    'order_reward_max': [12, 8],
    'half_norm_scale_reward_per_zone': [1],
    
    'grid': (4,4),
    'order_promise': 60,
    
    'penalty_per_timestep': 0.1,
    'penalty_per_move': 0.2,
    'order_miss_penalty': 0,
    'step_limit': 1000,
    
    'rest_loc': [(3,2)],
    'order_values': [10],
    
    'info': {},
    
    'mask': True,
}


agent_init_info = {
    'num_actions' : 7,
    'step_size' : 0.8,
    'discount' : 0.7,
}


rl_config = dict(
    env=env_name,
    num_workers=10,
    env_config=env_config,
    double_q=True,
    model=dict(
        vf_share_layers=False,
        fcnet_activation='relu',
        fcnet_hiddens=[256, 256]
    ),
    exploration_config={
        "type": "EpsilonGreedy",
        "initial_epsilon": 1.0,
        "final_epsilon": 0.02,
        "epsilon_timesteps": 1,
    },
    evaluation_config={
        "explore": False,
    },
    timesteps_per_iteration=1000,
    target_network_update_freq=500,
    buffer_size=100,
    adam_epsilon=1e-8,
    grad_clip=40,
    train_batch_size=32,
    framework='torch',
    lr=1e-5
)

epochs = 1000

In [None]:
import gym
gym.logger.set_level(40)

# Register environment
ray.shutdown()
register_env(env_name, env_config)

# Initialize Ray and Build Agent
ray.init(num_cpus=10, ignore_reinit_error=True)

agent = dqn.DQNTrainer(env=env_name,
    config=rl_config)
results = []

rew = np.nan

pbar = tqdm(range(epochs), desc='Training Loop' )

for i in pbar:
    res = agent.train()
    results.append(res)
    rew = res['episode_reward_mean']
    pbar.set_description("reward = %f" % rew)
    #if (i+1) % 5 == 0:
    #    tqdm.write('\rIter: {}\tReward: {:.2f}'.format(
    #            i+1, res['episode_reward_mean']), end='')
ray.shutdown()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec
# Unpack values from each iteration
rewards = np.hstack([i['episode_reward_mean'] 
    for i in results])

p = 100

mean_rewards = np.array([np.mean(rewards[i-p:i+1]) 
                if i >= p else np.mean(rewards[:i+1]) 
                for i, _ in enumerate(rewards)])
std_rewards = np.array([np.std(rewards[i-p:i+1])
               if i >= p else np.std(rewards[:i+1])
               for i, _ in enumerate(rewards)])

fig = plt.figure(constrained_layout=True, figsize=(20, 10))
gs = fig.add_gridspec(2, 4)
ax0 = fig.add_subplot(gs[:, :-2])
ax0.fill_between(np.arange(len(mean_rewards)), 
                 mean_rewards - std_rewards, 
                 mean_rewards + std_rewards, 
                 label='Standard Deviation', alpha=0.3)
ax0.plot(mean_rewards, label='Mean Rewards')
ax0.set_ylabel('Rewards')
ax0.set_xlabel('Episode')
ax0.set_title('Training Rewards')
ax0.legend()
plt.savefig("Results.svg", dpi=300)
plt.show()

In [None]:
td_err = [
    i['info']['learner']['default_policy']['learner_stats']['mean_td_error'] 
    for i in results]


p = 100

mean_td_err = np.array([np.mean(td_err[i-p:i+1]) 
                if i >= p else np.mean(td_err[:i+1]) 
                for i, _ in enumerate(td_err)])
std_td_err = np.array([np.std(td_err[i-p:i+1])
               if i >= p else np.std(td_err[:i+1])
               for i, _ in enumerate(td_err)])

fig = plt.figure(constrained_layout=True, figsize=(20, 10))
gs = fig.add_gridspec(2, 4)
ax0 = fig.add_subplot(gs[:, :-2])
ax0.fill_between(np.arange(len(mean_td_err)), 
                 mean_td_err - std_td_err, 
                 mean_td_err + std_td_err, 
                 label='Standard Deviation', alpha=0.3)
ax0.plot(mean_td_err, label='Mean td_err')
ax0.set_ylabel('td_err')
ax0.set_xlabel('Episode')
ax0.set_title('Training td_err')
ax0.legend()
plt.savefig("Results_TD_err.svg", dpi=300)
plt.show()

In [None]:
rewards

In [None]:
td_err