# World of Supply

A simulation environment for multi-echelon supply chain optimization problems. 

In [2]:
import numpy as np
from tqdm import tqdm as tqdm
import importlib

# Core Simulation Logic and Rendering

In this section, we test the core simulator and renderer (without RL adapters and integrations).

In [None]:
import world_of_supply_environment as ws
importlib.reload(ws)
import world_of_supply_renderer as wsr
importlib.reload(wsr)

# Measure the simulation rate, steps/sec
world = ws.WorldBuilder.create(80, 16)
policy = ws.SimpleControlPolicy()
for i in tqdm(range(10000)):
    world.act(policy.compute_control(world))
    
# Test rendering
renderer = wsr.AsciiWorldRenderer()
frame_seq = []
world = ws.WorldBuilder.create(80, 16)
policy = ws.SimpleControlPolicy()
for epoch in tqdm(range(300)):
    frame = renderer.render(world)
    frame_seq.append(np.asarray(frame))
    world.act(policy.compute_control(world))

print('Rendering the animation...')
wsr.AsciiWorldRenderer.plot_sequence_images(frame_seq)

# Policy Training

In this section, we run RLlib policy trainers. These trainers evaluate the hand coded policy, learn a new policy from scrath, or learn a new policy by playing against the hand coded policy.

In [None]:
import world_of_supply_rllib as wsr
importlib.reload(wsr)
import world_of_supply_rllib_training as wst
importlib.reload(wst)

# Policy training
#trainer = wst.play_baseline(n_iterations = 2)
trainer = wst.train_ppo(n_iterations = 30)

# Policy Evaluation

In this section, we evaluate the trained policy.

### Rendering One Episod for the Trained Policy

In [None]:
import world_of_supply_renderer as wsren
importlib.reload(wsren)
import world_of_supply_rllib as wsrl
importlib.reload(wsrl)
import world_of_supply_rllib_training as wstr
importlib.reload(wstr)

# Parameters of the tracing simulation
episod_duration = 1000
steps_to_render = (0, episod_duration)

# Create the environment
renderer = wsren.AsciiWorldRenderer()
frame_seq = []
env = wsrl.WorldOfSupplyEnv(wstr.env_config)
states = env.reset()
infos = None
    
def load_policy(facility_id):
    #return wsrl.SimplePolicy(env.observation_space, env.action_space, wsrl.SimplePolicy.get_config_from_env(env))
    
    policy_map = wstr.policy_mapping_global.copy()
    #policy_map['LumberFactory'] = 'ppo'
    return trainer.get_policy(wstr.create_policy_mapping_fn(policy_map)(facility_id))

policies = {}
rnn_states = {}
for facility_id in states.keys():
    policies[facility_id] = load_policy(facility_id)
    rnn_states[facility_id] = policies[facility_id].get_initial_state()
    
# Simulation loop
for epoch in tqdm(range(episod_duration)):        
    action_dict = {}
    for facility_id, state in states.items():
        
        policy = policies[facility_id]
        rnn_state = rnn_states[facility_id]
        
        if infos is not None and facility_id in infos:
            action_dict[facility_id], rnn_state, _ = policy.compute_single_action( state, info=infos[facility_id], state=rnn_state ) 
        else:
            action_dict[facility_id], rnn_state, _ = policy.compute_single_action( state, state=rnn_state )
        
    states, reward, dones, infos = env.step(action_dict)
    
    if epoch >= steps_to_render[0] and epoch < steps_to_render[1]:
        frame = renderer.render(env.world)
        frame_seq.append(np.asarray(frame))
        
print('Rendering the animation...')
wsren.AsciiWorldRenderer.plot_sequence_images(frame_seq)