In [1]:
import numpy as np
import sys
import warnings
import plotly.express as px 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import matplotlib.pyplot as plt
sys.path.insert(1, "../src/utils/")
from agent import Environment

# <span style="color:orange">Input file</span>

Before to start trainning we need to define our environment. In this example, the system is a rocket. 
It falls from a starting point and the goal is to reach a target with a minimum speed and maximize fuel.
Variable's names, their initial values and boundaries limit are defined into a JSON file (or can be given directly into a dictionary)

## <span style="color:orange">Variables</span>

Variables can be categorize into 3 classes:
* `states_variables` : variable used as coordinate to describe our system
* `agent_variables` : variable use as agent. Their values are changed for each iteration
* 3th category are other variables. They are not used to describe our environment but they can be usefull to monitor information or to compute intermediate value. 
  There is no key for this kind of variable. Consider them as variables present into `initial_values` field and that are not `states_variables` and `agent_variables`

You can access to the name of state and agent variables, with the attibute `states_variables` and `agent_variables`.

    "states_variables" : ["pos_x", "pos_y", "acceleration_x", "acceleration_y", "speed_x", "speed_y", "angle"],
    "agent_variables" : ["booster", "alpha"]

## <span style="color:orange">Initial system</span>

After to name `states_variables` and `agent_variables`, next step is to define initial state.
It will be use as environment coordinates at the beginning of each episode. Initially, it comprises 
the values of state_variables, followed by agent_variables, and finally other variables that are 
not used for the system's coordinates.

    "initial_values" : {
      "pos_x" : [100.0],
      "pos_y" : [175.0],
      "acceleration_x": [0.0],
      "acceleration_y": [0.0],
      "speed_x": [0.0],
      "speed_y": [0.0],
      "angle" : [0.0],
      "booster" : [0.0],
      "alpha" : [0.0],
      "m_fuel" : [100],
      "G" : [1.62],
      "futur_pos_x" : [100.0],
      "futur_pos_y" : [175.0],
      "m_fuel_ini" : [100.0],
      "pos_y_star": [0.0],
      "pos_x_star": [120.0]
      }


## <span style="color:orange">Limit</span>

Q-learning algorithms model events as a Markov process. Therefore, it is necessary to discretize our environment space. 
We define lower and upper bounds, as well as the number of divisions we want to use to discretize the variable space.


    "limit" : {
      "pos_x" : [0.0, 300, 61],
      "pos_y" : [0.0, 300.0, 61],
      "acceleration_x": [-20.0, 20.0, 21],
      "acceleration_y": [-20.0, 20.0, 21],
      "speed_x": [-50.0, 50.0, 21],
      "speed_y": [-50.0, 50.0, 21],
      "angle" : [-0.8, 0.8, 17],
      "booster" : [0.0, 1.0, 3],
      "alpha" : [-0.1, 0.1, 3],
      "m_fuel" : [0.0, 100, 101]
    }


## <span style="color:orange">Agents's actions</span>
    "n_action" : {
      "booster": {"0" : 0.0, "1" : 0.5, "2" : 1.0},
      "alpha": {"0" : -0.1, "1" : 0.0, "2" : 0.1}
    }

After defining the variables and their initial values, we proceed to define actions that apply to the agent variables. In this example, we have 2 agents, and each agent can take 3 actions:

For the booster:
  * "0": Booster is off.
  * "1": Booster is turned on to half of its power.
  * "2": Booster is turned on to its full power.

For alpha:
  * "0": Rotate rocket to -0.1 radians.
  * "1": No rotation.
  * "2": Rotate rocket to 0.1 radians.

## <span style="color:orange">Actions to take</span>

    "action_to_take" : {
      "booster": {"$booster$" : "$action},
      "alpha": {"$alpha$" : "$alpha$ + $action$"}
    }

Actions change the agent variables by modifying their values based on the action taken, which are retrieved from the n_action dictionary.

## <span style="color:orange">System's evolution and reward</span>

Last fields are how variables evolve after agents's action and how reward are computed.
Only variables present in field inital value are stored. Other variables present are just temporary and are lost after each iteration.
Reward values are stored into a dictionnary, the keys are agent variable's name. 

### <span style="color:orange">Equation variables</span>

    "equations_variables": {
        "$F$" : "600",
        "$m_fuel$" : "$m_fuel$ - $booster$",
        "$weight_rocket$" : "5 + $m_fuel$",
        "dt" : "0.5",
        "$theta$" : "0.0",
        "$x_0$" : "$pos_x$",
        "$y_0$" : "$pos_y$",
        "$Vx_0$" : "$speed_x$",
        "$Vy_0$" : "$speed_y$",
        "$angle$" : "$theta$ + $alpha$",
        "$acceleration_x$" : "($F$/(5+$weight_rocket$) * np.sin($angle$)) * $booster$",
        "$acceleration_y$" : "($F$/(5+$weight_rocket$) * np.cos($angle$)) * $booster$ - $G$",
        "$speed_x$" : "($F$/(5+$weight_rocket$) * np.sin($angle$)) * $booster$ * $dt$ + $Vx_0$",
        "$speed_y$" : "($F$/(5+$weight_rocket$) * np.cos($angle$)) * $booster$ * $dt$ - $G$ * $dt$ + $Vy_0$",
        "$pos_x$": "(0.5 * $F$/(5+$weight_rocket$) * np.sin($angle$)) * $booster$ * $dt$**2 + $Vx_0$ * $dt$ + $x_0$",
        "$pos_y$": "(0.5 * $F$/(5+$weight_rocket$) * np.cos($angle$)) * $booster$ * $dt$**2 - $G$ * $dt$**2 + $Vy_0$ * $dt$ + $y_0$",
        "$futur_pos_y$" : "$pos_y$ + 3 * $speed_y$",
        "$futur_pos_x$" : "$pos_x$ + 3 * $speed_x$",
        "$normalize_y_min$" :"np.abs(min($normalize_y_min$, $futur_pos_y$) )",
        "max_speed" : "max(($speed_x$**2 - $speed_y$**2), $max_speed$)"
    },

### <span style="color:orange">Reward</span>

    "equations_rewards": {
      "$booster$" : "-($pos_y$ - $pos_y_star$)**2 + $m_fuel$/$m_fuel_ini$",
      "$alpha$" : " -($pos_x$ - $pos_x_star$)**2 - np.sin($alpha$)"
    }

# <span style="color:orange">Lets see how system evolves</span>

A premilary work must be done on reward function. Indeed, we have 2 agents. Each one has an impact on rocket trajectory. 

Lets simulate a simple case. Our rocket start with no speed. The only force applied on it is G (Gravitational constant). 
The rocket falls down straight on the planetoid (angle is zero no needs to correct it) and without friction. The goal is to reach a point (pos_x_star, pos_y_star).

We start booster engine when we are close to the ground. For that, we compute the new position after 3 * dt (`futur_pos_y`).
When `futur_pos_y` is bellow zero, the rocket activates engine to compensate the fall speed and avoid the crash. 

In [5]:
# Create an environment object with the rules defined previously
env = Environment("rocket_tuto.json", check_model = False)
flag = "0"
flag_to_continue = True
# monitor action takes for each iteration
actions = {"action_booster" : []} 
while flag_to_continue:    
    env.step([flag, 1]) 
    actions["action_booster"].append(flag)
    if env.futur_pos_y[-1] <= 0 and env.m_fuel[-1] > 0:
        flag = "2"
    # stop engine if there is no fuel
    if env.m_fuel[-1] == 0:
        flag = "0"
    # stop simulation
    if env.pos_y[-1] < 0:
        flag_to_continue = False
        # delete last state because rocket is bellow to the ground
        env.delete_last_states()

Each state of our system is saved in `env`. We can access the last state of our system using the method `last_state()`.
If we want to access a specific range of states, it is the method `last_state()` (by default, all states are loaded).

Lets visualize how the system evolves !

In [6]:
# We save result into a pandas data frame and add column for time
df_traj = pd.concat([pd.DataFrame(env.all_states()), pd.DataFrame(actions)], axis = 1)
df_traj["iter"] = np.arange(0, df_traj.shape[0])

In [12]:
fig = make_subplots(rows=3, cols=1)
fig1 = px.line(df_traj.set_index('iter')[['pos_y', 'futur_pos_y']] )
fig2 = px.line(df_traj.set_index('iter')[['acceleration_y', 'speed_y']].rename(
    columns = {'acceleration_y' : 'acceleration_y (y/time²)',
               'speed_y': 'speed_y (y/time)'
               }))
fig3 = px.line(df_traj.set_index('iter')[['m_fuel']] )


for d in fig1.data:
    fig.add_trace((go.Scatter(x=d['x'], y=d['y'],  name = d['name'])), row=1, col=1)

for d in fig2.data:
    fig.add_trace((go.Scatter(x=d['x'], y=d['y'],  name = d['name'])), row=2, col=1)

for d in fig3.data:
    fig.add_trace((go.Scatter(x=d['x'], y=d['y'],  name = d['name'])), row=3, col=1)

# Update xaxis properties
for i in range(32):
    fig.update_xaxes(title_text="time", row=i+1, col=1)
fig.update_yaxes(title_text="Height (y)", row=1, col=1)
fig.update_yaxes(title_text="Unit", row=2, col=1)
fig3.update_yaxes(title_text="fuel mass", row=3, col=1)


fig.update_layout(height=600, width=600, title_text="Rocket first attempt")
fig.show()



At t=0, the rocket is held in the air and its velocity is 0. Once released, the only force acting on the rocket is gravity. 
Therefore, our rocket will accelerate constantly and gain speed. At t=24, the projection of the rocket's position is below 0. 
This means that the rocket will reach the ground in 3 time steps if nothing is done. To avoid catastrophe, the boosters are activated. 
The boosters will provide acceleration that counteracts the gravitational constant. 
Therefore, as the rocket burn fuel its weight decreases and the acceleration increases and the rocket gains speed in the opposite direction of its fall.

# <span style="color:orange">Design reward functions</span>

Let's visualize the evolution of the reward functions by using the attribut `reward`. As a reminder, the reward function for the booster corresponds to the vertical distance relative to pos_y_star plus the fuel ratio.
While for alpha (which corresponds to the angle of the rocket), its reward function corresponds to the horizontal distance relative to pos_x_star, minus the rocket's tilt angle.

In [13]:
fig = px.line(pd.DataFrame(env.rewards),
        labels=dict(index="time", value="Reward", variable="Agent") )
fig.update_layout(height=400, width=600, title_text="Agent's reward")


## <span style="color:orange">Constraint acceleration and speed</span>


The current reward function for the booster is far from optimal. As evident from the data, the value increases steadily and peaks at the 26th iteration. However, the engine won't start until then, risking a crash. Instead of using `pos_y`, we can utilize `future_pos_y` to allow ample time to start the engine and prevent a collision. While predicting the next position after a 3-time step works well in this example, in other scenarios, the rocket's speed may be excessive, rendering 3 time steps insufficient. To mitigate such scenarios, we could also impose constraints on the vehicle's acceleration and speed.

We can modulate speed in a way, system will be high penalize when it begans to be out of bounds:

$$ - exp( |speed\_y| - speed\_y\_limit) $$ 

The same reasoning can be applied to acceleration.

$$ - exp( |acceleration\_y| - acceleration\_y\_limit) $$ 

In [14]:
speed_y_limit = 10
speed_y = np.arange(-15,16)

acceleration_y_limit = 6
acceleration_y = np.arange(-11,12)

fig = go.Figure()
fig.add_trace(go.Scatter(
    x = speed_y,
    y = -np.exp(np.abs(speed_y) - speed_y_limit),
    name='Speed constraint',
    mode='lines+markers'
))
fig.add_trace(go.Scatter(
    x = acceleration_y,
    y = -np.exp(np.abs(acceleration_y) - acceleration_y_limit),
    name='Acceleration constraint',
    mode='lines+markers'
))

fig.update_xaxes(title_text = "Speed or acceleration")
fig.update_yaxes(title_text = "Penalty")
fig.update_layout(height=400, width=600, title_text="Function to penalize speed")

## <span style="color:orange">Height boundaries for furtur pos y</span>


After we've imposed constraints on acceleration and speed, the final step is to discourage the system from crashing. 
To achieve this, we set a minimum bound for the height.

$$ -exp(y\_lower\_limit - futur\_pos\_y) $$

In [17]:
y_lower_limit = 1
futur_pos_y = np.arange(-4, 7)


fig = go.Figure()
fig.add_trace(go.Scatter(
    y = futur_pos_y,
    x = -np.exp(- futur_pos_y + y_lower_limit),
    name='height constraint',
    mode='lines+markers'
))

fig.update_yaxes(title_text = "height")
fig.update_xaxes(title_text = "Penalty")
fig.update_layout(height=400, width=600, title_text="Function to penalize speed")

In [39]:
def booster_reward(states, acceleration_y_constraint, speed_y_limit, y_lower_limit):
    dist_squared = np.square(states["pos_y"] - states["pos_y_star"])
    acceleration_y_constraint =  -np.exp(np.abs(states["acceleration_y"]) - acceleration_y_constraint)
    speed_y_constraint =-np.exp(np.abs(states["speed_y"]) - speed_y_limit)
    y_lim_constraint = -np.exp(-states["futur_pos_y"] + y_lower_limit)
    # return -dist_squared + acceleration_y_constraint + speed_y_constraint +y_lim_constraint + states["m_fuel"]/states["m_fuel_ini"]
    dt = pd.DataFrame({
        "futur_height" : -dist_squared,
        "acceleration_constraint" : acceleration_y_constraint,
        "speed_constraint" : speed_y_constraint,
        "y_lim_constraint" : y_lim_constraint,
        "ratio_fuel" : states["m_fuel"]/states["m_fuel_ini"]
    })
    dt["sum_penalty"] = dt.sum(1)
    dt["iter"] =  np.arange(0, dt.shape[0])
    return dt

df_penalty = booster_reward(env.all_states(), acceleration_y_constraint = 6, speed_y_limit = 10, y_lower_limit = 1)

In [59]:
df_penalty

Unnamed: 0,futur_height,acceleration_constraint,speed_constraint,y_lim_constraint,ratio_fuel,sum_penalty,iter
0,-30625.000000,-0.002479,-4.539993e-05,-2.708695e-76,1.0,-3.062400e+04,0
1,-30483.414025,-0.012525,-1.020549e-04,-4.613016e-75,1.0,-3.048243e+04,1
2,-30201.226225,-0.012525,-2.294099e-04,-1.177875e-73,1.0,-3.020024e+04,2
3,-29780.404900,-0.012525,-5.156924e-04,-4.509231e-72,1.0,-2.977942e+04,3
4,-29223.902500,-0.012525,-1.159229e-03,-2.588184e-70,1.0,-2.922292e+04,4
...,...,...,...,...,...,...,...
180,-14405.571967,-0.012525,-6.548595e+20,-6.111582e+23,0.0,-6.118131e+23,180
181,-8217.918924,-0.012525,-1.472064e+21,-3.952983e+37,0.0,-3.952983e+37,181
182,-3706.041017,-0.012525,-3.309064e+21,-3.833413e+51,0.0,-3.833413e+51,182
183,-942.292653,-0.012525,-7.438472e+21,-5.573596e+65,0.0,-5.573596e+65,183


In [87]:
fig = make_subplots(rows=2, cols=2)
fig.append_trace(go.Scatter(x=df_traj['iter'],
                             y=df_traj['speed_y'], 
                             name='speed on Y axis',
                             legendgroup = '1'),
                             row=1, col=1, )

fig.append_trace(go.Scatter(x=df_penalty['iter'], 
                            y=df_penalty['speed_constraint'], 
                            name='Penality on speed',
                            legendgroup = '2'),
              row=2, col=1)




fig.append_trace(go.Scatter(x=df_traj['iter'],
                             y=df_traj['futur_pos_y'], 
                             name='futur_pos_y',
                             legendgroup = '1'),
                             row=1, col=2, )

fig.append_trace(go.Scatter(x=df_penalty['iter'], 
                            y=df_penalty['y_lim_constraint'], 
                            name='penality on ground',
                            legendgroup = '2'),
              row=2, col=2)


fig.update_layout(height=600, width=1400, 
                  title_text="Penalty on rocket", 
                  legend_tracegroupgap = 280,
                  xaxis1_title = 'Iteration',
                  xaxis2_title = 'Iteration',
                  xaxis3_title = 'Iteration',
                  xaxis4_title = 'Iteration',
                  yaxis1_title = 'Speed',
                  yaxis2_title = 'Height',
                  yaxis3_title = 'Penalty',
                  yaxis4_title = 'Penalty')
fig.show()



In [85]:
fig = make_subplots(rows=2, cols=2)
fig.append_trace(go.Scatter(x=df_traj['iter'],
                             y=df_traj['futur_pos_y'], 
                             name='futur_pos_y',
                             legendgroup = '1'),
                             row=1, col=1, )
fig.append_trace(go.Scatter(x=df_penalty['iter'], 
                            y=df_penalty['futur_height'], 
                            name='penality on height',
                            legendgroup = '2'),
              row=2, col=1)




fig.append_trace(go.Scatter(x=df_traj['iter'],
                             y=df_traj['acceleration_y'], 
                             name='acceleration_y',
                             legendgroup = '1'),
                             row=1, col=2, )
fig.append_trace(go.Scatter(x=df_penalty['iter'], 
                            y=df_penalty['acceleration_constraint'], 
                            name='penality on acceleration',
                            legendgroup = '2'),
              row=2, col=2)


fig.update_layout(height=600, width=1400, 
                  title_text="Penalty on rocket's altitude", 
                  legend_tracegroupgap = 280,
                  xaxis1_title = 'Iteration',
                  xaxis2_title = 'Iteration',
                  xaxis3_title = 'Iteration',
                  xaxis4_title = 'Iteration',
                  yaxis1_title = 'Height',
                  yaxis2_title = 'Acceleration',
                  yaxis3_title = 'Penalty',
                  yaxis4_title = 'Penalty')
fig.show()

In [89]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    y = df_penalty["sum_penalty"],
    x = df_penalty["iter"],
    name='Sum penality',
    mode='lines+markers'
))

In [88]:
df_penalty

Unnamed: 0,futur_height,acceleration_constraint,speed_constraint,y_lim_constraint,ratio_fuel,sum_penalty,iter
0,-30625.000000,-0.002479,-4.539993e-05,-2.708695e-76,1.0,-3.062400e+04,0
1,-30483.414025,-0.012525,-1.020549e-04,-4.613016e-75,1.0,-3.048243e+04,1
2,-30201.226225,-0.012525,-2.294099e-04,-1.177875e-73,1.0,-3.020024e+04,2
3,-29780.404900,-0.012525,-5.156924e-04,-4.509231e-72,1.0,-2.977942e+04,3
4,-29223.902500,-0.012525,-1.159229e-03,-2.588184e-70,1.0,-2.922292e+04,4
...,...,...,...,...,...,...,...
180,-14405.571967,-0.012525,-6.548595e+20,-6.111582e+23,0.0,-6.118131e+23,180
181,-8217.918924,-0.012525,-1.472064e+21,-3.952983e+37,0.0,-3.952983e+37,181
182,-3706.041017,-0.012525,-3.309064e+21,-3.833413e+51,0.0,-3.833413e+51,182
183,-942.292653,-0.012525,-7.438472e+21,-5.573596e+65,0.0,-5.573596e+65,183
