In [1]:
# Bibliotecas
import sys, pandas as pd, numpy as np
import plotly.express as px
import seaborn as sns, numpy as np, os
from os.path import exists
sys.path.append('../')

from env.RiverProblem import RiverProblem
from rl_utils.VizTools import VizTools

# Modelos
from models.ValueIteration import ValueIteration
from models.PolicyIteration import PolicyIteration
from models.RS_PolicyIteration import RS_PolicyIteration
from models.PieceLinear_RSPI import PieceLinear_RSPI

from evaluation.AverageCosts import AverageCosts

%load_ext autoreload
%autoreload 2

# Experimentos - 1S2023

|Data|Descritivo|
|--|--|
|03-2023|Primeiros experimentos com confecção de artigo|

---

## Referências

**Exponential**

2013 MICAI: Shortest Stochastic Path with Risk Sensitive Evaluation \
ou 2016 BRACIS: Extreme Risk Averse Policy for Goal-Directed Risk-Sensitive MDP

**Piecewise Linear**

2020 MICAI: Risk-Sensitive Piecewise-Linear Policy Iteration for SSP-MDP

In [28]:
# ----------------------------------------------
# -- Configurações Iniciais --------------------
# ----------------------------------------------

run_value_iteration = False
run_policy_iteration = False
run_EUF_RSMDP = True

# -- Configurações Rio

grid_size = (5, 20)
goal_state = (4, 0)

rp = RiverProblem(grid_size, goal_state, dead_end=False)
block_type = rp.build_block_type()

In [29]:
def run_river_problem(river_flow, model_obj, epsilon=None, vl_lambda=None, discount_factor=None, 
                                     k=None, gamma=None, alpha=None, _log=False):
    transition_probabilities = rp.build_transition_probabilities(block_type, river_flow=river_flow)

    is_ok, dict_verification = rp._verify_sum_probabilities(transition_probabilities, block_type)
    if not is_ok: print(f'Soma das Probabilidades está válida? {is_ok}')
    
    if epsilon != None and discount_factor != None:
        rnp = model_obj(grid_size, goal_state, transition_probabilities, costs, 
                         num_actions=4, discount_factor=discount_factor, epsilon=epsilon)
    elif epsilon != None and vl_lambda != None:
        rnp = model_obj(grid_size, goal_state, transition_probabilities, costs, vl_lambda,
                         num_actions=4, discount_factor=discount_factor, epsilon=epsilon)
    elif epsilon != None and k != None and gamma != None and alpha != None:
        rnp = model_obj(grid_size, goal_state, transition_probabilities, costs, k, alpha, gamma,
                         num_actions=4, epsilon=epsilon)
    
    num_iterations, time = rnp.run_converge()
    if _log: print(f'Número de Iterações: {num_iterations}')
    if _log: print(f'Tempo de Execução: {time}s')
    
    return rnp

# Value Iteration

In [22]:
# Value Iteration - Configs
if run_value_iteration:
    discount_factor = 0.95
    epsilon = 0.001

    costs = {0: 1, 1: 1, 2: 1, 3: 1}
    block_type = rp.build_block_type()

In [23]:
# Value Iteration - Run
if run_value_iteration:
    value_iteration_01 = \
    run_river_problem(river_flow=0.1, model_obj=ValueIteration, epsilon=epsilon, discount_factor=discount_factor)
    value_iteration_05 = \
    run_river_problem(river_flow=0.75, model_obj=ValueIteration, epsilon=epsilon, discount_factor=discount_factor)
    value_iteration_09 = \
    run_river_problem(river_flow=0.9, model_obj=ValueIteration, epsilon=epsilon, discount_factor=discount_factor)

In [24]:
# Value Iteration - Displays
if run_value_iteration:
    display(value_iteration_01)
    display(value_iteration_05)
    display(value_iteration_09)

# Policy Iteration

Definimos o algoritmo de Policy Iteration da seguinte forma:

1. Escolhe uma política $\pi_0$ arbitrária;
2. Define $i=0$ como a quantidade de iterações;
3. Para cada iteração, enquanto ($i=0$ ou $\pi_i \neq \pi_{i-1}$) faça: \
    3.1. Policy Evaluation: obtém o valor da política atual $\pi_i$ para cada $s \in S$ resolvendo o seguinte sistema de equações: \
    $$V^{(\pi_i)}(s)=r(s,\pi_i(s))+\sum_{s'\in S} T(s, \pi_i(s), s') V^{\pi_i}(s') \forall s \in S$$
    3.2. Policy Improvement: melhora a política atual realizando uma atualização para todo $s \in S$: \
    $$\pi_{i+1}(s)=\text{argmax}_{a \in A_s}[r(s,a)+\sum_{s'\in S} T(s, a, s') V^{\pi_i}(s')]$$

In [25]:
# Policy Iteration - Configs
if run_policy_iteration:
    discount_factor = 0.99
    epsilon = 0.001
    costs = {0: 1, 1: 1, 2: 1, 3: 1}

In [26]:
# Policy Iteration - Run
if run_policy_iteration:
    policy_iteration_01 = \
    run_river_problem(river_flow=0.1, model_obj=PolicyIteration, epsilon=epsilon, discount_factor=discount_factor)
    policy_iteration_05 = \
    run_river_problem(river_flow=0.5, model_obj=PolicyIteration, epsilon=epsilon, discount_factor=discount_factor)
    policy_iteration_09 = \
    run_river_problem(river_flow=0.9, model_obj=PolicyIteration, epsilon=epsilon, discount_factor=discount_factor)

In [27]:
# Policy Iteration - Displays
if run_policy_iteration:
    display(policy_iteration_01)
    display(policy_iteration_05)
    display(policy_iteration_09)

# Exponential Utility Function for RS-MDP

Definimos o algoritmo de Risk Sensitive MDP da seguinte forma:

1. Escolhe uma política $\pi_0$ arbitrária;
2. Define $i=0$ como a quantidade de iterações;
3. Para cada iteração, enquanto ($i=0$ ou $\pi_i \neq \pi_{i-1}$) faça: \
    3.1. Policy Evaluation: obtém o valor da política atual $\pi_i$ para cada $s \in S$ resolvendo o seguinte sistema de equações: \
    $$V^{\pi_i}(s_G)=-\text{sign}(\lambda)$$
    $$V^{(\pi_i)}(s)=\text{exp}(-\lambda r(s,\pi_i(s)))\sum_{s'\in S} T(s, \pi_i(s), s') V^{\pi_i}(s') \forall s \neq s_G \in S$$
    3.2. Policy Improvement: melhora a política atual realizando uma atualização para todo $s \in S$: \
    $$\pi_{i+1}(s)=\text{argmax}_{a \in A_s}[\text{exp}(-\lambda r(s,a))\sum_{s'\in S} T(s, a, s') V^{\pi_i}(s')]$$
    
    
When λ < 0 (risk prone) and the policy π is proper, then π is also λ-feasible. However, this is not guaranteed for all policies when λ > 0 (risk averse). Given a GD-RSMDP, no result exists on how to determine the set of λ > 0 such that exists a λ-feasible policy.


In [30]:
# Exponential Utility Function for RS-MDP - Configs
if run_EUF_RSMDP:
    RSMDP = {}

    range_river_flow = [0.2]
    range_lambda     = [-1] 
    epsilon = 0.001
    
    costs = {0: 1, 1: 1, 2: 1, 3: 1}

In [31]:
# Exponential Utility Function for RS-MDP - Run
if run_EUF_RSMDP:
    for river_flow in range_river_flow:
        for vl_lambda in range_lambda:
            EUF_RSMDP[(river_flow, vl_lambda, discount_factor)] = \
                                            run_river_problem(river_flow=river_flow, model_obj=RS_PolicyIteration, 
                                            vl_lambda=vl_lambda, epsilon=epsilon)

TypeError: __init__() missing 1 required positional argument: 'vl_lambda'

In [None]:
# Exponential Utility Function for RS-MDP - Keys
if run_EUF_RSMDP:
    display(RSMDP.keys())

In [None]:
# Exponential Utility Function for RS-MDP - Display
if run_EUF_RSMDP: 
    display(RSMDP[(0.2, -5, 0.95)])

# PieceLinear - RSPI (Risk Sensitive Policy Iteration)

Define-se uma transformação por partes linear $X^{(k)}$ que depende de um parâmetro $x$ (diferença temporal) seja positiva ou não e um fator de risco $k$:

$$X^{(k)}(x)= \left\{ \begin{array} & (1-k)x & \text{, if x < 0,} \\ (1+k)x & \text{, otherwise.} \end{array} \right. $$

1. Escolhe uma política $\pi_0$ arbitrária;
2. Define $i=0$ como a quantidade de iterações;
3. Para cada iteração, enquanto ($i=0$ ou $\pi_i \neq \pi_{i-1}$) faça: \
    3.1. Policy Evaluation: obtém o valor da política atual $\pi_i$ para cada $s \in S$ resolvendo o seguinte sistema de equações: \
    $$
    O^{\pi}_{\alpha k}[V](s) = V(s) + \alpha \sum_{s'\in S} P(s'|s, \pi(s)) X^{(k)}(C(s,\pi(s),s')+\gamma V(s') - V(s))    
    $$
    3.2. Policy Improvement: melhora a política atual realizando uma atualização para todo $s \in S$: \
    $$
    \pi'(s)=\text{argmin}_{a\in A}\sum_{s'\in S} P(s'|s, a) X^{(k)}(C(s,a,s')+\gamma V(s') - V^\pi_k(s))
    $$

In [None]:
RSPI = {}
viz_tools = VizTools()

grid_size = (5, 20)
goal_state = (4, 0)
rp = RiverProblem(grid_size, goal_state, dead_end=False)

costs = {0: 1, 1: 1, 2: 1, 3: 1}
block_type = rp.build_block_type()

range_river_flow = [0.2] # [rf/10 for rf in range(1, 10)]
range_k    = [0.9999999] # [v/10 for v in range(-9, 10)]
gamma      = 1
alpha      = 0.95 # Verificar qual valor devo colocar para alpha para convergir

In [None]:
for river_flow in range_river_flow:
    for k in range_k:
        RSPI[(river_flow, k, gamma, alpha)] = run_river_problem(river_flow=river_flow, model_obj=PieceLinear_RSPI,
                                            gamma=gamma, k=k, alpha=alpha)

In [None]:
RSPI.keys()

In [None]:
#RSPI[(0.5, -0.9, 1, 0.95)]
RSPI[(0.2, 0.9999999, 1, 0.95)]

# Comparando Políticas Geradas

* Fixar em um ambiente (0.2 por exemplo) que o neutro atravessa no meio do rio para observar mais facil os comportamentos;
* E para um só tamanho para validar o processo;
* Residual

Policy Evaluation Exponencial

1 - 
PEXP(env, lambda, pi) -> R, avaliação da política para este mundo com lambda/pi;

Gráfico: PEXP(env, lambda, pi_lambda) por lambda
         PEXP(env, lambda, pi_linear) por lambda

pi_linear = argmax_{pi_k} PEXP(env, lambda, pi_k)

2 - Contrário da (1)
P(env, K, pi) -> R, avaliação da política para este mundo com K/pi;

3 -
Lambda, K

k_max = argmax_{k} PEXP(env, lambda, pi_k)

4 - Contrário da (3)

lambda_max = argmax_{lambda} PLINEAR(env, k, pi_lambda)

---

_* Calcular a variancia de uma política através de programação dinâmica sem usar simulações_

### RS-MDP with Exponential Utility Function

In [None]:
def build_res_dataframe_RSMDP(d, c, path, model):
    res = pd.DataFrame(d.items(), columns=c)
    res.insert(0, 'Model', model)
    res.insert(2, 'RiverFlow', [v[0] for v in res['Keys']])
    res.insert(3, 'Lambda', [v[1] for v in res['Keys']])
    res.insert(4, 'DiscountFactor', [v[2] for v in res['Keys']])
    
    res.to_excel(path)
    
    return res

In [None]:
# --------------------------------------------------------------------
# Gerando simulações para RS-MDP with Exponential Utility Function ---
# --------------------------------------------------------------------

nm_model = 'EXP-UTILITY_RSMDP-v4'
total_simulations = 10000
average_costs, std_costs, dict_costs = {}, {}, {}

nm_RSMDP_path_std = os.path.join('saidas', f'SAIDA_STD_{nm_model}.xlsx')
nm_RSMDP_path_avg = os.path.join('saidas', f'SAIDA_MEAN_{nm_model}.xlsx')

if exists(nm_RSMDP_path_std) and exists(nm_RSMDP_path_avg):
    RSMDP_std_costs = pd.read_excel(nm_RSMDP_path_std)
    RSMDP_average_costs = pd.read_excel(nm_RSMDP_path_avg)
else:
    for key in RSMDP.keys():
        print(f'Processando [{key}]', end='\r')
        ac = AverageCosts(RSMDP[key], costs, give_up=1000)
        res_costs, res_it = ac.run_simulations(total_simulations)
        lst_costs = [c[1] for c in res_costs.items()]

        dict_costs[key] = lst_costs
        average_costs[key] = np.mean(lst_costs)
        std_costs[key] = np.std(lst_costs)

    RSMDP_std_costs = build_res_dataframe_RSMDP(std_costs, ['Keys', 'Std'], 
                                          path=nm_RSMDP_path_std, model=nm_model)\
                                          .sort_values(by=['RiverFlow', 'Lambda'])
    RSMDP_average_costs = build_res_dataframe_RSMDP(average_costs, ['Keys', 'Mean'], 
                                          path=nm_RSMDP_path_avg, model=nm_model)\
                                          .sort_values(by=['RiverFlow', 'Lambda'])

In [None]:
RSMDP_std_costs['Lambda_STR'] = [str(l) for l in RSMDP_std_costs['Lambda']]
RSMDP_average_costs['Lambda_STR'] = [str(l) for l in RSMDP_average_costs['Lambda']]

In [None]:
RSMDP_std_costs = RSMDP_std_costs.sort_values(by=['Lambda_STR', 'RiverFlow'], ascending=False)
RSMDP_average_costs = RSMDP_average_costs.sort_values(by=['Lambda_STR', 'RiverFlow'], ascending=False)

### PieceLinear - RSPI (Risk Sensitive Policy Iteration)

In [None]:
def build_res_dataframe_RSPI(d, c, path, model):
    res = pd.DataFrame(d.items(), columns=c)
    res.insert(0, 'Model', model)
    res.insert(2, 'RiverFlow', [v[0] for v in res['Keys']])
    res.insert(3, 'K', [v[1] for v in res['Keys']])
    res.insert(4, 'Alpha', [v[2] for v in res['Keys']])
    res.insert(5, 'Gamma', [v[3] for v in res['Keys']])
    
    res.to_excel(path)
    
    return res

In [None]:
# --------------------------------------------------------------------------------
# Gerando simulações para PieceLinear - RSPI (Risk Sensitive Policy Iteration) ---
# --------------------------------------------------------------------------------

nm_model = 'LINEAR_RSPI'
total_simulations = 10000
average_costs, std_costs, dict_costs = {}, {}, {}

nm_LRSPI_path_std = os.path.join('saidas', f'SAIDA_STD_{nm_model}.xlsx')
nm_LRSPI_path_avg = os.path.join('saidas', f'SAIDA_MEAN_{nm_model}.xlsx')

if exists(nm_LRSPI_path_std) and exists(nm_LRSPI_path_avg):
    RSPI_std_costs = pd.read_excel(nm_LRSPI_path_std)
    RSPI_average_costs = pd.read_excel(nm_LRSPI_path_avg)
else:
    for key in RSPI.keys():
        print(f'Processando [{key}]', end='\r')
        ac = AverageCosts(RSPI[key], costs, give_up=1000)
        res_costs, res_it = ac.run_simulations(total_simulations)
        lst_costs = [c[1] for c in res_costs.items()]

        dict_costs[key] = lst_costs
        average_costs[key] = np.mean(lst_costs)
        std_costs[key] = np.std(lst_costs)
    
    RSPI_std_costs = build_res_dataframe_RSPI(std_costs, ['Keys', 'Std'], 
                                         path=nm_LRSPI_path_std, model=nm_model)
    RSPI_average_costs = build_res_dataframe_RSPI(average_costs, ['Keys', 'Mean'], 
                                             path=nm_LRSPI_path_avg, model=nm_model)

In [None]:
RSPI_std_costs = RSPI_std_costs.sort_values(by=['K', 'RiverFlow'], ascending=False)
RSPI_std_costs = RSPI_std_costs[RSPI_std_costs['K'] != -1]

RSPI_average_costs = RSPI_average_costs.sort_values(by=['K', 'RiverFlow'], ascending=False)
RSPI_average_costs = RSPI_average_costs[RSPI_average_costs['K'] != -1]

# Análises

In [None]:
fig = px.line(RSMDP_std_costs, x='Lambda_STR', y='Std', color='RiverFlow', title='Standard Deviation for each' + \
                                                                     'value of Lambda in different River Flows')
fig.show()

fig = px.line(RSPI_std_costs, x='K', y='Std', color='RiverFlow', title='Standard Deviation for each' + \
                                                                     'value of K in different River Flows')
fig.show()

In [None]:
fig = px.line(RSMDP_average_costs, x='Lambda_STR', y='Mean', color='RiverFlow', title='Mean for each' + \
                                                                     'value of Lambda in different River Flows')
fig.show()

fig = px.line(RSPI_average_costs, x='K', y='Mean', color='RiverFlow', title='Mean for each' + \
                                                                     'value of K in different River Flows')
fig.show()

# Outras Análises

In [None]:
fig = px.scatter(RSPI_std_costs, x='RiverFlow', y='Std', color='K')
fig.show()

In [None]:
print(list(dict_costs.keys())[-6])
p = sns.histplot(dict_costs[(0.9, 0.5, 1, 0.95)])