In [1]:
import numpy as np
from scipy.stats import norm, laplace

from envs.vases_grid import VasesGrid, VasesEnvState #, print_state, str_to_state, state_to_str
from envs.utils import unique_perm, zeros_with_ones, printoptions
from envs.vases_spec import VasesEnvState2x3V2D3, VasesEnvSpec2x3V2D3, VasesEnvState2x3Broken, VasesEnvSpec2x3Broken

from value_iter_and_policy import vi_boltzmann, vi_boltzmann_deterministic
from occupancy_measure import compute_d_deterministic, compute_d_last_step_deterministic

In [2]:
def compute_g_deterministic(mdp, policy, p_0, T, d_last_step_list, feature_matrix):
    # base case
    G_prev = np.zeros((mdp.nS, feature_matrix.shape[1]))
    for s in range(mdp.nS):
        for a in range(mdp.nA):
            # due to env being deterministic, sprime=self.P[s][a][0][1] and p_sprime=1
            G_prev[mdp.P[s][a][0][1], :] += p_0[s] * policy[s, a] * feature_matrix[s,:]
    
    # recursive case
    for t in range(T-1):
        G = np.zeros((mdp.nS, feature_matrix.shape[1]))
        for s in range(mdp.nS):
            for a in range(mdp.nA):
                # due to env being deterministic, sprime=self.P[s][a][0][1] and p_sprime=1
                G[mdp.P[s][a][0][1], :] += policy[s, a] * (G_prev[s] + d_last_step_list[t][s] * feature_matrix[s,:]) 
        
        G_prev = np.copy(G)
    return G

In [3]:
def om_method(mdp, s_current, p_0, horizon, temp=1, epochs=1, learning_rate=0.2, r_prior=None, r_vec=None):
    '''Modified MaxCausalEnt that maximizes last step occupancy measure for the current state'''
     
    if r_vec is None:
        if False: #r_prior!= None:
            # Sample the initial reward from the prior
            r_vec = r_prior.rvs()
        else:
            r_vec = .01*np.random.randn(mdp.f_matrix.shape[1])
    print('Initial reward vector: {}'.format(r_vec))
        
    for i in range(epochs):
            # Compute the Boltzmann rational policy \pi_{s,a} = \exp(Q_{s,a} - V_s) 
            V, Q, policy = vi_boltzmann_deterministic(mdp, 1, mdp.f_matrix @ r_vec, horizon, temp) 
            
            # Compute the gradient
            d_last_step, d_last_step_list = compute_d_last_step_deterministic(mdp, policy, p_0, horizon, return_all=True)   
            G = compute_g_deterministic(mdp, policy, p_0, horizon, d_last_step_list, mdp.f_matrix)
            d_T_step = compute_d_deterministic(mdp, 1, policy, p_0, horizon+1)
            
            g_div_d_last_step = np.zeros(mdp.f_matrix.shape[1])
            if d_last_step[np.where(s_current)]!=0:
                g_div_d_last_step = G[np.where(s_current)]/d_last_step[np.where(s_current)]
            
            dL_dr_vec = g_div_d_last_step.flatten() + (s_current - d_T_step) @ mdp.f_matrix

            # Gradient of the prior
            if r_prior!= None: dL_dr_vec += r_prior.logdistr_grad(r_vec)
            
            # Gradient ascent
            r_vec = r_vec + learning_rate * dL_dr_vec
            
            if i%1==0:
                with printoptions(precision=4, suppress=True):
                    print('Epoch {}; Reward vector: {}'.format(i, r_vec))
    return r_vec

In [4]:
def experiment_wrapper(env,
                     horizon=22, #number of steps we assume the expert was acting previously
                     temp=1,
                     learning_rate=.03,
                     epochs = 200,
                     s_current=None,
                     uniform=False,
                     r_prior=None):

    print('Initial state:')
    env.print_state(env.init_state)

    if not uniform:
        p_0=np.zeros(env.nS)
        p_0[env.state_num[env.state_to_str(env.init_state)]] = 1
    else:
        p_0=np.ones(env.nS) / env.nS
    
    if s_current is None: s_current = np.copy(p_0)
    
    r_vec = om_method(env, s_current, p_0, horizon, temp, epochs, learning_rate, r_prior=r_prior)
    with printoptions(precision=4, suppress=True):
        print(); print('Final reward vector: ', r_vec)
    return r_vec

In [5]:
def forward_rl(env, r, h=40, temp=.1, steps_printed=15, current_s=None):
    '''Given an env and R, runs soft VI for h steps and rolls out the resulting policy'''
    V, Q, policy = vi_boltzmann_deterministic(env, 1, env.f_matrix @ r, h, temp) 
    
    if current_s is None: 
        env.reset()
    else:
        env.s = env.str_to_state(env.num_state[np.where(current_s)[0][0]])
    env.print_state(env.s); print()
    for i in range(steps_printed):
        a = np.random.choice(5,p=policy[env.state_num[env.state_to_str(env.s)],:])
        env.state_step(a)
        env.print_state(env.s)
        
        obs = env.s_to_f(env.s)
        
        print(obs, obs.T @ env.r_vec)
        print()

In [6]:
def grad_gaussian_prior(theta, theta_spec, sigma=1):
    return -(theta-theta_spec)/(sigma**2)

def grad_laplace_prior(theta, theta_spec, b=1):
    return (theta_spec-theta)/(np.fabs(theta-theta_spec)*b)

class norm_distr(object):
    def __init__(self, mu, sigma=1):
        self.mu = mu
        self.sigma = sigma
        self.distribution = norm(loc=mu, scale=sigma)
        
    def rvs(self):
        '''sample'''
        return self.distribution.rvs()
    
    def pdf(self, x):
        return self.distribution.pdf(x)
    
    def logpdf(self, x):
        return self.distribution.logpdf(x)
    
    def logdistr_grad(self, x):
        return (self.mu-x)/(self.sigma**2)
    
class laplace_distr(object):
    def __init__(self, mu, b=1):
        self.mu = mu
        self.b = b
        self.distribution = laplace(loc=mu, scale=b)
        
    def rvs(self):
        '''sample'''
        return self.distribution.rvs()
    
    def pdf(self, x):
        return self.distribution.pdf(x)
    
    def logpdf(self, x):
        return self.distribution.logpdf(x)
    
    def logdistr_grad(self, x):
        return (self.mu-x)/(np.fabs(x-self.mu)*self.b)

Order of the features of the state for the experiments below
- Number of broken vases
- Number of vases on tables
- Number of tablecloths on tables
- Number of tablecloths on floors
- Number of vases on desks
- Number of tablecloths on desks



### 1. Baseline: use $R_{rl}$ that rewards the agent for tablecloths on tables; both vases get broken

In [7]:
r_rl = np.array([0, 0, 1, 0, 0, 0], dtype='float32')
#env2x3v2d3 = VasesGrid(VasesEnvSpec2x3V2D3(), VasesEnvState2x3V2D3())
#forward_rl(env2x3v2d3, r_rl)

### 2. Experiment where we know that the starting state had no broken vases, and the expert broke no vases during its acting in the env

In [8]:
env2x3v2d3 = VasesGrid(VasesEnvSpec2x3V2D3(), VasesEnvState2x3V2D3())
r_prior = laplace_distr(r_rl, 1)
r_learned = experiment_wrapper(env2x3v2d3, r_prior=r_prior, horizon=100)

Initial state:
│[0;35;85m█[0m[0;32;85m█[0m│  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│[0m↑[0m │  │  │
Initial reward vector: [-1.42820890e-02 -1.21519935e-02 -1.99767944e-05  8.98693476e-03
  2.43529692e-03  3.51360023e-03]
Epoch 0; Reward vector: [-0.4508  0.9291  0.0297 -0.0564 -0.0393  0.0869]
Epoch 1; Reward vector: [-0.4208  0.9065  0.0597 -0.0264  0.511   0.0658]
Epoch 2; Reward vector: [-0.3908  0.8769  0.0897  0.0036  0.4798  0.0361]
Epoch 3; Reward vector: [-0.3608  0.8473  0.1197 -0.0265  0.4489  0.007 ]
Epoch 4; Reward vector: [-0.3308  0.8179  0.1497  0.0031  0.4187 -0.0187]
Epoch 5; Reward vector: [-0.3008  0.7894  0.1797 -0.0407  0.395   0.0621]
Epoch 6; Reward vector: [-0.2708  0.7599  0.2097 -0.0107  0.3639  0.0325]
Epoch 7; Reward vector: [-0.2408  0.7305  0.2397  0.0191  0.3332  0.004 ]
Epoch 8; Reward vector: [-0.2108  0.7016  0.2697 -0.0197  0.3063 -0.002 ]
Epoch 9; Reward vector: [-0.1808  0.6727  0.2997  0.008

Epoch 106; Reward vector: [-0.2047  0.1157  0.6777  0.0028  0.0216  0.4416]
Epoch 107; Reward vector: [-0.1747  0.0903  0.7077 -0.0272  0.0165  0.4128]
Epoch 108; Reward vector: [-0.1447  0.067   0.7372  0.0025  0.0186  0.3852]
Epoch 109; Reward vector: [-0.1355  0.1171  0.7007 -0.0472  0.0016  0.4572]
Epoch 110; Reward vector: [-0.1055  0.0913  0.7307 -0.0172  0.0513  0.4291]
Epoch 111; Reward vector: [-0.0777  0.0734  0.7575  0.012   0.0253  0.4049]
Epoch 112; Reward vector: [-0.6406  0.525   0.331  -0.0199  0.3536  0.9059]
Epoch 113; Reward vector: [-0.6106  0.4957  0.361   0.0101  0.322   0.876 ]
Epoch 114; Reward vector: [-0.5806  0.4665  0.391  -0.0199  0.2903  0.8461]
Epoch 115; Reward vector: [-0.5506  0.4373  0.421   0.0101  0.2586  0.8162]
Epoch 116; Reward vector: [-0.5206  0.4082  0.451  -0.0199  0.2268  0.7864]
Epoch 117; Reward vector: [-0.4906  0.3792  0.481   0.0101  0.195   0.7566]
Epoch 118; Reward vector: [-0.4606  0.3503  0.511  -0.0199  0.1633  0.7268]
Epoch 119; R

In [9]:
# Combine the learned R_h + R* with a reward function R_rl that rewards the 
# agent for tablecloths on tables.
# No vases broken!
print(r_learned)
forward_rl(env2x3v2d3, r_learned, steps_printed=40, h=40)

[-0.41211439  0.27304429  0.80283766 -0.01617582  0.14804806  0.85994534]
│[0;35;85m█[0m[0;32;85m█[0m│  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│[0m↑[0m │  │  │

│[0;35;85m█[0m[0;32;85m█[0m│  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│[0m↓[0m │  │  │
[0. 1. 0. 0. 1. 1.] 0.0

│[0;35;85m█[0m[0;32;85m█[0m│  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[0m→[0m │  │
[0. 1. 0. 0. 1. 1.] 0.0

│[0;35;85m█[0m[0;32;85m█[0m│  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[0m↑[0m │  │
[0. 1. 0. 0. 1. 1.] 0.0

│[0;35;85m█[0m[0;32;85m█[0m│  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[0m↑[0m │  │
[0. 1. 0. 0. 1. 1.] 0.0

│[0;35;85m█[0m[0;32;85m█[0m│  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  

### 3. Experiment where we know that the starting state had no broken vases, and the expert broke a vase while getting to the current state

In [10]:
env2x3v2d3 = VasesGrid(VasesEnvSpec2x3V2D3(), VasesEnvState2x3V2D3())
np.random.seed(1)
s_current = np.zeros(env2x3v2d3.nS)
s_current[1050] = 1

print('Current state:')
env2x3v2d3.print_state(env2x3v2d3.str_to_state(env2x3v2d3.num_state[1050]))

r_learned_broken = experiment_wrapper(env2x3v2d3, s_current=s_current, r_prior=r_prior, horizon=100)

Current state:
│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[0m→[0m │ [91m█[0m│
Initial state:
│[0;35;85m█[0m[0;32;85m█[0m│  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│[0m↑[0m │  │  │
Initial reward vector: [ 0.01624345 -0.00611756 -0.00528172 -0.01072969  0.00865408 -0.02301539]
Epoch 0; Reward vector: [-0.5964  1.3927  0.0109 -0.1421 -0.5524  0.3488]
Epoch 1; Reward vector: [ 0.8873  1.3376  0.0409 -0.1121 -0.4991  0.2894]
Epoch 2; Reward vector: [ 0.8289  1.3083  0.0709 -0.0821 -0.4391  0.2623]
Epoch 3; Reward vector: [ 0.7708  1.2791  0.1009 -0.0521 -0.3791  0.2359]
Epoch 4; Reward vector: [ 0.7131  1.25    0.1309 -0.0221 -0.3191  0.2104]
Epoch 5; Reward vector: [ 0.6558  1.2211  0.1609  0.0078 -0.2591  0.1861]
Epoch 6; Reward vector: [ 0.5992  1.1923  0.1909 -0.0223 -0.1991  0.1634]
Epoch 7; Reward vector: [ 0.5433  1.1636  0.2209  0.0076 -0.1391  0.1426]
Ep

Epoch 107; Reward vector: [ 0.1742  3.5647  0.9959 -0.0053 -0.0116  0.592 ]
Epoch 108; Reward vector: [0.1418 3.5349 1.0259 0.0247 0.0483 0.5623]
Epoch 109; Reward vector: [ 0.1181  3.505   0.9959 -0.0053  0.0479  0.5326]
Epoch 110; Reward vector: [0.1039 3.4751 1.0259 0.0247 0.046  0.5029]
Epoch 111; Reward vector: [ 0.1013  3.445   0.9959 -0.0053  0.0382  0.4729]
Epoch 112; Reward vector: [0.0967 3.415  1.0259 0.0247 0.034  0.4433]
Epoch 113; Reward vector: [ 0.0947  3.3851  0.9959 -0.0053  0.0296  0.4138]
Epoch 114; Reward vector: [0.0928 3.3552 1.0259 0.0247 0.0262 0.3845]
Epoch 115; Reward vector: [ 0.0914  3.3255  0.9959 -0.0053  0.0233  0.3554]
Epoch 116; Reward vector: [0.0904 3.2957 1.0259 0.0247 0.0209 0.3266]
Epoch 117; Reward vector: [ 0.0895  3.2661  0.9959 -0.0053  0.0189  0.2983]
Epoch 118; Reward vector: [0.0888 3.2366 1.0258 0.0246 0.0171 0.2703]
Epoch 119; Reward vector: [ 0.0883  3.2076  0.9941 -0.0054  0.0152  0.2449]
Epoch 120; Reward vector: [0.0877 3.1787 1.0225 

In [11]:
# The agent learns that the expert cared about breaking vases and vases on tables,
# and breaks the remaining vase
forward_rl(env2x3v2d3, r_learned_broken, current_s=s_current)

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[0m→[0m │ [91m█[0m│

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[0m↓[0m │ [91m█[0m│
[1. 1. 0. 0. 0. 1.] 0.0

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│[0m←[0m │  │ [91m█[0m│
[1. 1. 0. 0. 0. 1.] 0.0

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[0m→[0m │ [91m█[0m│
[1. 1. 0. 0. 0. 1.] 0.0

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[0m→[0m │ [91m█[0m│
[1. 1. 0. 0. 0. 1.] 0.0

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[0m↑[0m │ [91m█[0m│
[1. 1. 0. 0. 0. 1.] 0.0

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█

### 4. Experiment in which the expert starts out in an env with a broken vase and doesn't break any vases during its acting

In [12]:
env2x3v2d3.init_state = env2x3v2d3.str_to_state(env2x3v2d3.num_state[1050])
env2x3v2d3.reset()

np.random.seed(2)
s_current = np.zeros(env2x3v2d3.nS)
s_current[1050] = 1

print('Current state:')
env2x3v2d3.print_state(env2x3v2d3.str_to_state(env2x3v2d3.num_state[1050]))

r_learned_broken = experiment_wrapper(env2x3v2d3, s_current=s_current, r_prior=r_prior, horizon=100)

Current state:
│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[0m→[0m │ [91m█[0m│
Initial state:
│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[0m→[0m │ [91m█[0m│
Initial reward vector: [-0.00416758 -0.00056267 -0.02136196  0.01640271 -0.01793436 -0.00841747]
Epoch 0; Reward vector: [-0.1195  0.3907  0.0077 -1.0125 -0.0212  1.4223]
Epoch 1; Reward vector: [-0.0895  0.3616  0.0377 -0.9825  0.0088  1.3923]
Epoch 2; Reward vector: [-0.0595  0.3326  0.0677 -0.9525 -0.0212  1.3622]
Epoch 3; Reward vector: [-0.0295  0.3038  0.0977 -0.9225  0.0088  1.3321]
Epoch 4; Reward vector: [ 0.0005  0.275   0.1277 -0.8925 -0.0212  1.302 ]
Epoch 5; Reward vector: [-0.0295  0.2465  0.1577 -0.8625  0.0088  1.272 ]
Epoch 6; Reward vector: [ 0.0005  0.2181  0.1877 -0.8325 -0.0212  1.2419]
Epoch 7; Reward vector: [-0.0295  0.19    0.2177 -0.8025  0.0088  1.2118]
Epoch 8

Epoch 104; Reward vector: [-0.1574  0.6748  0.7118  0.0071  0.0009  1.0933]
Epoch 105; Reward vector: [-0.1274  0.6453  0.7418 -0.0229 -0.0291  1.0632]
Epoch 106; Reward vector: [-0.0974  0.6159  0.7718  0.0071  0.0009  1.0332]
Epoch 107; Reward vector: [-0.0674  0.5864  0.8018 -0.0229 -0.0291  1.0031]
Epoch 108; Reward vector: [-0.0373  0.557   0.8318  0.0071  0.0009  0.9731]
Epoch 109; Reward vector: [-0.0073  0.5277  0.8618 -0.0229 -0.0291  0.9431]
Epoch 110; Reward vector: [0.0227 0.4984 0.8918 0.0071 0.0009 0.913 ]
Epoch 111; Reward vector: [-0.0073  0.4691  0.9218 -0.0229 -0.0291  0.883 ]
Epoch 112; Reward vector: [0.0228 0.4399 0.9518 0.0071 0.0009 0.853 ]
Epoch 113; Reward vector: [-0.0072  0.4107  0.9818 -0.0229 -0.0291  0.823 ]
Epoch 114; Reward vector: [0.0229 0.3816 1.0118 0.0071 0.0009 0.793 ]
Epoch 115; Reward vector: [-0.0071  0.3528  0.9814 -0.0229 -0.0292  0.7635]
Epoch 116; Reward vector: [0.023  0.3242 1.011  0.0071 0.0007 0.7341]
Epoch 117; Reward vector: [-0.0031  

In [13]:
# The agent learns that the expert cared about vases on tables, 
# and doesn't break the remaining vase
forward_rl(env2x3v2d3, r_learned_broken, current_s=s_current, h=100, steps_printed=100)

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[0m→[0m │ [91m█[0m│

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[0m↑[0m │ [91m█[0m│
[1. 1. 0. 0. 0. 1.] 0.0

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│[0m←[0m │  │ [91m█[0m│
[1. 1. 0. 0. 0. 1.] 0.0

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[0m→[0m │ [91m█[0m│
[1. 1. 0. 0. 0. 1.] 0.0

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │  │[0m→[0m[91m█[0m│
[1. 1. 0. 0. 0. 1.] 0.0

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[0m←[0m │ [91m█[0m│
[1. 1. 0. 0. 0. 1.] 0.0

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█

│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │  │[0m→[0m[91m█[0m│
[1. 1. 0. 0. 0. 1.] 0.0

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │  │[0m↑[0m[91m█[0m│
[1. 1. 0. 0. 0. 1.] 0.0

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[0m←[0m │ [91m█[0m│
[1. 1. 0. 0. 0. 1.] 0.0

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[0m←[0m │ [91m█[0m│
[1. 1. 0. 0. 0. 1.] 0.0

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[0m←[0m │ [91m█[0m│
[1. 1. 0. 0. 0. 1.] 0.0

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m█[0m │[93m█[0m │
│──│──│──│
│  │  │  │
│  │[0m↓[0m │ [91m█[0m│
[1. 1. 0. 0. 0. 1.] 0.0

│[0;35;85m█[0m │  │ [0;32;85m█[0m│
│[0;33;85m█[0m │[0;33;85m