In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
import gym

In [2]:
# Переменные класса торговли (Trade_emulator)
NAME = 'BR Splice_TF'
TIMEFRAME = '5' #'16385' #'16408'
MODE = 'train' #'test'
NUMBER_ACTION = 3 # {0: вне рынка,1: buy, 2:sell}
NROWS = 30000 #количество бар истории
env_name = NAME+TIMEFRAME

# Переменные функции потерь
EPSILON_CLIP        = 0.2
#весовые коэффициенты потерь
C0_LOSS_ACTOR       = 30
C1_LOSS_CRITIC      = 30 
C2_LOSS_ENTROPY     = 0.1

#расчет Advantage и Value_target 
GAMMA               = 0.99
GAE_LAMBDA          = 0.95

PPO_BATCH           =128 #размер выборки
EPOCH               =10  #количество эпох

MAX_ITERATION       = 100000 #маскимальное количество проходов

In [3]:
class Trade_emulator:
    def __init__(self, env_name):
        # переменные для анализа
        columns_clas = ['AC_L', 'AC_S','AMA_L', 'AMA_S', 'AO_L', 'AO_S', 'Bears_L', 'Bulls_S', 'CCI_L',
                       'CCI_S', 'DEMA_L', 'DEMA_S', 'DeM_L', 'DeM_S', 'Envel_L', 'Envel_S',
                       'FrAMA_L', 'FrAMA_S', 'MA_L', 'MA_S', 'MACD_L', 'MACD_S', 'RSI_L',
                       'RSI_S', 'RVI_L', 'RVI_S', 'SAR_L', 'SAR_S', 'Stoch_L', 'Stoch_S',
                       'TEMA_L', 'TEMA_S', 'TRIX_L', 'TRIX_S', 'WPR_L', 'WPR_S']
        self.space = len(columns_clas)
        self.number_action = NUMBER_ACTION
        
        # загрузка истории котировок
        df_instr = pd.read_csv(env_name+'.csv', encoding='utf-16', header=0, sep=';',nrows=NROWS)
        df_instr['del_Open'] = -df_instr['Open'].diff().shift(1) #считаем изменение цены открытия от хвоста к 0
        df_instr = df_instr[2:] # удаляем кривые значения в начале и в конце истории
        df_instr.reset_index(inplace=True, drop=True)  #сбрасываем индекс
        
        #делим на подвыборки
        NTrainVal=len(df_instr)//5              
        
        if (MODE == 'train'):
            # отбираем только нужные для обучения столбцы
            self.X_train_rl = df_instr[columns_clas].iloc[NTrainVal:].copy().values/100
            self.Rew_train = df_instr['del_Open'].iloc[NTrainVal:].copy().values 
        else:
            self.X_train_rl = df_instr[columns_clas].iloc[:NTrainVal].copy().values/100
            self.Rew_train = df_instr['del_Open'].iloc[:NTrainVal].copy().values   
    
    def action_space(self):
        def n(self):
            return(self.number_action)
        return(self.number_action)
    
    def action_sample(self):
        return(np.random.randint(self.number_action))
    
    def observation_space(self):
        return(self.space)
    
    def reset(self):
        number_step = 1000
        self.done = False
        # фиксируем границы интервала и текущий шаг
        self.Start_step = np.random.randint(number_step,self.X_train_rl.shape[0])
        self.Stop_step = self.Start_step-number_step
        self.Current_step = self.Start_step
        return(self.X_train_rl[self.Current_step])
    
    def step(self,action):
        
        curent_state = self.X_train_rl[self.Current_step] # фиксируем текущее состояние
        
        if self.done: # если шаги закончились
            reward = 0  
            new_state = curent_state
            return(new_state, reward, self.done, curent_state)
        
       # отрабатываем сигнал  
        if action == 0: # вне рынка
            reward = 0
        elif action == 1: # Buy
            reward = self.Rew_train[self.Current_step]
        elif action == 2: # Sell
            reward = -self.Rew_train[self.Current_step]
        
        # переходим на новый шаг
        self.Current_step-=1

        if self.Current_step==self.Stop_step: # если дошли до конца интервала
            self.done = True # сигнал об окончании

        new_state = self.X_train_rl[self.Current_step]
        
        return (new_state, reward, self.done, curent_state)

In [4]:
#преобразование массива softmax в onehot
def policy_to_onehot(pol):
    b = np.zeros_like(pol)
    a = np.argmax(pol, axis=-1)
    if len(b.shape)>1:       
        b[np.arange(a.size),a] = 1
    else:
        b[a] = 1
    return(b)

# функция потерь Actor
def loss_Actor(y_true,y_pred):
    (pol_old_onehot, pol_old, adv) = tf.split(y_true, num_or_size_splits=[-1,1,1], axis=-1)
    pol_new_full = y_pred
    
    # берем прошлые и соответствующие им новые вероятности Action
    pol_new = K.max(pol_new_full*pol_old_onehot, axis=1, keepdims=True)

    # считаем surrogate losses 
    ratio = pol_new/pol_old
    L_clip = K.mean(K.minimum(ratio*adv,K.clip(ratio, min_value=1 - EPSILON_CLIP, max_value=1 + EPSILON_CLIP)*adv),axis=-1)
    
    # считаем энтропию
    entropy =-K.sum(pol_new * K.log(pol_new+ 1e-20),axis=1)
    
    # итоговая потеря
    loss = K.mean(C0_LOSS_ACTOR*L_clip+C2_LOSS_ENTROPY*entropy)
    return(-loss) # т.к. потеря минимизаруется

# рабочая модель 
def get_model_ActorCritic(input_d,out_d):
    state_input = Input(shape=input_d, dtype = 'float32')
    dense1 = Dense(128, activation='relu')(state_input)
    dense2 = Dense(128, activation='relu')(dense1)
    Policy_prediction = Dense(out_d, activation='softmax', name='Actor')(dense2)
    V_prediction = Dense(1, activation='tanh', name='Critic')(dense2)
    model = Model(state_input,[Policy_prediction, V_prediction])
    model.compile(optimizer='rmsprop',
                  loss={'Actor': loss_Actor,
                        'Critic':'mse'},
                  loss_weights = {'Actor':1.,
                                  'Critic':C1_LOSS_CRITIC})
    return(model)

# расчет Value и Advantage
def get_Vtarget_Advantage(val, next_val, rew, mas, gamma=GAMMA, lmbda=GAE_LAMBDA):
    val = val + [next_val]
    vtarg = []
    gae = 0
    for i in reversed(range(len(rew))):
        delta = rew[i] + gamma * val[i + 1] * mas[i] - val[i]
        gae = delta + gamma * lmbda * mas[i] * gae
        vtarg.insert(0, gae + val[i])

    adv = np.array(vtarg) - val[:-1]
    return (vtarg, (adv - np.mean(adv)) / (np.std(adv) + 1e-20))


# Тестирование модели
def test_reward():
    state = env.reset()
    done = False
    total_reward = 0
    while not done:
        action_probs,_ = model_ActorCritic.predict_on_batch(state[None,:])
        action = np.argmax(action_probs)
        next_state, reward, done, _ = env.step(action)
        state = next_state
        total_reward += reward
    return total_reward

#Статистика
def record(iters, glob_reward, AC_loss):
    """Helper function to store score and print statistics.
    """
    avg_reward = np.mean([test_reward() for _ in range(3)])
    #ep_reward = test_reward()
    
    if glob_reward == 0:
        glob_reward = avg_reward
    else:
        glob_reward = glob_reward * 0.99 + avg_reward * 0.01
    #Печатаем каждый 100 проход для экономии места
    if (iters%100 == 0):
        print(
              f"Iters: {iters} | "
              f"Global MA Reward: {round(glob_reward,4)} | "
              f"AVG Episode Reward: {round(avg_reward,4)} | "
              f"Loss: {round(AC_loss,4)}"
              )
    return(glob_reward)

In [7]:
# Создаем рабочую среду
env = Trade_emulator(env_name)
state_dims = env.observation_space()
n_actions = env.action_space()

# Создаем модель
model_ActorCritic = get_model_ActorCritic(state_dims,n_actions)

In [5]:
env = gym.make('CartPole-v0')
state_dims = env.observation_space.shape[0]
n_actions = env.action_space.n

model_ActorCritic = get_model_ActorCritic(state_dims,n_actions)
MAX_ITERATION = 10000

In [8]:
%%time
global_reward = 0  #среднее вознаграждение
best_reward   = 0  #лучшее вознаграждение
for iteration in range(MAX_ITERATION):  
    # обнуляем историю
    states = []
    actions = []
    values = []
    masks = []
    rewards = []
    policys = []
    
    # создаем окружение
    state = env.reset()
    done = False
    
    # прогоняем один эпизод
    while (not done):
        Policy_pred,V_pred = model_ActorCritic.predict_on_batch(state[None,:])
        policy = Policy_pred.numpy().ravel()
        value = V_pred.numpy()[0][0]

        action = np.random.choice(n_actions, p=policy)
        new_state, reward, done, curent_state = env.step(action)
        mask = (1- done)

        states.append(state)
        actions.append(action)
        values.append(value)
        masks.append(mask)
        rewards.append(reward)
        policys.append(policy)

        state = new_state
    #считаем последнее значение value
    _,next_V_pred = model_ActorCritic.predict_on_batch(state[None,:])
    next_value = next_V_pred.numpy()[0][0]
    
    #считаем Value и Advantage
    vtargets, advantages = get_Vtarget_Advantage(values, next_value, rewards, masks, gamma=GAMMA, lmbda=GAE_LAMBDA)
    
    #делаем склейку данных, для передачи Advantage в функцию потерь
    policy_targets = np.concatenate((policy_to_onehot(policys),np.amax(policys, axis=1)[:,None], np.array(advantages)[:,None]), axis=-1)
    
    # учим сеть
    ActorCritic_loss = model_ActorCritic.fit([states], [policy_targets, np.array(vtargets)[:,None]],
                                              verbose=0, shuffle=True, epochs=EPOCH, batch_size=PPO_BATCH)
    
    # Считаем награду и печатаем статистику
    global_reward = record(iteration,global_reward,ActorCritic_loss.history['loss'][-1])
    
    # сохраняем лучший результат
    if global_reward > best_reward:
        print('++++++++++++++++++ best reward=',round(global_reward,4),'++++++++++++++++++++++')
        #model_ActorCritic.save('best_{}_{}_{}.hdf5'.format(env_name,iteration, round(global_reward,4)))
        best_reward = global_reward
    # сохраняем каждый 500 проход
    if (iteration%500 == 0):
        model_ActorCritic.save('curent_{}_{}_{}.hdf5'.format(env_name,iteration, round(global_reward,4)))

Iters: 0 | Global MA Reward: 1.81 | AVG Episode Reward: 1.81 | Loss: -1.8057
++++++++++++++++++ best reward= 1.81 ++++++++++++++++++++++
Iters: 100 | Global MA Reward: 1.0367 | AVG Episode Reward: 0.12 | Loss: 0.965
Iters: 200 | Global MA Reward: 0.7488 | AVG Episode Reward: -1.7767 | Loss: -1.851
Iters: 300 | Global MA Reward: 0.5709 | AVG Episode Reward: -0.09 | Loss: -3.9681
Iters: 400 | Global MA Reward: 0.5449 | AVG Episode Reward: -0.4533 | Loss: 1.3794
Iters: 500 | Global MA Reward: 0.5478 | AVG Episode Reward: 0.5067 | Loss: -1.1681
Iters: 600 | Global MA Reward: 0.6747 | AVG Episode Reward: -0.7433 | Loss: 3.6774
Iters: 700 | Global MA Reward: 1.1125 | AVG Episode Reward: 2.3 | Loss: 1.3423
Iters: 800 | Global MA Reward: 1.0812 | AVG Episode Reward: 1.8233 | Loss: 2.9945
Iters: 900 | Global MA Reward: 1.334 | AVG Episode Reward: 1.37 | Loss: 5.573
Iters: 1000 | Global MA Reward: 1.3775 | AVG Episode Reward: 1.5133 | Loss: 1.5617
Iters: 1100 | Global MA Reward: 0.9592 | AVG Epi

++++++++++++++++++ best reward= 2.7614 ++++++++++++++++++++++
++++++++++++++++++ best reward= 2.7682 ++++++++++++++++++++++
++++++++++++++++++ best reward= 2.7774 ++++++++++++++++++++++
++++++++++++++++++ best reward= 2.7884 ++++++++++++++++++++++
++++++++++++++++++ best reward= 2.7894 ++++++++++++++++++++++
++++++++++++++++++ best reward= 2.7897 ++++++++++++++++++++++
++++++++++++++++++ best reward= 2.7959 ++++++++++++++++++++++
Iters: 3800 | Global MA Reward: 2.6649 | AVG Episode Reward: 2.7167 | Loss: 1.0883
Iters: 3900 | Global MA Reward: 2.4677 | AVG Episode Reward: 3.2033 | Loss: 6.6216
Iters: 4000 | Global MA Reward: 2.5715 | AVG Episode Reward: 4.94 | Loss: 1.4967
++++++++++++++++++ best reward= 2.799 ++++++++++++++++++++++
++++++++++++++++++ best reward= 2.8121 ++++++++++++++++++++++
++++++++++++++++++ best reward= 2.8205 ++++++++++++++++++++++
++++++++++++++++++ best reward= 2.8267 ++++++++++++++++++++++
++++++++++++++++++ best reward= 2.8364 ++++++++++++++++++++++
++++++++++

++++++++++++++++++ best reward= 3.9913 ++++++++++++++++++++++
++++++++++++++++++ best reward= 4.0002 ++++++++++++++++++++++
Iters: 5400 | Global MA Reward: 3.9197 | AVG Episode Reward: 1.67 | Loss: 9.8551
++++++++++++++++++ best reward= 4.0218 ++++++++++++++++++++++
++++++++++++++++++ best reward= 4.0559 ++++++++++++++++++++++
++++++++++++++++++ best reward= 4.0801 ++++++++++++++++++++++
++++++++++++++++++ best reward= 4.0925 ++++++++++++++++++++++
++++++++++++++++++ best reward= 4.1193 ++++++++++++++++++++++
Iters: 5500 | Global MA Reward: 3.9382 | AVG Episode Reward: 4.54 | Loss: 6.1273
++++++++++++++++++ best reward= 4.1213 ++++++++++++++++++++++
++++++++++++++++++ best reward= 4.1394 ++++++++++++++++++++++
Iters: 5600 | Global MA Reward: 4.119 | AVG Episode Reward: 3.0667 | Loss: 4.8145
++++++++++++++++++ best reward= 4.148 ++++++++++++++++++++++
++++++++++++++++++ best reward= 4.166 ++++++++++++++++++++++
++++++++++++++++++ best reward= 4.1676 ++++++++++++++++++++++
++++++++++++++

++++++++++++++++++ best reward= 5.8257 ++++++++++++++++++++++
++++++++++++++++++ best reward= 5.8455 ++++++++++++++++++++++
++++++++++++++++++ best reward= 5.8601 ++++++++++++++++++++++
++++++++++++++++++ best reward= 5.8602 ++++++++++++++++++++++
++++++++++++++++++ best reward= 5.8685 ++++++++++++++++++++++
++++++++++++++++++ best reward= 5.895 ++++++++++++++++++++++
++++++++++++++++++ best reward= 5.9015 ++++++++++++++++++++++
++++++++++++++++++ best reward= 5.9045 ++++++++++++++++++++++
++++++++++++++++++ best reward= 5.9315 ++++++++++++++++++++++
++++++++++++++++++ best reward= 5.9401 ++++++++++++++++++++++
++++++++++++++++++ best reward= 5.9674 ++++++++++++++++++++++
++++++++++++++++++ best reward= 5.9897 ++++++++++++++++++++++
++++++++++++++++++ best reward= 5.9951 ++++++++++++++++++++++
++++++++++++++++++ best reward= 5.9983 ++++++++++++++++++++++
++++++++++++++++++ best reward= 6.0185 ++++++++++++++++++++++
Iters: 6900 | Global MA Reward: 5.9676 | AVG Episode Reward: 6.67 | Los

++++++++++++++++++ best reward= 7.0589 ++++++++++++++++++++++
++++++++++++++++++ best reward= 7.0591 ++++++++++++++++++++++
++++++++++++++++++ best reward= 7.0708 ++++++++++++++++++++++
++++++++++++++++++ best reward= 7.077 ++++++++++++++++++++++
++++++++++++++++++ best reward= 7.0872 ++++++++++++++++++++++
++++++++++++++++++ best reward= 7.0889 ++++++++++++++++++++++
++++++++++++++++++ best reward= 7.1054 ++++++++++++++++++++++
++++++++++++++++++ best reward= 7.1236 ++++++++++++++++++++++
++++++++++++++++++ best reward= 7.1284 ++++++++++++++++++++++
++++++++++++++++++ best reward= 7.1349 ++++++++++++++++++++++
++++++++++++++++++ best reward= 7.144 ++++++++++++++++++++++
++++++++++++++++++ best reward= 7.1452 ++++++++++++++++++++++
++++++++++++++++++ best reward= 7.151 ++++++++++++++++++++++
++++++++++++++++++ best reward= 7.1663 ++++++++++++++++++++++
++++++++++++++++++ best reward= 7.1901 ++++++++++++++++++++++
++++++++++++++++++ best reward= 7.1985 ++++++++++++++++++++++
+++++++++++

++++++++++++++++++ best reward= 8.2257 ++++++++++++++++++++++
++++++++++++++++++ best reward= 8.2278 ++++++++++++++++++++++
++++++++++++++++++ best reward= 8.2305 ++++++++++++++++++++++
++++++++++++++++++ best reward= 8.2353 ++++++++++++++++++++++
++++++++++++++++++ best reward= 8.245 ++++++++++++++++++++++
++++++++++++++++++ best reward= 8.2452 ++++++++++++++++++++++
++++++++++++++++++ best reward= 8.2533 ++++++++++++++++++++++
++++++++++++++++++ best reward= 8.2618 ++++++++++++++++++++++
++++++++++++++++++ best reward= 8.2803 ++++++++++++++++++++++
++++++++++++++++++ best reward= 8.2827 ++++++++++++++++++++++
Iters: 10200 | Global MA Reward: 8.1131 | AVG Episode Reward: 9.5333 | Loss: -4.8617
Iters: 10300 | Global MA Reward: 8.2965 | AVG Episode Reward: 10.9067 | Loss: 0.9199
++++++++++++++++++ best reward= 8.2965 ++++++++++++++++++++++
++++++++++++++++++ best reward= 8.297 ++++++++++++++++++++++
++++++++++++++++++ best reward= 8.3108 ++++++++++++++++++++++
++++++++++++++++++ best re

++++++++++++++++++ best reward= 9.1822 ++++++++++++++++++++++
++++++++++++++++++ best reward= 9.192 ++++++++++++++++++++++
++++++++++++++++++ best reward= 9.2169 ++++++++++++++++++++++
++++++++++++++++++ best reward= 9.2175 ++++++++++++++++++++++
Iters: 12400 | Global MA Reward: 9.1771 | AVG Episode Reward: 8.4033 | Loss: 1.8088
Iters: 12500 | Global MA Reward: 9.1169 | AVG Episode Reward: 8.8167 | Loss: -2.2475
Iters: 12600 | Global MA Reward: 9.0779 | AVG Episode Reward: 7.7567 | Loss: -1.3844
Iters: 12700 | Global MA Reward: 9.0931 | AVG Episode Reward: 8.41 | Loss: 2.2264
++++++++++++++++++ best reward= 9.2405 ++++++++++++++++++++++
++++++++++++++++++ best reward= 9.2432 ++++++++++++++++++++++
++++++++++++++++++ best reward= 9.2705 ++++++++++++++++++++++
++++++++++++++++++ best reward= 9.2877 ++++++++++++++++++++++
Iters: 12800 | Global MA Reward: 9.2913 | AVG Episode Reward: 10.2067 | Loss: -2.1223
++++++++++++++++++ best reward= 9.2913 ++++++++++++++++++++++
Iters: 12900 | Global

Iters: 16200 | Global MA Reward: 9.8867 | AVG Episode Reward: 9.7167 | Loss: -3.4677
Iters: 16300 | Global MA Reward: 9.8297 | AVG Episode Reward: 8.4567 | Loss: 3.4564
++++++++++++++++++ best reward= 9.9682 ++++++++++++++++++++++
++++++++++++++++++ best reward= 9.9709 ++++++++++++++++++++++
++++++++++++++++++ best reward= 10.0035 ++++++++++++++++++++++
++++++++++++++++++ best reward= 10.0202 ++++++++++++++++++++++
++++++++++++++++++ best reward= 10.0209 ++++++++++++++++++++++
++++++++++++++++++ best reward= 10.0294 ++++++++++++++++++++++
++++++++++++++++++ best reward= 10.0317 ++++++++++++++++++++++
++++++++++++++++++ best reward= 10.0373 ++++++++++++++++++++++
++++++++++++++++++ best reward= 10.0522 ++++++++++++++++++++++
++++++++++++++++++ best reward= 10.063 ++++++++++++++++++++++
Iters: 16400 | Global MA Reward: 9.9622 | AVG Episode Reward: 11.2233 | Loss: -0.4965
++++++++++++++++++ best reward= 10.0644 ++++++++++++++++++++++
++++++++++++++++++ best reward= 10.0662 +++++++++++++++

++++++++++++++++++ best reward= 10.8556 ++++++++++++++++++++++
++++++++++++++++++ best reward= 10.8641 ++++++++++++++++++++++
++++++++++++++++++ best reward= 10.8677 ++++++++++++++++++++++
Iters: 20100 | Global MA Reward: 10.7794 | AVG Episode Reward: 11.88 | Loss: 4.2239
Iters: 20200 | Global MA Reward: 10.6983 | AVG Episode Reward: 11.6467 | Loss: -1.1216
Iters: 20300 | Global MA Reward: 10.7216 | AVG Episode Reward: 10.86 | Loss: 1.038
Iters: 20400 | Global MA Reward: 10.533 | AVG Episode Reward: 11.51 | Loss: 3.3361
Iters: 20500 | Global MA Reward: 10.5888 | AVG Episode Reward: 9.91 | Loss: 0.732
Iters: 20600 | Global MA Reward: 10.8174 | AVG Episode Reward: 11.93 | Loss: -1.057
++++++++++++++++++ best reward= 10.8768 ++++++++++++++++++++++
++++++++++++++++++ best reward= 10.888 ++++++++++++++++++++++
Iters: 20700 | Global MA Reward: 10.687 | AVG Episode Reward: 10.8667 | Loss: -0.7105
Iters: 20800 | Global MA Reward: 10.6617 | AVG Episode Reward: 10.05 | Loss: -3.6705
Iters: 20900

Iters: 27100 | Global MA Reward: 10.9644 | AVG Episode Reward: 12.0233 | Loss: -1.3767
Iters: 27200 | Global MA Reward: 11.1348 | AVG Episode Reward: 13.61 | Loss: -0.8425
Iters: 27300 | Global MA Reward: 10.831 | AVG Episode Reward: 11.2933 | Loss: -0.6507
Iters: 27400 | Global MA Reward: 10.969 | AVG Episode Reward: 10.63 | Loss: 0.5122
Iters: 27500 | Global MA Reward: 11.019 | AVG Episode Reward: 11.29 | Loss: 3.6049
Iters: 27600 | Global MA Reward: 11.021 | AVG Episode Reward: 11.02 | Loss: -3.8823
Iters: 27700 | Global MA Reward: 11.0411 | AVG Episode Reward: 12.2867 | Loss: 0.2805
Iters: 27800 | Global MA Reward: 10.9778 | AVG Episode Reward: 10.92 | Loss: 0.9215
Iters: 27900 | Global MA Reward: 11.0571 | AVG Episode Reward: 11.5533 | Loss: -2.4551
Iters: 28000 | Global MA Reward: 11.1502 | AVG Episode Reward: 8.5667 | Loss: 4.6445
Iters: 28100 | Global MA Reward: 11.2192 | AVG Episode Reward: 11.4133 | Loss: 1.7057
++++++++++++++++++ best reward= 11.2633 ++++++++++++++++++++++
I

Iters: 34800 | Global MA Reward: 11.2213 | AVG Episode Reward: 12.0767 | Loss: -0.0195
Iters: 34900 | Global MA Reward: 11.2698 | AVG Episode Reward: 10.05 | Loss: -1.15
Iters: 35000 | Global MA Reward: 10.9384 | AVG Episode Reward: 9.04 | Loss: -0.2063
Iters: 35100 | Global MA Reward: 10.9295 | AVG Episode Reward: 12.16 | Loss: -1.4875
Iters: 35200 | Global MA Reward: 10.8637 | AVG Episode Reward: 10.5933 | Loss: -4.1765
Iters: 35300 | Global MA Reward: 10.9535 | AVG Episode Reward: 12.05 | Loss: 2.7792
Iters: 35400 | Global MA Reward: 11.2126 | AVG Episode Reward: 10.31 | Loss: 3.3332
Iters: 35500 | Global MA Reward: 11.2004 | AVG Episode Reward: 13.11 | Loss: 1.5667
Iters: 35600 | Global MA Reward: 10.857 | AVG Episode Reward: 11.3667 | Loss: 20.7535
Iters: 35700 | Global MA Reward: 10.7158 | AVG Episode Reward: 10.8867 | Loss: 1.3152
Iters: 35800 | Global MA Reward: 10.8338 | AVG Episode Reward: 9.0433 | Loss: 3.8765
Iters: 35900 | Global MA Reward: 10.7636 | AVG Episode Reward: 10

Iters: 44500 | Global MA Reward: 10.8343 | AVG Episode Reward: 10.13 | Loss: -0.5909
Iters: 44600 | Global MA Reward: 10.8112 | AVG Episode Reward: 13.7 | Loss: 0.92
Iters: 44700 | Global MA Reward: 10.8862 | AVG Episode Reward: 11.54 | Loss: -1.6607
Iters: 44800 | Global MA Reward: 10.713 | AVG Episode Reward: 13.1767 | Loss: 1.0398
Iters: 44900 | Global MA Reward: 10.6571 | AVG Episode Reward: 11.8967 | Loss: -2.9568
Iters: 45000 | Global MA Reward: 11.0006 | AVG Episode Reward: 9.74 | Loss: -2.084
Iters: 45100 | Global MA Reward: 11.0812 | AVG Episode Reward: 13.46 | Loss: 0.8542
Iters: 45200 | Global MA Reward: 11.0657 | AVG Episode Reward: 11.4933 | Loss: -0.1092
Iters: 45300 | Global MA Reward: 10.9445 | AVG Episode Reward: 15.28 | Loss: 0.241
Iters: 45400 | Global MA Reward: 10.8932 | AVG Episode Reward: 11.5567 | Loss: 2.3414
Iters: 45500 | Global MA Reward: 11.0459 | AVG Episode Reward: 10.83 | Loss: 0.4688
Iters: 45600 | Global MA Reward: 10.9085 | AVG Episode Reward: 12.1867

Iters: 54200 | Global MA Reward: 10.5326 | AVG Episode Reward: 11.9633 | Loss: 0.1827
Iters: 54300 | Global MA Reward: 10.442 | AVG Episode Reward: 10.0867 | Loss: -4.2708
Iters: 54400 | Global MA Reward: 10.7716 | AVG Episode Reward: 9.9967 | Loss: 2.6297
Iters: 54500 | Global MA Reward: 10.8315 | AVG Episode Reward: 7.2967 | Loss: -1.0162
Iters: 54600 | Global MA Reward: 10.835 | AVG Episode Reward: 10.7767 | Loss: -4.0643
Iters: 54700 | Global MA Reward: 10.756 | AVG Episode Reward: 13.12 | Loss: 0.4463
Iters: 54800 | Global MA Reward: 10.7507 | AVG Episode Reward: 12.42 | Loss: -2.4491
Iters: 54900 | Global MA Reward: 10.5054 | AVG Episode Reward: 11.02 | Loss: 0.6655
Iters: 55000 | Global MA Reward: 10.5307 | AVG Episode Reward: 10.7267 | Loss: 5.2981
Iters: 55100 | Global MA Reward: 10.8489 | AVG Episode Reward: 12.7433 | Loss: -0.3586
Iters: 55200 | Global MA Reward: 10.7489 | AVG Episode Reward: 9.4 | Loss: 1.4363
Iters: 55300 | Global MA Reward: 10.607 | AVG Episode Reward: 9.

KeyboardInterrupt: 