# AIRL with TransactionsGraphEnvironment v2

In [2]:
import ast

import gymnasium as gym
import mlflow
import numpy as np
import pandas as pd
import torch
from imitation.algorithms.adversarial.airl import AIRL
from imitation.data.rollout import flatten_trajectories
from imitation.data.types import Trajectory
from imitation.data.wrappers import RolloutInfoWrapper
from imitation.rewards.reward_nets import BasicShapedRewardNet
from imitation.util import logger
from imitation.util.networks import RunningNorm
from imitation.util.util import make_vec_env
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import VecCheckNan
from stable_baselines3.ppo import MlpPolicy

import graph_reinforcement_learning_using_blockchain_data as grl
from graph_reinforcement_learning_using_blockchain_data import config

config.load_dotenv()
mlflow.set_tracking_uri(uri=config.MLFLOW_TRACKING_URI)

[32m2025-04-14 13:52:52.638[0m | [1mINFO    [0m | [36mgraph_reinforcement_learning_using_blockchain_data.config[0m:[36m<module>[0m:[36m12[0m - [1mPROJ_ROOT path is: /Users/liamtessendorf/Programming/Uni/2_Master/4_FS25_Programming/graph-reinforcement-learning-using-blockchain-data[0m


In [3]:
RNG = np.random.default_rng(seed=42)

## Dataset 

In [4]:
df_emb = pd.read_csv(config.FLASHBOTS_Q2_DATA_DIR / "state_embeddings.csv")
df_class0 = pd.read_csv(config.RAW_DATA_DIR / "receipts_class0.csv")
df_class1 = pd.read_csv(config.RAW_DATA_DIR / "receipts_class1.csv")
df_eth_balances_class1 = pd.read_csv(config.RAW_DATA_DIR / "eth_balances_class1.csv")
df_eth_balances_class0 = pd.read_csv(config.RAW_DATA_DIR / "eth_balances_class0.csv")

In [5]:
print(df_class0.columns)
print(df_eth_balances_class0.columns)

Index(['block_number', 'transaction_hash', 'blockHash', 'blockNumber',
       'logsBloom', 'gasUsed', 'contractAddress', 'cumulativeGasUsed',
       'transactionIndex', 'from', 'to', 'type', 'effectiveGasPrice', 'logs',
       'status'],
      dtype='object')
Index(['account', 'block_number', 'balance'], dtype='object')


In [6]:
df_class0_with_eth_balances = df_class0.merge(
    df_eth_balances_class0, left_on=["from", "blockNumber"], right_on=["account", "block_number"], how="inner"
)
df_class1_with_eth_balances = df_class1.merge(
    df_eth_balances_class1, left_on=["from", "blockNumber"], right_on=["account", "block_number"], how="inner"
)

In [7]:
df_class0_multi_occ = df_class0_with_eth_balances[
    df_class0_with_eth_balances["from"].duplicated(keep=False)
]

In [8]:
df_emb["embeddings"] = df_emb["embeddings"].apply(
    lambda x: np.array(ast.literal_eval(x), dtype=np.float32)
)

In [9]:
df_class0_with_eth_balances["label"] = 0
df_class1_with_eth_balances["label"] = 1

In [10]:
df_receipts = pd.concat([df_class0_with_eth_balances, df_class1_with_eth_balances], ignore_index=True)
df_receipts.drop_duplicates("transaction_hash", inplace=True)
df = df_receipts.merge(df_emb, how="right", left_on="transaction_hash", right_on="transactionHash")

In [11]:
df.head()

Unnamed: 0,block_number_x,transaction_hash,blockHash,blockNumber,logsBloom,gasUsed,contractAddress,cumulativeGasUsed,transactionIndex,from,...,created_at,account_address,profit_token_address,start_amount,end_amount,profit_amount,error,protocols,transactionHash,embeddings
0,16969850,0xe4029c908cfc40f825051cd0957797c66196eb8ba437...,0x241b2ebd536a6f546ce2214bcf2146d359ae4e0cc4e3...,16969850,0x00000000000000000000000000000000000000000000...,21000,,1315400,19,0x00000000000124d994209fbB955E0217B5C2ECA1,...,,,,,,,,,0xe4029c908cfc40f825051cd0957797c66196eb8ba437...,"[-1.0455092, -0.45075172, -1.2988981, 0.689674..."
1,16975162,0xbcfb84169287cf7acce33fba2b7390cfe21852871f78...,0x577d4ef683ebd84c73cdbc3635c7177f6ec137eef3bf...,16975162,0x00000000000000000000000080000000000000000000...,116880,,396290,3,0x00000000000124d994209fbB955E0217B5C2ECA1,...,,,,,,,,,0xbcfb84169287cf7acce33fba2b7390cfe21852871f78...,"[-6.1341934, -4.090339, -6.813894, 2.9215977, ..."
2,16983327,0x7a954e541df7c296b8fec61b77d22ea8bacd63399467...,0xe79daae4074cb858a3e79b4d95064845295eaddc134c...,16983327,0x00000000000000000000000000000000000000000000...,103421,,3934866,21,0x00000000000124d994209fbB955E0217B5C2ECA1,...,,,,,,,,,0x7a954e541df7c296b8fec61b77d22ea8bacd63399467...,"[-5.403815, -3.6793585, -6.4153795, 2.0699742,..."
3,16990575,0x016858c7c133cdf545b6934653544c19475c5c111ba0...,0x9c3fa88c63603c9c5696174085177f9559e20b331a64...,16990575,0x00200000000000000000000080004000000000000000...,105953,,4683545,39,0x00000000000124d994209fbB955E0217B5C2ECA1,...,,,,,,,,,0x016858c7c133cdf545b6934653544c19475c5c111ba0...,"[-7.711429, -5.4846554, -9.181214, 2.5007846, ..."
4,16994491,0x0f358c5aed8ae456a298eedaf3fc08cb802c3417a6b4...,0xae6c66f64f13ba5404bbe4cebd33fdd666c7bc8fe601...,16994491,0x00200000000011000000000080000000000000000000...,223722,,3972377,34,0x00000000000124d994209fbB955E0217B5C2ECA1,...,,,,,,,,,0x0f358c5aed8ae456a298eedaf3fc08cb802c3417a6b4...,"[-10.870382, -8.408549, -12.329528, 6.705676, ..."


In [12]:
df_median_gas_prices = pd.DataFrame(
    {
        "median_gas_price": df.groupby(["blockNumber"])["effectiveGasPrice"].median(),
        "std_gas_price": df.groupby(["blockNumber"])["effectiveGasPrice"].std(),
        "max_gas_price": df.groupby(["blockNumber"])["effectiveGasPrice"].max(),
        "min_gas_price": df.groupby(["blockNumber"])["effectiveGasPrice"].min(),
    }
)

df_with_median_gas_prices = df.merge(df_median_gas_prices, how="left", on="blockNumber")
df_with_median_gas_prices.head()

Unnamed: 0,block_number_x,transaction_hash,blockHash,blockNumber,logsBloom,gasUsed,contractAddress,cumulativeGasUsed,transactionIndex,from,...,end_amount,profit_amount,error,protocols,transactionHash,embeddings,median_gas_price,std_gas_price,max_gas_price,min_gas_price
0,16969850,0xe4029c908cfc40f825051cd0957797c66196eb8ba437...,0x241b2ebd536a6f546ce2214bcf2146d359ae4e0cc4e3...,16969850,0x00000000000000000000000000000000000000000000...,21000,,1315400,19,0x00000000000124d994209fbB955E0217B5C2ECA1,...,,,,,0xe4029c908cfc40f825051cd0957797c66196eb8ba437...,"[-1.0455092, -0.45075172, -1.2988981, 0.689674...",28412780000.0,5338623000.0,40046142239,27985774295
1,16975162,0xbcfb84169287cf7acce33fba2b7390cfe21852871f78...,0x577d4ef683ebd84c73cdbc3635c7177f6ec137eef3bf...,16975162,0x00000000000000000000000080000000000000000000...,116880,,396290,3,0x00000000000124d994209fbB955E0217B5C2ECA1,...,,,,,0xbcfb84169287cf7acce33fba2b7390cfe21852871f78...,"[-6.1341934, -4.090339, -6.813894, 2.9215977, ...",31324250000.0,13106710000.0,62912040686,29651658352
2,16983327,0x7a954e541df7c296b8fec61b77d22ea8bacd63399467...,0xe79daae4074cb858a3e79b4d95064845295eaddc134c...,16983327,0x00000000000000000000000000000000000000000000...,103421,,3934866,21,0x00000000000124d994209fbB955E0217B5C2ECA1,...,,,,,0x7a954e541df7c296b8fec61b77d22ea8bacd63399467...,"[-5.403815, -3.6793585, -6.4153795, 2.0699742,...",47015990000.0,94835380000.0,298140379626,44115991364
3,16990575,0x016858c7c133cdf545b6934653544c19475c5c111ba0...,0x9c3fa88c63603c9c5696174085177f9559e20b331a64...,16990575,0x00200000000000000000000080004000000000000000...,105953,,4683545,39,0x00000000000124d994209fbB955E0217B5C2ECA1,...,,,,,0x016858c7c133cdf545b6934653544c19475c5c111ba0...,"[-7.711429, -5.4846554, -9.181214, 2.5007846, ...",39976070000.0,22168940000.0,72384430845,29976074199
4,16994491,0x0f358c5aed8ae456a298eedaf3fc08cb802c3417a6b4...,0xae6c66f64f13ba5404bbe4cebd33fdd666c7bc8fe601...,16994491,0x00200000000011000000000080000000000000000000...,223722,,3972377,34,0x00000000000124d994209fbB955E0217B5C2ECA1,...,,,,,0x0f358c5aed8ae456a298eedaf3fc08cb802c3417a6b4...,"[-10.870382, -8.408549, -12.329528, 6.705676, ...",26952080000.0,3320303000.0,30915355327,22915355327


In [13]:
df_with_actions = df_with_median_gas_prices.copy()
df_with_actions["action"] = df_with_median_gas_prices.apply(
    lambda r: 1 if r["effectiveGasPrice"] > r["median_gas_price"] else 0, axis=1
)

In [14]:
df_with_actions["action"].mean()

0.24449359876667337

In [15]:
df_with_actions.head()

Unnamed: 0,block_number_x,transaction_hash,blockHash,blockNumber,logsBloom,gasUsed,contractAddress,cumulativeGasUsed,transactionIndex,from,...,profit_amount,error,protocols,transactionHash,embeddings,median_gas_price,std_gas_price,max_gas_price,min_gas_price,action
0,16969850,0xe4029c908cfc40f825051cd0957797c66196eb8ba437...,0x241b2ebd536a6f546ce2214bcf2146d359ae4e0cc4e3...,16969850,0x00000000000000000000000000000000000000000000...,21000,,1315400,19,0x00000000000124d994209fbB955E0217B5C2ECA1,...,,,,0xe4029c908cfc40f825051cd0957797c66196eb8ba437...,"[-1.0455092, -0.45075172, -1.2988981, 0.689674...",28412780000.0,5338623000.0,40046142239,27985774295,1
1,16975162,0xbcfb84169287cf7acce33fba2b7390cfe21852871f78...,0x577d4ef683ebd84c73cdbc3635c7177f6ec137eef3bf...,16975162,0x00000000000000000000000080000000000000000000...,116880,,396290,3,0x00000000000124d994209fbB955E0217B5C2ECA1,...,,,,0xbcfb84169287cf7acce33fba2b7390cfe21852871f78...,"[-6.1341934, -4.090339, -6.813894, 2.9215977, ...",31324250000.0,13106710000.0,62912040686,29651658352,1
2,16983327,0x7a954e541df7c296b8fec61b77d22ea8bacd63399467...,0xe79daae4074cb858a3e79b4d95064845295eaddc134c...,16983327,0x00000000000000000000000000000000000000000000...,103421,,3934866,21,0x00000000000124d994209fbB955E0217B5C2ECA1,...,,,,0x7a954e541df7c296b8fec61b77d22ea8bacd63399467...,"[-5.403815, -3.6793585, -6.4153795, 2.0699742,...",47015990000.0,94835380000.0,298140379626,44115991364,1
3,16990575,0x016858c7c133cdf545b6934653544c19475c5c111ba0...,0x9c3fa88c63603c9c5696174085177f9559e20b331a64...,16990575,0x00200000000000000000000080004000000000000000...,105953,,4683545,39,0x00000000000124d994209fbB955E0217B5C2ECA1,...,,,,0x016858c7c133cdf545b6934653544c19475c5c111ba0...,"[-7.711429, -5.4846554, -9.181214, 2.5007846, ...",39976070000.0,22168940000.0,72384430845,29976074199,0
4,16994491,0x0f358c5aed8ae456a298eedaf3fc08cb802c3417a6b4...,0xae6c66f64f13ba5404bbe4cebd33fdd666c7bc8fe601...,16994491,0x00200000000011000000000080000000000000000000...,223722,,3972377,34,0x00000000000124d994209fbB955E0217B5C2ECA1,...,,,,0x0f358c5aed8ae456a298eedaf3fc08cb802c3417a6b4...,"[-10.870382, -8.408549, -12.329528, 6.705676, ...",26952080000.0,3320303000.0,30915355327,22915355327,1


In [16]:
df_with_actions.rename(columns={"balance": "eth_balance"}, inplace=True)
df_with_actions["eth_balance"] = df_with_actions["eth_balance"].astype("float64")
df_with_actions["median_gas_price"] = df_with_actions["median_gas_price"].astype("float64")
df_with_actions["std_gas_price"] = df_with_actions["std_gas_price"].astype("float64")
df_with_actions["from"] = df_with_actions["from"].astype("string")
df_with_actions["to"] = df_with_actions["to"].astype("string")
df_with_actions.head()

Unnamed: 0,block_number_x,transaction_hash,blockHash,blockNumber,logsBloom,gasUsed,contractAddress,cumulativeGasUsed,transactionIndex,from,...,profit_amount,error,protocols,transactionHash,embeddings,median_gas_price,std_gas_price,max_gas_price,min_gas_price,action
0,16969850,0xe4029c908cfc40f825051cd0957797c66196eb8ba437...,0x241b2ebd536a6f546ce2214bcf2146d359ae4e0cc4e3...,16969850,0x00000000000000000000000000000000000000000000...,21000,,1315400,19,0x00000000000124d994209fbB955E0217B5C2ECA1,...,,,,0xe4029c908cfc40f825051cd0957797c66196eb8ba437...,"[-1.0455092, -0.45075172, -1.2988981, 0.689674...",28412780000.0,5338623000.0,40046142239,27985774295,1
1,16975162,0xbcfb84169287cf7acce33fba2b7390cfe21852871f78...,0x577d4ef683ebd84c73cdbc3635c7177f6ec137eef3bf...,16975162,0x00000000000000000000000080000000000000000000...,116880,,396290,3,0x00000000000124d994209fbB955E0217B5C2ECA1,...,,,,0xbcfb84169287cf7acce33fba2b7390cfe21852871f78...,"[-6.1341934, -4.090339, -6.813894, 2.9215977, ...",31324250000.0,13106710000.0,62912040686,29651658352,1
2,16983327,0x7a954e541df7c296b8fec61b77d22ea8bacd63399467...,0xe79daae4074cb858a3e79b4d95064845295eaddc134c...,16983327,0x00000000000000000000000000000000000000000000...,103421,,3934866,21,0x00000000000124d994209fbB955E0217B5C2ECA1,...,,,,0x7a954e541df7c296b8fec61b77d22ea8bacd63399467...,"[-5.403815, -3.6793585, -6.4153795, 2.0699742,...",47015990000.0,94835380000.0,298140379626,44115991364,1
3,16990575,0x016858c7c133cdf545b6934653544c19475c5c111ba0...,0x9c3fa88c63603c9c5696174085177f9559e20b331a64...,16990575,0x00200000000000000000000080004000000000000000...,105953,,4683545,39,0x00000000000124d994209fbB955E0217B5C2ECA1,...,,,,0x016858c7c133cdf545b6934653544c19475c5c111ba0...,"[-7.711429, -5.4846554, -9.181214, 2.5007846, ...",39976070000.0,22168940000.0,72384430845,29976074199,0
4,16994491,0x0f358c5aed8ae456a298eedaf3fc08cb802c3417a6b4...,0xae6c66f64f13ba5404bbe4cebd33fdd666c7bc8fe601...,16994491,0x00200000000011000000000080000000000000000000...,223722,,3972377,34,0x00000000000124d994209fbB955E0217B5C2ECA1,...,,,,0x0f358c5aed8ae456a298eedaf3fc08cb802c3417a6b4...,"[-10.870382, -8.408549, -12.329528, 6.705676, ...",26952080000.0,3320303000.0,30915355327,22915355327,1


In [17]:
df_with_actions[df_with_actions["std_gas_price"].isna() == True]

Unnamed: 0,block_number_x,transaction_hash,blockHash,blockNumber,logsBloom,gasUsed,contractAddress,cumulativeGasUsed,transactionIndex,from,...,profit_amount,error,protocols,transactionHash,embeddings,median_gas_price,std_gas_price,max_gas_price,min_gas_price,action
63,16975107,0xd05a59ef18204af79ae9bf2a7ba722bca892055819c7...,0x8d8cd7dfa64a3867f12d472be895ee2a1b163d854a02...,16975107,0x00200000000000000000000080000000000008000000...,196537,,685900,5,0x00000006e42915A2B6907f8b3fAF311B68862f60,...,6855415086744551,,"[""uniswap_v2""]",0xd05a59ef18204af79ae9bf2a7ba722bca892055819c7...,"[4.5761786, 6.5636315, 3.3592527, -2.8692067, ...",3.442311e+10,,34423111170,34423111170,0
65,16976564,0x5530313d0b0271506691e3732c517172d5bfa1b2ba3d...,0x458b66a35808bf44a7e332b9f9b326ca8660952ab40a...,16976564,0x00200000000000000000000080000000200000000000...,190691,,1322470,7,0x00000006e42915A2B6907f8b3fAF311B68862f60,...,7796163749018364,,"[""uniswap_v2""]",0x5530313d0b0271506691e3732c517172d5bfa1b2ba3d...,"[3.4024527, 5.283112, 2.2056818, -2.9582765, 8...",4.041179e+10,,40411786608,40411786608,0
76,16979829,0xe47601937f0538ecc2a67c0a1b2481a1d339b52b2ef0...,0x4d1ebc8a72732a87fb083679233a06cf0a33ec984b35...,16979829,0x00200000000000000000000084000000200000000000...,158198,,158198,0,0x00000006e42915A2B6907f8b3fAF311B68862f60,...,24408839782523421,,"[""uniswap_v2"",""uniswap_v3""]",0xe47601937f0538ecc2a67c0a1b2481a1d339b52b2ef0...,"[6.823197, 9.531916, 5.651215, -3.9661467, 12....",1.532201e+11,,153220129205,153220129205,0
98,16992014,0x23e05562df7784836aaf6c8235d2aca5501621e6aab2...,0x8794222099471416ba2721137ee0e0d60a149b2f2029...,16992014,0x00200000000000000000000080000200000000000000...,219788,,219788,0,0x00000006e42915A2B6907f8b3fAF311B68862f60,...,35168219119161781,,"[""uniswap_v2"",""uniswap_v3""]",0x23e05562df7784836aaf6c8235d2aca5501621e6aab2...,"[3.409607, 5.0597196, 2.8262546, -2.1056595, 6...",1.595249e+11,,159524873969,159524873969,0
167,17240935,0x74e6628155b2f61c067a568235c952e3c1fa4aa22d76...,0xa658109201d2dcd46dcc9f5ad1c71d29d6942c44b1c4...,17240935,0x00000000040000000000000000000000000000000000...,210289,,2954344,23,0x00000006e42915A2B6907f8b3fAF311B68862f60,...,13097045417966494,,"[""uniswap_v3""]",0x74e6628155b2f61c067a568235c952e3c1fa4aa22d76...,"[5.1223907, 7.9230337, 4.8604784, -1.8773488, ...",6.190800e+10,,61907998723,61907998723,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149181,17296632,0x2f06cc7f885dd98fbf765ad57706fe505c5b63478928...,0x64522455567aa9a4209551fd938a1e31b448487ac157...,17296632,0x00200000000000000010000080000000000000000000...,178376,,440031,1,0xffFf14106945bCB267B34711c416AA3085B8865F,...,22902832502491512,,"[""uniswap_v2""]",0x2f06cc7f885dd98fbf765ad57706fe505c5b63478928...,"[9.8747, 15.316246, 9.5223255, -5.324061, 16.4...",3.450291e+10,,34502909809,34502909809,0
149182,17297932,0xd477617aaa93aad5aa5f5a2f880f3e934b70a91cd523...,0x493341206c063a7ba6c85137473b17f697ed72701c07...,17297932,0x00200000000000000010000080000000000000000000...,178388,,440065,1,0xffFf14106945bCB267B34711c416AA3085B8865F,...,31467600199400760,,"[""uniswap_v2""]",0xd477617aaa93aad5aa5f5a2f880f3e934b70a91cd523...,"[12.2059765, 19.282898, 11.881564, -6.684133, ...",2.928406e+10,,29284056698,29284056698,0
149183,17298360,0xda0b7b1156a57ff85ef5c2e80f47d6fc0c89c6253fca...,0x21a1c5ae76e2cbff779f308288e1cf2273d999f978e1...,17298360,0x00200000000000000010000080000000000000000000...,178352,,439975,1,0xffFf14106945bCB267B34711c416AA3085B8865F,...,38228015952142520,,"[""uniswap_v2""]",0xda0b7b1156a57ff85ef5c2e80f47d6fc0c89c6253fca...,"[14.737251, 23.417326, 14.310656, -8.357638, 2...",2.959543e+10,,29595427147,29595427147,0
149184,17298379,0x7755b8553cc9f2479e5cc48ebf18dd6f25fe038a668f...,0xa0171ad96bcd73eb6aa92c8da1e3a8629c1c067ea811...,17298379,0x00200000000000000010000080000000000000000000...,178364,,1162669,7,0xffFf14106945bCB267B34711c416AA3085B8865F,...,28570015952142520,,"[""uniswap_v2""]",0x7755b8553cc9f2479e5cc48ebf18dd6f25fe038a668f...,"[8.204366, 13.051107, 7.5100203, -5.3788714, 1...",2.783189e+10,,27831892530,27831892530,0


In [18]:
df_with_actions = df_with_actions.fillna({"std_gas_price": 0})

In [19]:
df_with_actions_0 = df_with_actions[df_with_actions["label"] == 0]
df_with_actions_1 = df_with_actions[df_with_actions["label"] == 1]

In [20]:
unique_accs_0 = df_with_actions_0["from"].unique()
accs_train_0 = unique_accs_0[: int(0.8 * len(unique_accs_0))]
accs_val_0 = unique_accs_0[int(0.8 * len(unique_accs_0)): int(0.9 * len(unique_accs_0))]
accs_test_0 = unique_accs_0[int(0.9 * len(unique_accs_0)):]
df_with_actions_0_train = df_with_actions_0[df_with_actions_0["from"].isin(accs_train_0)]
df_with_actions_0_val = df_with_actions_0[df_with_actions_0["from"].isin(accs_val_0)]
df_with_actions_0_test = df_with_actions_0[df_with_actions_0["from"].isin(accs_test_0)]

In [21]:
unique_accs_1 = df_with_actions_1["from"].unique()
accs_train_1 = unique_accs_1[: int(0.8 * len(unique_accs_1))]
accs_val_1 = unique_accs_1[int(0.8 * len(unique_accs_1)): int(0.9 * len(unique_accs_1))]
accs_test_1 = unique_accs_1[int(0.9 * len(unique_accs_1)):]
df_with_actions_1_train = df_with_actions_1[df_with_actions_1["from"].isin(accs_train_1)]
df_with_actions_1_val = df_with_actions_1[df_with_actions_1["from"].isin(accs_val_1)]
df_with_actions_1_test = df_with_actions_1[df_with_actions_1["from"].isin(accs_test_1)]

In [22]:
df_val = pd.concat([df_with_actions_0_val, df_with_actions_1_val])

In [23]:
df_with_actions_1_train[
    df_with_actions_1_train["from"] == "0x1e6c1c4669f612112a7caCa5596BfE6629e669aA"
    ]

Unnamed: 0,block_number_x,transaction_hash,blockHash,blockNumber,logsBloom,gasUsed,contractAddress,cumulativeGasUsed,transactionIndex,from,...,profit_amount,error,protocols,transactionHash,embeddings,median_gas_price,std_gas_price,max_gas_price,min_gas_price,action


## Creating trajectories

In [24]:
def extract_trajectories(df: pd.DataFrame):
    trajectories = []
    for account, group in df.groupby("from"):
        group = group.sort_values("blockNumber")
        obs_list = group["embeddings"].tolist() + [np.zeros(128, dtype=np.float32)]
        traj = {
            "obs": np.stack(obs_list),  # Convert list of arrays to a single numpy array
            "acts": np.array(group["action"].tolist()),
            "label": group["label"].iloc[0],
        }
        trajectories.append(traj)
    return trajectories


trajectories_1_train = extract_trajectories(df_with_actions_1_train)
trajectories_0_train = extract_trajectories(df_with_actions_0_train)
trajectories_1_test = extract_trajectories(df_with_actions_1_test)
trajectories_0_test = extract_trajectories(df_with_actions_0_test)
trajectories_val = extract_trajectories(df_val)

In [25]:
trajectories_1 = [
    Trajectory(obs=traj["obs"], acts=traj["acts"], infos=None, terminal=True)
    for traj in trajectories_1_train
]
trajectories_0 = [
    Trajectory(obs=traj["obs"], acts=traj["acts"], infos=None, terminal=True)
    for traj in trajectories_0_train
]
trajectories_val = [
    Trajectory(obs=traj["obs"], acts=traj["acts"], infos=None, terminal=True)
    for traj in trajectories_val
]

trajectories_1 = flatten_trajectories(trajectories_1)
trajectories_0 = flatten_trajectories(trajectories_0)
trajectories_val = flatten_trajectories(trajectories_val)

In [26]:
trajectories_1_test = [
    Trajectory(obs=traj["obs"], acts=traj["acts"], infos=None, terminal=True)
    for traj in trajectories_1_test
]
trajectories_0_test = [
    Trajectory(obs=traj["obs"], acts=traj["acts"], infos=None, terminal=True)
    for traj in trajectories_0_test
]

trajectories_1_test = flatten_trajectories(trajectories_1_test)
trajectories_0_test = flatten_trajectories(trajectories_0_test)

## Setting up environments

In [32]:
ID0 = "gymnasium_env/TransactionGraphEnv0-v2"
gym.envs.register(
    id=ID0,
    entry_point=grl.TransactionGraphEnvV2,
    kwargs={"df": df_with_actions_0, "alpha": 0.9, "device": torch.device("mps"), "label": 0},
    max_episode_steps=300,
)

ID1 = "gymnasium_env/TransactionGraphEnv1-v2"
gym.envs.register(
    id=ID1,
    entry_point=grl.TransactionGraphEnvV2,
    kwargs={"df": df_with_actions_1, "alpha": 0.9, "device": torch.device("mps"), "label": 1},
    max_episode_steps=300,
)

In [33]:
gym.pprint_registry()

===== classic_control =====
Acrobot-v1             CartPole-v0            CartPole-v1
MountainCar-v0         MountainCarContinuous-v0 Pendulum-v1
===== phys2d =====
phys2d/CartPole-v0     phys2d/CartPole-v1     phys2d/Pendulum-v0
===== box2d =====
BipedalWalker-v3       BipedalWalkerHardcore-v3 CarRacing-v2
LunarLander-v2         LunarLanderContinuous-v2
===== toy_text =====
Blackjack-v1           CliffWalking-v0        FrozenLake-v1
FrozenLake8x8-v1       Taxi-v3
===== tabular =====
tabular/Blackjack-v0   tabular/CliffWalking-v0
===== mujoco =====
Ant-v2                 Ant-v3                 Ant-v4
HalfCheetah-v2         HalfCheetah-v3         HalfCheetah-v4
Hopper-v2              Hopper-v3              Hopper-v4
Humanoid-v2            Humanoid-v3            Humanoid-v4
HumanoidStandup-v2     HumanoidStandup-v4     InvertedDoublePendulum-v2
InvertedDoublePendulum-v4 InvertedPendulum-v2    InvertedPendulum-v4
Pusher-v2              Pusher-v4              Reacher-v2
Reacher-v4         

In [35]:
env0 = Monitor(gym.make(ID0))

venv0 = make_vec_env(
    ID0,
    rng=RNG,
    n_envs=1,
    post_wrappers=[lambda env0, _: RolloutInfoWrapper(env0)],
    parallel=False,
)

venv0 = VecCheckNan(venv0, raise_exception=True)  # Check for NaN observations
venv0.reset()

Downloading artifacts:   0%|          | 0/6 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [29]:
env1 = Monitor(gym.make(ID1))

venv1 = make_vec_env(
    ID1,
    rng=RNG,
    n_envs=1,
    post_wrappers=[lambda env1, _: RolloutInfoWrapper(env1)],
    parallel=False,
)

venv1 = VecCheckNan(venv1, raise_exception=True)  # Check for NaN observations
venv1.reset()

Downloading artifacts:   0%|          | 0/6 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/6 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/6 [00:00<?, ?it/s]

array([[ 2.1050785 ,  2.9409041 ,  2.7491593 , -0.90090597,  3.34102   ,
        -1.0163782 , -3.628866  ,  1.7879648 , -0.41493422, -2.257103  ,
         0.17917961, -5.5413423 , -3.187664  , -1.0074005 ,  3.8020494 ,
         1.867261  ,  5.3037596 , -2.5561662 , -3.1998749 , -3.1663272 ,
         2.4884343 ,  3.8236866 ,  0.6439351 , -3.8735547 ,  0.34911406,
         1.3782847 , -4.0826197 ,  0.25616807, -2.5234458 , -1.987067  ,
         0.5799383 , -1.5068438 ,  3.2463756 , -1.0692186 , -3.016827  ,
        -3.0377662 ,  1.5168358 , -3.3340023 , -3.153436  ,  4.7484083 ,
         0.818479  , -2.3471856 , -1.0186008 ,  3.4683278 ,  1.7480986 ,
         1.5406379 , -0.77182394, -1.4625266 , -2.89302   , -2.5207417 ,
        -0.4640103 ,  2.423791  , -1.465664  ,  4.614048  , -1.4057485 ,
        -3.2410553 , -3.822288  , -7.75205   ,  1.7975447 , -1.1890996 ,
        -1.2437905 , -3.8492424 ,  1.6798604 ,  0.6190152 , -3.506558  ,
         3.7543266 ,  0.23060143,  3.0399046 ,  0.4

## AIRL setup

In [30]:
# Set parameters for the PPO algorithm (generator)
learning_rate = 0.001  # Learning rate, can be a function of progress
batch_size = 60  # Mini batch size for each gradient update
n_epochs = 15  # N of epochs when optimizing the surrogate loss

gamma = 0.5  # Discount factor, focus on the recent rewards
gae_lambda = 0  # Generalized advantage estimation
clip_range = 0.1  # Clipping parameter
ent_coef = 0.01  # Entropy coefficient for the loss calculation
vf_coef = 0.5  # Value function coef. for the loss calculation
max_grad_norm = 0.5  # The maximum value for the gradient clipping

verbose = 0  # Verbosity level: 0 no output, 1 info, 2 debug
normalize_advantage = True  # Whether to normalize or not the advantage

clip_range_vf = None  # Clip for the value function
use_sde = False  # Use State Dependent Exploration
sde_sample_freq = -1  # SDE - noise matrix frequency (-1 = disable)

# Set parameters for the AIRL trainer
gen_replay_buffer_capacity = None
allow_variable_horizon = True

disc_opt_kwargs = {
    "lr": 0.001,
}
policy_kwargs = {"use_expln": True}  # Fixing an issue with NaNs

In [31]:
# Set the number of timesteps, batch size and number of disc updates

# Total number of timesteps in the whole training
total_timesteps = 3000 * 100

# Generator
gen_train_timesteps = 3000  # N steps in the environment per one round
n_steps = gen_train_timesteps

# Discriminator batches
demo_minibatch_size = 60  # N samples in minibatch for one discrim. update
demo_batch_size = 300 * 10  # N samples in the batch of expert data (batch)
n_disc_updates_per_round = 4  # N discriminator updates per one round

In [32]:
hier_logger = logger.configure()
hier_logger.default_logger.output_formats.append(grl.MLflowOutputFormat())

In [33]:
# Initialize the learner PPO policy (generator)
learner0 = PPO(
    env=venv0,
    policy=MlpPolicy,
    policy_kwargs=policy_kwargs,
    learning_rate=learning_rate,
    n_steps=n_steps,
    batch_size=batch_size,
    n_epochs=n_epochs,
    gamma=gamma,
    gae_lambda=gae_lambda,
    clip_range=clip_range,
    clip_range_vf=clip_range_vf,
    normalize_advantage=normalize_advantage,
    ent_coef=ent_coef,
    vf_coef=vf_coef,
    max_grad_norm=max_grad_norm,
    use_sde=use_sde,
    sde_sample_freq=sde_sample_freq,
    verbose=verbose,
    seed=42,
    device="mps",
)

reward_net0 = BasicShapedRewardNet(
    observation_space=venv0.observation_space,
    action_space=venv0.action_space,
    normalize_input_layer=RunningNorm,
)

# Initialize the AIRL trainer
airl_trainer0 = AIRL(
    demonstrations=trajectories_0,
    demo_batch_size=demo_batch_size,
    demo_minibatch_size=demo_minibatch_size,
    n_disc_updates_per_round=n_disc_updates_per_round,
    gen_train_timesteps=gen_train_timesteps,
    gen_replay_buffer_capacity=gen_replay_buffer_capacity,
    venv=venv0,
    gen_algo=learner0,
    reward_net=reward_net0,
    allow_variable_horizon=allow_variable_horizon,
    disc_opt_kwargs=disc_opt_kwargs,
    custom_logger=hier_logger,
)

Running with `allow_variable_horizon` set to True. Some algorithms are biased towards shorter or longer episodes, which may significantly confound results. Additionally, even unbiased algorithms can exploit the information leak from the termination condition, producing spuriously high performance. See https://imitation.readthedocs.io/en/latest/getting-started/variable-horizon.html for more information.


In [34]:
# Initialize the learner PPO policy (generator)
learner1 = PPO(
    env=venv1,
    policy=MlpPolicy,
    policy_kwargs=policy_kwargs,
    learning_rate=learning_rate,
    n_steps=n_steps,
    batch_size=batch_size,
    n_epochs=n_epochs,
    gamma=gamma,
    gae_lambda=gae_lambda,
    clip_range=clip_range,
    clip_range_vf=clip_range_vf,
    normalize_advantage=normalize_advantage,
    ent_coef=ent_coef,
    vf_coef=vf_coef,
    max_grad_norm=max_grad_norm,
    use_sde=use_sde,
    sde_sample_freq=sde_sample_freq,
    verbose=verbose,
    seed=42,
    device="mps",
)

reward_net1 = BasicShapedRewardNet(
    observation_space=venv1.observation_space,
    action_space=venv1.action_space,
    normalize_input_layer=RunningNorm,
)

# Initialize the AIRL trainer
airl_trainer1 = AIRL(
    demonstrations=trajectories_1,
    demo_batch_size=demo_batch_size,
    demo_minibatch_size=demo_minibatch_size,
    n_disc_updates_per_round=n_disc_updates_per_round,
    gen_train_timesteps=gen_train_timesteps,
    gen_replay_buffer_capacity=gen_replay_buffer_capacity,
    venv=venv1,
    gen_algo=learner1,
    reward_net=reward_net1,
    allow_variable_horizon=allow_variable_horizon,
    disc_opt_kwargs=disc_opt_kwargs,
    custom_logger=hier_logger,
)

Running with `allow_variable_horizon` set to True. Some algorithms are biased towards shorter or longer episodes, which may significantly confound results. Additionally, even unbiased algorithms can exploit the information leak from the termination condition, producing spuriously high performance. See https://imitation.readthedocs.io/en/latest/getting-started/variable-horizon.html for more information.


## Training AIRL discriminator and generator, stats are saved with mlflow

We need to train 2 distinct airl trainers, one for arbitrage transactions and the other for class 0 transactions. The goal is to use the resulting reward functions to do classification.

In [38]:
mlflow.set_experiment("AIRLv2")

with mlflow.start_run():
    mlflow.log_param("n_steps", n_steps)
    mlflow.log_param("batch_size", batch_size)
    mlflow.log_param("total_timesteps", total_timesteps)

    airl_trainer1.train(total_timesteps=total_timesteps)

    learner1.save(config.MODELS_DIR / "learner1v2")
    torch.save(reward_net1, config.MODELS_DIR / "reward_net1v2")

    mlflow.log_artifact(config.MODELS_DIR / "learner1v2.zip")
    mlflow.log_artifact(config.MODELS_DIR / "reward_net1v2")
    mlflow.end_run()

round:   0%|          | 0/100 [00:00<?, ?it/s]

------------------------------------------
| raw/                        |          |
|    gen/rollout/ep_len_mean  | 28.6     |
|    gen/rollout/ep_rew_mean  | 13.6     |
|    gen/time/fps             | 29       |
|    gen/time/iterations      | 1        |
|    gen/time/time_elapsed    | 101      |
|    gen/time/total_timesteps | 3000     |
------------------------------------------
--------------------------------------------------
| raw/                                |          |
|    disc/disc_acc                    | 0.5      |
|    disc/disc_acc_expert             | 1        |
|    disc/disc_acc_gen                | 0        |
|    disc/disc_entropy                | 0.588    |
|    disc/disc_loss                   | 0.0156   |
|    disc/disc_proportion_expert_pred | 1        |
|    disc/disc_proportion_expert_true | 0.5      |
|    disc/global_step                 | 1        |
|    disc/n_expert                    | 60       |
|    disc/n_generated                 | 60       |
-

round:   1%|          | 1/100 [01:49<3:00:51, 109.61s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 28           |
|    gen/rollout/ep_rew_mean         | 13.3         |
|    gen/rollout/ep_rew_wrapped_mean | 30.7         |
|    gen/time/fps                    | 29           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 101          |
|    gen/time/total_timesteps        | 6000         |
|    gen/train/approx_kl             | 0.0036236835 |
|    gen/train/clip_fraction         | 0.0901       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.691       |
|    gen/train/explained_variance    | -0.0137      |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.148        |
|    gen/train/n_updates             | 15           |
|    gen/train/policy_gradient_loss  | -0.0014      |
|    gen/train/value_loss   

round:   2%|▏         | 2/100 [03:38<2:58:09, 109.07s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 21.4         |
|    gen/rollout/ep_rew_mean         | 10           |
|    gen/rollout/ep_rew_wrapped_mean | -2.11        |
|    gen/time/fps                    | 32           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 93           |
|    gen/time/total_timesteps        | 9000         |
|    gen/train/approx_kl             | 0.0035196855 |
|    gen/train/clip_fraction         | 0.186        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.691       |
|    gen/train/explained_variance    | 0.77         |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.0073       |
|    gen/train/n_updates             | 30           |
|    gen/train/policy_gradient_loss  | -0.00434     |
|    gen/train/value_loss   

round:   3%|▎         | 3/100 [05:18<2:50:04, 105.20s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 33.9        |
|    gen/rollout/ep_rew_mean         | 16.3        |
|    gen/rollout/ep_rew_wrapped_mean | -5.53       |
|    gen/time/fps                    | 32          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 91          |
|    gen/time/total_timesteps        | 12000       |
|    gen/train/approx_kl             | 0.004437146 |
|    gen/train/clip_fraction         | 0.252       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.691      |
|    gen/train/explained_variance    | 0.837       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | -0.0214     |
|    gen/train/n_updates             | 45          |
|    gen/train/policy_gradient_loss  | -0.00781    |
|    gen/train/value_loss            | 0.0346 

round:   4%|▍         | 4/100 [06:57<2:43:54, 102.45s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 31.5        |
|    gen/rollout/ep_rew_mean         | 15.3        |
|    gen/rollout/ep_rew_wrapped_mean | -21.2       |
|    gen/time/fps                    | 29          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 101         |
|    gen/time/total_timesteps        | 15000       |
|    gen/train/approx_kl             | 0.003634154 |
|    gen/train/clip_fraction         | 0.277       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.684      |
|    gen/train/explained_variance    | 0.9         |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | -0.00733    |
|    gen/train/n_updates             | 60          |
|    gen/train/policy_gradient_loss  | -0.00691    |
|    gen/train/value_loss            | 0.0309 

round:   5%|▌         | 5/100 [08:45<2:45:36, 104.60s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 27.3         |
|    gen/rollout/ep_rew_mean         | 14.5         |
|    gen/rollout/ep_rew_wrapped_mean | -22.9        |
|    gen/time/fps                    | 33           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 90           |
|    gen/time/total_timesteps        | 18000        |
|    gen/train/approx_kl             | 0.0047311992 |
|    gen/train/clip_fraction         | 0.354        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.663       |
|    gen/train/explained_variance    | 0.916        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.0342       |
|    gen/train/n_updates             | 75           |
|    gen/train/policy_gradient_loss  | -0.0074      |
|    gen/train/value_loss   

round:   6%|▌         | 6/100 [10:23<2:40:13, 102.27s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 31.9        |
|    gen/rollout/ep_rew_mean         | 17.6        |
|    gen/rollout/ep_rew_wrapped_mean | -20         |
|    gen/time/fps                    | 31          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 94          |
|    gen/time/total_timesteps        | 21000       |
|    gen/train/approx_kl             | 0.006084317 |
|    gen/train/clip_fraction         | 0.432       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.622      |
|    gen/train/explained_variance    | 0.941       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0346      |
|    gen/train/n_updates             | 90          |
|    gen/train/policy_gradient_loss  | -0.0142     |
|    gen/train/value_loss            | 0.0929 

round:   7%|▋         | 7/100 [12:05<2:38:17, 102.12s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 27.2        |
|    gen/rollout/ep_rew_mean         | 15.6        |
|    gen/rollout/ep_rew_wrapped_mean | -32.5       |
|    gen/time/fps                    | 34          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 86          |
|    gen/time/total_timesteps        | 24000       |
|    gen/train/approx_kl             | 0.008849625 |
|    gen/train/clip_fraction         | 0.313       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.579      |
|    gen/train/explained_variance    | 0.916       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.328       |
|    gen/train/n_updates             | 105         |
|    gen/train/policy_gradient_loss  | -0.00735    |
|    gen/train/value_loss            | 0.254  

round:   8%|▊         | 8/100 [13:38<2:32:19, 99.35s/it] 

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 26.4        |
|    gen/rollout/ep_rew_mean         | 17.6        |
|    gen/rollout/ep_rew_wrapped_mean | -25.6       |
|    gen/time/fps                    | 32          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 93          |
|    gen/time/total_timesteps        | 27000       |
|    gen/train/approx_kl             | 0.013267774 |
|    gen/train/clip_fraction         | 0.373       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.503      |
|    gen/train/explained_variance    | 0.923       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0961      |
|    gen/train/n_updates             | 120         |
|    gen/train/policy_gradient_loss  | -0.0138     |
|    gen/train/value_loss            | 0.268  

round:   9%|▉         | 9/100 [15:19<2:31:21, 99.79s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 24.5        |
|    gen/rollout/ep_rew_mean         | 18.4        |
|    gen/rollout/ep_rew_wrapped_mean | -17.5       |
|    gen/time/fps                    | 36          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 83          |
|    gen/time/total_timesteps        | 30000       |
|    gen/train/approx_kl             | 0.023774363 |
|    gen/train/clip_fraction         | 0.23        |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.412      |
|    gen/train/explained_variance    | 0.906       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0713      |
|    gen/train/n_updates             | 135         |
|    gen/train/policy_gradient_loss  | -0.00379    |
|    gen/train/value_loss            | 0.263  

round:  10%|█         | 10/100 [16:49<2:25:17, 96.86s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 28.1        |
|    gen/rollout/ep_rew_mean         | 22.3        |
|    gen/rollout/ep_rew_wrapped_mean | -27.9       |
|    gen/time/fps                    | 36          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 82          |
|    gen/time/total_timesteps        | 33000       |
|    gen/train/approx_kl             | 0.018919883 |
|    gen/train/clip_fraction         | 0.18        |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.288      |
|    gen/train/explained_variance    | 0.945       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.111       |
|    gen/train/n_updates             | 150         |
|    gen/train/policy_gradient_loss  | -0.00146    |
|    gen/train/value_loss            | 0.315  

round:  11%|█         | 11/100 [18:19<2:20:39, 94.82s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 27.6       |
|    gen/rollout/ep_rew_mean         | 23.7       |
|    gen/rollout/ep_rew_wrapped_mean | -31.4      |
|    gen/time/fps                    | 34         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 86         |
|    gen/time/total_timesteps        | 36000      |
|    gen/train/approx_kl             | 0.00791615 |
|    gen/train/clip_fraction         | 0.139      |
|    gen/train/clip_range            | 0.1        |
|    gen/train/entropy_loss          | -0.246     |
|    gen/train/explained_variance    | 0.96       |
|    gen/train/learning_rate         | 0.001      |
|    gen/train/loss                  | 0.0996     |
|    gen/train/n_updates             | 165        |
|    gen/train/policy_gradient_loss  | -0.00165   |
|    gen/train/value_loss            | 0.257      |
------------

round:  12%|█▏        | 12/100 [19:53<2:18:36, 94.51s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 26.7        |
|    gen/rollout/ep_rew_mean         | 22.9        |
|    gen/rollout/ep_rew_wrapped_mean | -22.1       |
|    gen/time/fps                    | 34          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 87          |
|    gen/time/total_timesteps        | 39000       |
|    gen/train/approx_kl             | 0.007435648 |
|    gen/train/clip_fraction         | 0.109       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.18       |
|    gen/train/explained_variance    | 0.958       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.078       |
|    gen/train/n_updates             | 180         |
|    gen/train/policy_gradient_loss  | 0.00055     |
|    gen/train/value_loss            | 0.237  

round:  13%|█▎        | 13/100 [21:28<2:17:09, 94.59s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 34.1        |
|    gen/rollout/ep_rew_mean         | 29.7        |
|    gen/rollout/ep_rew_wrapped_mean | -23.6       |
|    gen/time/fps                    | 36          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 81          |
|    gen/time/total_timesteps        | 42000       |
|    gen/train/approx_kl             | 0.006146773 |
|    gen/train/clip_fraction         | 0.102       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.157      |
|    gen/train/explained_variance    | 0.938       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.185       |
|    gen/train/n_updates             | 195         |
|    gen/train/policy_gradient_loss  | -0.00107    |
|    gen/train/value_loss            | 0.289  

round:  14%|█▍        | 14/100 [22:56<2:12:55, 92.74s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 26          |
|    gen/rollout/ep_rew_mean         | 21.9        |
|    gen/rollout/ep_rew_wrapped_mean | -32.1       |
|    gen/time/fps                    | 36          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 81          |
|    gen/time/total_timesteps        | 45000       |
|    gen/train/approx_kl             | 0.007947231 |
|    gen/train/clip_fraction         | 0.0729      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.128      |
|    gen/train/explained_variance    | 0.955       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.229       |
|    gen/train/n_updates             | 210         |
|    gen/train/policy_gradient_loss  | 0.00147     |
|    gen/train/value_loss            | 0.407  

round:  15%|█▌        | 15/100 [24:25<2:09:33, 91.46s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 29.2        |
|    gen/rollout/ep_rew_mean         | 26.3        |
|    gen/rollout/ep_rew_wrapped_mean | -44.5       |
|    gen/time/fps                    | 35          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 84          |
|    gen/time/total_timesteps        | 48000       |
|    gen/train/approx_kl             | 0.003172683 |
|    gen/train/clip_fraction         | 0.0864      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.177      |
|    gen/train/explained_variance    | 0.951       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0963      |
|    gen/train/n_updates             | 225         |
|    gen/train/policy_gradient_loss  | -0.0013     |
|    gen/train/value_loss            | 0.442  

round:  16%|█▌        | 16/100 [25:56<2:08:07, 91.52s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 26.6         |
|    gen/rollout/ep_rew_mean         | 23.4         |
|    gen/rollout/ep_rew_wrapped_mean | -27.3        |
|    gen/time/fps                    | 39           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 76           |
|    gen/time/total_timesteps        | 51000        |
|    gen/train/approx_kl             | 0.0052934075 |
|    gen/train/clip_fraction         | 0.0588       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.103       |
|    gen/train/explained_variance    | 0.926        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.272        |
|    gen/train/n_updates             | 240          |
|    gen/train/policy_gradient_loss  | 0.000112     |
|    gen/train/value_loss   

round:  17%|█▋        | 17/100 [27:20<2:03:16, 89.12s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 34.4         |
|    gen/rollout/ep_rew_mean         | 30.6         |
|    gen/rollout/ep_rew_wrapped_mean | -30.3        |
|    gen/time/fps                    | 35           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 84           |
|    gen/time/total_timesteps        | 54000        |
|    gen/train/approx_kl             | 0.0064397072 |
|    gen/train/clip_fraction         | 0.0537       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.104       |
|    gen/train/explained_variance    | 0.952        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.316        |
|    gen/train/n_updates             | 255          |
|    gen/train/policy_gradient_loss  | -0.000932    |
|    gen/train/value_loss   

round:  18%|█▊        | 18/100 [28:52<2:02:49, 89.87s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 25.1         |
|    gen/rollout/ep_rew_mean         | 22.5         |
|    gen/rollout/ep_rew_wrapped_mean | -49.7        |
|    gen/time/fps                    | 33           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 90           |
|    gen/time/total_timesteps        | 57000        |
|    gen/train/approx_kl             | 0.0069625024 |
|    gen/train/clip_fraction         | 0.0736       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.109       |
|    gen/train/explained_variance    | 0.913        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.119        |
|    gen/train/n_updates             | 270          |
|    gen/train/policy_gradient_loss  | 0.00215      |
|    gen/train/value_loss   

round:  19%|█▉        | 19/100 [30:30<2:04:39, 92.34s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 17.8         |
|    gen/rollout/ep_rew_mean         | 16           |
|    gen/rollout/ep_rew_wrapped_mean | -37.9        |
|    gen/time/fps                    | 36           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 83           |
|    gen/time/total_timesteps        | 60000        |
|    gen/train/approx_kl             | 0.0051765684 |
|    gen/train/clip_fraction         | 0.0673       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.102       |
|    gen/train/explained_variance    | 0.931        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.371        |
|    gen/train/n_updates             | 285          |
|    gen/train/policy_gradient_loss  | 0.00148      |
|    gen/train/value_loss   

round:  20%|██        | 20/100 [32:00<2:02:19, 91.74s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 27           |
|    gen/rollout/ep_rew_mean         | 25.7         |
|    gen/rollout/ep_rew_wrapped_mean | -19.4        |
|    gen/time/fps                    | 33           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 88           |
|    gen/time/total_timesteps        | 63000        |
|    gen/train/approx_kl             | 0.0072019356 |
|    gen/train/clip_fraction         | 0.0522       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0763      |
|    gen/train/explained_variance    | 0.886        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.366        |
|    gen/train/n_updates             | 300          |
|    gen/train/policy_gradient_loss  | -0.000804    |
|    gen/train/value_loss   

round:  21%|██        | 21/100 [33:36<2:02:18, 92.89s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 39.3         |
|    gen/rollout/ep_rew_mean         | 37.1         |
|    gen/rollout/ep_rew_wrapped_mean | -18.3        |
|    gen/time/fps                    | 34           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 87           |
|    gen/time/total_timesteps        | 66000        |
|    gen/train/approx_kl             | 0.0055146385 |
|    gen/train/clip_fraction         | 0.0241       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.032       |
|    gen/train/explained_variance    | 0.932        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.417        |
|    gen/train/n_updates             | 315          |
|    gen/train/policy_gradient_loss  | -0.000786    |
|    gen/train/value_loss   

round:  22%|██▏       | 22/100 [35:10<2:01:25, 93.40s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 29.1         |
|    gen/rollout/ep_rew_mean         | 26.5         |
|    gen/rollout/ep_rew_wrapped_mean | -46.2        |
|    gen/time/fps                    | 36           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 82           |
|    gen/time/total_timesteps        | 69000        |
|    gen/train/approx_kl             | 0.0043525393 |
|    gen/train/clip_fraction         | 0.0451       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.06        |
|    gen/train/explained_variance    | 0.957        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.219        |
|    gen/train/n_updates             | 330          |
|    gen/train/policy_gradient_loss  | 0.00553      |
|    gen/train/value_loss   

round:  23%|██▎       | 23/100 [36:39<1:58:05, 92.02s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 28.5         |
|    gen/rollout/ep_rew_mean         | 26.6         |
|    gen/rollout/ep_rew_wrapped_mean | -36.2        |
|    gen/time/fps                    | 36           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 81           |
|    gen/time/total_timesteps        | 72000        |
|    gen/train/approx_kl             | 0.0038782263 |
|    gen/train/clip_fraction         | 0.0413       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0671      |
|    gen/train/explained_variance    | 0.958        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.761        |
|    gen/train/n_updates             | 345          |
|    gen/train/policy_gradient_loss  | 0.00103      |
|    gen/train/value_loss   

round:  24%|██▍       | 24/100 [38:07<1:55:06, 90.88s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 26.8         |
|    gen/rollout/ep_rew_mean         | 25.3         |
|    gen/rollout/ep_rew_wrapped_mean | -38.5        |
|    gen/time/fps                    | 37           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 79           |
|    gen/time/total_timesteps        | 75000        |
|    gen/train/approx_kl             | 0.0044584577 |
|    gen/train/clip_fraction         | 0.0173       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0445      |
|    gen/train/explained_variance    | 0.955        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.331        |
|    gen/train/n_updates             | 360          |
|    gen/train/policy_gradient_loss  | 0.000454     |
|    gen/train/value_loss   

round:  25%|██▌       | 25/100 [39:34<1:52:10, 89.73s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 21.7         |
|    gen/rollout/ep_rew_mean         | 20.5         |
|    gen/rollout/ep_rew_wrapped_mean | -13.6        |
|    gen/time/fps                    | 36           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 83           |
|    gen/time/total_timesteps        | 78000        |
|    gen/train/approx_kl             | 0.0016162596 |
|    gen/train/clip_fraction         | 0.0167       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0302      |
|    gen/train/explained_variance    | 0.928        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.316        |
|    gen/train/n_updates             | 375          |
|    gen/train/policy_gradient_loss  | 7.07e-05     |
|    gen/train/value_loss   

round:  26%|██▌       | 26/100 [41:05<1:50:57, 89.97s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 26.4         |
|    gen/rollout/ep_rew_mean         | 25.1         |
|    gen/rollout/ep_rew_wrapped_mean | -19.3        |
|    gen/time/fps                    | 33           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 88           |
|    gen/time/total_timesteps        | 81000        |
|    gen/train/approx_kl             | 0.0030025193 |
|    gen/train/clip_fraction         | 0.0101       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0207      |
|    gen/train/explained_variance    | 0.948        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.0832       |
|    gen/train/n_updates             | 390          |
|    gen/train/policy_gradient_loss  | -0.000833    |
|    gen/train/value_loss   

round:  27%|██▋       | 27/100 [42:41<1:51:37, 91.74s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 25.6         |
|    gen/rollout/ep_rew_mean         | 24.2         |
|    gen/rollout/ep_rew_wrapped_mean | -10.4        |
|    gen/time/fps                    | 36           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 82           |
|    gen/time/total_timesteps        | 84000        |
|    gen/train/approx_kl             | 0.0018567545 |
|    gen/train/clip_fraction         | 0.0104       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0237      |
|    gen/train/explained_variance    | 0.949        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.243        |
|    gen/train/n_updates             | 405          |
|    gen/train/policy_gradient_loss  | -0.000708    |
|    gen/train/value_loss   

round:  28%|██▊       | 28/100 [44:10<1:49:18, 91.09s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 21.2        |
|    gen/rollout/ep_rew_mean         | 19.8        |
|    gen/rollout/ep_rew_wrapped_mean | -31.9       |
|    gen/time/fps                    | 38          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 77          |
|    gen/time/total_timesteps        | 87000       |
|    gen/train/approx_kl             | 0.004065585 |
|    gen/train/clip_fraction         | 0.0105      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0108     |
|    gen/train/explained_variance    | 0.956       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.14        |
|    gen/train/n_updates             | 420         |
|    gen/train/policy_gradient_loss  | 0.000254    |
|    gen/train/value_loss            | 0.492  

round:  29%|██▉       | 29/100 [45:35<1:45:31, 89.17s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 23.8         |
|    gen/rollout/ep_rew_mean         | 22.4         |
|    gen/rollout/ep_rew_wrapped_mean | -22.9        |
|    gen/time/fps                    | 35           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 84           |
|    gen/time/total_timesteps        | 90000        |
|    gen/train/approx_kl             | 0.0006727536 |
|    gen/train/clip_fraction         | 0.0195       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.019       |
|    gen/train/explained_variance    | 0.973        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.147        |
|    gen/train/n_updates             | 435          |
|    gen/train/policy_gradient_loss  | -0.00147     |
|    gen/train/value_loss   

round:  30%|███       | 30/100 [47:07<1:44:53, 89.91s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 13.8         |
|    gen/rollout/ep_rew_mean         | 12.7         |
|    gen/rollout/ep_rew_wrapped_mean | -19.1        |
|    gen/time/fps                    | 38           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 78           |
|    gen/time/total_timesteps        | 93000        |
|    gen/train/approx_kl             | 0.0058416654 |
|    gen/train/clip_fraction         | 0.0116       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0113      |
|    gen/train/explained_variance    | 0.881        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.0549       |
|    gen/train/n_updates             | 450          |
|    gen/train/policy_gradient_loss  | 0.00243      |
|    gen/train/value_loss   

round:  31%|███       | 31/100 [48:32<1:41:56, 88.64s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 29.8        |
|    gen/rollout/ep_rew_mean         | 28.5        |
|    gen/rollout/ep_rew_wrapped_mean | -10.9       |
|    gen/time/fps                    | 37          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 80          |
|    gen/time/total_timesteps        | 96000       |
|    gen/train/approx_kl             | 0.000394083 |
|    gen/train/clip_fraction         | 0.0103      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0195     |
|    gen/train/explained_variance    | 0.953       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.149       |
|    gen/train/n_updates             | 465         |
|    gen/train/policy_gradient_loss  | -0.000257   |
|    gen/train/value_loss            | 0.491  

round:  32%|███▏      | 32/100 [49:59<1:39:56, 88.18s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 26.6         |
|    gen/rollout/ep_rew_mean         | 25.4         |
|    gen/rollout/ep_rew_wrapped_mean | -29.9        |
|    gen/time/fps                    | 33           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 89           |
|    gen/time/total_timesteps        | 99000        |
|    gen/train/approx_kl             | 0.0005529264 |
|    gen/train/clip_fraction         | 0.0058       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0105      |
|    gen/train/explained_variance    | 0.928        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.262        |
|    gen/train/n_updates             | 480          |
|    gen/train/policy_gradient_loss  | -0.000615    |
|    gen/train/value_loss   

round:  33%|███▎      | 33/100 [51:36<1:41:11, 90.62s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 24.6         |
|    gen/rollout/ep_rew_mean         | 21.9         |
|    gen/rollout/ep_rew_wrapped_mean | -19.8        |
|    gen/time/fps                    | 35           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 83           |
|    gen/time/total_timesteps        | 102000       |
|    gen/train/approx_kl             | 0.0027824056 |
|    gen/train/clip_fraction         | 0.12         |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0958      |
|    gen/train/explained_variance    | 0.947        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.15         |
|    gen/train/n_updates             | 495          |
|    gen/train/policy_gradient_loss  | -0.00509     |
|    gen/train/value_loss   

round:  34%|███▍      | 34/100 [53:06<1:39:36, 90.55s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 25.2        |
|    gen/rollout/ep_rew_mean         | 23.2        |
|    gen/rollout/ep_rew_wrapped_mean | -35.6       |
|    gen/time/fps                    | 38          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 77          |
|    gen/time/total_timesteps        | 105000      |
|    gen/train/approx_kl             | 0.013487093 |
|    gen/train/clip_fraction         | 0.0582      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.119      |
|    gen/train/explained_variance    | 0.963       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.105       |
|    gen/train/n_updates             | 510         |
|    gen/train/policy_gradient_loss  | 0.000247    |
|    gen/train/value_loss            | 0.43   

round:  35%|███▌      | 35/100 [54:30<1:36:03, 88.67s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 21.2        |
|    gen/rollout/ep_rew_mean         | 19.9        |
|    gen/rollout/ep_rew_wrapped_mean | -31.7       |
|    gen/time/fps                    | 37          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 79          |
|    gen/time/total_timesteps        | 108000      |
|    gen/train/approx_kl             | 0.018475823 |
|    gen/train/clip_fraction         | 0.0204      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0343     |
|    gen/train/explained_variance    | 0.951       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.327       |
|    gen/train/n_updates             | 525         |
|    gen/train/policy_gradient_loss  | -0.00171    |
|    gen/train/value_loss            | 0.378  

round:  36%|███▌      | 36/100 [55:57<1:33:52, 88.01s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 21.8        |
|    gen/rollout/ep_rew_mean         | 20.5        |
|    gen/rollout/ep_rew_wrapped_mean | -25.2       |
|    gen/time/fps                    | 36          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 82          |
|    gen/time/total_timesteps        | 111000      |
|    gen/train/approx_kl             | 0.009533141 |
|    gen/train/clip_fraction         | 0.0131      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0202     |
|    gen/train/explained_variance    | 0.966       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.161       |
|    gen/train/n_updates             | 540         |
|    gen/train/policy_gradient_loss  | -0.000936   |
|    gen/train/value_loss            | 0.286  

round:  37%|███▋      | 37/100 [57:26<1:32:52, 88.46s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 28.9         |
|    gen/rollout/ep_rew_mean         | 26.8         |
|    gen/rollout/ep_rew_wrapped_mean | -28.7        |
|    gen/time/fps                    | 36           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 81           |
|    gen/time/total_timesteps        | 114000       |
|    gen/train/approx_kl             | 0.0010483129 |
|    gen/train/clip_fraction         | 0.0206       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0306      |
|    gen/train/explained_variance    | 0.953        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.29         |
|    gen/train/n_updates             | 555          |
|    gen/train/policy_gradient_loss  | -0.00151     |
|    gen/train/value_loss   

round:  38%|███▊      | 38/100 [58:55<1:31:21, 88.42s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 16.6        |
|    gen/rollout/ep_rew_mean         | 14.8        |
|    gen/rollout/ep_rew_wrapped_mean | -38.5       |
|    gen/time/fps                    | 38          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 77          |
|    gen/time/total_timesteps        | 117000      |
|    gen/train/approx_kl             | 0.008957358 |
|    gen/train/clip_fraction         | 0.0338      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0483     |
|    gen/train/explained_variance    | 0.933       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.202       |
|    gen/train/n_updates             | 570         |
|    gen/train/policy_gradient_loss  | 0.00147     |
|    gen/train/value_loss            | 0.609  

round:  39%|███▉      | 39/100 [1:00:20<1:28:50, 87.39s/it]

------------------------------------------------------
| raw/                               |               |
|    gen/rollout/ep_len_mean         | 25.8          |
|    gen/rollout/ep_rew_mean         | 24.4          |
|    gen/rollout/ep_rew_wrapped_mean | -27           |
|    gen/time/fps                    | 37            |
|    gen/time/iterations             | 1             |
|    gen/time/time_elapsed           | 79            |
|    gen/time/total_timesteps        | 120000        |
|    gen/train/approx_kl             | 0.00088320015 |
|    gen/train/clip_fraction         | 0.026         |
|    gen/train/clip_range            | 0.1           |
|    gen/train/entropy_loss          | -0.043        |
|    gen/train/explained_variance    | 0.959         |
|    gen/train/learning_rate         | 0.001         |
|    gen/train/loss                  | 0.348         |
|    gen/train/n_updates             | 585           |
|    gen/train/policy_gradient_loss  | -0.000426     |
|    gen/t

round:  40%|████      | 40/100 [1:01:46<1:27:05, 87.09s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 25.2         |
|    gen/rollout/ep_rew_mean         | 23.8         |
|    gen/rollout/ep_rew_wrapped_mean | -37.9        |
|    gen/time/fps                    | 36           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 82           |
|    gen/time/total_timesteps        | 123000       |
|    gen/train/approx_kl             | 0.0005950945 |
|    gen/train/clip_fraction         | 0.0174       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0596      |
|    gen/train/explained_variance    | 0.964        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.29         |
|    gen/train/n_updates             | 600          |
|    gen/train/policy_gradient_loss  | 0.000595     |
|    gen/train/value_loss   

round:  41%|████      | 41/100 [1:03:15<1:26:13, 87.69s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 35           |
|    gen/rollout/ep_rew_mean         | 33.7         |
|    gen/rollout/ep_rew_wrapped_mean | -16.2        |
|    gen/time/fps                    | 37           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 80           |
|    gen/time/total_timesteps        | 126000       |
|    gen/train/approx_kl             | 0.0050628115 |
|    gen/train/clip_fraction         | 0.0137       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0268      |
|    gen/train/explained_variance    | 0.94         |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.159        |
|    gen/train/n_updates             | 615          |
|    gen/train/policy_gradient_loss  | -0.000235    |
|    gen/train/value_loss   

round:  42%|████▏     | 42/100 [1:04:42<1:24:35, 87.51s/it]

-------------------------------------------------------
| raw/                               |                |
|    gen/rollout/ep_len_mean         | 23.2           |
|    gen/rollout/ep_rew_mean         | 21.9           |
|    gen/rollout/ep_rew_wrapped_mean | -26.3          |
|    gen/time/fps                    | 36             |
|    gen/time/iterations             | 1              |
|    gen/time/time_elapsed           | 81             |
|    gen/time/total_timesteps        | 129000         |
|    gen/train/approx_kl             | 0.000104394916 |
|    gen/train/clip_fraction         | 0.00133        |
|    gen/train/clip_range            | 0.1            |
|    gen/train/entropy_loss          | -0.0101        |
|    gen/train/explained_variance    | 0.944          |
|    gen/train/learning_rate         | 0.001          |
|    gen/train/loss                  | 0.122          |
|    gen/train/n_updates             | 630            |
|    gen/train/policy_gradient_loss  | -0.000303

round:  43%|████▎     | 43/100 [1:06:11<1:23:27, 87.85s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 22.6         |
|    gen/rollout/ep_rew_mean         | 21.2         |
|    gen/rollout/ep_rew_wrapped_mean | -12.3        |
|    gen/time/fps                    | 38           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 78           |
|    gen/time/total_timesteps        | 132000       |
|    gen/train/approx_kl             | 0.0012162128 |
|    gen/train/clip_fraction         | 0.00578      |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0105      |
|    gen/train/explained_variance    | 0.948        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.0631       |
|    gen/train/n_updates             | 645          |
|    gen/train/policy_gradient_loss  | 0.000463     |
|    gen/train/value_loss   

round:  44%|████▍     | 44/100 [1:07:37<1:21:22, 87.19s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 17.9        |
|    gen/rollout/ep_rew_mean         | 16.8        |
|    gen/rollout/ep_rew_wrapped_mean | -27.8       |
|    gen/time/fps                    | 39          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 76          |
|    gen/time/total_timesteps        | 135000      |
|    gen/train/approx_kl             | 0.006384099 |
|    gen/train/clip_fraction         | 0.00744     |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.00762    |
|    gen/train/explained_variance    | 0.941       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.103       |
|    gen/train/n_updates             | 660         |
|    gen/train/policy_gradient_loss  | -0.000247   |
|    gen/train/value_loss            | 0.447  

round:  45%|████▌     | 45/100 [1:09:00<1:18:57, 86.13s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 20.8         |
|    gen/rollout/ep_rew_mean         | 19.2         |
|    gen/rollout/ep_rew_wrapped_mean | -19.6        |
|    gen/time/fps                    | 35           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 84           |
|    gen/time/total_timesteps        | 138000       |
|    gen/train/approx_kl             | 0.0025248101 |
|    gen/train/clip_fraction         | 0.0509       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0429      |
|    gen/train/explained_variance    | 0.948        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.647        |
|    gen/train/n_updates             | 675          |
|    gen/train/policy_gradient_loss  | -0.00307     |
|    gen/train/value_loss   

round:  46%|████▌     | 46/100 [1:10:32<1:19:01, 87.81s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 26.3        |
|    gen/rollout/ep_rew_mean         | 24.7        |
|    gen/rollout/ep_rew_wrapped_mean | -6.87       |
|    gen/time/fps                    | 36          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 82          |
|    gen/time/total_timesteps        | 141000      |
|    gen/train/approx_kl             | 0.012533704 |
|    gen/train/clip_fraction         | 0.0172      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0308     |
|    gen/train/explained_variance    | 0.912       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.473       |
|    gen/train/n_updates             | 690         |
|    gen/train/policy_gradient_loss  | 0.000296    |
|    gen/train/value_loss            | 0.696  

round:  47%|████▋     | 47/100 [1:12:02<1:18:08, 88.46s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 30.2         |
|    gen/rollout/ep_rew_mean         | 28.9         |
|    gen/rollout/ep_rew_wrapped_mean | -29.4        |
|    gen/time/fps                    | 35           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 83           |
|    gen/time/total_timesteps        | 144000       |
|    gen/train/approx_kl             | 0.0047201635 |
|    gen/train/clip_fraction         | 0.0159       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0176      |
|    gen/train/explained_variance    | 0.944        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.209        |
|    gen/train/n_updates             | 705          |
|    gen/train/policy_gradient_loss  | -0.00163     |
|    gen/train/value_loss   

round:  48%|████▊     | 48/100 [1:13:32<1:17:13, 89.10s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 26.4        |
|    gen/rollout/ep_rew_mean         | 25.2        |
|    gen/rollout/ep_rew_wrapped_mean | -23.9       |
|    gen/time/fps                    | 36          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 82          |
|    gen/time/total_timesteps        | 147000      |
|    gen/train/approx_kl             | 0.008733641 |
|    gen/train/clip_fraction         | 0.0048      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.00384    |
|    gen/train/explained_variance    | 0.953       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0746      |
|    gen/train/n_updates             | 720         |
|    gen/train/policy_gradient_loss  | 0.000908    |
|    gen/train/value_loss            | 0.305  

round:  49%|████▉     | 49/100 [1:15:02<1:15:44, 89.11s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 23.3         |
|    gen/rollout/ep_rew_mean         | 21           |
|    gen/rollout/ep_rew_wrapped_mean | -41          |
|    gen/time/fps                    | 37           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 79           |
|    gen/time/total_timesteps        | 150000       |
|    gen/train/approx_kl             | 0.0030412897 |
|    gen/train/clip_fraction         | 0.0362       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.032       |
|    gen/train/explained_variance    | 0.958        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.229        |
|    gen/train/n_updates             | 735          |
|    gen/train/policy_gradient_loss  | -0.00289     |
|    gen/train/value_loss   

round:  50%|█████     | 50/100 [1:16:28<1:13:40, 88.42s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 24.2        |
|    gen/rollout/ep_rew_mean         | 22.9        |
|    gen/rollout/ep_rew_wrapped_mean | -40.5       |
|    gen/time/fps                    | 35          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 85          |
|    gen/time/total_timesteps        | 153000      |
|    gen/train/approx_kl             | 0.035239387 |
|    gen/train/clip_fraction         | 0.0877      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0319     |
|    gen/train/explained_variance    | 0.958       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.141       |
|    gen/train/n_updates             | 750         |
|    gen/train/policy_gradient_loss  | 0.000123    |
|    gen/train/value_loss            | 0.422  

round:  51%|█████     | 51/100 [1:18:01<1:13:10, 89.61s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 26.1         |
|    gen/rollout/ep_rew_mean         | 23.9         |
|    gen/rollout/ep_rew_wrapped_mean | -25.9        |
|    gen/time/fps                    | 38           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 78           |
|    gen/time/total_timesteps        | 156000       |
|    gen/train/approx_kl             | 0.0032379993 |
|    gen/train/clip_fraction         | 0.0748       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.064       |
|    gen/train/explained_variance    | 0.964        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.227        |
|    gen/train/n_updates             | 765          |
|    gen/train/policy_gradient_loss  | -0.00205     |
|    gen/train/value_loss   

round:  52%|█████▏    | 52/100 [1:19:26<1:10:33, 88.19s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 26.6        |
|    gen/rollout/ep_rew_mean         | 24.9        |
|    gen/rollout/ep_rew_wrapped_mean | -30         |
|    gen/time/fps                    | 38          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 78          |
|    gen/time/total_timesteps        | 159000      |
|    gen/train/approx_kl             | 0.013953461 |
|    gen/train/clip_fraction         | 0.0488      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0466     |
|    gen/train/explained_variance    | 0.957       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.186       |
|    gen/train/n_updates             | 780         |
|    gen/train/policy_gradient_loss  | 0.000904    |
|    gen/train/value_loss            | 0.396  

round:  53%|█████▎    | 53/100 [1:20:51<1:08:29, 87.44s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 21.9         |
|    gen/rollout/ep_rew_mean         | 20.2         |
|    gen/rollout/ep_rew_wrapped_mean | -27.1        |
|    gen/time/fps                    | 35           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 84           |
|    gen/time/total_timesteps        | 162000       |
|    gen/train/approx_kl             | 0.0030117566 |
|    gen/train/clip_fraction         | 0.0254       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0469      |
|    gen/train/explained_variance    | 0.969        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.298        |
|    gen/train/n_updates             | 795          |
|    gen/train/policy_gradient_loss  | -0.00246     |
|    gen/train/value_loss   

round:  54%|█████▍    | 54/100 [1:22:23<1:08:01, 88.74s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 31.7         |
|    gen/rollout/ep_rew_mean         | 29.4         |
|    gen/rollout/ep_rew_wrapped_mean | -30          |
|    gen/time/fps                    | 38           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 78           |
|    gen/time/total_timesteps        | 165000       |
|    gen/train/approx_kl             | 0.0020569232 |
|    gen/train/clip_fraction         | 0.0158       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0438      |
|    gen/train/explained_variance    | 0.966        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.267        |
|    gen/train/n_updates             | 810          |
|    gen/train/policy_gradient_loss  | -0.00167     |
|    gen/train/value_loss   

round:  55%|█████▌    | 55/100 [1:23:49<1:05:53, 87.85s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 26.2        |
|    gen/rollout/ep_rew_mean         | 24.8        |
|    gen/rollout/ep_rew_wrapped_mean | -57.8       |
|    gen/time/fps                    | 37          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 80          |
|    gen/time/total_timesteps        | 168000      |
|    gen/train/approx_kl             | 0.008370443 |
|    gen/train/clip_fraction         | 0.0312      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.035      |
|    gen/train/explained_variance    | 0.937       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0658      |
|    gen/train/n_updates             | 825         |
|    gen/train/policy_gradient_loss  | 2.01e-05    |
|    gen/train/value_loss            | 0.51   

round:  56%|█████▌    | 56/100 [1:25:17<1:04:24, 87.84s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 28.1         |
|    gen/rollout/ep_rew_mean         | 26.1         |
|    gen/rollout/ep_rew_wrapped_mean | -32.7        |
|    gen/time/fps                    | 38           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 78           |
|    gen/time/total_timesteps        | 171000       |
|    gen/train/approx_kl             | 0.0018788634 |
|    gen/train/clip_fraction         | 0.0426       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0517      |
|    gen/train/explained_variance    | 0.958        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.322        |
|    gen/train/n_updates             | 840          |
|    gen/train/policy_gradient_loss  | -0.0036      |
|    gen/train/value_loss   

round:  57%|█████▋    | 57/100 [1:26:43<1:02:31, 87.24s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 48.8        |
|    gen/rollout/ep_rew_mean         | 44.8        |
|    gen/rollout/ep_rew_wrapped_mean | -36.4       |
|    gen/time/fps                    | 37          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 80          |
|    gen/time/total_timesteps        | 174000      |
|    gen/train/approx_kl             | 0.014859896 |
|    gen/train/clip_fraction         | 0.0432      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.057      |
|    gen/train/explained_variance    | 0.943       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.311       |
|    gen/train/n_updates             | 855         |
|    gen/train/policy_gradient_loss  | 0.000824    |
|    gen/train/value_loss            | 0.664  

round:  58%|█████▊    | 58/100 [1:28:10<1:01:09, 87.36s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 32.4        |
|    gen/rollout/ep_rew_mean         | 30.5        |
|    gen/rollout/ep_rew_wrapped_mean | -71.3       |
|    gen/time/fps                    | 36          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 83          |
|    gen/time/total_timesteps        | 177000      |
|    gen/train/approx_kl             | 0.014372646 |
|    gen/train/clip_fraction         | 0.0857      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0914     |
|    gen/train/explained_variance    | 0.968       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.208       |
|    gen/train/n_updates             | 870         |
|    gen/train/policy_gradient_loss  | 0.000147    |
|    gen/train/value_loss            | 0.519  

round:  59%|█████▉    | 59/100 [1:29:41<1:00:18, 88.26s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 32.2        |
|    gen/rollout/ep_rew_mean         | 29.9        |
|    gen/rollout/ep_rew_wrapped_mean | -48.1       |
|    gen/time/fps                    | 35          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 84          |
|    gen/time/total_timesteps        | 180000      |
|    gen/train/approx_kl             | 0.001306185 |
|    gen/train/clip_fraction         | 0.014       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0366     |
|    gen/train/explained_variance    | 0.955       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.132       |
|    gen/train/n_updates             | 885         |
|    gen/train/policy_gradient_loss  | -0.00173    |
|    gen/train/value_loss            | 0.624  

round:  60%|██████    | 60/100 [1:31:12<59:24, 89.11s/it]  

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 25.8        |
|    gen/rollout/ep_rew_mean         | 24.6        |
|    gen/rollout/ep_rew_wrapped_mean | -62.3       |
|    gen/time/fps                    | 34          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 86          |
|    gen/time/total_timesteps        | 183000      |
|    gen/train/approx_kl             | 0.010114292 |
|    gen/train/clip_fraction         | 0.0262      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0465     |
|    gen/train/explained_variance    | 0.956       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.27        |
|    gen/train/n_updates             | 900         |
|    gen/train/policy_gradient_loss  | -0.00138    |
|    gen/train/value_loss            | 0.715  

round:  61%|██████    | 61/100 [1:32:46<58:52, 90.57s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 26.2         |
|    gen/rollout/ep_rew_mean         | 24.9         |
|    gen/rollout/ep_rew_wrapped_mean | -23.7        |
|    gen/time/fps                    | 34           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 87           |
|    gen/time/total_timesteps        | 186000       |
|    gen/train/approx_kl             | 0.0005353613 |
|    gen/train/clip_fraction         | 0.00571      |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.011       |
|    gen/train/explained_variance    | 0.949        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.16         |
|    gen/train/n_updates             | 915          |
|    gen/train/policy_gradient_loss  | -0.000145    |
|    gen/train/value_loss   

round:  62%|██████▏   | 62/100 [1:34:20<58:07, 91.77s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 24.1        |
|    gen/rollout/ep_rew_mean         | 22.9        |
|    gen/rollout/ep_rew_wrapped_mean | -51.4       |
|    gen/time/fps                    | 34          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 87          |
|    gen/time/total_timesteps        | 189000      |
|    gen/train/approx_kl             | 0.010682328 |
|    gen/train/clip_fraction         | 0.0171      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0217     |
|    gen/train/explained_variance    | 0.966       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.92        |
|    gen/train/n_updates             | 930         |
|    gen/train/policy_gradient_loss  | 0.00106     |
|    gen/train/value_loss            | 0.529  

round:  63%|██████▎   | 63/100 [1:35:54<57:03, 92.53s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 20.9         |
|    gen/rollout/ep_rew_mean         | 19.8         |
|    gen/rollout/ep_rew_wrapped_mean | -35.9        |
|    gen/time/fps                    | 34           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 86           |
|    gen/time/total_timesteps        | 192000       |
|    gen/train/approx_kl             | 8.457071e-05 |
|    gen/train/clip_fraction         | 0.0012       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.00468     |
|    gen/train/explained_variance    | 0.965        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.115        |
|    gen/train/n_updates             | 945          |
|    gen/train/policy_gradient_loss  | -0.00022     |
|    gen/train/value_loss   

round:  64%|██████▍   | 64/100 [1:37:28<55:46, 92.96s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 14.6         |
|    gen/rollout/ep_rew_mean         | 13.1         |
|    gen/rollout/ep_rew_wrapped_mean | -20.8        |
|    gen/time/fps                    | 39           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 75           |
|    gen/time/total_timesteps        | 195000       |
|    gen/train/approx_kl             | 0.0016657208 |
|    gen/train/clip_fraction         | 0.0516       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0449      |
|    gen/train/explained_variance    | 0.961        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.229        |
|    gen/train/n_updates             | 960          |
|    gen/train/policy_gradient_loss  | -0.0024      |
|    gen/train/value_loss   

round:  65%|██████▌   | 65/100 [1:38:50<52:18, 89.66s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 17.2        |
|    gen/rollout/ep_rew_mean         | 16.1        |
|    gen/rollout/ep_rew_wrapped_mean | -27.6       |
|    gen/time/fps                    | 38          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 78          |
|    gen/time/total_timesteps        | 198000      |
|    gen/train/approx_kl             | 0.043597814 |
|    gen/train/clip_fraction         | 0.0225      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0113     |
|    gen/train/explained_variance    | 0.95        |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.518       |
|    gen/train/n_updates             | 975         |
|    gen/train/policy_gradient_loss  | -0.00124    |
|    gen/train/value_loss            | 0.699  

round:  66%|██████▌   | 66/100 [1:40:16<50:09, 88.51s/it]

------------------------------------------------------
| raw/                               |               |
|    gen/rollout/ep_len_mean         | 25.4          |
|    gen/rollout/ep_rew_mean         | 24.2          |
|    gen/rollout/ep_rew_wrapped_mean | -41.4         |
|    gen/time/fps                    | 38            |
|    gen/time/iterations             | 1             |
|    gen/time/time_elapsed           | 77            |
|    gen/time/total_timesteps        | 201000        |
|    gen/train/approx_kl             | 0.00048134298 |
|    gen/train/clip_fraction         | 0.016         |
|    gen/train/clip_range            | 0.1           |
|    gen/train/entropy_loss          | -0.00992      |
|    gen/train/explained_variance    | 0.959         |
|    gen/train/learning_rate         | 0.001         |
|    gen/train/loss                  | 0.116         |
|    gen/train/n_updates             | 990           |
|    gen/train/policy_gradient_loss  | -0.00677      |
|    gen/t

round:  67%|██████▋   | 67/100 [1:41:40<47:54, 87.12s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 23.4         |
|    gen/rollout/ep_rew_mean         | 22.2         |
|    gen/rollout/ep_rew_wrapped_mean | -52.2        |
|    gen/time/fps                    | 36           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 82           |
|    gen/time/total_timesteps        | 204000       |
|    gen/train/approx_kl             | 0.0013732243 |
|    gen/train/clip_fraction         | 0.00242      |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.00305     |
|    gen/train/explained_variance    | 0.963        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.395        |
|    gen/train/n_updates             | 1005         |
|    gen/train/policy_gradient_loss  | -0.000296    |
|    gen/train/value_loss   

round:  68%|██████▊   | 68/100 [1:43:10<46:52, 87.89s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 32.2         |
|    gen/rollout/ep_rew_mean         | 30.9         |
|    gen/rollout/ep_rew_wrapped_mean | -26.3        |
|    gen/time/fps                    | 32           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 91           |
|    gen/time/total_timesteps        | 207000       |
|    gen/train/approx_kl             | 0.0010272303 |
|    gen/train/clip_fraction         | 0.0107       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.00612     |
|    gen/train/explained_variance    | 0.945        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.563        |
|    gen/train/n_updates             | 1020         |
|    gen/train/policy_gradient_loss  | -0.000907    |
|    gen/train/value_loss   

round:  69%|██████▉   | 69/100 [1:44:48<47:01, 91.03s/it]

------------------------------------------------------
| raw/                               |               |
|    gen/rollout/ep_len_mean         | 24            |
|    gen/rollout/ep_rew_mean         | 22.8          |
|    gen/rollout/ep_rew_wrapped_mean | -50.3         |
|    gen/time/fps                    | 37            |
|    gen/time/iterations             | 1             |
|    gen/time/time_elapsed           | 81            |
|    gen/time/total_timesteps        | 210000        |
|    gen/train/approx_kl             | 0.00026470458 |
|    gen/train/clip_fraction         | 0.00798       |
|    gen/train/clip_range            | 0.1           |
|    gen/train/entropy_loss          | -0.00525      |
|    gen/train/explained_variance    | 0.967         |
|    gen/train/learning_rate         | 0.001         |
|    gen/train/loss                  | 0.567         |
|    gen/train/n_updates             | 1035          |
|    gen/train/policy_gradient_loss  | -0.000658     |
|    gen/t

round:  70%|███████   | 70/100 [1:46:16<45:05, 90.18s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 20.3         |
|    gen/rollout/ep_rew_mean         | 19.2         |
|    gen/rollout/ep_rew_wrapped_mean | -25.1        |
|    gen/time/fps                    | 35           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 84           |
|    gen/time/total_timesteps        | 213000       |
|    gen/train/approx_kl             | 0.0013790682 |
|    gen/train/clip_fraction         | 0.0118       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.00324     |
|    gen/train/explained_variance    | 0.962        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.217        |
|    gen/train/n_updates             | 1050         |
|    gen/train/policy_gradient_loss  | -0.00105     |
|    gen/train/value_loss   

round:  71%|███████   | 71/100 [1:47:48<43:49, 90.66s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 15.6         |
|    gen/rollout/ep_rew_mean         | 14.4         |
|    gen/rollout/ep_rew_wrapped_mean | -28.4        |
|    gen/time/fps                    | 39           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 76           |
|    gen/time/total_timesteps        | 216000       |
|    gen/train/approx_kl             | 0.0030436257 |
|    gen/train/clip_fraction         | 0.00864      |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.00816     |
|    gen/train/explained_variance    | 0.959        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.149        |
|    gen/train/n_updates             | 1065         |
|    gen/train/policy_gradient_loss  | -0.00063     |
|    gen/train/value_loss   

round:  72%|███████▏  | 72/100 [1:49:12<41:19, 88.55s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 33.7         |
|    gen/rollout/ep_rew_mean         | 32.5         |
|    gen/rollout/ep_rew_wrapped_mean | -22.8        |
|    gen/time/fps                    | 36           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 82           |
|    gen/time/total_timesteps        | 219000       |
|    gen/train/approx_kl             | 0.0008656857 |
|    gen/train/clip_fraction         | 0.0175       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0151      |
|    gen/train/explained_variance    | 0.943        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.306        |
|    gen/train/n_updates             | 1080         |
|    gen/train/policy_gradient_loss  | 0.000678     |
|    gen/train/value_loss   

round:  73%|███████▎  | 73/100 [1:50:42<40:01, 88.93s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 30           |
|    gen/rollout/ep_rew_mean         | 28.8         |
|    gen/rollout/ep_rew_wrapped_mean | -64.4        |
|    gen/time/fps                    | 35           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 83           |
|    gen/time/total_timesteps        | 222000       |
|    gen/train/approx_kl             | 0.0003686593 |
|    gen/train/clip_fraction         | 0.00222      |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.00284     |
|    gen/train/explained_variance    | 0.969        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.161        |
|    gen/train/n_updates             | 1095         |
|    gen/train/policy_gradient_loss  | -0.000309    |
|    gen/train/value_loss   

round:  74%|███████▍  | 74/100 [1:52:12<38:43, 89.38s/it]

------------------------------------------------------
| raw/                               |               |
|    gen/rollout/ep_len_mean         | 30.4          |
|    gen/rollout/ep_rew_mean         | 27.7          |
|    gen/rollout/ep_rew_wrapped_mean | -42.1         |
|    gen/time/fps                    | 36            |
|    gen/time/iterations             | 1             |
|    gen/time/time_elapsed           | 82            |
|    gen/time/total_timesteps        | 225000        |
|    gen/train/approx_kl             | 0.00045128667 |
|    gen/train/clip_fraction         | 0.00476       |
|    gen/train/clip_range            | 0.1           |
|    gen/train/entropy_loss          | -0.00671      |
|    gen/train/explained_variance    | 0.969         |
|    gen/train/learning_rate         | 0.001         |
|    gen/train/loss                  | 0.0622        |
|    gen/train/n_updates             | 1110          |
|    gen/train/policy_gradient_loss  | -0.000132     |
|    gen/t

round:  75%|███████▌  | 75/100 [1:53:42<37:20, 89.62s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 29.4        |
|    gen/rollout/ep_rew_mean         | 28          |
|    gen/rollout/ep_rew_wrapped_mean | -48.9       |
|    gen/time/fps                    | 36          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 83          |
|    gen/time/total_timesteps        | 228000      |
|    gen/train/approx_kl             | 0.052398838 |
|    gen/train/clip_fraction         | 0.128       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0293     |
|    gen/train/explained_variance    | 0.953       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.714       |
|    gen/train/n_updates             | 1125        |
|    gen/train/policy_gradient_loss  | 0.00318     |
|    gen/train/value_loss            | 0.739  

round:  76%|███████▌  | 76/100 [1:55:12<35:51, 89.67s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 32.8       |
|    gen/rollout/ep_rew_mean         | 31.3       |
|    gen/rollout/ep_rew_wrapped_mean | -35.9      |
|    gen/time/fps                    | 37         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 79         |
|    gen/time/total_timesteps        | 231000     |
|    gen/train/approx_kl             | 0.00894025 |
|    gen/train/clip_fraction         | 0.0112     |
|    gen/train/clip_range            | 0.1        |
|    gen/train/entropy_loss          | -0.0156    |
|    gen/train/explained_variance    | 0.968      |
|    gen/train/learning_rate         | 0.001      |
|    gen/train/loss                  | 0.16       |
|    gen/train/n_updates             | 1140       |
|    gen/train/policy_gradient_loss  | -0.000398  |
|    gen/train/value_loss            | 0.471      |
------------

round:  77%|███████▋  | 77/100 [1:56:38<33:56, 88.54s/it]

------------------------------------------------------
| raw/                               |               |
|    gen/rollout/ep_len_mean         | 20.6          |
|    gen/rollout/ep_rew_mean         | 19.1          |
|    gen/rollout/ep_rew_wrapped_mean | -48.7         |
|    gen/time/fps                    | 40            |
|    gen/time/iterations             | 1             |
|    gen/time/time_elapsed           | 73            |
|    gen/time/total_timesteps        | 234000        |
|    gen/train/approx_kl             | 0.00058712944 |
|    gen/train/clip_fraction         | 0.00667       |
|    gen/train/clip_range            | 0.1           |
|    gen/train/entropy_loss          | -0.0138       |
|    gen/train/explained_variance    | 0.972         |
|    gen/train/learning_rate         | 0.001         |
|    gen/train/loss                  | 0.296         |
|    gen/train/n_updates             | 1155          |
|    gen/train/policy_gradient_loss  | -0.00112      |
|    gen/t

round:  78%|███████▊  | 78/100 [1:57:59<31:36, 86.19s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 27.1         |
|    gen/rollout/ep_rew_mean         | 25.3         |
|    gen/rollout/ep_rew_wrapped_mean | -57.2        |
|    gen/time/fps                    | 36           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 82           |
|    gen/time/total_timesteps        | 237000       |
|    gen/train/approx_kl             | 0.0022678236 |
|    gen/train/clip_fraction         | 0.0322       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0319      |
|    gen/train/explained_variance    | 0.97         |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.151        |
|    gen/train/n_updates             | 1170         |
|    gen/train/policy_gradient_loss  | -0.00361     |
|    gen/train/value_loss   

round:  79%|███████▉  | 79/100 [1:59:28<30:30, 87.16s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 28.6         |
|    gen/rollout/ep_rew_mean         | 27.1         |
|    gen/rollout/ep_rew_wrapped_mean | -68.8        |
|    gen/time/fps                    | 35           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 84           |
|    gen/time/total_timesteps        | 240000       |
|    gen/train/approx_kl             | 0.0037905436 |
|    gen/train/clip_fraction         | 0.0395       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0341      |
|    gen/train/explained_variance    | 0.975        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.172        |
|    gen/train/n_updates             | 1185         |
|    gen/train/policy_gradient_loss  | -0.00214     |
|    gen/train/value_loss   

round:  80%|████████  | 80/100 [2:00:59<29:29, 88.45s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 32.1        |
|    gen/rollout/ep_rew_mean         | 28.5        |
|    gen/rollout/ep_rew_wrapped_mean | -42.2       |
|    gen/time/fps                    | 36          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 82          |
|    gen/time/total_timesteps        | 243000      |
|    gen/train/approx_kl             | 0.004052431 |
|    gen/train/clip_fraction         | 0.0194      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0295     |
|    gen/train/explained_variance    | 0.967       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.185       |
|    gen/train/n_updates             | 1200        |
|    gen/train/policy_gradient_loss  | -0.00138    |
|    gen/train/value_loss            | 0.65   

round:  81%|████████  | 81/100 [2:02:29<28:06, 88.76s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 33.8        |
|    gen/rollout/ep_rew_mean         | 31.9        |
|    gen/rollout/ep_rew_wrapped_mean | -68.9       |
|    gen/time/fps                    | 34          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 87          |
|    gen/time/total_timesteps        | 246000      |
|    gen/train/approx_kl             | 0.016530236 |
|    gen/train/clip_fraction         | 0.14        |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0901     |
|    gen/train/explained_variance    | 0.967       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.227       |
|    gen/train/n_updates             | 1215        |
|    gen/train/policy_gradient_loss  | -0.00151    |
|    gen/train/value_loss            | 0.823  

round:  82%|████████▏ | 82/100 [2:04:03<27:07, 90.44s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 24.6         |
|    gen/rollout/ep_rew_mean         | 21.7         |
|    gen/rollout/ep_rew_wrapped_mean | -26.8        |
|    gen/time/fps                    | 37           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 80           |
|    gen/time/total_timesteps        | 249000       |
|    gen/train/approx_kl             | 0.0034908834 |
|    gen/train/clip_fraction         | 0.0317       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0341      |
|    gen/train/explained_variance    | 0.949        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.194        |
|    gen/train/n_updates             | 1230         |
|    gen/train/policy_gradient_loss  | -0.00187     |
|    gen/train/value_loss   

round:  83%|████████▎ | 83/100 [2:05:31<25:22, 89.58s/it]

-------------------------------------------------
| raw/                               |          |
|    gen/rollout/ep_len_mean         | 31.7     |
|    gen/rollout/ep_rew_mean         | 30.5     |
|    gen/rollout/ep_rew_wrapped_mean | -78.3    |
|    gen/time/fps                    | 35       |
|    gen/time/iterations             | 1        |
|    gen/time/time_elapsed           | 84       |
|    gen/time/total_timesteps        | 252000   |
|    gen/train/approx_kl             | 0.03753  |
|    gen/train/clip_fraction         | 0.129    |
|    gen/train/clip_range            | 0.1      |
|    gen/train/entropy_loss          | -0.0365  |
|    gen/train/explained_variance    | 0.964    |
|    gen/train/learning_rate         | 0.001    |
|    gen/train/loss                  | 0.511    |
|    gen/train/n_updates             | 1245     |
|    gen/train/policy_gradient_loss  | -0.00753 |
|    gen/train/value_loss            | 1.18     |
-------------------------------------------------


round:  84%|████████▍ | 84/100 [2:07:02<24:00, 90.04s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 23.1         |
|    gen/rollout/ep_rew_mean         | 21.9         |
|    gen/rollout/ep_rew_wrapped_mean | -57.5        |
|    gen/time/fps                    | 39           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 76           |
|    gen/time/total_timesteps        | 255000       |
|    gen/train/approx_kl             | 0.0016706623 |
|    gen/train/clip_fraction         | 0.00256      |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.00821     |
|    gen/train/explained_variance    | 0.966        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.469        |
|    gen/train/n_updates             | 1260         |
|    gen/train/policy_gradient_loss  | -0.000963    |
|    gen/train/value_loss   

round:  85%|████████▌ | 85/100 [2:08:25<21:59, 87.94s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 28.1         |
|    gen/rollout/ep_rew_mean         | 26.8         |
|    gen/rollout/ep_rew_wrapped_mean | -22.9        |
|    gen/time/fps                    | 38           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 78           |
|    gen/time/total_timesteps        | 258000       |
|    gen/train/approx_kl             | 0.0006252532 |
|    gen/train/clip_fraction         | 0.0111       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.00937     |
|    gen/train/explained_variance    | 0.941        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.115        |
|    gen/train/n_updates             | 1275         |
|    gen/train/policy_gradient_loss  | -0.000886    |
|    gen/train/value_loss   

round:  86%|████████▌ | 86/100 [2:09:51<20:21, 87.23s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 33.8         |
|    gen/rollout/ep_rew_mean         | 32.6         |
|    gen/rollout/ep_rew_wrapped_mean | -56.9        |
|    gen/time/fps                    | 34           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 87           |
|    gen/time/total_timesteps        | 261000       |
|    gen/train/approx_kl             | 0.0017295975 |
|    gen/train/clip_fraction         | 0.00451      |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.00375     |
|    gen/train/explained_variance    | 0.973        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.249        |
|    gen/train/n_updates             | 1290         |
|    gen/train/policy_gradient_loss  | -0.000489    |
|    gen/train/value_loss   

round:  87%|████████▋ | 87/100 [2:11:25<19:23, 89.47s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 22.4         |
|    gen/rollout/ep_rew_mean         | 20.7         |
|    gen/rollout/ep_rew_wrapped_mean | -63.9        |
|    gen/time/fps                    | 37           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 79           |
|    gen/time/total_timesteps        | 264000       |
|    gen/train/approx_kl             | 0.0011606263 |
|    gen/train/clip_fraction         | 0.0279       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0218      |
|    gen/train/explained_variance    | 0.974        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.127        |
|    gen/train/n_updates             | 1305         |
|    gen/train/policy_gradient_loss  | -0.00224     |
|    gen/train/value_loss   

round:  88%|████████▊ | 88/100 [2:12:52<17:41, 88.50s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 29.2        |
|    gen/rollout/ep_rew_mean         | 27.7        |
|    gen/rollout/ep_rew_wrapped_mean | -54.8       |
|    gen/time/fps                    | 36          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 81          |
|    gen/time/total_timesteps        | 267000      |
|    gen/train/approx_kl             | 0.009206994 |
|    gen/train/clip_fraction         | 0.0198      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0156     |
|    gen/train/explained_variance    | 0.965       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.253       |
|    gen/train/n_updates             | 1320        |
|    gen/train/policy_gradient_loss  | -0.00195    |
|    gen/train/value_loss            | 0.8    

round:  89%|████████▉ | 89/100 [2:14:21<16:15, 88.64s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 23.5         |
|    gen/rollout/ep_rew_mean         | 22.2         |
|    gen/rollout/ep_rew_wrapped_mean | -48          |
|    gen/time/fps                    | 37           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 79           |
|    gen/time/total_timesteps        | 270000       |
|    gen/train/approx_kl             | 0.0019522763 |
|    gen/train/clip_fraction         | 0.00796      |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0163      |
|    gen/train/explained_variance    | 0.958        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 1.08         |
|    gen/train/n_updates             | 1335         |
|    gen/train/policy_gradient_loss  | 0.000387     |
|    gen/train/value_loss   

round:  90%|█████████ | 90/100 [2:15:47<14:40, 88.04s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 16.1        |
|    gen/rollout/ep_rew_mean         | 14.7        |
|    gen/rollout/ep_rew_wrapped_mean | -54.7       |
|    gen/time/fps                    | 38          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 77          |
|    gen/time/total_timesteps        | 273000      |
|    gen/train/approx_kl             | 0.004050931 |
|    gen/train/clip_fraction         | 0.0123      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0213     |
|    gen/train/explained_variance    | 0.975       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.468       |
|    gen/train/n_updates             | 1350        |
|    gen/train/policy_gradient_loss  | -0.000531   |
|    gen/train/value_loss            | 0.638  

round:  91%|█████████ | 91/100 [2:17:12<13:03, 87.04s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 23.4         |
|    gen/rollout/ep_rew_mean         | 21.6         |
|    gen/rollout/ep_rew_wrapped_mean | -30.5        |
|    gen/time/fps                    | 36           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 81           |
|    gen/time/total_timesteps        | 276000       |
|    gen/train/approx_kl             | 0.0041920794 |
|    gen/train/clip_fraction         | 0.0176       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0245      |
|    gen/train/explained_variance    | 0.954        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.229        |
|    gen/train/n_updates             | 1365         |
|    gen/train/policy_gradient_loss  | -0.00146     |
|    gen/train/value_loss   

round:  92%|█████████▏| 92/100 [2:18:40<11:39, 87.44s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 28.8         |
|    gen/rollout/ep_rew_mean         | 27.1         |
|    gen/rollout/ep_rew_wrapped_mean | -39.6        |
|    gen/time/fps                    | 36           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 82           |
|    gen/time/total_timesteps        | 279000       |
|    gen/train/approx_kl             | 0.0025493652 |
|    gen/train/clip_fraction         | 0.0231       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0296      |
|    gen/train/explained_variance    | 0.94         |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.81         |
|    gen/train/n_updates             | 1380         |
|    gen/train/policy_gradient_loss  | 0.00104      |
|    gen/train/value_loss   

round:  93%|█████████▎| 93/100 [2:20:10<10:17, 88.15s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 21.6         |
|    gen/rollout/ep_rew_mean         | 20.4         |
|    gen/rollout/ep_rew_wrapped_mean | -46          |
|    gen/time/fps                    | 36           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 81           |
|    gen/time/total_timesteps        | 282000       |
|    gen/train/approx_kl             | 0.0051858313 |
|    gen/train/clip_fraction         | 0.0261       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0173      |
|    gen/train/explained_variance    | 0.959        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.295        |
|    gen/train/n_updates             | 1395         |
|    gen/train/policy_gradient_loss  | -0.00122     |
|    gen/train/value_loss   

round:  94%|█████████▍| 94/100 [2:21:38<08:49, 88.17s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 20.7         |
|    gen/rollout/ep_rew_mean         | 18.7         |
|    gen/rollout/ep_rew_wrapped_mean | -32.4        |
|    gen/time/fps                    | 34           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 86           |
|    gen/time/total_timesteps        | 285000       |
|    gen/train/approx_kl             | 0.0041862833 |
|    gen/train/clip_fraction         | 0.0441       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0665      |
|    gen/train/explained_variance    | 0.966        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.379        |
|    gen/train/n_updates             | 1410         |
|    gen/train/policy_gradient_loss  | -0.00298     |
|    gen/train/value_loss   

round:  95%|█████████▌| 95/100 [2:23:12<07:28, 89.77s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 29.6        |
|    gen/rollout/ep_rew_mean         | 27.8        |
|    gen/rollout/ep_rew_wrapped_mean | -28.3       |
|    gen/time/fps                    | 34          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 85          |
|    gen/time/total_timesteps        | 288000      |
|    gen/train/approx_kl             | 0.020224981 |
|    gen/train/clip_fraction         | 0.0554      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0506     |
|    gen/train/explained_variance    | 0.971       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.477       |
|    gen/train/n_updates             | 1425        |
|    gen/train/policy_gradient_loss  | -0.00263    |
|    gen/train/value_loss            | 0.836  

round:  96%|█████████▌| 96/100 [2:24:45<06:03, 90.78s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 39.3         |
|    gen/rollout/ep_rew_mean         | 37.8         |
|    gen/rollout/ep_rew_wrapped_mean | -61.1        |
|    gen/time/fps                    | 35           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 83           |
|    gen/time/total_timesteps        | 291000       |
|    gen/train/approx_kl             | 0.0063598426 |
|    gen/train/clip_fraction         | 0.0142       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0235      |
|    gen/train/explained_variance    | 0.971        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.304        |
|    gen/train/n_updates             | 1440         |
|    gen/train/policy_gradient_loss  | 0.000134     |
|    gen/train/value_loss   

round:  97%|█████████▋| 97/100 [2:26:15<04:32, 90.67s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 42.4         |
|    gen/rollout/ep_rew_mean         | 40.2         |
|    gen/rollout/ep_rew_wrapped_mean | -61          |
|    gen/time/fps                    | 35           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 84           |
|    gen/time/total_timesteps        | 294000       |
|    gen/train/approx_kl             | 0.0078849355 |
|    gen/train/clip_fraction         | 0.0293       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.046       |
|    gen/train/explained_variance    | 0.971        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.165        |
|    gen/train/n_updates             | 1455         |
|    gen/train/policy_gradient_loss  | -0.00184     |
|    gen/train/value_loss   

round:  98%|█████████▊| 98/100 [2:27:47<03:01, 90.90s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 19.9        |
|    gen/rollout/ep_rew_mean         | 18.5        |
|    gen/rollout/ep_rew_wrapped_mean | -89         |
|    gen/time/fps                    | 38          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 78          |
|    gen/time/total_timesteps        | 297000      |
|    gen/train/approx_kl             | 0.013656101 |
|    gen/train/clip_fraction         | 0.016       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0483     |
|    gen/train/explained_variance    | 0.957       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.799       |
|    gen/train/n_updates             | 1470        |
|    gen/train/policy_gradient_loss  | 0.00197     |
|    gen/train/value_loss            | 1.13   

round:  99%|█████████▉| 99/100 [2:29:12<01:29, 89.21s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 16.5         |
|    gen/rollout/ep_rew_mean         | 15.3         |
|    gen/rollout/ep_rew_wrapped_mean | -63.3        |
|    gen/time/fps                    | 38           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 78           |
|    gen/time/total_timesteps        | 300000       |
|    gen/train/approx_kl             | 0.0017114796 |
|    gen/train/clip_fraction         | 0.0526       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0467      |
|    gen/train/explained_variance    | 0.969        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.744        |
|    gen/train/n_updates             | 1485         |
|    gen/train/policy_gradient_loss  | -0.00578     |
|    gen/train/value_loss   

round: 100%|██████████| 100/100 [2:30:37<00:00, 90.38s/it]

🏃 View run calm-moose-867 at: http://127.0.0.1:8080/#/experiments/282678262450638424/runs/8781ce0d9d5f453fa45b69b82b1ce30b
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/282678262450638424





In [35]:
mlflow.set_experiment("AIRLv2")
with mlflow.start_run():
    mlflow.log_param("n_steps", n_steps)
    mlflow.log_param("batch_size", batch_size)
    mlflow.log_param("total_timesteps", total_timesteps)

    airl_trainer0.train(total_timesteps=total_timesteps)

    learner0.save(config.MODELS_DIR / "learner0v2")
    torch.save(reward_net0, config.MODELS_DIR / "reward_net0v2")

    mlflow.log_artifact(config.MODELS_DIR / "learner0v2.zip")
    mlflow.log_artifact(config.MODELS_DIR / "reward_net0v2")
    mlflow.end_run()

round:   0%|          | 0/100 [00:00<?, ?it/s]

------------------------------------------
| raw/                        |          |
|    gen/rollout/ep_len_mean  | 3.8      |
|    gen/rollout/ep_rew_mean  | 1.36     |
|    gen/time/fps             | 50       |
|    gen/time/iterations      | 1        |
|    gen/time/time_elapsed    | 59       |
|    gen/time/total_timesteps | 3000     |
------------------------------------------
--------------------------------------------------
| raw/                                |          |
|    disc/disc_acc                    | 0.5      |
|    disc/disc_acc_expert             | 1        |
|    disc/disc_acc_gen                | 0        |
|    disc/disc_entropy                | 0.603    |
|    disc/disc_loss                   | 0.0158   |
|    disc/disc_proportion_expert_pred | 1        |
|    disc/disc_proportion_expert_true | 0.5      |
|    disc/global_step                 | 1        |
|    disc/n_expert                    | 60       |
|    disc/n_generated                 | 60       |
-

round:   1%|          | 1/100 [01:07<1:51:12, 67.40s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 3.02        |
|    gen/rollout/ep_rew_mean         | 1.03        |
|    gen/rollout/ep_rew_wrapped_mean | 6.06        |
|    gen/time/fps                    | 61          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 48          |
|    gen/time/total_timesteps        | 6000        |
|    gen/train/approx_kl             | 0.001884219 |
|    gen/train/clip_fraction         | 0.0561      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.692      |
|    gen/train/explained_variance    | 0.0105      |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 1.02        |
|    gen/train/n_updates             | 15          |
|    gen/train/policy_gradient_loss  | -0.000958   |
|    gen/train/value_loss            | 3.18   

round:   2%|▏         | 2/100 [02:03<1:39:03, 60.65s/it]

------------------------------------------------------
| raw/                               |               |
|    gen/rollout/ep_len_mean         | 3.98          |
|    gen/rollout/ep_rew_mean         | 1.52          |
|    gen/rollout/ep_rew_wrapped_mean | -0.271        |
|    gen/time/fps                    | 52            |
|    gen/time/iterations             | 1             |
|    gen/time/time_elapsed           | 57            |
|    gen/time/total_timesteps        | 9000          |
|    gen/train/approx_kl             | 0.00091607554 |
|    gen/train/clip_fraction         | 0.0848        |
|    gen/train/clip_range            | 0.1           |
|    gen/train/entropy_loss          | -0.692        |
|    gen/train/explained_variance    | -1.71         |
|    gen/train/learning_rate         | 0.001         |
|    gen/train/loss                  | 0.234         |
|    gen/train/n_updates             | 30            |
|    gen/train/policy_gradient_loss  | 0.000372      |
|    gen/t

round:   3%|▎         | 3/100 [03:08<1:41:07, 62.55s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 7.77        |
|    gen/rollout/ep_rew_mean         | 3.19        |
|    gen/rollout/ep_rew_wrapped_mean | -1.16       |
|    gen/time/fps                    | 51          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 58          |
|    gen/time/total_timesteps        | 12000       |
|    gen/train/approx_kl             | 0.004343145 |
|    gen/train/clip_fraction         | 0.275       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.688      |
|    gen/train/explained_variance    | -0.0167     |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0257      |
|    gen/train/n_updates             | 45          |
|    gen/train/policy_gradient_loss  | -0.00773    |
|    gen/train/value_loss            | 0.0694 

round:   4%|▍         | 4/100 [04:13<1:42:03, 63.78s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6.35        |
|    gen/rollout/ep_rew_mean         | 2.79        |
|    gen/rollout/ep_rew_wrapped_mean | -3.23       |
|    gen/time/fps                    | 59          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 50          |
|    gen/time/total_timesteps        | 15000       |
|    gen/train/approx_kl             | 0.005053894 |
|    gen/train/clip_fraction         | 0.388       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.671      |
|    gen/train/explained_variance    | 0.696       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.00323     |
|    gen/train/n_updates             | 60          |
|    gen/train/policy_gradient_loss  | -0.0146     |
|    gen/train/value_loss            | 0.0163 

round:   5%|▌         | 5/100 [05:11<1:37:26, 61.54s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 5.2        |
|    gen/rollout/ep_rew_mean         | 2.42       |
|    gen/rollout/ep_rew_wrapped_mean | -2.63      |
|    gen/time/fps                    | 61         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 49         |
|    gen/time/total_timesteps        | 18000      |
|    gen/train/approx_kl             | 0.00536494 |
|    gen/train/clip_fraction         | 0.411      |
|    gen/train/clip_range            | 0.1        |
|    gen/train/entropy_loss          | -0.658     |
|    gen/train/explained_variance    | 0.843      |
|    gen/train/learning_rate         | 0.001      |
|    gen/train/loss                  | -0.0243    |
|    gen/train/n_updates             | 75         |
|    gen/train/policy_gradient_loss  | -0.0212    |
|    gen/train/value_loss            | 0.017      |
------------

round:   6%|▌         | 6/100 [06:07<1:33:38, 59.77s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 5.1          |
|    gen/rollout/ep_rew_mean         | 2.34         |
|    gen/rollout/ep_rew_wrapped_mean | -2.68        |
|    gen/time/fps                    | 46           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 64           |
|    gen/time/total_timesteps        | 21000        |
|    gen/train/approx_kl             | 0.0057654367 |
|    gen/train/clip_fraction         | 0.419        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.633       |
|    gen/train/explained_variance    | 0.845        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | -0.0583      |
|    gen/train/n_updates             | 90           |
|    gen/train/policy_gradient_loss  | -0.0214      |
|    gen/train/value_loss   

round:   7%|▋         | 7/100 [07:18<1:38:27, 63.53s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 4.82         |
|    gen/rollout/ep_rew_mean         | 2.39         |
|    gen/rollout/ep_rew_wrapped_mean | -3           |
|    gen/time/fps                    | 55           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 53           |
|    gen/time/total_timesteps        | 24000        |
|    gen/train/approx_kl             | 0.0059744106 |
|    gen/train/clip_fraction         | 0.294        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.589       |
|    gen/train/explained_variance    | 0.831        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | -0.00762     |
|    gen/train/n_updates             | 105          |
|    gen/train/policy_gradient_loss  | -0.0148      |
|    gen/train/value_loss   

round:   8%|▊         | 8/100 [08:19<1:36:09, 62.71s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6.61        |
|    gen/rollout/ep_rew_mean         | 3.53        |
|    gen/rollout/ep_rew_wrapped_mean | -2.49       |
|    gen/time/fps                    | 40          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 74          |
|    gen/time/total_timesteps        | 27000       |
|    gen/train/approx_kl             | 0.009497246 |
|    gen/train/clip_fraction         | 0.366       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.535      |
|    gen/train/explained_variance    | 0.797       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | -0.0198     |
|    gen/train/n_updates             | 120         |
|    gen/train/policy_gradient_loss  | -0.0176     |
|    gen/train/value_loss            | 0.0722 

round:   9%|▉         | 9/100 [09:42<1:44:17, 68.76s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6.19        |
|    gen/rollout/ep_rew_mean         | 3.32        |
|    gen/rollout/ep_rew_wrapped_mean | -3.29       |
|    gen/time/fps                    | 54          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 55          |
|    gen/time/total_timesteps        | 30000       |
|    gen/train/approx_kl             | 0.014226957 |
|    gen/train/clip_fraction         | 0.371       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.465      |
|    gen/train/explained_variance    | 0.619       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0448      |
|    gen/train/n_updates             | 135         |
|    gen/train/policy_gradient_loss  | -0.00977    |
|    gen/train/value_loss            | 0.142  

round:  10%|█         | 10/100 [10:44<1:40:15, 66.84s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 5.62        |
|    gen/rollout/ep_rew_mean         | 3.81        |
|    gen/rollout/ep_rew_wrapped_mean | -7.09       |
|    gen/time/fps                    | 58          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 51          |
|    gen/time/total_timesteps        | 33000       |
|    gen/train/approx_kl             | 0.028549304 |
|    gen/train/clip_fraction         | 0.192       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.367      |
|    gen/train/explained_variance    | 0.807       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0185      |
|    gen/train/n_updates             | 150         |
|    gen/train/policy_gradient_loss  | -0.0105     |
|    gen/train/value_loss            | 0.132  

round:  11%|█         | 11/100 [11:43<1:35:22, 64.29s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 4.16         |
|    gen/rollout/ep_rew_mean         | 2.55         |
|    gen/rollout/ep_rew_wrapped_mean | -2.89        |
|    gen/time/fps                    | 58           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 51           |
|    gen/time/total_timesteps        | 36000        |
|    gen/train/approx_kl             | 0.0048820646 |
|    gen/train/clip_fraction         | 0.0886       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.254       |
|    gen/train/explained_variance    | 0.685        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.0274       |
|    gen/train/n_updates             | 165          |
|    gen/train/policy_gradient_loss  | -0.00267     |
|    gen/train/value_loss   

round:  12%|█▏        | 12/100 [12:41<1:31:39, 62.49s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 8.04        |
|    gen/rollout/ep_rew_mean         | 6.31        |
|    gen/rollout/ep_rew_wrapped_mean | -1.72       |
|    gen/time/fps                    | 57          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 51          |
|    gen/time/total_timesteps        | 39000       |
|    gen/train/approx_kl             | 0.010912265 |
|    gen/train/clip_fraction         | 0.0825      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.233      |
|    gen/train/explained_variance    | 0.682       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0464      |
|    gen/train/n_updates             | 180         |
|    gen/train/policy_gradient_loss  | -0.00711    |
|    gen/train/value_loss            | 0.0925 

round:  13%|█▎        | 13/100 [13:40<1:29:12, 61.52s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 5.05        |
|    gen/rollout/ep_rew_mean         | 3.59        |
|    gen/rollout/ep_rew_wrapped_mean | -2.6        |
|    gen/time/fps                    | 59          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 50          |
|    gen/time/total_timesteps        | 42000       |
|    gen/train/approx_kl             | 0.020035224 |
|    gen/train/clip_fraction         | 0.0564      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.148      |
|    gen/train/explained_variance    | 0.54        |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0618      |
|    gen/train/n_updates             | 195         |
|    gen/train/policy_gradient_loss  | -0.00559    |
|    gen/train/value_loss            | 0.0906 

round:  14%|█▍        | 14/100 [14:38<1:26:30, 60.36s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 3.09         |
|    gen/rollout/ep_rew_mean         | 1.67         |
|    gen/rollout/ep_rew_wrapped_mean | -1.63        |
|    gen/time/fps                    | 60           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 49           |
|    gen/time/total_timesteps        | 45000        |
|    gen/train/approx_kl             | 0.0039874287 |
|    gen/train/clip_fraction         | 0.0357       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.133       |
|    gen/train/explained_variance    | 0.5          |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.0798       |
|    gen/train/n_updates             | 210          |
|    gen/train/policy_gradient_loss  | -0.00141     |
|    gen/train/value_loss   

round:  15%|█▌        | 15/100 [15:35<1:24:07, 59.38s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 6.06         |
|    gen/rollout/ep_rew_mean         | 4.41         |
|    gen/rollout/ep_rew_wrapped_mean | -1.03        |
|    gen/time/fps                    | 55           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 54           |
|    gen/time/total_timesteps        | 48000        |
|    gen/train/approx_kl             | 0.0066725966 |
|    gen/train/clip_fraction         | 0.0399       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.109       |
|    gen/train/explained_variance    | 0.565        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.0226       |
|    gen/train/n_updates             | 225          |
|    gen/train/policy_gradient_loss  | -0.00189     |
|    gen/train/value_loss   

round:  16%|█▌        | 16/100 [16:36<1:23:57, 59.97s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 5.12       |
|    gen/rollout/ep_rew_mean         | 3.78       |
|    gen/rollout/ep_rew_wrapped_mean | -2.36      |
|    gen/time/fps                    | 59         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 50         |
|    gen/time/total_timesteps        | 51000      |
|    gen/train/approx_kl             | 0.02247291 |
|    gen/train/clip_fraction         | 0.024      |
|    gen/train/clip_range            | 0.1        |
|    gen/train/entropy_loss          | -0.0475    |
|    gen/train/explained_variance    | 0.699      |
|    gen/train/learning_rate         | 0.001      |
|    gen/train/loss                  | 0.00276    |
|    gen/train/n_updates             | 240        |
|    gen/train/policy_gradient_loss  | -0.00401   |
|    gen/train/value_loss            | 0.0357     |
------------

round:  17%|█▋        | 17/100 [17:34<1:21:52, 59.18s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 4.94         |
|    gen/rollout/ep_rew_mean         | 3.57         |
|    gen/rollout/ep_rew_wrapped_mean | -1.58        |
|    gen/time/fps                    | 54           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 55           |
|    gen/time/total_timesteps        | 54000        |
|    gen/train/approx_kl             | 0.0027931617 |
|    gen/train/clip_fraction         | 0.0129       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0272      |
|    gen/train/explained_variance    | 0.75         |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.00921      |
|    gen/train/n_updates             | 255          |
|    gen/train/policy_gradient_loss  | -0.00155     |
|    gen/train/value_loss   

round:  18%|█▊        | 18/100 [18:36<1:22:17, 60.21s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 3.58         |
|    gen/rollout/ep_rew_mean         | 2.29         |
|    gen/rollout/ep_rew_wrapped_mean | -2.21        |
|    gen/time/fps                    | 57           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 52           |
|    gen/time/total_timesteps        | 57000        |
|    gen/train/approx_kl             | 0.0023142472 |
|    gen/train/clip_fraction         | 0.0105       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0283      |
|    gen/train/explained_variance    | 0.783        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.00743      |
|    gen/train/n_updates             | 270          |
|    gen/train/policy_gradient_loss  | -0.000289    |
|    gen/train/value_loss   

round:  19%|█▉        | 19/100 [19:36<1:20:52, 59.91s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6.45        |
|    gen/rollout/ep_rew_mean         | 5.1         |
|    gen/rollout/ep_rew_wrapped_mean | -1.18       |
|    gen/time/fps                    | 57          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 51          |
|    gen/time/total_timesteps        | 60000       |
|    gen/train/approx_kl             | 0.004022826 |
|    gen/train/clip_fraction         | 0.0761      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0551     |
|    gen/train/explained_variance    | 0.866       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | -0.00475    |
|    gen/train/n_updates             | 285         |
|    gen/train/policy_gradient_loss  | -0.00677    |
|    gen/train/value_loss            | 0.0181 

round:  20%|██        | 20/100 [20:34<1:19:30, 59.63s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6.68        |
|    gen/rollout/ep_rew_mean         | 5.3         |
|    gen/rollout/ep_rew_wrapped_mean | -1.63       |
|    gen/time/fps                    | 59          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 50          |
|    gen/time/total_timesteps        | 63000       |
|    gen/train/approx_kl             | 0.011156648 |
|    gen/train/clip_fraction         | 0.0973      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0716     |
|    gen/train/explained_variance    | 0.786       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.00792     |
|    gen/train/n_updates             | 300         |
|    gen/train/policy_gradient_loss  | 0.00173     |
|    gen/train/value_loss            | 0.0329 

round:  21%|██        | 21/100 [21:32<1:17:42, 59.02s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 5.88         |
|    gen/rollout/ep_rew_mean         | 4.58         |
|    gen/rollout/ep_rew_wrapped_mean | -3.21        |
|    gen/time/fps                    | 62           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 47           |
|    gen/time/total_timesteps        | 66000        |
|    gen/train/approx_kl             | 0.0021367685 |
|    gen/train/clip_fraction         | 0.0111       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0501      |
|    gen/train/explained_variance    | 0.585        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.0216       |
|    gen/train/n_updates             | 315          |
|    gen/train/policy_gradient_loss  | -0.000929    |
|    gen/train/value_loss   

round:  22%|██▏       | 22/100 [22:27<1:15:02, 57.73s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6.07        |
|    gen/rollout/ep_rew_mean         | 4.27        |
|    gen/rollout/ep_rew_wrapped_mean | -1.87       |
|    gen/time/fps                    | 63          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 47          |
|    gen/time/total_timesteps        | 69000       |
|    gen/train/approx_kl             | 0.014990923 |
|    gen/train/clip_fraction         | 0.107       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.109      |
|    gen/train/explained_variance    | 0.787       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.00221     |
|    gen/train/n_updates             | 330         |
|    gen/train/policy_gradient_loss  | -0.00544    |
|    gen/train/value_loss            | 0.0373 

round:  23%|██▎       | 23/100 [23:21<1:12:53, 56.80s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 5.59        |
|    gen/rollout/ep_rew_mean         | 3.92        |
|    gen/rollout/ep_rew_wrapped_mean | -3.32       |
|    gen/time/fps                    | 58          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 51          |
|    gen/time/total_timesteps        | 72000       |
|    gen/train/approx_kl             | 0.012033543 |
|    gen/train/clip_fraction         | 0.0641      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.16       |
|    gen/train/explained_variance    | 0.803       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.00518     |
|    gen/train/n_updates             | 345         |
|    gen/train/policy_gradient_loss  | -0.00731    |
|    gen/train/value_loss            | 0.0373 

round:  24%|██▍       | 24/100 [24:20<1:12:32, 57.27s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 3.46       |
|    gen/rollout/ep_rew_mean         | 2.08       |
|    gen/rollout/ep_rew_wrapped_mean | -2.93      |
|    gen/time/fps                    | 61         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 48         |
|    gen/time/total_timesteps        | 75000      |
|    gen/train/approx_kl             | 0.10586649 |
|    gen/train/clip_fraction         | 0.061      |
|    gen/train/clip_range            | 0.1        |
|    gen/train/entropy_loss          | -0.0786    |
|    gen/train/explained_variance    | 0.726      |
|    gen/train/learning_rate         | 0.001      |
|    gen/train/loss                  | 0.000574   |
|    gen/train/n_updates             | 360        |
|    gen/train/policy_gradient_loss  | -0.013     |
|    gen/train/value_loss            | 0.0498     |
------------

round:  25%|██▌       | 25/100 [25:16<1:11:02, 56.83s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 2.97         |
|    gen/rollout/ep_rew_mean         | 1.67         |
|    gen/rollout/ep_rew_wrapped_mean | -2.08        |
|    gen/time/fps                    | 57           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 51           |
|    gen/time/total_timesteps        | 78000        |
|    gen/train/approx_kl             | 0.0020935268 |
|    gen/train/clip_fraction         | 0.0233       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.073       |
|    gen/train/explained_variance    | 0.794        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.138        |
|    gen/train/n_updates             | 375          |
|    gen/train/policy_gradient_loss  | -0.00191     |
|    gen/train/value_loss   

round:  26%|██▌       | 26/100 [26:14<1:10:49, 57.43s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6.33        |
|    gen/rollout/ep_rew_mean         | 4.98        |
|    gen/rollout/ep_rew_wrapped_mean | -1.53       |
|    gen/time/fps                    | 59          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 50          |
|    gen/time/total_timesteps        | 81000       |
|    gen/train/approx_kl             | 0.007288718 |
|    gen/train/clip_fraction         | 0.0394      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0812     |
|    gen/train/explained_variance    | 0.79        |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0176      |
|    gen/train/n_updates             | 390         |
|    gen/train/policy_gradient_loss  | -0.00174    |
|    gen/train/value_loss            | 0.0574 

round:  27%|██▋       | 27/100 [27:12<1:09:54, 57.46s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 4.18         |
|    gen/rollout/ep_rew_mean         | 2.86         |
|    gen/rollout/ep_rew_wrapped_mean | -2.53        |
|    gen/time/fps                    | 56           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 53           |
|    gen/time/total_timesteps        | 84000        |
|    gen/train/approx_kl             | 0.0038245136 |
|    gen/train/clip_fraction         | 0.0174       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0556      |
|    gen/train/explained_variance    | 0.806        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.0475       |
|    gen/train/n_updates             | 405          |
|    gen/train/policy_gradient_loss  | -0.000926    |
|    gen/train/value_loss   

round:  28%|██▊       | 28/100 [28:12<1:09:57, 58.30s/it]

--------------------------------------------------
| raw/                               |           |
|    gen/rollout/ep_len_mean         | 3.28      |
|    gen/rollout/ep_rew_mean         | 1.92      |
|    gen/rollout/ep_rew_wrapped_mean | -1.45     |
|    gen/time/fps                    | 54        |
|    gen/time/iterations             | 1         |
|    gen/time/time_elapsed           | 55        |
|    gen/time/total_timesteps        | 87000     |
|    gen/train/approx_kl             | 0.2800647 |
|    gen/train/clip_fraction         | 0.109     |
|    gen/train/clip_range            | 0.1       |
|    gen/train/entropy_loss          | -0.135    |
|    gen/train/explained_variance    | 0.794     |
|    gen/train/learning_rate         | 0.001     |
|    gen/train/loss                  | 0.667     |
|    gen/train/n_updates             | 420       |
|    gen/train/policy_gradient_loss  | -0.000711 |
|    gen/train/value_loss            | 0.0646    |
-------------------------------

round:  29%|██▉       | 29/100 [29:15<1:10:30, 59.59s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 2.93        |
|    gen/rollout/ep_rew_mean         | 1.52        |
|    gen/rollout/ep_rew_wrapped_mean | -1.51       |
|    gen/time/fps                    | 60          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 49          |
|    gen/time/total_timesteps        | 90000       |
|    gen/train/approx_kl             | 0.018910674 |
|    gen/train/clip_fraction         | 0.0866      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.168      |
|    gen/train/explained_variance    | 0.721       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0627      |
|    gen/train/n_updates             | 435         |
|    gen/train/policy_gradient_loss  | 0.000193    |
|    gen/train/value_loss            | 0.0652 

round:  30%|███       | 30/100 [30:11<1:08:25, 58.64s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 5.93        |
|    gen/rollout/ep_rew_mean         | 4.47        |
|    gen/rollout/ep_rew_wrapped_mean | -1.67       |
|    gen/time/fps                    | 48          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 62          |
|    gen/time/total_timesteps        | 93000       |
|    gen/train/approx_kl             | 0.028203405 |
|    gen/train/clip_fraction         | 0.0909      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.112      |
|    gen/train/explained_variance    | 0.583       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0238      |
|    gen/train/n_updates             | 450         |
|    gen/train/policy_gradient_loss  | -0.00941    |
|    gen/train/value_loss            | 0.104  

round:  31%|███       | 31/100 [31:21<1:11:22, 62.06s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 5.51        |
|    gen/rollout/ep_rew_mean         | 4.13        |
|    gen/rollout/ep_rew_wrapped_mean | -2.37       |
|    gen/time/fps                    | 60          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 49          |
|    gen/time/total_timesteps        | 96000       |
|    gen/train/approx_kl             | 0.008645272 |
|    gen/train/clip_fraction         | 0.0323      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0747     |
|    gen/train/explained_variance    | 0.577       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0224      |
|    gen/train/n_updates             | 465         |
|    gen/train/policy_gradient_loss  | -0.00286    |
|    gen/train/value_loss            | 0.0764 

round:  32%|███▏      | 32/100 [32:18<1:08:28, 60.42s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 4.2         |
|    gen/rollout/ep_rew_mean         | 2.83        |
|    gen/rollout/ep_rew_wrapped_mean | -2.06       |
|    gen/time/fps                    | 53          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 56          |
|    gen/time/total_timesteps        | 99000       |
|    gen/train/approx_kl             | 0.008406238 |
|    gen/train/clip_fraction         | 0.0168      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0433     |
|    gen/train/explained_variance    | 0.455       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0305      |
|    gen/train/n_updates             | 480         |
|    gen/train/policy_gradient_loss  | 0.000265    |
|    gen/train/value_loss            | 0.0738 

round:  33%|███▎      | 33/100 [33:22<1:08:42, 61.54s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 4.36        |
|    gen/rollout/ep_rew_mean         | 3.04        |
|    gen/rollout/ep_rew_wrapped_mean | -1.64       |
|    gen/time/fps                    | 57          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 52          |
|    gen/time/total_timesteps        | 102000      |
|    gen/train/approx_kl             | 0.009626873 |
|    gen/train/clip_fraction         | 0.00913     |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0238     |
|    gen/train/explained_variance    | 0.0861      |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0271      |
|    gen/train/n_updates             | 495         |
|    gen/train/policy_gradient_loss  | -0.000879   |
|    gen/train/value_loss            | 0.123  

round:  34%|███▍      | 34/100 [34:22<1:07:00, 60.92s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 4.97         |
|    gen/rollout/ep_rew_mean         | 3.64         |
|    gen/rollout/ep_rew_wrapped_mean | -2.02        |
|    gen/time/fps                    | 63           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 47           |
|    gen/time/total_timesteps        | 105000       |
|    gen/train/approx_kl             | 0.0008259488 |
|    gen/train/clip_fraction         | 0.00373      |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.023       |
|    gen/train/explained_variance    | 0.606        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.018        |
|    gen/train/n_updates             | 510          |
|    gen/train/policy_gradient_loss  | -0.000372    |
|    gen/train/value_loss   

round:  35%|███▌      | 35/100 [35:16<1:03:56, 59.02s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 4.68         |
|    gen/rollout/ep_rew_mean         | 3.32         |
|    gen/rollout/ep_rew_wrapped_mean | -2.41        |
|    gen/time/fps                    | 53           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 56           |
|    gen/time/total_timesteps        | 108000       |
|    gen/train/approx_kl             | 0.0014280246 |
|    gen/train/clip_fraction         | 0.0044       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0171      |
|    gen/train/explained_variance    | 0.724        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.018        |
|    gen/train/n_updates             | 525          |
|    gen/train/policy_gradient_loss  | -0.000296    |
|    gen/train/value_loss   

round:  36%|███▌      | 36/100 [36:20<1:04:25, 60.40s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 7.84       |
|    gen/rollout/ep_rew_mean         | 6.51       |
|    gen/rollout/ep_rew_wrapped_mean | -2.23      |
|    gen/time/fps                    | 61         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 48         |
|    gen/time/total_timesteps        | 111000     |
|    gen/train/approx_kl             | 0.01904975 |
|    gen/train/clip_fraction         | 0.0608     |
|    gen/train/clip_range            | 0.1        |
|    gen/train/entropy_loss          | -0.0451    |
|    gen/train/explained_variance    | 0.584      |
|    gen/train/learning_rate         | 0.001      |
|    gen/train/loss                  | 0.0387     |
|    gen/train/n_updates             | 540        |
|    gen/train/policy_gradient_loss  | 0.00556    |
|    gen/train/value_loss            | 0.0649     |
------------

round:  37%|███▋      | 37/100 [37:16<1:01:58, 59.02s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 2.86         |
|    gen/rollout/ep_rew_mean         | 1.56         |
|    gen/rollout/ep_rew_wrapped_mean | -3.36        |
|    gen/time/fps                    | 56           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 52           |
|    gen/time/total_timesteps        | 114000       |
|    gen/train/approx_kl             | 0.0014025895 |
|    gen/train/clip_fraction         | 0.0513       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0707      |
|    gen/train/explained_variance    | 0.568        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.0255       |
|    gen/train/n_updates             | 555          |
|    gen/train/policy_gradient_loss  | -0.00382     |
|    gen/train/value_loss   

round:  38%|███▊      | 38/100 [38:16<1:01:21, 59.38s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6.39        |
|    gen/rollout/ep_rew_mean         | 5           |
|    gen/rollout/ep_rew_wrapped_mean | -1.27       |
|    gen/time/fps                    | 58          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 51          |
|    gen/time/total_timesteps        | 117000      |
|    gen/train/approx_kl             | 0.013975498 |
|    gen/train/clip_fraction         | 0.0207      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0514     |
|    gen/train/explained_variance    | 0.568       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0192      |
|    gen/train/n_updates             | 570         |
|    gen/train/policy_gradient_loss  | -0.00125    |
|    gen/train/value_loss            | 0.0491 

round:  39%|███▉      | 39/100 [39:14<1:00:02, 59.06s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 5.8         |
|    gen/rollout/ep_rew_mean         | 4.52        |
|    gen/rollout/ep_rew_wrapped_mean | -2.64       |
|    gen/time/fps                    | 63          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 47          |
|    gen/time/total_timesteps        | 120000      |
|    gen/train/approx_kl             | 0.006401848 |
|    gen/train/clip_fraction         | 0.0206      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0216     |
|    gen/train/explained_variance    | 0.767       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0179      |
|    gen/train/n_updates             | 585         |
|    gen/train/policy_gradient_loss  | -0.00239    |
|    gen/train/value_loss            | 0.0369 

round:  40%|████      | 40/100 [40:09<57:41, 57.69s/it]  

--------------------------------------------------
| raw/                               |           |
|    gen/rollout/ep_len_mean         | 8.33      |
|    gen/rollout/ep_rew_mean         | 5.98      |
|    gen/rollout/ep_rew_wrapped_mean | -2.34     |
|    gen/time/fps                    | 57        |
|    gen/time/iterations             | 1         |
|    gen/time/time_elapsed           | 52        |
|    gen/time/total_timesteps        | 123000    |
|    gen/train/approx_kl             | 0.0160168 |
|    gen/train/clip_fraction         | 0.069     |
|    gen/train/clip_range            | 0.1       |
|    gen/train/entropy_loss          | -0.153    |
|    gen/train/explained_variance    | 0.548     |
|    gen/train/learning_rate         | 0.001     |
|    gen/train/loss                  | 0.0191    |
|    gen/train/n_updates             | 600       |
|    gen/train/policy_gradient_loss  | -0.00157  |
|    gen/train/value_loss            | 0.0491    |
-------------------------------

round:  41%|████      | 41/100 [41:08<57:11, 58.16s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 2.97       |
|    gen/rollout/ep_rew_mean         | 1.64       |
|    gen/rollout/ep_rew_wrapped_mean | -3.38      |
|    gen/time/fps                    | 59         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 50         |
|    gen/time/total_timesteps        | 126000     |
|    gen/train/approx_kl             | 0.08537986 |
|    gen/train/clip_fraction         | 0.242      |
|    gen/train/clip_range            | 0.1        |
|    gen/train/entropy_loss          | -0.0724    |
|    gen/train/explained_variance    | 0.667      |
|    gen/train/learning_rate         | 0.001      |
|    gen/train/loss                  | 0.0193     |
|    gen/train/n_updates             | 615        |
|    gen/train/policy_gradient_loss  | -0.0151    |
|    gen/train/value_loss            | 0.0493     |
------------

round:  42%|████▏     | 42/100 [42:06<56:10, 58.11s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 8.03        |
|    gen/rollout/ep_rew_mean         | 6.55        |
|    gen/rollout/ep_rew_wrapped_mean | -1.26       |
|    gen/time/fps                    | 53          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 56          |
|    gen/time/total_timesteps        | 129000      |
|    gen/train/approx_kl             | 0.003013008 |
|    gen/train/clip_fraction         | 0.0139      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0333     |
|    gen/train/explained_variance    | 0.75        |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0218      |
|    gen/train/n_updates             | 630         |
|    gen/train/policy_gradient_loss  | -0.00151    |
|    gen/train/value_loss            | 0.045  

round:  43%|████▎     | 43/100 [43:09<56:40, 59.66s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 4.37        |
|    gen/rollout/ep_rew_mean         | 3.05        |
|    gen/rollout/ep_rew_wrapped_mean | -4.28       |
|    gen/time/fps                    | 56          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 53          |
|    gen/time/total_timesteps        | 132000      |
|    gen/train/approx_kl             | 0.015072989 |
|    gen/train/clip_fraction         | 0.0708      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0416     |
|    gen/train/explained_variance    | 0.724       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.00536     |
|    gen/train/n_updates             | 645         |
|    gen/train/policy_gradient_loss  | 0.0154      |
|    gen/train/value_loss            | 0.0359 

round:  44%|████▍     | 44/100 [44:10<55:59, 59.99s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 4.35         |
|    gen/rollout/ep_rew_mean         | 2.99         |
|    gen/rollout/ep_rew_wrapped_mean | -2.27        |
|    gen/time/fps                    | 55           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 53           |
|    gen/time/total_timesteps        | 135000       |
|    gen/train/approx_kl             | 0.0020847328 |
|    gen/train/clip_fraction         | 0.0115       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.033       |
|    gen/train/explained_variance    | 0.596        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.0329       |
|    gen/train/n_updates             | 660          |
|    gen/train/policy_gradient_loss  | -0.00139     |
|    gen/train/value_loss   

round:  45%|████▌     | 45/100 [45:11<55:18, 60.33s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 3.83         |
|    gen/rollout/ep_rew_mean         | 2.51         |
|    gen/rollout/ep_rew_wrapped_mean | -2.15        |
|    gen/time/fps                    | 63           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 47           |
|    gen/time/total_timesteps        | 138000       |
|    gen/train/approx_kl             | 0.0013591526 |
|    gen/train/clip_fraction         | 0.0214       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0679      |
|    gen/train/explained_variance    | 0.738        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.024        |
|    gen/train/n_updates             | 675          |
|    gen/train/policy_gradient_loss  | 0.00012      |
|    gen/train/value_loss   

round:  46%|████▌     | 46/100 [46:06<52:49, 58.70s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 6.53         |
|    gen/rollout/ep_rew_mean         | 5.22         |
|    gen/rollout/ep_rew_wrapped_mean | -2.13        |
|    gen/time/fps                    | 58           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 51           |
|    gen/time/total_timesteps        | 141000       |
|    gen/train/approx_kl             | 0.0018863128 |
|    gen/train/clip_fraction         | 0.02         |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0627      |
|    gen/train/explained_variance    | 0.694        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.0251       |
|    gen/train/n_updates             | 690          |
|    gen/train/policy_gradient_loss  | 0.00176      |
|    gen/train/value_loss   

round:  47%|████▋     | 47/100 [47:04<51:46, 58.60s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 4.86         |
|    gen/rollout/ep_rew_mean         | 3.5          |
|    gen/rollout/ep_rew_wrapped_mean | -3.32        |
|    gen/time/fps                    | 59           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 50           |
|    gen/time/total_timesteps        | 144000       |
|    gen/train/approx_kl             | 0.0021023215 |
|    gen/train/clip_fraction         | 0.0573       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0856      |
|    gen/train/explained_variance    | 0.763        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.0168       |
|    gen/train/n_updates             | 705          |
|    gen/train/policy_gradient_loss  | -0.00328     |
|    gen/train/value_loss   

round:  48%|████▊     | 48/100 [48:02<50:37, 58.41s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 4.47         |
|    gen/rollout/ep_rew_mean         | 3.07         |
|    gen/rollout/ep_rew_wrapped_mean | -2.66        |
|    gen/time/fps                    | 62           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 47           |
|    gen/time/total_timesteps        | 147000       |
|    gen/train/approx_kl             | 0.0037811485 |
|    gen/train/clip_fraction         | 0.021        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0594      |
|    gen/train/explained_variance    | 0.668        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.0613       |
|    gen/train/n_updates             | 720          |
|    gen/train/policy_gradient_loss  | -0.00172     |
|    gen/train/value_loss   

round:  49%|████▉     | 49/100 [48:58<48:58, 57.61s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 4.18        |
|    gen/rollout/ep_rew_mean         | 2.36        |
|    gen/rollout/ep_rew_wrapped_mean | -2.64       |
|    gen/time/fps                    | 52          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 56          |
|    gen/time/total_timesteps        | 150000      |
|    gen/train/approx_kl             | 0.014578836 |
|    gen/train/clip_fraction         | 0.0914      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0752     |
|    gen/train/explained_variance    | 0.373       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0299      |
|    gen/train/n_updates             | 735         |
|    gen/train/policy_gradient_loss  | -0.00843    |
|    gen/train/value_loss            | 0.123  

round:  50%|█████     | 50/100 [50:02<49:44, 59.70s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 5.23       |
|    gen/rollout/ep_rew_mean         | 3.82       |
|    gen/rollout/ep_rew_wrapped_mean | -2.55      |
|    gen/time/fps                    | 60         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 49         |
|    gen/time/total_timesteps        | 153000     |
|    gen/train/approx_kl             | 0.06682358 |
|    gen/train/clip_fraction         | 0.113      |
|    gen/train/clip_range            | 0.1        |
|    gen/train/entropy_loss          | -0.116     |
|    gen/train/explained_variance    | 0.61       |
|    gen/train/learning_rate         | 0.001      |
|    gen/train/loss                  | 0.047      |
|    gen/train/n_updates             | 750        |
|    gen/train/policy_gradient_loss  | -0.00404   |
|    gen/train/value_loss            | 0.108      |
------------

round:  51%|█████     | 51/100 [50:59<47:57, 58.72s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 7.99        |
|    gen/rollout/ep_rew_mean         | 5.67        |
|    gen/rollout/ep_rew_wrapped_mean | -2.38       |
|    gen/time/fps                    | 53          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 56          |
|    gen/time/total_timesteps        | 156000      |
|    gen/train/approx_kl             | 0.008639977 |
|    gen/train/clip_fraction         | 0.15        |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.153      |
|    gen/train/explained_variance    | 0.559       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.00416     |
|    gen/train/n_updates             | 765         |
|    gen/train/policy_gradient_loss  | -0.00057    |
|    gen/train/value_loss            | 0.0643 

round:  52%|█████▏    | 52/100 [52:02<48:01, 60.03s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 4.6        |
|    gen/rollout/ep_rew_mean         | 3.17       |
|    gen/rollout/ep_rew_wrapped_mean | -2.58      |
|    gen/time/fps                    | 63         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 47         |
|    gen/time/total_timesteps        | 159000     |
|    gen/train/approx_kl             | 0.08857627 |
|    gen/train/clip_fraction         | 0.0794     |
|    gen/train/clip_range            | 0.1        |
|    gen/train/entropy_loss          | -0.105     |
|    gen/train/explained_variance    | 0.634      |
|    gen/train/learning_rate         | 0.001      |
|    gen/train/loss                  | -0.00693   |
|    gen/train/n_updates             | 780        |
|    gen/train/policy_gradient_loss  | -0.01      |
|    gen/train/value_loss            | 0.063      |
------------

round:  53%|█████▎    | 53/100 [52:56<45:38, 58.27s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 2.68        |
|    gen/rollout/ep_rew_mean         | 1.42        |
|    gen/rollout/ep_rew_wrapped_mean | -2.08       |
|    gen/time/fps                    | 55          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 53          |
|    gen/time/total_timesteps        | 162000      |
|    gen/train/approx_kl             | 0.019586897 |
|    gen/train/clip_fraction         | 0.0333      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0281     |
|    gen/train/explained_variance    | 0.572       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0126      |
|    gen/train/n_updates             | 795         |
|    gen/train/policy_gradient_loss  | -0.00568    |
|    gen/train/value_loss            | 0.0696 

round:  54%|█████▍    | 54/100 [53:57<45:16, 59.05s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 5.87         |
|    gen/rollout/ep_rew_mean         | 4.49         |
|    gen/rollout/ep_rew_wrapped_mean | -1.64        |
|    gen/time/fps                    | 62           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 47           |
|    gen/time/total_timesteps        | 165000       |
|    gen/train/approx_kl             | 0.0030060327 |
|    gen/train/clip_fraction         | 0.114        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.103       |
|    gen/train/explained_variance    | 0.831        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | -0.0111      |
|    gen/train/n_updates             | 810          |
|    gen/train/policy_gradient_loss  | -0.00635     |
|    gen/train/value_loss   

round:  55%|█████▌    | 55/100 [54:52<43:20, 57.80s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 5.32       |
|    gen/rollout/ep_rew_mean         | 3.99       |
|    gen/rollout/ep_rew_wrapped_mean | -1.74      |
|    gen/time/fps                    | 64         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 46         |
|    gen/time/total_timesteps        | 168000     |
|    gen/train/approx_kl             | 0.02805049 |
|    gen/train/clip_fraction         | 0.054      |
|    gen/train/clip_range            | 0.1        |
|    gen/train/entropy_loss          | -0.083     |
|    gen/train/explained_variance    | 0.729      |
|    gen/train/learning_rate         | 0.001      |
|    gen/train/loss                  | 0.0356     |
|    gen/train/n_updates             | 825        |
|    gen/train/policy_gradient_loss  | -0.00718   |
|    gen/train/value_loss            | 0.0723     |
------------

round:  56%|█████▌    | 56/100 [55:45<41:27, 56.52s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 4.18        |
|    gen/rollout/ep_rew_mean         | 2.73        |
|    gen/rollout/ep_rew_wrapped_mean | -2.12       |
|    gen/time/fps                    | 54          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 55          |
|    gen/time/total_timesteps        | 171000      |
|    gen/train/approx_kl             | 0.028579738 |
|    gen/train/clip_fraction         | 0.135       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.121      |
|    gen/train/explained_variance    | 0.517       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.03        |
|    gen/train/n_updates             | 840         |
|    gen/train/policy_gradient_loss  | -0.00236    |
|    gen/train/value_loss            | 0.0845 

round:  57%|█████▋    | 57/100 [56:48<41:45, 58.28s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 4.35        |
|    gen/rollout/ep_rew_mean         | 3.08        |
|    gen/rollout/ep_rew_wrapped_mean | -1.96       |
|    gen/time/fps                    | 61          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 48          |
|    gen/time/total_timesteps        | 174000      |
|    gen/train/approx_kl             | 0.063920505 |
|    gen/train/clip_fraction         | 0.217       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0642     |
|    gen/train/explained_variance    | 0.211       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0293      |
|    gen/train/n_updates             | 855         |
|    gen/train/policy_gradient_loss  | -0.0211     |
|    gen/train/value_loss            | 0.154  

round:  58%|█████▊    | 58/100 [57:45<40:35, 57.99s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 3.4         |
|    gen/rollout/ep_rew_mean         | 2.12        |
|    gen/rollout/ep_rew_wrapped_mean | -2.22       |
|    gen/time/fps                    | 51          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 58          |
|    gen/time/total_timesteps        | 177000      |
|    gen/train/approx_kl             | 0.011094784 |
|    gen/train/clip_fraction         | 0.013       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0102     |
|    gen/train/explained_variance    | 0.453       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0242      |
|    gen/train/n_updates             | 870         |
|    gen/train/policy_gradient_loss  | -0.00193    |
|    gen/train/value_loss            | 0.11   

round:  59%|█████▉    | 59/100 [58:50<41:07, 60.19s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 6.07         |
|    gen/rollout/ep_rew_mean         | 4.74         |
|    gen/rollout/ep_rew_wrapped_mean | -1.8         |
|    gen/time/fps                    | 57           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 52           |
|    gen/time/total_timesteps        | 180000       |
|    gen/train/approx_kl             | 0.0014983829 |
|    gen/train/clip_fraction         | 0.0223       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0389      |
|    gen/train/explained_variance    | 0.565        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.0356       |
|    gen/train/n_updates             | 885          |
|    gen/train/policy_gradient_loss  | 0.00759      |
|    gen/train/value_loss   

round:  60%|██████    | 60/100 [59:50<39:59, 60.00s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 8.63        |
|    gen/rollout/ep_rew_mean         | 6.96        |
|    gen/rollout/ep_rew_wrapped_mean | -2.83       |
|    gen/time/fps                    | 45          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 66          |
|    gen/time/total_timesteps        | 183000      |
|    gen/train/approx_kl             | 0.004199741 |
|    gen/train/clip_fraction         | 0.126       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0909     |
|    gen/train/explained_variance    | 0.647       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.00112     |
|    gen/train/n_updates             | 900         |
|    gen/train/policy_gradient_loss  | -0.00533    |
|    gen/train/value_loss            | 0.0408 

round:  61%|██████    | 61/100 [1:01:03<41:34, 63.97s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 5.8         |
|    gen/rollout/ep_rew_mean         | 4.08        |
|    gen/rollout/ep_rew_wrapped_mean | -1.89       |
|    gen/time/fps                    | 57          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 51          |
|    gen/time/total_timesteps        | 186000      |
|    gen/train/approx_kl             | 0.014171093 |
|    gen/train/clip_fraction         | 0.084       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.162      |
|    gen/train/explained_variance    | 0.699       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0251      |
|    gen/train/n_updates             | 915         |
|    gen/train/policy_gradient_loss  | -0.00689    |
|    gen/train/value_loss            | 0.0754 

round:  62%|██████▏   | 62/100 [1:02:02<39:32, 62.43s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 4.91       |
|    gen/rollout/ep_rew_mean         | 3.49       |
|    gen/rollout/ep_rew_wrapped_mean | -2.33      |
|    gen/time/fps                    | 54         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 54         |
|    gen/time/total_timesteps        | 189000     |
|    gen/train/approx_kl             | 0.13622138 |
|    gen/train/clip_fraction         | 0.359      |
|    gen/train/clip_range            | 0.1        |
|    gen/train/entropy_loss          | -0.0985    |
|    gen/train/explained_variance    | 0.482      |
|    gen/train/learning_rate         | 0.001      |
|    gen/train/loss                  | -0.00663   |
|    gen/train/n_updates             | 930        |
|    gen/train/policy_gradient_loss  | -0.0238    |
|    gen/train/value_loss            | 0.0547     |
------------

round:  63%|██████▎   | 63/100 [1:03:04<38:23, 62.25s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 9.58        |
|    gen/rollout/ep_rew_mean         | 7.97        |
|    gen/rollout/ep_rew_wrapped_mean | -1.85       |
|    gen/time/fps                    | 53          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 55          |
|    gen/time/total_timesteps        | 192000      |
|    gen/train/approx_kl             | 0.028233811 |
|    gen/train/clip_fraction         | 0.093       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0748     |
|    gen/train/explained_variance    | 0.469       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0106      |
|    gen/train/n_updates             | 945         |
|    gen/train/policy_gradient_loss  | -0.000728   |
|    gen/train/value_loss            | 0.0693 

round:  64%|██████▍   | 64/100 [1:04:07<37:28, 62.46s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6.59        |
|    gen/rollout/ep_rew_mean         | 4.97        |
|    gen/rollout/ep_rew_wrapped_mean | -5.56       |
|    gen/time/fps                    | 58          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 51          |
|    gen/time/total_timesteps        | 195000      |
|    gen/train/approx_kl             | 0.046925765 |
|    gen/train/clip_fraction         | 0.207       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0696     |
|    gen/train/explained_variance    | 0.59        |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | -0.00374    |
|    gen/train/n_updates             | 960         |
|    gen/train/policy_gradient_loss  | 0.000983    |
|    gen/train/value_loss            | 0.0396 

round:  65%|██████▌   | 65/100 [1:05:05<35:44, 61.28s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 3.84       |
|    gen/rollout/ep_rew_mean         | 2.09       |
|    gen/rollout/ep_rew_wrapped_mean | -3.71      |
|    gen/time/fps                    | 62         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 47         |
|    gen/time/total_timesteps        | 198000     |
|    gen/train/approx_kl             | 0.02881596 |
|    gen/train/clip_fraction         | 0.249      |
|    gen/train/clip_range            | 0.1        |
|    gen/train/entropy_loss          | -0.183     |
|    gen/train/explained_variance    | 0.575      |
|    gen/train/learning_rate         | 0.001      |
|    gen/train/loss                  | 0.00238    |
|    gen/train/n_updates             | 975        |
|    gen/train/policy_gradient_loss  | -0.0104    |
|    gen/train/value_loss            | 0.0938     |
------------

round:  66%|██████▌   | 66/100 [1:06:00<33:35, 59.27s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 4.81        |
|    gen/rollout/ep_rew_mean         | 3.1         |
|    gen/rollout/ep_rew_wrapped_mean | -2.26       |
|    gen/time/fps                    | 61          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 48          |
|    gen/time/total_timesteps        | 201000      |
|    gen/train/approx_kl             | 0.038822617 |
|    gen/train/clip_fraction         | 0.236       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.17       |
|    gen/train/explained_variance    | 0.578       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.034       |
|    gen/train/n_updates             | 990         |
|    gen/train/policy_gradient_loss  | -0.0149     |
|    gen/train/value_loss            | 0.137  

round:  67%|██████▋   | 67/100 [1:06:56<31:59, 58.17s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 4.68        |
|    gen/rollout/ep_rew_mean         | 3.25        |
|    gen/rollout/ep_rew_wrapped_mean | -2.88       |
|    gen/time/fps                    | 63          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 47          |
|    gen/time/total_timesteps        | 204000      |
|    gen/train/approx_kl             | 0.065251626 |
|    gen/train/clip_fraction         | 0.0588      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0758     |
|    gen/train/explained_variance    | 0.37        |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0417      |
|    gen/train/n_updates             | 1005        |
|    gen/train/policy_gradient_loss  | -0.00353    |
|    gen/train/value_loss            | 0.14   

round:  68%|██████▊   | 68/100 [1:07:50<30:24, 57.01s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 2.87        |
|    gen/rollout/ep_rew_mean         | 1.6         |
|    gen/rollout/ep_rew_wrapped_mean | -1.9        |
|    gen/time/fps                    | 55          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 53          |
|    gen/time/total_timesteps        | 207000      |
|    gen/train/approx_kl             | 0.008153347 |
|    gen/train/clip_fraction         | 0.0284      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0795     |
|    gen/train/explained_variance    | 0.518       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0407      |
|    gen/train/n_updates             | 1020        |
|    gen/train/policy_gradient_loss  | -0.00156    |
|    gen/train/value_loss            | 0.0971 

round:  69%|██████▉   | 69/100 [1:08:51<30:00, 58.09s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 6.75       |
|    gen/rollout/ep_rew_mean         | 5.25       |
|    gen/rollout/ep_rew_wrapped_mean | -1.3       |
|    gen/time/fps                    | 60         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 49         |
|    gen/time/total_timesteps        | 210000     |
|    gen/train/approx_kl             | 0.01446118 |
|    gen/train/clip_fraction         | 0.0463     |
|    gen/train/clip_range            | 0.1        |
|    gen/train/entropy_loss          | -0.0572    |
|    gen/train/explained_variance    | 0.623      |
|    gen/train/learning_rate         | 0.001      |
|    gen/train/loss                  | 0.0374     |
|    gen/train/n_updates             | 1035       |
|    gen/train/policy_gradient_loss  | -0.00361   |
|    gen/train/value_loss            | 0.0605     |
------------

round:  70%|███████   | 70/100 [1:09:47<28:50, 57.69s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 3.21       |
|    gen/rollout/ep_rew_mean         | 1.97       |
|    gen/rollout/ep_rew_wrapped_mean | -2.62      |
|    gen/time/fps                    | 59         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 50         |
|    gen/time/total_timesteps        | 213000     |
|    gen/train/approx_kl             | 0.04581873 |
|    gen/train/clip_fraction         | 0.0369     |
|    gen/train/clip_range            | 0.1        |
|    gen/train/entropy_loss          | -0.0304    |
|    gen/train/explained_variance    | 0.698      |
|    gen/train/learning_rate         | 0.001      |
|    gen/train/loss                  | 0.00499    |
|    gen/train/n_updates             | 1050       |
|    gen/train/policy_gradient_loss  | -0.00508   |
|    gen/train/value_loss            | 0.0431     |
------------

round:  71%|███████   | 71/100 [1:10:45<27:51, 57.63s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 8.43        |
|    gen/rollout/ep_rew_mean         | 7.1         |
|    gen/rollout/ep_rew_wrapped_mean | -1.52       |
|    gen/time/fps                    | 57          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 52          |
|    gen/time/total_timesteps        | 216000      |
|    gen/train/approx_kl             | 0.012693264 |
|    gen/train/clip_fraction         | 0.0191      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.013      |
|    gen/train/explained_variance    | 0.242       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0281      |
|    gen/train/n_updates             | 1065        |
|    gen/train/policy_gradient_loss  | -0.00129    |
|    gen/train/value_loss            | 0.07   

round:  72%|███████▏  | 72/100 [1:11:44<27:09, 58.19s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 5.35        |
|    gen/rollout/ep_rew_mean         | 4.08        |
|    gen/rollout/ep_rew_wrapped_mean | -4.2        |
|    gen/time/fps                    | 56          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 52          |
|    gen/time/total_timesteps        | 219000      |
|    gen/train/approx_kl             | 0.020554066 |
|    gen/train/clip_fraction         | 0.0876      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0532     |
|    gen/train/explained_variance    | 0.403       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0226      |
|    gen/train/n_updates             | 1080        |
|    gen/train/policy_gradient_loss  | 0.00774     |
|    gen/train/value_loss            | 0.0682 

round:  73%|███████▎  | 73/100 [1:12:45<26:31, 58.93s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 4.98         |
|    gen/rollout/ep_rew_mean         | 3.49         |
|    gen/rollout/ep_rew_wrapped_mean | -2.31        |
|    gen/time/fps                    | 60           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 49           |
|    gen/time/total_timesteps        | 222000       |
|    gen/train/approx_kl             | 0.0034634282 |
|    gen/train/clip_fraction         | 0.0426       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0263      |
|    gen/train/explained_variance    | 0.542        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.0183       |
|    gen/train/n_updates             | 1095         |
|    gen/train/policy_gradient_loss  | 0.00239      |
|    gen/train/value_loss   

round:  74%|███████▍  | 74/100 [1:13:42<25:14, 58.26s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 7.89        |
|    gen/rollout/ep_rew_mean         | 6.29        |
|    gen/rollout/ep_rew_wrapped_mean | -2.13       |
|    gen/time/fps                    | 51          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 58          |
|    gen/time/total_timesteps        | 225000      |
|    gen/train/approx_kl             | 0.026487404 |
|    gen/train/clip_fraction         | 0.0572      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0464     |
|    gen/train/explained_variance    | 0.525       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0404      |
|    gen/train/n_updates             | 1110        |
|    gen/train/policy_gradient_loss  | 0.0123      |
|    gen/train/value_loss            | 0.0561 

round:  75%|███████▌  | 75/100 [1:14:47<25:09, 60.38s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 8.29        |
|    gen/rollout/ep_rew_mean         | 6.96        |
|    gen/rollout/ep_rew_wrapped_mean | -3.13       |
|    gen/time/fps                    | 54          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 54          |
|    gen/time/total_timesteps        | 228000      |
|    gen/train/approx_kl             | 0.014477708 |
|    gen/train/clip_fraction         | 0.0231      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.028      |
|    gen/train/explained_variance    | 0.549       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0282      |
|    gen/train/n_updates             | 1125        |
|    gen/train/policy_gradient_loss  | -0.00161    |
|    gen/train/value_loss            | 0.0417 

round:  76%|███████▌  | 76/100 [1:15:49<24:21, 60.88s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 4.63        |
|    gen/rollout/ep_rew_mean         | 3.25        |
|    gen/rollout/ep_rew_wrapped_mean | -2.12       |
|    gen/time/fps                    | 64          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 46          |
|    gen/time/total_timesteps        | 231000      |
|    gen/train/approx_kl             | 0.008809945 |
|    gen/train/clip_fraction         | 0.0191      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0308     |
|    gen/train/explained_variance    | 0.563       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0411      |
|    gen/train/n_updates             | 1140        |
|    gen/train/policy_gradient_loss  | 0.000369    |
|    gen/train/value_loss            | 0.0749 

round:  77%|███████▋  | 77/100 [1:16:43<22:29, 58.69s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 5.56        |
|    gen/rollout/ep_rew_mean         | 4.35        |
|    gen/rollout/ep_rew_wrapped_mean | -1.2        |
|    gen/time/fps                    | 59          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 50          |
|    gen/time/total_timesteps        | 234000      |
|    gen/train/approx_kl             | 0.022649473 |
|    gen/train/clip_fraction         | 0.00873     |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0142     |
|    gen/train/explained_variance    | 0.49        |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0485      |
|    gen/train/n_updates             | 1155        |
|    gen/train/policy_gradient_loss  | -0.00283    |
|    gen/train/value_loss            | 0.0824 

round:  78%|███████▊  | 78/100 [1:17:40<21:24, 58.37s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 3.28         |
|    gen/rollout/ep_rew_mean         | 1.89         |
|    gen/rollout/ep_rew_wrapped_mean | -2.46        |
|    gen/time/fps                    | 59           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 50           |
|    gen/time/total_timesteps        | 237000       |
|    gen/train/approx_kl             | 0.0032700174 |
|    gen/train/clip_fraction         | 0.144        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0835      |
|    gen/train/explained_variance    | 0.33         |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.0128       |
|    gen/train/n_updates             | 1170         |
|    gen/train/policy_gradient_loss  | -0.0112      |
|    gen/train/value_loss   

round:  79%|███████▉  | 79/100 [1:18:38<20:21, 58.19s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 3.26       |
|    gen/rollout/ep_rew_mean         | 1.96       |
|    gen/rollout/ep_rew_wrapped_mean | -1.87      |
|    gen/time/fps                    | 60         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 49         |
|    gen/time/total_timesteps        | 240000     |
|    gen/train/approx_kl             | 0.09146827 |
|    gen/train/clip_fraction         | 0.129      |
|    gen/train/clip_range            | 0.1        |
|    gen/train/entropy_loss          | -0.073     |
|    gen/train/explained_variance    | 0.498      |
|    gen/train/learning_rate         | 0.001      |
|    gen/train/loss                  | 0.044      |
|    gen/train/n_updates             | 1185       |
|    gen/train/policy_gradient_loss  | -0.000311  |
|    gen/train/value_loss            | 0.0746     |
------------

round:  80%|████████  | 80/100 [1:19:35<19:16, 57.85s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 5.95        |
|    gen/rollout/ep_rew_mean         | 3.95        |
|    gen/rollout/ep_rew_wrapped_mean | -1.62       |
|    gen/time/fps                    | 54          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 54          |
|    gen/time/total_timesteps        | 243000      |
|    gen/train/approx_kl             | 0.010752451 |
|    gen/train/clip_fraction         | 0.141       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.189      |
|    gen/train/explained_variance    | 0.422       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0118      |
|    gen/train/n_updates             | 1200        |
|    gen/train/policy_gradient_loss  | -0.00824    |
|    gen/train/value_loss            | 0.0619 

round:  81%|████████  | 81/100 [1:20:37<18:42, 59.09s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 3.55        |
|    gen/rollout/ep_rew_mean         | 2.27        |
|    gen/rollout/ep_rew_wrapped_mean | -3.56       |
|    gen/time/fps                    | 61          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 49          |
|    gen/time/total_timesteps        | 246000      |
|    gen/train/approx_kl             | 0.061768122 |
|    gen/train/clip_fraction         | 0.0794      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0969     |
|    gen/train/explained_variance    | 0.745       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.00339     |
|    gen/train/n_updates             | 1215        |
|    gen/train/policy_gradient_loss  | -0.015      |
|    gen/train/value_loss            | 0.0414 

round:  82%|████████▏ | 82/100 [1:21:33<17:29, 58.29s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 3.73        |
|    gen/rollout/ep_rew_mean         | 2.39        |
|    gen/rollout/ep_rew_wrapped_mean | -1.88       |
|    gen/time/fps                    | 63          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 47          |
|    gen/time/total_timesteps        | 249000      |
|    gen/train/approx_kl             | 0.016444247 |
|    gen/train/clip_fraction         | 0.0193      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0262     |
|    gen/train/explained_variance    | 0.569       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0371      |
|    gen/train/n_updates             | 1230        |
|    gen/train/policy_gradient_loss  | -0.00327    |
|    gen/train/value_loss            | 0.0415 

round:  83%|████████▎ | 83/100 [1:22:28<16:10, 57.10s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 6.5         |
|    gen/rollout/ep_rew_mean         | 5.2         |
|    gen/rollout/ep_rew_wrapped_mean | -1.94       |
|    gen/time/fps                    | 51          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 57          |
|    gen/time/total_timesteps        | 252000      |
|    gen/train/approx_kl             | 0.008666238 |
|    gen/train/clip_fraction         | 0.00824     |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0168     |
|    gen/train/explained_variance    | 0.732       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0201      |
|    gen/train/n_updates             | 1245        |
|    gen/train/policy_gradient_loss  | -0.00182    |
|    gen/train/value_loss            | 0.045  

round:  84%|████████▍ | 84/100 [1:23:33<15:52, 59.53s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 6.33         |
|    gen/rollout/ep_rew_mean         | 4.74         |
|    gen/rollout/ep_rew_wrapped_mean | -2.65        |
|    gen/time/fps                    | 58           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 51           |
|    gen/time/total_timesteps        | 255000       |
|    gen/train/approx_kl             | 0.0027033747 |
|    gen/train/clip_fraction         | 0.149        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0916      |
|    gen/train/explained_variance    | 0.726        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.000502     |
|    gen/train/n_updates             | 1260         |
|    gen/train/policy_gradient_loss  | -0.00969     |
|    gen/train/value_loss   

round:  85%|████████▌ | 85/100 [1:24:32<14:49, 59.29s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 7.61        |
|    gen/rollout/ep_rew_mean         | 5.7         |
|    gen/rollout/ep_rew_wrapped_mean | -3.14       |
|    gen/time/fps                    | 55          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 53          |
|    gen/time/total_timesteps        | 258000      |
|    gen/train/approx_kl             | 0.012933586 |
|    gen/train/clip_fraction         | 0.06        |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0687     |
|    gen/train/explained_variance    | 0.636       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0292      |
|    gen/train/n_updates             | 1275        |
|    gen/train/policy_gradient_loss  | 0.00214     |
|    gen/train/value_loss            | 0.0521 

round:  86%|████████▌ | 86/100 [1:25:33<13:57, 59.81s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 5.63        |
|    gen/rollout/ep_rew_mean         | 4.37        |
|    gen/rollout/ep_rew_wrapped_mean | -3.55       |
|    gen/time/fps                    | 47          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 62          |
|    gen/time/total_timesteps        | 261000      |
|    gen/train/approx_kl             | 0.022611925 |
|    gen/train/clip_fraction         | 0.0393      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0886     |
|    gen/train/explained_variance    | 0.735       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0178      |
|    gen/train/n_updates             | 1290        |
|    gen/train/policy_gradient_loss  | 0.00583     |
|    gen/train/value_loss            | 0.0391 

round:  87%|████████▋ | 87/100 [1:26:43<13:36, 62.82s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 6.81         |
|    gen/rollout/ep_rew_mean         | 5.04         |
|    gen/rollout/ep_rew_wrapped_mean | -1.99        |
|    gen/time/fps                    | 54           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 54           |
|    gen/time/total_timesteps        | 264000       |
|    gen/train/approx_kl             | 0.0029448092 |
|    gen/train/clip_fraction         | 0.117        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.114       |
|    gen/train/explained_variance    | 0.542        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.0196       |
|    gen/train/n_updates             | 1305         |
|    gen/train/policy_gradient_loss  | -0.00289     |
|    gen/train/value_loss   

round:  88%|████████▊ | 88/100 [1:27:44<12:30, 62.51s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 5.47       |
|    gen/rollout/ep_rew_mean         | 4.23       |
|    gen/rollout/ep_rew_wrapped_mean | -2.84      |
|    gen/time/fps                    | 62         |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 47         |
|    gen/time/total_timesteps        | 267000     |
|    gen/train/approx_kl             | 0.14959274 |
|    gen/train/clip_fraction         | 0.134      |
|    gen/train/clip_range            | 0.1        |
|    gen/train/entropy_loss          | -0.0256    |
|    gen/train/explained_variance    | 0.331      |
|    gen/train/learning_rate         | 0.001      |
|    gen/train/loss                  | 0.0146     |
|    gen/train/n_updates             | 1320       |
|    gen/train/policy_gradient_loss  | -0.0129    |
|    gen/train/value_loss            | 0.0764     |
------------

round:  89%|████████▉ | 89/100 [1:28:39<11:02, 60.24s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 4.11         |
|    gen/rollout/ep_rew_mean         | 2.68         |
|    gen/rollout/ep_rew_wrapped_mean | -3.05        |
|    gen/time/fps                    | 66           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 45           |
|    gen/time/total_timesteps        | 270000       |
|    gen/train/approx_kl             | 0.0011881326 |
|    gen/train/clip_fraction         | 0.00924      |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0149      |
|    gen/train/explained_variance    | 0.307        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.0289       |
|    gen/train/n_updates             | 1335         |
|    gen/train/policy_gradient_loss  | -0.000745    |
|    gen/train/value_loss   

round:  90%|█████████ | 90/100 [1:29:32<09:38, 57.89s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 4.07        |
|    gen/rollout/ep_rew_mean         | 2.76        |
|    gen/rollout/ep_rew_wrapped_mean | -2.91       |
|    gen/time/fps                    | 63          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 47          |
|    gen/time/total_timesteps        | 273000      |
|    gen/train/approx_kl             | 0.040954404 |
|    gen/train/clip_fraction         | 0.0538      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0444     |
|    gen/train/explained_variance    | 0.419       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0209      |
|    gen/train/n_updates             | 1350        |
|    gen/train/policy_gradient_loss  | -0.00302    |
|    gen/train/value_loss            | 0.0797 

round:  91%|█████████ | 91/100 [1:30:26<08:32, 56.95s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 5.4         |
|    gen/rollout/ep_rew_mean         | 4.17        |
|    gen/rollout/ep_rew_wrapped_mean | -3.01       |
|    gen/time/fps                    | 62          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 47          |
|    gen/time/total_timesteps        | 276000      |
|    gen/train/approx_kl             | 0.056206714 |
|    gen/train/clip_fraction         | 0.0418      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.035      |
|    gen/train/explained_variance    | 0.481       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0423      |
|    gen/train/n_updates             | 1365        |
|    gen/train/policy_gradient_loss  | -0.000377   |
|    gen/train/value_loss            | 0.0716 

round:  92%|█████████▏| 92/100 [1:31:22<07:31, 56.41s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 4.11         |
|    gen/rollout/ep_rew_mean         | 2.77         |
|    gen/rollout/ep_rew_wrapped_mean | -3.25        |
|    gen/time/fps                    | 56           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 53           |
|    gen/time/total_timesteps        | 279000       |
|    gen/train/approx_kl             | 0.0036900905 |
|    gen/train/clip_fraction         | 0.0219       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0322      |
|    gen/train/explained_variance    | 0.552        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.0293       |
|    gen/train/n_updates             | 1380         |
|    gen/train/policy_gradient_loss  | -0.00267     |
|    gen/train/value_loss   

round:  93%|█████████▎| 93/100 [1:32:22<06:43, 57.66s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 7.46        |
|    gen/rollout/ep_rew_mean         | 6.01        |
|    gen/rollout/ep_rew_wrapped_mean | -1.98       |
|    gen/time/fps                    | 56          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 53          |
|    gen/time/total_timesteps        | 282000      |
|    gen/train/approx_kl             | 0.018081434 |
|    gen/train/clip_fraction         | 0.069       |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0513     |
|    gen/train/explained_variance    | 0.38        |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0202      |
|    gen/train/n_updates             | 1395        |
|    gen/train/policy_gradient_loss  | -0.00717    |
|    gen/train/value_loss            | 0.0928 

round:  94%|█████████▍| 94/100 [1:33:23<05:50, 58.49s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 2.97         |
|    gen/rollout/ep_rew_mean         | 1.59         |
|    gen/rollout/ep_rew_wrapped_mean | -2.49        |
|    gen/time/fps                    | 49           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 60           |
|    gen/time/total_timesteps        | 285000       |
|    gen/train/approx_kl             | 0.0010984314 |
|    gen/train/clip_fraction         | 0.011        |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.02        |
|    gen/train/explained_variance    | 0.571        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.0182       |
|    gen/train/n_updates             | 1410         |
|    gen/train/policy_gradient_loss  | -0.00114     |
|    gen/train/value_loss   

round:  95%|█████████▌| 95/100 [1:34:30<05:06, 61.29s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 3.46        |
|    gen/rollout/ep_rew_mean         | 2.12        |
|    gen/rollout/ep_rew_wrapped_mean | -1.19       |
|    gen/time/fps                    | 59          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 50          |
|    gen/time/total_timesteps        | 288000      |
|    gen/train/approx_kl             | 0.016366543 |
|    gen/train/clip_fraction         | 0.0723      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0391     |
|    gen/train/explained_variance    | 0.685       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0115      |
|    gen/train/n_updates             | 1425        |
|    gen/train/policy_gradient_loss  | 0.00197     |
|    gen/train/value_loss            | 0.0408 

round:  96%|█████████▌| 96/100 [1:35:28<04:00, 60.10s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 3.24        |
|    gen/rollout/ep_rew_mean         | 1.99        |
|    gen/rollout/ep_rew_wrapped_mean | -1.25       |
|    gen/time/fps                    | 58          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 51          |
|    gen/time/total_timesteps        | 291000      |
|    gen/train/approx_kl             | 0.013462674 |
|    gen/train/clip_fraction         | 0.00949     |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.0179     |
|    gen/train/explained_variance    | 0.501       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0248      |
|    gen/train/n_updates             | 1440        |
|    gen/train/policy_gradient_loss  | -0.000607   |
|    gen/train/value_loss            | 0.0385 

round:  97%|█████████▋| 97/100 [1:36:26<02:58, 59.57s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 5.67        |
|    gen/rollout/ep_rew_mean         | 4.41        |
|    gen/rollout/ep_rew_wrapped_mean | -1.04       |
|    gen/time/fps                    | 56          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 52          |
|    gen/time/total_timesteps        | 294000      |
|    gen/train/approx_kl             | 0.009525475 |
|    gen/train/clip_fraction         | 0.00449     |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.00645    |
|    gen/train/explained_variance    | 0.61        |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.00774     |
|    gen/train/n_updates             | 1455        |
|    gen/train/policy_gradient_loss  | 0.000558    |
|    gen/train/value_loss            | 0.0223 

round:  98%|█████████▊| 98/100 [1:37:26<01:59, 59.70s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 7.33        |
|    gen/rollout/ep_rew_mean         | 6.09        |
|    gen/rollout/ep_rew_wrapped_mean | -2.25       |
|    gen/time/fps                    | 59          |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 50          |
|    gen/time/total_timesteps        | 297000      |
|    gen/train/approx_kl             | 0.002836241 |
|    gen/train/clip_fraction         | 0.0186      |
|    gen/train/clip_range            | 0.1         |
|    gen/train/entropy_loss          | -0.00841    |
|    gen/train/explained_variance    | 0.712       |
|    gen/train/learning_rate         | 0.001       |
|    gen/train/loss                  | 0.0133      |
|    gen/train/n_updates             | 1470        |
|    gen/train/policy_gradient_loss  | -0.00383    |
|    gen/train/value_loss            | 0.0213 

round:  99%|█████████▉| 99/100 [1:38:24<00:59, 59.11s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 5.06         |
|    gen/rollout/ep_rew_mean         | 3.76         |
|    gen/rollout/ep_rew_wrapped_mean | -3.3         |
|    gen/time/fps                    | 61           |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 48           |
|    gen/time/total_timesteps        | 300000       |
|    gen/train/approx_kl             | 0.0006281334 |
|    gen/train/clip_fraction         | 0.0216       |
|    gen/train/clip_range            | 0.1          |
|    gen/train/entropy_loss          | -0.0187      |
|    gen/train/explained_variance    | 0.784        |
|    gen/train/learning_rate         | 0.001        |
|    gen/train/loss                  | 0.00373      |
|    gen/train/n_updates             | 1485         |
|    gen/train/policy_gradient_loss  | -0.00301     |
|    gen/train/value_loss   

round: 100%|██████████| 100/100 [1:39:20<00:00, 59.60s/it]

🏃 View run gregarious-stork-132 at: http://127.0.0.1:8080/#/experiments/282678262450638424/runs/a2ba18e93f1749659fd26f48943646cb
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/282678262450638424





## Metrics

Download the reward network from mlflow.

In [51]:
local_path = mlflow.artifacts.download_artifacts(
    artifact_uri="mlflow-artifacts:/282678262450638424/a2ba18e93f1749659fd26f48943646cb/artifacts/reward_net0v2"
)

reward_net0 = torch.load(local_path, weights_only=False)


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

In [52]:
local_path = mlflow.artifacts.download_artifacts(
    artifact_uri="mlflow-artifacts:/282678262450638424/8781ce0d9d5f453fa45b69b82b1ce30b/artifacts/reward_net1v2"
)

reward_net1 = torch.load(local_path, weights_only=False)

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

### Comparing **uncalibrated** mean rewards

In [53]:
states0, obs0, next_states0, dones0 = (
    trajectories_0_test.obs,
    trajectories_0_test.acts,
    trajectories_0_test.next_obs,
    trajectories_0_test.dones,
)
states1, obs1, next_states1, dones1 = (
    trajectories_1_test.obs,
    trajectories_1_test.acts,
    trajectories_1_test.next_obs,
    trajectories_1_test.dones,
)

In [54]:
# For reward_net1
rewards1 = reward_net1.predict(states0, obs0, next_states0, dones0)
norm_rewards1 = (rewards1 - rewards1.mean()) / rewards1.std()
print("Reward network 1 with traj0: ", norm_rewards1.mean())

# For reward_net0
rewards0 = reward_net0.predict(states0, obs0, next_states0, dones0)
norm_rewards0 = (rewards0 - rewards0.mean()) / rewards0.std()
print("Reward network 0 with traj0: ", norm_rewards0.mean())

Reward network 1 with traj0:  4.4192507e-08
Reward network 0 with traj0:  -7.070801e-08


In [55]:
# For reward_net1
rewards1 = reward_net1.predict(states1, obs1, next_states1, dones1)
norm_rewards1 = (rewards1 - rewards1.mean()) / rewards1.std()
print("Reward network 1 with traj1: ", norm_rewards1.mean())

# For reward_net0
rewards0 = reward_net0.predict(states1, obs1, next_states1, dones1)
norm_rewards0 = (rewards0 - rewards0.mean()) / rewards0.std()
print("Reward network 0 with traj1: ", norm_rewards0.mean())

Reward network 1 with traj1:  1.0056319e-08
Reward network 0 with traj1:  -2.9498537e-07


### Calibration using affine transformations

We use a validation set to calculate alpha and beta parameters for our affine transformation, which is simply 
$f(x) = \alpha x + \beta$, where $\alpha = {\sigma_{target}\over\sigma_{x}}$ and $\beta = \mu_{target} - \alpha_x * \mu_x$

In [56]:
states, obs, next_states, dones = (
    trajectories_val.obs,
    trajectories_val.acts,
    trajectories_val.next_obs,
    trajectories_val.dones,
)

In [57]:
outputs_arb = reward_net1.predict(states, obs, next_states, dones)
outputs_nonarb = reward_net0.predict(states, obs, next_states, dones)

# Compute empirical mean and std for each network
mean_arb, std_arb = outputs_arb.mean(), outputs_arb.std()
mean_nonarb, std_nonarb = outputs_nonarb.mean(), outputs_nonarb.std()

# Define target calibration values (e.g., mean=0, std=1)
target_mean, target_std = 0.0, 1.0

# Calculate affine transformation parameters
alpha_arb = target_std / std_arb
beta_arb = target_mean - alpha_arb * mean_arb

alpha_nonarb = target_std / std_nonarb
beta_nonarb = target_mean - alpha_nonarb * mean_nonarb

# Calibrate the outputs
calibrated_arb = outputs_arb * alpha_arb + beta_arb
calibrated_nonarb = outputs_nonarb * alpha_nonarb + beta_nonarb

In [66]:
# For reward_net1
rewards1 = reward_net1.predict(states0, obs0, next_states0, dones0)
calibrated_rewards1 = alpha_arb * rewards1 + beta_arb
print("Reward network 1 with traj0: ", calibrated_rewards1.mean())

# For reward_net0
rewards0 = reward_net0.predict(states0, obs0, next_states0, dones0)
calibrated_rewards0 = alpha_nonarb * rewards0 + beta_nonarb
print("Reward network 0 with traj0: ", calibrated_rewards0.mean())

Reward network 1 with traj0:  -0.34985515
Reward network 0 with traj0:  0.5804129


In [71]:
predictions = np.array([1 if r0 > r1 else 0 for r0, r1 in zip(calibrated_rewards0, calibrated_rewards1)])
true_labels = np.array([1]*len(predictions))

accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.7898517145505097
Precision: 1.0
Recall: 0.7898517145505097
F1 Score: 0.882588996763754


In [49]:
# For reward_net1
rewards1 = reward_net1.predict(states1, obs1, next_states1, dones1)
calibrated_rewards1 = alpha_arb * rewards1 + beta_arb
print("Reward network 1 with traj1: ", calibrated_rewards1.mean())

# For reward_net0
rewards0 = reward_net0.predict(states1, obs1, next_states1, dones1)
calibrated_rewards0 = alpha_nonarb * rewards0 + beta_nonarb
print("Reward network 0 with traj1: ", calibrated_rewards0.mean())

Reward network 1 with traj1:  0.43942836
Reward network 0 with traj1:  -0.62645113


In [50]:
predictions = np.array([1 if r0 < r1 else 0 for r0, r1 in zip(calibrated_rewards0, calibrated_rewards1)])
true_labels = np.array([1]*len(predictions))

accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.9740773286467487
Precision: 1.0
Recall: 0.9740773286467487
F1 Score: 0.986868462052081
