In [1]:
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import accuracy_score
# import argparse
from data_generation import generate_data
import os
import json
import pandas as pd
import time
import initpath_alg
#initpath_alg.init_sys_path()
import utilmlab
import data_loader_mlab


def array2str(a):
    s = ''
    for idx, el in enumerate(a):
        s += (' ' if idx > 0 else '') + '{:0.3f}'.format(el)
    return s


def one_hot_encoder(a):
    n_values = np.max(a) + 1
    return np.eye(n_values)[a]


def load_create_data(
        data_type,
        data_out,
        is_logging_enabled=True,
        fn_csv=None,
        label_nm=None):

    df_train, df_test, dset = None, None, None
    features = None
    if data_type in data_loader_mlab.get_available_datasets() + ['show'] \
       or fn_csv is not None:
        if fn_csv is not None:
            rval, dset = data_loader_mlab.load_dataset_from_csv(
                logger, fn_csv, label_nm)
        else:
            rval, dset = data_loader_mlab.get_dataset(data_type)
        assert rval == 0
        data_loader_mlab.dataset_log_properties(logger, dset)
        if is_logging_enabled:
            logger.info('warning no seed')
        df = dset['df']
        features = dset['features']
        labels = dset['targets']
        nsample = len(df)
        train_ratio = 0.8
        idx = np.random.permutation(nsample)
        ntrain = int(nsample * train_ratio)
        df_train = df.iloc[idx[:ntrain]]
        df_test = df.iloc[idx[ntrain:]]

        col_drop = utilmlab.col_with_nan(df)
        if is_logging_enabled and len(col_drop):
            print('warning: dropping features {}'
                  ', contains nan'.format(col_drop))
            time.sleep(2)

        features = [el for el in features if el not in col_drop]

        x_train = df_train[features].values
        y_train = df_train[labels].values
        x_test = df_test[features].values
        y_test = df_test[labels].values

        g_train, g_test = None, None

        y_train = one_hot_encoder(np.ravel(y_train))
        y_test = one_hot_encoder(np.ravel(y_test))
        if is_logging_enabled:
            logger.info('y: train:{} test:{}'.format(
                set(np.ravel(y_train)), set(np.ravel(y_test))))
    else:
        x_train, y_train, g_train = generate_data(
            n=train_N, data_type=data_type, seed=train_seed, out=data_out, x_dim = X_DIM)
        x_test,  y_test,  g_test = generate_data(
            n=test_N,  data_type=data_type, seed=test_seed,  out=data_out, x_dim = X_DIM)
    if is_logging_enabled:
        logger.info('{} {} {} {}'.format(
            x_train.shape,
            y_train.shape,
            x_test.shape,
            y_test.shape))
    return x_train, y_train, g_train, x_test, y_test, \
        g_test, df_train, df_test, dset, features




In [2]:
from IPython import embed
import torch
import torch.nn as nn
import torch.nn.functional as F

# Generator (Actor) in PyTorch
class INVASE_Actor(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(INVASE_Actor, self).__init__()

        self.l1 = nn.Linear(state_dim + action_dim, 100)
        self.l2 = nn.Linear(100, 100)
        self.l3 = nn.Linear(100, action_dim)


    def forward(self, state, action):
        sa = torch.cat([state, action], 1)
        
        a = F.selu(self.l1(sa))
        a = F.selu(self.l2(a))
        return torch.sigmoid(self.l3(a))
        
# Discriminator (Critic) in PyTorch    
# Critic in INVASE is a classifier that provide return signal
class INVASE_Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(INVASE_Critic, self).__init__()

        # Q1 architecture
        self.l1 = nn.Linear(state_dim + action_dim, 200)
        #self.bn1 = nn.BatchNorm1d(num_features=200)
        self.l2 = nn.Linear(200, 200)
        #self.bn2 = nn.BatchNorm1d(num_features=200)
        self.l3 = nn.Linear(200, state_dim)


    def forward(self, state, action, mask):
        #sa = torch.cat([state, action], 1)
        sa = torch.cat([state, mask* action],1)
        
        #q1 = F.selu(self.bn1(self.l1(sa)))
        #q1 = F.selu(self.bn2(self.l2(q1)))
        q1 = F.selu(self.l1(sa))
        q1 = F.selu(self.l2(q1))
        q1 = self.l3(q1)

        return q1 # prob, actually the binary classification result with softmax activation (logits)
    
# Valuefunction (Baseline) in PyTorch   
# Valuefunction in INVASE is a classifier that provide return signal
class INVASE_Baseline(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(INVASE_Baseline, self).__init__()

        # Q1 architecture
        self.l1 = nn.Linear(state_dim + action_dim, 200)
        #self.bn1 = nn.BatchNorm1d(num_features=200)
        self.l2 = nn.Linear(200, 200)
        #self.bn2 = nn.BatchNorm1d(num_features=200)
        self.l3 = nn.Linear(200, state_dim)


    def forward(self, state, action):
        sa = torch.cat([state, action], 1)
        #sa = state

        q1 = F.selu(self.l1(sa))
        q1 = F.selu(self.l2(q1))
        q1 = self.l3(q1)

        return q1 # prob, actually the binary classification result with softmax activation (logits)    


class PVS():
    # 1. Initialization
    '''
    x_train: training samples
    data_type: Syn1 to Syn 6
    '''
    def __init__(self, xs_train, data_type, nepoch, is_logging_enabled=True, thres = 0.5):
        self.is_logging_enabled = is_logging_enabled
        self.latent_dim1 = 100      # Dimension of actor (generator) network
        self.latent_dim2 = 200      # Dimension of critic (discriminator) network
        
        self.batch_size = min(1000, xs_train.shape[0])      # Batch size
        self.epochs = nepoch        # Epoch size (large epoch is needed due to the policy gradient framework)
        self.lamda = 1.0           # Hyper-parameter for the number of selected features 
        self.thres = thres
        '''lamda is number of selected features? is it the coefficient?'''
        
        
        self.input_shape_state = xs_train.shape[1]     # state dimension
        self.input_shape_action = xa_train.shape[1]    # action dimension
        logger.info('input shape: {}'.format(self.input_shape_state))
        
        # Actionvation. (For Syn1 and 2, relu, others, selu)
        self.activation = 'relu' if data_type in ['Syn1','Syn2'] else 'selu'       
        
        
        self.generator = INVASE_Actor(state_dim=self.input_shape_state, action_dim = self.input_shape_action)
        self.discriminator = INVASE_Critic(state_dim=self.input_shape_state, action_dim = self.input_shape_action)
        self.valfunction = INVASE_Baseline(state_dim=self.input_shape_state, action_dim = self.input_shape_action)
        
        
        self.generator_optimizer = torch.optim.Adam(self.generator.parameters(), lr=1e-4)#,weight_decay=1e-3)
        self.discriminator_optimizer = torch.optim.Adam(self.discriminator.parameters(), lr=1e-4)#,weight_decay=1e-3)
        self.valfunction_optimizer = torch.optim.Adam(self.valfunction.parameters(), lr=1e-4)#,weight_decay=1e-3)
        
    def my_loss(self, y_true, y_pred,lmd, Thr):
        # dimension of the features
        
        '''
        sel_prob: the mask generated by bernulli sampler [bs, d]
        dis_prob: prediction of the critic               [bs, state_dim]
        val_prob: prediction of the baseline model       [bs, state_dim]
        y_batch: batch of y_train                        [bs, state_dim]
        all of those variables are 'placeholders'
        '''
        
        
        d = y_pred.shape[1]        
        
        # Put all three in y_true 
        # 1. selected probability
        sel_prob = y_true[:,:d] # bs x d
        # 2. discriminator output
        dis_prob = y_true[:,d:(d+self.input_shape_state)] # bs x 2
        # 3. valfunction output
        val_prob = y_true[:,(d+self.input_shape_state):(d+self.input_shape_state*2)] # bs x 2
        # 4. ground truth
        y_final = y_true[:,(d+self.input_shape_state*2):] # bs x 2
        
        # A1. Compute the rewards of the actor network
        #embed()
        Reward1 = torch.norm(y_final - dis_prob, p=2, dim=1)  

        # A2. Compute the rewards of the actor network
        Reward2 = torch.norm(y_final - val_prob, p=2, dim=1)  

        # Difference is the rewards
        Reward =Reward2 -  Reward1

        # B. Policy gradient loss computation. 
        loss1 = Reward * torch.sum(sel_prob * torch.log(y_pred + 1e-8) + (1-sel_prob) * torch.log(1-y_pred + 1e-8), axis = 1) - lmd *torch.mean( torch.abs(y_pred-Thr), axis = 1)
        
        # C. Maximize the loss1
        loss = torch.mean(-loss1)
        #embed()
        return loss
    
    
    def Sample_M(self, gen_prob):
        # Shape of the selection probability
        n = gen_prob.shape[0]
        d = gen_prob.shape[1]
        # Sampling
        samples = np.random.binomial(1, gen_prob, (n,d))

        return samples

    #%% Training procedure
    def train(self, xs_train, xa_train, y_train, lmd, thr):

        # For each epoch (actually iterations)
        for epoch in range(self.epochs):

            #%% Train Discriminator
            # Select a random batch of samples
            idx = np.random.randint(0, xs_train.shape[0], self.batch_size)
            xs_batch = torch.as_tensor(xs_train[idx,:]).float()
            xa_batch = torch.as_tensor(xa_train[idx,:]).float()
            y_batch = torch.as_tensor(y_train[idx,:]).float() 
            # y_batch = torch.as_tensor(np.argmax(y_train[idx,:],1)).long()
            
            # Generate a batch of probabilities of feature selection
            gen_prob = self.generator(xs_batch, xa_batch).cpu().detach().numpy()
            # Sampling the features based on the generated probability
            sel_prob = self.Sample_M(gen_prob)
            '''sel_prob is the mask'''
            
            # Compute the prediction of the critic based on the sampled features (used for generator training)
            dis_prob = self.discriminator(xs_batch, xa_batch, torch.as_tensor(sel_prob).float())
            
            # Train the discriminator
            loss_func_c = nn.MSELoss()
            self.discriminator_optimizer.zero_grad()
            critic_loss = loss_func_c(dis_prob, y_batch)
            critic_loss.backward()
            self.discriminator_optimizer.step()

            #%% Train Valud function

            # Compute the prediction of the baseline based on the sampled features (used for generator training)
            val_prob = self.valfunction(xs_batch, xa_batch)#.cpu().detach().numpy()
            
            # Train the baseline model
            #v_loss = self.valfunction.train_on_batch(x_batch, y_batch)
            loss_func_v = nn.MSELoss()
            self.valfunction_optimizer.zero_grad()
            value_loss = loss_func_v(val_prob, y_batch)
            value_loss.backward()
            self.valfunction_optimizer.step()
            
            
            #%% Train Generator
            # Use three things as the y_true: sel_prob, dis_prob, and ground truth (y_batch)
            '''
            sel_prob: the mask generated by bernulli sampler [bs, d]
            dis_prob: prediction of the critic               [bs, state_dim]
            val_prob: prediction of the baseline model       [bs, state_dim]
            y_batch: batch of y_train                        [bs, state_dim]
            all of those variables are 'placeholders'
            '''
            
            y_batch_final = torch.as_tensor(np.concatenate( (sel_prob, torch.as_tensor(dis_prob).cpu().detach().numpy(), torch.as_tensor(val_prob).cpu().detach().numpy(), y_train[idx,:]), axis = 1 ))
            # Train the generator
            
            actor_pred = self.generator(xs_batch,xa_batch)
            self.generator_optimizer.zero_grad()
            actor_loss = self.my_loss(y_batch_final,actor_pred,lmd,Thr)
            actor_loss.backward()
            self.generator_optimizer.step()
            
            #%% Plot the progress
            dialog = 'Epoch: ' + '{:6d}'.format(epoch) + ', d_loss (Acc)): '
            dialog += '{:0.3f}'.format(critic_loss) + ', v_loss (Acc): '
            dialog += '{:0.3f}'.format(value_loss) + ', g_loss: ' + '{:+6.4f}'.format(actor_loss)

            if epoch % 100 == 0:
                logger.info('{}'.format(dialog))
    
    #%% Selected Features        
    def output(self, xs_train, xa_train):
        
        gen_prob = self.generator(xs_train, xa_train).cpu().detach().numpy()
        
        return np.asarray(gen_prob)
     
    #%% Prediction Results 
    def get_prediction(self, xs, xa, m_train):
        
        val_prediction = self.valfunction(xs,xa).cpu().detach().numpy()
        
        dis_prediction = self.discriminator(xs,xa, m_train).cpu().detach().numpy()
        
        return np.asarray(val_prediction), np.asarray(dis_prediction)

In [3]:
ENV_NAME = 'Pendulum-v0'
alias = 'Fixed_INVASE'
RED_ACTION_DIM = 100
import gym
print('\n now evaluating: \n       ', ENV_NAME)


import matplotlib.pyplot as plt
import numpy as np
import torch
import argparse
import os
import torch.nn.functional as F
import utils
import TD3_INVASE

def eval_policy(policy, eval_episodes=10):
    eval_env = gym.make(ENV_NAME)
    avg_reward = 0.
    for _ in range(eval_episodes):
        state, done = eval_env.reset(), False
        while not done:
            action = policy.select_action(np.array(state))
            state, reward, done, _ = eval_env.step(action[:-RED_ACTION_DIM])
            avg_reward += reward

    avg_reward /= eval_episodes

    print("---------------------------------------")
    print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}")
    print("---------------------------------------")
    return avg_reward

env = gym.make(ENV_NAME)
torch.manual_seed(0)
np.random.seed(0)

#spec = env.action_space
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0] + RED_ACTION_DIM
max_action = env.action_space.high[0]

args_policy_noise = 0.2
args_noise_clip = 0.5
args_policy_freq = 2
args_max_timesteps = 10000
args_expl_noise = 0.1
args_batch_size = 256
args_eval_freq = 1000
args_start_timesteps = 10000

kwargs = {
    "state_dim": state_dim,
    "action_dim": action_dim,
    "max_action": max_action,
    "discount": 0.99,
    "tau": 0.005
}

for repeat in range(5):
    kwargs["policy_noise"] = args_policy_noise * max_action
    kwargs["noise_clip"] = args_noise_clip * max_action
    kwargs["policy_freq"] = args_policy_freq
    policy = TD3_INVASE.TD3(**kwargs)
    replay_buffer = utils.ReplayBuffer(state_dim, action_dim)

    # Evaluate untrained policy
    evaluations = [eval_policy(policy)]
    
    state, done = env.reset(), False
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0
    counter = 0
    msk_list = []        
    temp_curve = [eval_policy(policy)]
    temp_val = []
    for t in range(int(args_max_timesteps)):
        episode_timesteps += 1
        counter += 1
        # Select action randomly or according to policy
        if t < args_start_timesteps:
            action = np.random.uniform(-max_action, max_action, action_dim)
        else:
            if np.random.uniform(0,1) < 0.0:
                action = np.random.uniform(-max_action, max_action, action_dim)
            else:
                action = (
                    policy.select_action(np.array(state))
                    + np.random.normal(0, max_action * args_expl_noise, size=action_dim)
                ).clip(-max_action, max_action)

        # Perform action
        next_state, reward, done, _ = env.step(action[:-RED_ACTION_DIM])
        

        done_bool = float(done) if episode_timesteps < env._max_episode_steps else 0

        replay_buffer.add(state, action, next_state, reward, done_bool)

        state = next_state
        episode_reward += reward

        if t >= args_start_timesteps:
            '''TD3'''
            policy.train(replay_buffer, args_batch_size)
                    
                    
        # Train agent after collecting sufficient data
        if done:
            print(f"Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}")
            msk_list = []
            state, done = env.reset(), False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1 

        # Evaluate episode
        if (t + 1) % args_eval_freq == 0:
            evaluations.append(eval_policy(policy))
            print('recent Evaluation:',evaluations[-1])
            np.save('results/evaluations_alias{}_ENV{}_Repeat{}'.format(alias,ENV_NAME,repeat),evaluations)
            
            
    state_list_train = replay_buffer.state[:args_start_timesteps-5000]
    state_list_test = replay_buffer.state[args_start_timesteps-5000:args_start_timesteps]

    action_list_train = replay_buffer.action[:args_start_timesteps-5000]
    action_list_test = replay_buffer.action[args_start_timesteps-5000:args_start_timesteps]
    next_state_list_train = replay_buffer.next_state[:args_start_timesteps-5000]
    next_state_list_test = replay_buffer.next_state[args_start_timesteps-5000:args_start_timesteps]

    state_delta_train = next_state_list_train - state_list_train
    state_delta_test = next_state_list_test - state_list_test
            
            
    X_DIM = action_list_train.shape[1] # feature dimension Hyper-Param
    
    
    class init_arg(object):
        def __init__(self, it = 10000, o = 'feature_score.csv.gz', dataset = None, i= None, target = None):
            self.it = it
            self.o = o
            self.dataset = dataset
            self.i = i
            self.target = target

    args = init_arg(dataset = 'Syn5', it = 300, )
    ocsv = args.o # 'feature_score.csv.gz'
    odir = os.path.dirname(ocsv)
    odir = '.' if not len(odir) else odir
    fn_csv = args.i #'data.csv'
    label_nm = args.target # 'target'
    nepoch = args.it
    logger = utilmlab.init_logger(odir)

    dataset = args.dataset

    assert dataset is not None or fn_csv is not None
    assert fn_csv is None or label_nm is not None

    # Data output can be either binary (Y) or Probability (Prob)
    data_out_sets = ['Y', 'Prob']
    data_out = data_out_sets[0]

    logger.info('invase: {} {} {} {}'.format(dataset, nepoch, odir, data_out))

    # Number of Training and Testing samples
    train_N = 10000
    test_N = 10000

    # Seeds (different seeds for training and testing)
    train_seed = 0
    test_seed = 1

    xs_train,xa_train, y_train, xs_test, xa_test, y_test= state_list_train, action_list_train, state_delta_train, state_list_test, action_list_test, state_delta_test, 
    g_test = np.zeros((y_test.shape[0],RED_ACTION_DIM + env.action_space.shape[0]))
    g_test[:,0] = 1
    print(g_test)
    print(xs_train.shape, xa_train.shape, y_train.shape, xs_test.shape, xa_test.shape, y_test.shape, g_test.shape)
            
    '''learning INVASE'''
    
    REAL_LMD = 1.0 # 0.0 - 0.5


    import time
    elapsed_time = []

    class init_arg(object):
        def __init__(self, it = 10000, o = 'feature_score.csv.gz', dataset = None, i= None, target = None):
            self.it = it
            self.o = o
            self.dataset = dataset
            self.i = i
            self.target = target


    for DATASET in ['Syn1']:
        args = init_arg(dataset = DATASET, it = 2500,)
        ocsv = args.o # 'feature_score.csv.gz'
        odir = os.path.dirname(ocsv)
        odir = '.' if not len(odir) else odir
        fn_csv = args.i #'data.csv'
        label_nm = args.target # 'target'
        nepoch = args.it
        logger = utilmlab.init_logger(odir)

        dataset = args.dataset

        assert dataset is not None or fn_csv is not None
        assert fn_csv is None or label_nm is not None

        # Data output can be either binary (Y) or Probability (Prob)
        data_out_sets = ['Y', 'Prob']
        data_out = data_out_sets[0]

        logger.info('invase: {} {} {} {}'.format(dataset, nepoch, odir, data_out))


        start_time = time.time()
        for thres_i in [0.0]:
            Predict_Out_temp = np.zeros([3, 2])    

            PVS_Alg = PVS(xs_train, dataset, 100, thres=thres_i)

            print('start training......')

            for train_epoch in range(int(nepoch/100)):

                Lmd = 0.1 #train_epoch*100/nepoch * REAL_LMD
                Thr = 0.0 #0.5*(1 - train_epoch*100/nepoch)
                print('now at training epoch number', int(train_epoch * 100),'hyp-params: lamda %.4f prior %.4f'%(Lmd,Thr))
                PVS_Alg.train(xs_train, xa_train, y_train, lmd = Lmd , thr = Thr)
                # 3. Get the selection probability on the testing set
                #Sel_Prob_Test = PVS_Alg.output(x_test)



                '''recurssive generation'''
                input_batch_xs = xs_test * 1.0
                input_batch_xa = xa_test * 1.0

                sel_prob_tot = 1.0
                for recur_time in range(1):
                    print('rec time now',recur_time,'dataset now:',DATASET)
                    gen_prob = PVS_Alg.generator(torch.as_tensor(input_batch_xs).float(),torch.as_tensor(input_batch_xa).float())
                    #sel_prob = PVS_Alg.Sample_M(gen_prob)
                    sel_prob = 1.*(gen_prob > 0.5)
                    sel_prob_tot_0 = sel_prob_tot * 1.0
                    sel_prob_tot = sel_prob * sel_prob_tot
                    input_batch_xa = sel_prob_tot * input_batch_xa

                    score = sel_prob_tot
                    #print('score',score)



                    # 4. Selected features
                    # 5. Prediction
                    val_predict, dis_predict = PVS_Alg.get_prediction(torch.as_tensor(xs_test).float(),torch.as_tensor(xa_test).float(), score)

                    def performance_metric(score, g_truth):

                        n = len(score)
                        Temp_TPR = np.zeros([n,])
                        Temp_FDR = np.zeros([n,])

                        for i in range(n):

                            # TPR    
                            # embed()
                            TPR_Nom = np.sum((score[i,:] * g_truth[i,:]).cpu().detach().numpy())
                            TPR_Den = np.sum(g_truth[i,:])
                            Temp_TPR[i] = 100 * float(TPR_Nom)/float(TPR_Den+1e-8)

                            # FDR
                            FDR_Nom = np.sum((score[i,:] * (1-g_truth[i,:])).cpu().detach().numpy())
                            FDR_Den = np.sum(score[i,:].cpu().detach().numpy())
                            Temp_FDR[i] = 100 * float(FDR_Nom)/float(FDR_Den+1e-8)

                        return np.mean(Temp_TPR), np.mean(Temp_FDR),\
                            np.std(Temp_TPR), np.std(Temp_FDR)

                    #%% Output

                    TPR_mean, TPR_std = -1, 0
                    FDR_mean, FDR_std = -1, 0
                    if g_test is not None:
                        TPR_mean, FDR_mean, TPR_std, FDR_std = performance_metric(
                            score, g_test)

                        logger.info('TPR mean: {:0.1f}%  std: {:0.1f}%'.format(
                            TPR_mean, TPR_std))
                        logger.info('FDR mean: {:0.1f}%  std: {:0.1f}%'.format(
                            FDR_mean, FDR_std))
                    else:
                        logger.info('no ground truth relevance')



                    #%% Performance Metrics
                    Predict_Out_temp[0,0] = np.linalg.norm(y_test - val_predict,2).mean()
                    Predict_Out_temp[0,1] = np.linalg.norm(y_test - dis_predict,2).mean()
                    print(Predict_Out_temp)

        elapsed_time.append(time.time() - start_time)
        print('PyTorch Version: elapsed time for {}: 11 feature, 10000 sample:'.format(DATASET),np.round(elapsed_time,4),'sec.')


    '''Continue training with fixed INVASE model'''
    PVS_Alg.generator.cuda()
    PVS_Alg.generator.eval()
    for t in range(10000, 50000):
        episode_timesteps += 1
        counter += 1
        # Select action randomly or according to policy
        if t < args_start_timesteps:
            action = np.random.uniform(-max_action, max_action, action_dim)
        else:
            if np.random.uniform(0,1) < 0.0:
                action = np.random.uniform(-max_action, max_action, action_dim)
            else:
                action = (
                    policy.select_action(np.array(state))
                    + np.random.normal(0, max_action * args_expl_noise, size=action_dim)
                ).clip(-max_action, max_action)

        # Perform action
        next_state, reward, done, _ = env.step(action[:-RED_ACTION_DIM])


        done_bool = float(done) if episode_timesteps < env._max_episode_steps else 0

        replay_buffer.add(state, action, next_state, reward, done_bool)

        state = next_state
        episode_reward += reward

        if t >= args_start_timesteps:
            '''TD3'''
            policy.train(replay_buffer, args_batch_size, PVS_Alg)


        # Train agent after collecting sufficient data
        if done:
            print(f"Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}")
            msk_list = []
            state, done = env.reset(), False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1 

        # Evaluate episode
        if (t + 1) % args_eval_freq == 0:
            evaluations.append(eval_policy(policy))
            print('recent Evaluation:',evaluations[-1])
            np.save('results/evaluations_alias{}_ENV{}_Repeat{}'.format(alias,ENV_NAME,repeat),evaluations)



 now evaluating: 
        Pendulum-v0




---------------------------------------
Evaluation over 10 episodes: -1521.553
---------------------------------------
---------------------------------------
Evaluation over 10 episodes: -1538.150
---------------------------------------
Total T: 200 Episode Num: 1 Episode T: 200 Reward: -1019.088
Total T: 400 Episode Num: 2 Episode T: 200 Reward: -1069.087
Total T: 600 Episode Num: 3 Episode T: 200 Reward: -1519.138
Total T: 800 Episode Num: 4 Episode T: 200 Reward: -1278.598
Total T: 1000 Episode Num: 5 Episode T: 200 Reward: -862.967
---------------------------------------
Evaluation over 10 episodes: -1505.696
---------------------------------------
recent Evaluation: -1505.6958268289775
Total T: 1200 Episode Num: 6 Episode T: 200 Reward: -1063.940
Total T: 1400 Episode Num: 7 Episode T: 200 Reward: -1058.628
Total T: 1600 Episode Num: 8 Episode T: 200 Reward: -867.111
Total T: 1800 Episode Num: 9 Episode T: 200 Reward: -953.699
Total T: 2000 Episode Num: 10 Episode T: 200 Reward: 

invase: Syn5 300 . Y
invase: Syn1 2500 . Y
input shape: 3
Epoch:      0, d_loss (Acc)): 0.165, v_loss (Acc): 0.200, g_loss: +4.7825


---------------------------------------
Evaluation over 10 episodes: -1401.752
---------------------------------------
recent Evaluation: -1401.7515668451001
[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]
(5000, 3) (5000, 101) (5000, 3) (5000, 3) (5000, 101) (5000, 3) (5000, 101)
start training......
now at training epoch number 0 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 60.7%  std: 48.8%
FDR mean: 98.7%  std: 1.0%
Epoch:      0, d_loss (Acc)): 0.016, v_loss (Acc): 0.017, g_loss: +0.7524


[[11.30659616  9.51890734]
 [ 0.          0.        ]
 [ 0.          0.        ]]
now at training epoch number 100 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 80.4%  std: 39.7%
FDR mean: 98.3%  std: 0.9%
Epoch:      0, d_loss (Acc)): 0.015, v_loss (Acc): 0.011, g_loss: -1.5741


[[9.37581565 8.79986379]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 200 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 91.3%  std: 28.2%
FDR mean: 98.0%  std: 0.7%
Epoch:      0, d_loss (Acc)): 0.013, v_loss (Acc): 0.010, g_loss: -1.0525


[[9.33257468 8.66035849]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 300 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 96.6%  std: 18.1%
FDR mean: 97.8%  std: 0.5%
Epoch:      0, d_loss (Acc)): 0.012, v_loss (Acc): 0.010, g_loss: -1.2542


[[9.28255825 8.46588066]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 400 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 98.8%  std: 10.8%
FDR mean: 97.6%  std: 0.4%
Epoch:      0, d_loss (Acc)): 0.011, v_loss (Acc): 0.009, g_loss: -0.7170


[[9.23175012 8.25304365]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 500 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 99.7%  std: 5.6%
FDR mean: 97.4%  std: 0.4%
Epoch:      0, d_loss (Acc)): 0.010, v_loss (Acc): 0.009, g_loss: -0.4100


[[9.22430864 7.94217783]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 600 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 1.4%
FDR mean: 97.0%  std: 0.5%
Epoch:      0, d_loss (Acc)): 0.009, v_loss (Acc): 0.008, g_loss: -0.1417


[[9.14469684 7.53441851]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 700 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 96.4%  std: 0.7%
Epoch:      0, d_loss (Acc)): 0.008, v_loss (Acc): 0.008, g_loss: -0.0085


[[9.13214928 6.99076363]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 800 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 95.3%  std: 1.2%
Epoch:      0, d_loss (Acc)): 0.007, v_loss (Acc): 0.007, g_loss: +0.2147


[[9.11100704 6.26746739]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 900 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 93.1%  std: 2.4%
Epoch:      0, d_loss (Acc)): 0.006, v_loss (Acc): 0.007, g_loss: +0.6591


[[9.10006176 5.3932218 ]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1000 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 88.3%  std: 5.7%
Epoch:      0, d_loss (Acc)): 0.005, v_loss (Acc): 0.007, g_loss: +1.2279


[[9.05915203 5.01605229]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1100 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 73.7%  std: 18.3%
Epoch:      0, d_loss (Acc)): 0.004, v_loss (Acc): 0.006, g_loss: +1.6010


[[9.0092041  4.98577696]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1200 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 41.9%  std: 31.3%
Epoch:      0, d_loss (Acc)): 0.003, v_loss (Acc): 0.006, g_loss: +1.9878


[[8.99314541 4.82267922]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1300 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 11.2%  std: 22.9%
Epoch:      0, d_loss (Acc)): 0.002, v_loss (Acc): 0.005, g_loss: +2.3274


[[8.95038472 4.49118178]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1400 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 2.3%  std: 10.9%
Epoch:      0, d_loss (Acc)): 0.002, v_loss (Acc): 0.005, g_loss: +2.3986


[[8.89691996 3.79109645]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1500 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.3%  std: 3.8%
Epoch:      0, d_loss (Acc)): 0.001, v_loss (Acc): 0.004, g_loss: +2.1182


[[8.86118839 3.24079292]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1600 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 1.5%
Epoch:      0, d_loss (Acc)): 0.001, v_loss (Acc): 0.004, g_loss: +1.8602


[[8.78757784 2.73463979]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1700 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.001, v_loss (Acc): 0.004, g_loss: +1.6634


[[8.71433194 2.33133747]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1800 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.001, v_loss (Acc): 0.003, g_loss: +1.2671


[[8.61928557 1.93652354]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1900 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.001, v_loss (Acc): 0.003, g_loss: +1.0533


[[8.46127657 1.65008093]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 2000 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.001, v_loss (Acc): 0.002, g_loss: +0.7888


[[8.31972727 1.46870443]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 2100 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.001, v_loss (Acc): 0.002, g_loss: +0.6398


[[8.18960884 1.2793991 ]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 2200 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.000, v_loss (Acc): 0.002, g_loss: +0.4770


[[7.98502023 1.13301086]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 2300 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.000, v_loss (Acc): 0.001, g_loss: +0.3418


[[7.79259525 1.01431751]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 2400 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%


[[7.59562277 0.94924869]
 [0.         0.        ]
 [0.         0.        ]]
PyTorch Version: elapsed time for Syn1: 11 feature, 10000 sample: [250.9811] sec.
Total T: 10200 Episode Num: 51 Episode T: 200 Reward: -1440.603
Total T: 10400 Episode Num: 52 Episode T: 200 Reward: -1202.390
Total T: 10600 Episode Num: 53 Episode T: 200 Reward: -1578.900
Total T: 10800 Episode Num: 54 Episode T: 200 Reward: -1812.212
Total T: 11000 Episode Num: 55 Episode T: 200 Reward: -1828.275
---------------------------------------
Evaluation over 10 episodes: -1549.953
---------------------------------------
recent Evaluation: -1549.9529251307645
Total T: 11200 Episode Num: 56 Episode T: 200 Reward: -1306.107
Total T: 11400 Episode Num: 57 Episode T: 200 Reward: -1390.792
Total T: 11600 Episode Num: 58 Episode T: 200 Reward: -1502.800
Total T: 11800 Episode Num: 59 Episode T: 200 Reward: -1510.539
Total T: 12000 Episode Num: 60 Episode T: 200 Reward: -1189.604
---------------------------------------
Eval

Total T: 27200 Episode Num: 136 Episode T: 200 Reward: -121.844
Total T: 27400 Episode Num: 137 Episode T: 200 Reward: -248.529
Total T: 27600 Episode Num: 138 Episode T: 200 Reward: -230.310
Total T: 27800 Episode Num: 139 Episode T: 200 Reward: -116.803
Total T: 28000 Episode Num: 140 Episode T: 200 Reward: -118.463
---------------------------------------
Evaluation over 10 episodes: -174.509
---------------------------------------
recent Evaluation: -174.5094718660324
Total T: 28200 Episode Num: 141 Episode T: 200 Reward: -317.794
Total T: 28400 Episode Num: 142 Episode T: 200 Reward: -119.469
Total T: 28600 Episode Num: 143 Episode T: 200 Reward: -360.110
Total T: 28800 Episode Num: 144 Episode T: 200 Reward: -231.568
Total T: 29000 Episode Num: 145 Episode T: 200 Reward: -298.432
---------------------------------------
Evaluation over 10 episodes: -175.396
---------------------------------------
recent Evaluation: -175.3958409419151
Total T: 29200 Episode Num: 146 Episode T: 200 R

Total T: 44600 Episode Num: 223 Episode T: 200 Reward: -126.215
Total T: 44800 Episode Num: 224 Episode T: 200 Reward: -242.141
Total T: 45000 Episode Num: 225 Episode T: 200 Reward: -237.585
---------------------------------------
Evaluation over 10 episodes: -110.244
---------------------------------------
recent Evaluation: -110.24372030902575
Total T: 45200 Episode Num: 226 Episode T: 200 Reward: -113.963
Total T: 45400 Episode Num: 227 Episode T: 200 Reward: -129.500
Total T: 45600 Episode Num: 228 Episode T: 200 Reward: -120.482
Total T: 45800 Episode Num: 229 Episode T: 200 Reward: -233.996
Total T: 46000 Episode Num: 230 Episode T: 200 Reward: -224.928
---------------------------------------
Evaluation over 10 episodes: -132.038
---------------------------------------
recent Evaluation: -132.03788474813288
Total T: 46200 Episode Num: 231 Episode T: 200 Reward: -129.426
Total T: 46400 Episode Num: 232 Episode T: 200 Reward: -121.902
Total T: 46600 Episode Num: 233 Episode T: 200

invase: Syn5 300 . Y
invase: Syn1 2500 . Y
input shape: 3
Epoch:      0, d_loss (Acc)): 0.172, v_loss (Acc): 0.197, g_loss: +3.6549


---------------------------------------
Evaluation over 10 episodes: -1413.927
---------------------------------------
recent Evaluation: -1413.9265831863352
[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]
(5000, 3) (5000, 101) (5000, 3) (5000, 3) (5000, 101) (5000, 3) (5000, 101)
start training......
now at training epoch number 0 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 53.0%  std: 49.9%
FDR mean: 99.0%  std: 1.0%
Epoch:      0, d_loss (Acc)): 0.015, v_loss (Acc): 0.016, g_loss: +0.6734


[[11.54894208 10.79465737]
 [ 0.          0.        ]
 [ 0.          0.        ]]
now at training epoch number 100 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 81.6%  std: 38.7%
FDR mean: 98.4%  std: 0.8%
Epoch:      0, d_loss (Acc)): 0.012, v_loss (Acc): 0.009, g_loss: -1.6189


[[8.99430559 8.70896077]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 200 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 93.7%  std: 24.3%
FDR mean: 98.1%  std: 0.5%
Epoch:      0, d_loss (Acc)): 0.011, v_loss (Acc): 0.008, g_loss: -1.6676


[[8.97038523 8.61154523]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 300 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 97.6%  std: 15.2%
FDR mean: 98.0%  std: 0.4%
Epoch:      0, d_loss (Acc)): 0.010, v_loss (Acc): 0.007, g_loss: -1.2112


[[8.90915804 8.42932089]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 400 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 99.1%  std: 9.3%
FDR mean: 97.8%  std: 0.3%
Epoch:      0, d_loss (Acc)): 0.009, v_loss (Acc): 0.007, g_loss: -1.0194


[[8.84590697 8.28477244]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 500 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 99.7%  std: 5.8%
FDR mean: 97.7%  std: 0.3%
Epoch:      0, d_loss (Acc)): 0.008, v_loss (Acc): 0.007, g_loss: -0.9591


[[8.80572715 8.09496896]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 600 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 99.9%  std: 3.2%
FDR mean: 97.4%  std: 0.4%
Epoch:      0, d_loss (Acc)): 0.008, v_loss (Acc): 0.007, g_loss: -0.5849


[[8.77496737 7.82781638]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 700 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 2.0%
FDR mean: 96.9%  std: 0.5%
Epoch:      0, d_loss (Acc)): 0.007, v_loss (Acc): 0.006, g_loss: -0.2409


[[8.72425183 7.46283956]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 800 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 96.0%  std: 0.9%
Epoch:      0, d_loss (Acc)): 0.007, v_loss (Acc): 0.006, g_loss: -0.4366


[[8.67125605 7.05749222]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 900 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 94.1%  std: 1.8%
Epoch:      0, d_loss (Acc)): 0.006, v_loss (Acc): 0.006, g_loss: -0.2080


[[8.61832132 6.48124077]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1000 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 89.4%  std: 5.2%
Epoch:      0, d_loss (Acc)): 0.006, v_loss (Acc): 0.005, g_loss: -0.2196


[[8.58351094 6.06840076]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1100 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 73.7%  std: 20.0%
Epoch:      0, d_loss (Acc)): 0.005, v_loss (Acc): 0.005, g_loss: +0.1451


[[8.48221224 5.98983474]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1200 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 38.6%  std: 32.7%
Epoch:      0, d_loss (Acc)): 0.004, v_loss (Acc): 0.004, g_loss: +0.4652


[[8.41597683 5.59642426]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1300 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 10.9%  std: 23.2%
Epoch:      0, d_loss (Acc)): 0.003, v_loss (Acc): 0.004, g_loss: +0.6013


[[8.33116001 5.2626247 ]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1400 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 1.8%  std: 10.3%
Epoch:      0, d_loss (Acc)): 0.003, v_loss (Acc): 0.004, g_loss: +0.8903


[[8.22327391 4.24380055]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1500 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.2%  std: 3.6%
Epoch:      0, d_loss (Acc)): 0.002, v_loss (Acc): 0.003, g_loss: +0.9775


[[8.12039014 3.61371224]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1600 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 1.0%
Epoch:      0, d_loss (Acc)): 0.002, v_loss (Acc): 0.003, g_loss: +0.8985


[[7.99066208 3.06292298]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1700 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.001, v_loss (Acc): 0.003, g_loss: +0.8747


[[7.90997242 2.48028102]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1800 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.001, v_loss (Acc): 0.003, g_loss: +0.6861


[[7.75939713 2.16129087]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1900 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.001, v_loss (Acc): 0.002, g_loss: +0.5938


[[7.58987206 1.92442106]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 2000 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.001, v_loss (Acc): 0.002, g_loss: +0.4898


[[7.46583139 1.80542781]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 2100 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.000, v_loss (Acc): 0.002, g_loss: +0.3879


[[7.32805608 1.72988786]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 2200 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.000, v_loss (Acc): 0.002, g_loss: +0.2854


[[7.19757347 1.6890226 ]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 2300 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.000, v_loss (Acc): 0.001, g_loss: +0.2177


[[7.10683049 1.6241501 ]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 2400 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%


[[6.99321383 1.60977811]
 [0.         0.        ]
 [0.         0.        ]]
PyTorch Version: elapsed time for Syn1: 11 feature, 10000 sample: [248.5574] sec.
Total T: 10200 Episode Num: 51 Episode T: 200 Reward: -1374.422
Total T: 10400 Episode Num: 52 Episode T: 200 Reward: -1803.300
Total T: 10600 Episode Num: 53 Episode T: 200 Reward: -1626.236
Total T: 10800 Episode Num: 54 Episode T: 200 Reward: -1827.035
Total T: 11000 Episode Num: 55 Episode T: 200 Reward: -1552.003
---------------------------------------
Evaluation over 10 episodes: -1508.162
---------------------------------------
recent Evaluation: -1508.161904787845
Total T: 11200 Episode Num: 56 Episode T: 200 Reward: -1475.327
Total T: 11400 Episode Num: 57 Episode T: 200 Reward: -1385.428
Total T: 11600 Episode Num: 58 Episode T: 200 Reward: -1511.358
Total T: 11800 Episode Num: 59 Episode T: 200 Reward: -1511.440
Total T: 12000 Episode Num: 60 Episode T: 200 Reward: -1537.156
---------------------------------------
Evalu

Total T: 27200 Episode Num: 136 Episode T: 200 Reward: -124.867
Total T: 27400 Episode Num: 137 Episode T: 200 Reward: -116.600
Total T: 27600 Episode Num: 138 Episode T: 200 Reward: -122.501
Total T: 27800 Episode Num: 139 Episode T: 200 Reward: -238.190
Total T: 28000 Episode Num: 140 Episode T: 200 Reward: -2.345
---------------------------------------
Evaluation over 10 episodes: -154.398
---------------------------------------
recent Evaluation: -154.39750302612705
Total T: 28200 Episode Num: 141 Episode T: 200 Reward: -348.221
Total T: 28400 Episode Num: 142 Episode T: 200 Reward: -116.590
Total T: 28600 Episode Num: 143 Episode T: 200 Reward: -340.794
Total T: 28800 Episode Num: 144 Episode T: 200 Reward: -116.426
Total T: 29000 Episode Num: 145 Episode T: 200 Reward: -122.065
---------------------------------------
Evaluation over 10 episodes: -132.250
---------------------------------------
recent Evaluation: -132.2496221276701
Total T: 29200 Episode Num: 146 Episode T: 200 Re

Total T: 44600 Episode Num: 223 Episode T: 200 Reward: -124.464
Total T: 44800 Episode Num: 224 Episode T: 200 Reward: -121.163
Total T: 45000 Episode Num: 225 Episode T: 200 Reward: -119.441
---------------------------------------
Evaluation over 10 episodes: -123.576
---------------------------------------
recent Evaluation: -123.57637149751876
Total T: 45200 Episode Num: 226 Episode T: 200 Reward: -126.586
Total T: 45400 Episode Num: 227 Episode T: 200 Reward: -130.468
Total T: 45600 Episode Num: 228 Episode T: 200 Reward: -126.949
Total T: 45800 Episode Num: 229 Episode T: 200 Reward: -126.292
Total T: 46000 Episode Num: 230 Episode T: 200 Reward: -350.398
---------------------------------------
Evaluation over 10 episodes: -156.166
---------------------------------------
recent Evaluation: -156.16602651961415
Total T: 46200 Episode Num: 231 Episode T: 200 Reward: -122.756
Total T: 46400 Episode Num: 232 Episode T: 200 Reward: -118.177
Total T: 46600 Episode Num: 233 Episode T: 200

invase: Syn5 300 . Y
invase: Syn1 2500 . Y
input shape: 3
Epoch:      0, d_loss (Acc)): 0.132, v_loss (Acc): 0.170, g_loss: +5.1377


---------------------------------------
Evaluation over 10 episodes: -1226.941
---------------------------------------
recent Evaluation: -1226.9412058098767
[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]
(5000, 3) (5000, 101) (5000, 3) (5000, 3) (5000, 101) (5000, 3) (5000, 101)
start training......
now at training epoch number 0 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 70.2%  std: 45.7%
FDR mean: 98.6%  std: 0.9%
Epoch:      0, d_loss (Acc)): 0.013, v_loss (Acc): 0.021, g_loss: +3.5915


[[18.99687822 10.30139719]
 [ 0.          0.        ]
 [ 0.          0.        ]]
now at training epoch number 100 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 90.9%  std: 28.8%
FDR mean: 98.1%  std: 0.6%
Epoch:      0, d_loss (Acc)): 0.010, v_loss (Acc): 0.007, g_loss: -1.5068


[[8.71202304 8.49535338]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 200 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 96.8%  std: 17.5%
FDR mean: 98.0%  std: 0.4%
Epoch:      0, d_loss (Acc)): 0.010, v_loss (Acc): 0.007, g_loss: -1.6202


[[8.6331217  8.37579312]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 300 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 98.6%  std: 11.9%
FDR mean: 97.9%  std: 0.3%
Epoch:      0, d_loss (Acc)): 0.009, v_loss (Acc): 0.006, g_loss: -1.1825


[[8.59350119 8.24436681]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 400 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 99.2%  std: 8.7%
FDR mean: 97.7%  std: 0.3%
Epoch:      0, d_loss (Acc)): 0.008, v_loss (Acc): 0.006, g_loss: -0.6398


[[8.56842859 8.15318434]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 500 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 99.7%  std: 5.6%
FDR mean: 97.5%  std: 0.3%
Epoch:      0, d_loss (Acc)): 0.007, v_loss (Acc): 0.006, g_loss: -0.4944


[[8.5226007 7.9595547]
 [0.        0.       ]
 [0.        0.       ]]
now at training epoch number 600 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 99.9%  std: 2.4%
FDR mean: 97.2%  std: 0.4%
Epoch:      0, d_loss (Acc)): 0.006, v_loss (Acc): 0.005, g_loss: -0.3787


[[8.51404663 7.75168392]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 700 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 1.4%
FDR mean: 96.6%  std: 0.6%
Epoch:      0, d_loss (Acc)): 0.006, v_loss (Acc): 0.005, g_loss: -0.2478


[[8.4460078 7.4689423]
 [0.        0.       ]
 [0.        0.       ]]
now at training epoch number 800 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 95.6%  std: 1.1%
Epoch:      0, d_loss (Acc)): 0.005, v_loss (Acc): 0.005, g_loss: -0.1332


[[8.44326295 7.07832042]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 900 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 93.6%  std: 2.0%
Epoch:      0, d_loss (Acc)): 0.005, v_loss (Acc): 0.005, g_loss: -0.1660


[[8.39670035 6.61974646]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1000 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 89.3%  std: 4.3%
Epoch:      0, d_loss (Acc)): 0.005, v_loss (Acc): 0.004, g_loss: -0.0739


[[8.34909947 5.97921566]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1100 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 78.5%  std: 12.5%
Epoch:      0, d_loss (Acc)): 0.005, v_loss (Acc): 0.004, g_loss: +0.1117


[[8.3342375  5.44900722]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1200 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 52.2%  std: 27.4%
Epoch:      0, d_loss (Acc)): 0.004, v_loss (Acc): 0.004, g_loss: +0.2818


[[8.29843355 5.20388209]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1300 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 14.1%  std: 25.9%
Epoch:      0, d_loss (Acc)): 0.004, v_loss (Acc): 0.004, g_loss: +0.4970


[[8.2674987 4.8709583]
 [0.        0.       ]
 [0.        0.       ]]
now at training epoch number 1400 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 2.9%  std: 13.0%
Epoch:      0, d_loss (Acc)): 0.002, v_loss (Acc): 0.003, g_loss: +0.6961


[[8.23769623 4.434782  ]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1500 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.7%  std: 6.5%
Epoch:      0, d_loss (Acc)): 0.002, v_loss (Acc): 0.003, g_loss: +0.7958


[[8.21624818 3.95826025]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1600 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.2%  std: 3.2%
Epoch:      0, d_loss (Acc)): 0.001, v_loss (Acc): 0.003, g_loss: +0.7372


[[8.15764364 3.38389195]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1700 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.1%  std: 1.8%
Epoch:      0, d_loss (Acc)): 0.001, v_loss (Acc): 0.003, g_loss: +0.6883


[[8.15999889 2.81215072]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1800 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 1.0%
Epoch:      0, d_loss (Acc)): 0.001, v_loss (Acc): 0.003, g_loss: +0.6078


[[8.11024826 2.28281649]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1900 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.001, v_loss (Acc): 0.002, g_loss: +0.4728


[[8.07635841 1.96044197]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 2000 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.000, v_loss (Acc): 0.002, g_loss: +0.4505


[[8.07584813 1.70700626]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 2100 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.001, v_loss (Acc): 0.002, g_loss: +0.3799


[[8.02629744 1.54806896]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 2200 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.000, v_loss (Acc): 0.002, g_loss: +0.2952


[[7.96032253 1.4135955 ]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 2300 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.000, v_loss (Acc): 0.002, g_loss: +0.2584


[[7.94087507 1.35606331]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 2400 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%


[[7.91297351 1.20222243]
 [0.         0.        ]
 [0.         0.        ]]
PyTorch Version: elapsed time for Syn1: 11 feature, 10000 sample: [248.1456] sec.
Total T: 10200 Episode Num: 51 Episode T: 200 Reward: -1470.285
Total T: 10400 Episode Num: 52 Episode T: 200 Reward: -1892.845
Total T: 10600 Episode Num: 53 Episode T: 200 Reward: -1897.560
Total T: 10800 Episode Num: 54 Episode T: 200 Reward: -1522.005
Total T: 11000 Episode Num: 55 Episode T: 200 Reward: -1481.015
---------------------------------------
Evaluation over 10 episodes: -1443.279
---------------------------------------
recent Evaluation: -1443.278507582909
Total T: 11200 Episode Num: 56 Episode T: 200 Reward: -1526.883
Total T: 11400 Episode Num: 57 Episode T: 200 Reward: -1511.341
Total T: 11600 Episode Num: 58 Episode T: 200 Reward: -1480.475
Total T: 11800 Episode Num: 59 Episode T: 200 Reward: -1518.332
Total T: 12000 Episode Num: 60 Episode T: 200 Reward: -1503.810
---------------------------------------
Evalu

Total T: 27200 Episode Num: 136 Episode T: 200 Reward: -249.611
Total T: 27400 Episode Num: 137 Episode T: 200 Reward: -121.816
Total T: 27600 Episode Num: 138 Episode T: 200 Reward: -119.581
Total T: 27800 Episode Num: 139 Episode T: 200 Reward: -120.169
Total T: 28000 Episode Num: 140 Episode T: 200 Reward: -115.718
---------------------------------------
Evaluation over 10 episodes: -154.858
---------------------------------------
recent Evaluation: -154.85766384552232
Total T: 28200 Episode Num: 141 Episode T: 200 Reward: -115.530
Total T: 28400 Episode Num: 142 Episode T: 200 Reward: -118.832
Total T: 28600 Episode Num: 143 Episode T: 200 Reward: -119.349
Total T: 28800 Episode Num: 144 Episode T: 200 Reward: -118.797
Total T: 29000 Episode Num: 145 Episode T: 200 Reward: -0.445
---------------------------------------
Evaluation over 10 episodes: -129.446
---------------------------------------
recent Evaluation: -129.4463580501267
Total T: 29200 Episode Num: 146 Episode T: 200 Re

Total T: 44600 Episode Num: 223 Episode T: 200 Reward: -117.204
Total T: 44800 Episode Num: 224 Episode T: 200 Reward: -120.919
Total T: 45000 Episode Num: 225 Episode T: 200 Reward: -230.348
---------------------------------------
Evaluation over 10 episodes: -172.171
---------------------------------------
recent Evaluation: -172.1707715654045
Total T: 45200 Episode Num: 226 Episode T: 200 Reward: -117.491
Total T: 45400 Episode Num: 227 Episode T: 200 Reward: -116.959
Total T: 45600 Episode Num: 228 Episode T: 200 Reward: -225.147
Total T: 45800 Episode Num: 229 Episode T: 200 Reward: -116.358
Total T: 46000 Episode Num: 230 Episode T: 200 Reward: -237.696
---------------------------------------
Evaluation over 10 episodes: -121.607
---------------------------------------
recent Evaluation: -121.60677270286976
Total T: 46200 Episode Num: 231 Episode T: 200 Reward: -125.037
Total T: 46400 Episode Num: 232 Episode T: 200 Reward: -118.795
Total T: 46600 Episode Num: 233 Episode T: 200 

invase: Syn5 300 . Y
invase: Syn1 2500 . Y
input shape: 3
Epoch:      0, d_loss (Acc)): 0.158, v_loss (Acc): 0.181, g_loss: +3.0518


---------------------------------------
Evaluation over 10 episodes: -1237.780
---------------------------------------
recent Evaluation: -1237.7798074394357
[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]
(5000, 3) (5000, 101) (5000, 3) (5000, 3) (5000, 101) (5000, 3) (5000, 101)
start training......
now at training epoch number 0 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 51.3%  std: 50.0%
FDR mean: 99.0%  std: 1.0%
Epoch:      0, d_loss (Acc)): 0.015, v_loss (Acc): 0.015, g_loss: +0.2780


[[12.26240881 11.98523907]
 [ 0.          0.        ]
 [ 0.          0.        ]]
now at training epoch number 100 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 76.7%  std: 42.3%
FDR mean: 98.4%  std: 0.9%
Epoch:      0, d_loss (Acc)): 0.011, v_loss (Acc): 0.008, g_loss: -1.5955


[[8.59715498 8.19791247]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 200 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 91.0%  std: 28.6%
FDR mean: 98.1%  std: 0.6%
Epoch:      0, d_loss (Acc)): 0.010, v_loss (Acc): 0.007, g_loss: -1.3815


[[8.50549645 8.08755093]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 300 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 96.7%  std: 17.8%
FDR mean: 97.9%  std: 0.5%
Epoch:      0, d_loss (Acc)): 0.010, v_loss (Acc): 0.008, g_loss: -1.0804


[[8.46187195 7.98010032]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 400 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 98.9%  std: 10.2%
FDR mean: 97.7%  std: 0.4%
Epoch:      0, d_loss (Acc)): 0.009, v_loss (Acc): 0.007, g_loss: -0.9378


[[8.41921702 7.84953108]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 500 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 99.9%  std: 3.5%
FDR mean: 97.4%  std: 0.4%
Epoch:      0, d_loss (Acc)): 0.008, v_loss (Acc): 0.006, g_loss: -0.6084


[[8.33723732 7.68108324]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 600 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 2.0%
FDR mean: 96.9%  std: 0.5%
Epoch:      0, d_loss (Acc)): 0.007, v_loss (Acc): 0.006, g_loss: -0.5340


[[8.31876618 7.42793303]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 700 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 96.1%  std: 0.8%
Epoch:      0, d_loss (Acc)): 0.006, v_loss (Acc): 0.005, g_loss: -0.2160


[[8.27891052 7.11414071]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 800 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 94.5%  std: 1.6%
Epoch:      0, d_loss (Acc)): 0.006, v_loss (Acc): 0.005, g_loss: -0.3986


[[8.23757673 6.62078794]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 900 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 91.1%  std: 3.5%
Epoch:      0, d_loss (Acc)): 0.006, v_loss (Acc): 0.005, g_loss: -0.2207


[[8.18894848 6.07201983]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1000 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 81.9%  std: 10.5%
Epoch:      0, d_loss (Acc)): 0.005, v_loss (Acc): 0.005, g_loss: -0.1063


[[8.11222471 5.44489681]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1100 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 57.4%  std: 27.1%
Epoch:      0, d_loss (Acc)): 0.005, v_loss (Acc): 0.004, g_loss: +0.1525


[[8.06081704 5.23654483]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1200 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 22.9%  std: 29.7%
Epoch:      0, d_loss (Acc)): 0.004, v_loss (Acc): 0.004, g_loss: +0.2946


[[7.95057451 5.04333175]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1300 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 6.6%  std: 18.6%
Epoch:      0, d_loss (Acc)): 0.003, v_loss (Acc): 0.004, g_loss: +0.6071


[[7.8867288  4.56934286]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1400 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 1.3%  std: 8.6%
Epoch:      0, d_loss (Acc)): 0.002, v_loss (Acc): 0.003, g_loss: +0.8151


[[7.76947835 4.04787588]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1500 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.3%  std: 4.0%
Epoch:      0, d_loss (Acc)): 0.002, v_loss (Acc): 0.003, g_loss: +0.8308


[[7.64540102 3.44605529]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1600 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 1.6%
Epoch:      0, d_loss (Acc)): 0.001, v_loss (Acc): 0.003, g_loss: +0.7683


[[7.50718853 2.88346358]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1700 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.001, v_loss (Acc): 0.003, g_loss: +0.7543


[[7.37889902 2.35084223]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1800 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.001, v_loss (Acc): 0.003, g_loss: +0.6238


[[7.21721895 1.96858354]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1900 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.000, v_loss (Acc): 0.002, g_loss: +0.5160


[[7.02680516 1.73365917]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 2000 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.000, v_loss (Acc): 0.002, g_loss: +0.4009


[[6.83074412 1.53810844]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 2100 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.000, v_loss (Acc): 0.002, g_loss: +0.3548


[[6.64185543 1.40439588]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 2200 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.000, v_loss (Acc): 0.002, g_loss: +0.2857


[[6.45404723 1.27470015]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 2300 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.000, v_loss (Acc): 0.002, g_loss: +0.2498


[[6.31973664 1.15663587]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 2400 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%


[[6.17298815 1.07375525]
 [0.         0.        ]
 [0.         0.        ]]
PyTorch Version: elapsed time for Syn1: 11 feature, 10000 sample: [248.2686] sec.
Total T: 10200 Episode Num: 51 Episode T: 200 Reward: -1595.317
Total T: 10400 Episode Num: 52 Episode T: 200 Reward: -1343.515
Total T: 10600 Episode Num: 53 Episode T: 200 Reward: -1687.217
Total T: 10800 Episode Num: 54 Episode T: 200 Reward: -1561.207
Total T: 11000 Episode Num: 55 Episode T: 200 Reward: -1593.242
---------------------------------------
Evaluation over 10 episodes: -1535.392
---------------------------------------
recent Evaluation: -1535.3921261376854
Total T: 11200 Episode Num: 56 Episode T: 200 Reward: -1521.368
Total T: 11400 Episode Num: 57 Episode T: 200 Reward: -1539.733
Total T: 11600 Episode Num: 58 Episode T: 200 Reward: -1522.140
Total T: 11800 Episode Num: 59 Episode T: 200 Reward: -1526.152
Total T: 12000 Episode Num: 60 Episode T: 200 Reward: -1459.885
---------------------------------------
Eval

Total T: 27200 Episode Num: 136 Episode T: 200 Reward: -1.010
Total T: 27400 Episode Num: 137 Episode T: 200 Reward: -116.120
Total T: 27600 Episode Num: 138 Episode T: 200 Reward: -116.650
Total T: 27800 Episode Num: 139 Episode T: 200 Reward: -116.642
Total T: 28000 Episode Num: 140 Episode T: 200 Reward: -118.741
---------------------------------------
Evaluation over 10 episodes: -130.120
---------------------------------------
recent Evaluation: -130.12029046493393
Total T: 28200 Episode Num: 141 Episode T: 200 Reward: -232.904
Total T: 28400 Episode Num: 142 Episode T: 200 Reward: -331.477
Total T: 28600 Episode Num: 143 Episode T: 200 Reward: -114.322
Total T: 28800 Episode Num: 144 Episode T: 200 Reward: -224.744
Total T: 29000 Episode Num: 145 Episode T: 200 Reward: -119.191
---------------------------------------
Evaluation over 10 episodes: -149.846
---------------------------------------
recent Evaluation: -149.8463336644727
Total T: 29200 Episode Num: 146 Episode T: 200 Re

Total T: 44600 Episode Num: 223 Episode T: 200 Reward: -233.678
Total T: 44800 Episode Num: 224 Episode T: 200 Reward: -124.772
Total T: 45000 Episode Num: 225 Episode T: 200 Reward: -121.889
---------------------------------------
Evaluation over 10 episodes: -125.989
---------------------------------------
recent Evaluation: -125.98882117674846
Total T: 45200 Episode Num: 226 Episode T: 200 Reward: -131.719
Total T: 45400 Episode Num: 227 Episode T: 200 Reward: -229.990
Total T: 45600 Episode Num: 228 Episode T: 200 Reward: -121.398
Total T: 45800 Episode Num: 229 Episode T: 200 Reward: -121.778
Total T: 46000 Episode Num: 230 Episode T: 200 Reward: -130.089
---------------------------------------
Evaluation over 10 episodes: -146.797
---------------------------------------
recent Evaluation: -146.79741256472988
Total T: 46200 Episode Num: 231 Episode T: 200 Reward: -233.959
Total T: 46400 Episode Num: 232 Episode T: 200 Reward: -237.035
Total T: 46600 Episode Num: 233 Episode T: 200

invase: Syn5 300 . Y
invase: Syn1 2500 . Y
input shape: 3
Epoch:      0, d_loss (Acc)): 0.166, v_loss (Acc): 0.192, g_loss: +3.7827


---------------------------------------
Evaluation over 10 episodes: -1243.598
---------------------------------------
recent Evaluation: -1243.5984594970098
[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]
(5000, 3) (5000, 101) (5000, 3) (5000, 3) (5000, 101) (5000, 3) (5000, 101)
start training......
now at training epoch number 0 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 57.2%  std: 49.5%
FDR mean: 98.8%  std: 1.1%
Epoch:      0, d_loss (Acc)): 0.017, v_loss (Acc): 0.018, g_loss: +0.7563


[[13.06122374 11.27666563]
 [ 0.          0.        ]
 [ 0.          0.        ]]
now at training epoch number 100 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 82.8%  std: 37.7%
FDR mean: 98.2%  std: 0.9%
Epoch:      0, d_loss (Acc)): 0.013, v_loss (Acc): 0.010, g_loss: -1.4452


[[8.50011809 8.24863635]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 200 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 93.9%  std: 24.0%
FDR mean: 97.9%  std: 0.6%
Epoch:      0, d_loss (Acc)): 0.012, v_loss (Acc): 0.009, g_loss: -1.0388


[[8.44313861 8.13931796]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 300 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 97.5%  std: 15.6%
FDR mean: 97.7%  std: 0.5%
Epoch:      0, d_loss (Acc)): 0.011, v_loss (Acc): 0.009, g_loss: -0.9505


[[8.41454302 7.99841093]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 400 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 99.1%  std: 9.5%
FDR mean: 97.4%  std: 0.6%
Epoch:      0, d_loss (Acc)): 0.011, v_loss (Acc): 0.009, g_loss: -0.7440


[[8.39631227 7.83912256]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 500 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 99.7%  std: 5.3%
FDR mean: 97.1%  std: 0.7%
Epoch:      0, d_loss (Acc)): 0.009, v_loss (Acc): 0.008, g_loss: -0.5647


[[8.35783986 7.58800149]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 600 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 99.9%  std: 2.4%
FDR mean: 96.5%  std: 1.0%
Epoch:      0, d_loss (Acc)): 0.008, v_loss (Acc): 0.007, g_loss: -0.3972


[[8.30418536 7.24712875]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 700 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 1.4%
FDR mean: 95.4%  std: 1.5%
Epoch:      0, d_loss (Acc)): 0.008, v_loss (Acc): 0.007, g_loss: -0.1318


[[8.28935764 6.79138312]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 800 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 93.3%  std: 2.9%
Epoch:      0, d_loss (Acc)): 0.007, v_loss (Acc): 0.007, g_loss: +0.0161


[[8.27526348 6.18415644]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 900 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 88.7%  std: 6.3%
Epoch:      0, d_loss (Acc)): 0.007, v_loss (Acc): 0.007, g_loss: +0.1496


[[8.22954499 5.52440667]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1000 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 76.7%  std: 17.1%
Epoch:      0, d_loss (Acc)): 0.005, v_loss (Acc): 0.006, g_loss: +0.3894


[[8.17241244 5.09830179]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1100 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 50.4%  std: 30.7%
Epoch:      0, d_loss (Acc)): 0.005, v_loss (Acc): 0.006, g_loss: +0.7830


[[8.09839618 5.06373473]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1200 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 17.1%  std: 27.3%
Epoch:      0, d_loss (Acc)): 0.004, v_loss (Acc): 0.005, g_loss: +1.1489


[[8.01713505 4.67306138]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1300 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 2.6%  std: 12.2%
Epoch:      0, d_loss (Acc)): 0.003, v_loss (Acc): 0.005, g_loss: +1.3254


[[7.9219678  3.99472516]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1400 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.5%  std: 5.0%
Epoch:      0, d_loss (Acc)): 0.002, v_loss (Acc): 0.004, g_loss: +1.5693


[[7.76725735 3.22283277]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1500 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 1.7%
Epoch:      0, d_loss (Acc)): 0.001, v_loss (Acc): 0.004, g_loss: +1.4486


[[7.64074122 2.75117001]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1600 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.7%
Epoch:      0, d_loss (Acc)): 0.001, v_loss (Acc): 0.004, g_loss: +1.3076


[[7.45187781 2.45381758]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1700 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.001, v_loss (Acc): 0.003, g_loss: +1.0591


[[7.22659678 2.13217094]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1800 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.001, v_loss (Acc): 0.003, g_loss: +0.7993


[[6.98073333 1.89995688]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 1900 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.001, v_loss (Acc): 0.003, g_loss: +0.6690


[[6.77199477 1.69287985]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 2000 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.001, v_loss (Acc): 0.002, g_loss: +0.5115


[[6.61051574 1.5819502 ]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 2100 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.000, v_loss (Acc): 0.002, g_loss: +0.4239


[[6.52728705 1.51857365]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 2200 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.001, v_loss (Acc): 0.002, g_loss: +0.3256


[[6.47611324 1.4471368 ]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 2300 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%
Epoch:      0, d_loss (Acc)): 0.000, v_loss (Acc): 0.002, g_loss: +0.2481


[[6.39301121 1.41759482]
 [0.         0.        ]
 [0.         0.        ]]
now at training epoch number 2400 hyp-params: lamda 0.1000 prior 0.0000
rec time now 0 dataset now: Syn1


TPR mean: 100.0%  std: 0.0%
FDR mean: 0.0%  std: 0.0%


[[6.31229983 1.3850124 ]
 [0.         0.        ]
 [0.         0.        ]]
PyTorch Version: elapsed time for Syn1: 11 feature, 10000 sample: [252.9614] sec.
Total T: 10200 Episode Num: 51 Episode T: 200 Reward: -1531.398
Total T: 10400 Episode Num: 52 Episode T: 200 Reward: -1267.802
Total T: 10600 Episode Num: 53 Episode T: 200 Reward: -1578.750
Total T: 10800 Episode Num: 54 Episode T: 200 Reward: -1679.254
Total T: 11000 Episode Num: 55 Episode T: 200 Reward: -1642.245
---------------------------------------
Evaluation over 10 episodes: -1550.781
---------------------------------------
recent Evaluation: -1550.7805797382757
Total T: 11200 Episode Num: 56 Episode T: 200 Reward: -1561.289
Total T: 11400 Episode Num: 57 Episode T: 200 Reward: -1432.505
Total T: 11600 Episode Num: 58 Episode T: 200 Reward: -1495.888
Total T: 11800 Episode Num: 59 Episode T: 200 Reward: -1515.560
Total T: 12000 Episode Num: 60 Episode T: 200 Reward: -1432.608
---------------------------------------
Eval

Total T: 27200 Episode Num: 136 Episode T: 200 Reward: -2.186
Total T: 27400 Episode Num: 137 Episode T: 200 Reward: -1.057
Total T: 27600 Episode Num: 138 Episode T: 200 Reward: -122.152
Total T: 27800 Episode Num: 139 Episode T: 200 Reward: -234.413
Total T: 28000 Episode Num: 140 Episode T: 200 Reward: -114.183
---------------------------------------
Evaluation over 10 episodes: -142.806
---------------------------------------
recent Evaluation: -142.806257752745
Total T: 28200 Episode Num: 141 Episode T: 200 Reward: -119.651
Total T: 28400 Episode Num: 142 Episode T: 200 Reward: -303.884
Total T: 28600 Episode Num: 143 Episode T: 200 Reward: -1.967
Total T: 28800 Episode Num: 144 Episode T: 200 Reward: -115.118
Total T: 29000 Episode Num: 145 Episode T: 200 Reward: -238.982
---------------------------------------
Evaluation over 10 episodes: -142.078
---------------------------------------
recent Evaluation: -142.07790595400053
Total T: 29200 Episode Num: 146 Episode T: 200 Reward:

Total T: 44600 Episode Num: 223 Episode T: 200 Reward: -364.296
Total T: 44800 Episode Num: 224 Episode T: 200 Reward: -116.867
Total T: 45000 Episode Num: 225 Episode T: 200 Reward: -118.403
---------------------------------------
Evaluation over 10 episodes: -132.169
---------------------------------------
recent Evaluation: -132.1687466874569
Total T: 45200 Episode Num: 226 Episode T: 200 Reward: -123.436
Total T: 45400 Episode Num: 227 Episode T: 200 Reward: -1.821
Total T: 45600 Episode Num: 228 Episode T: 200 Reward: -127.878
Total T: 45800 Episode Num: 229 Episode T: 200 Reward: -225.010
Total T: 46000 Episode Num: 230 Episode T: 200 Reward: -230.348
---------------------------------------
Evaluation over 10 episodes: -122.406
---------------------------------------
recent Evaluation: -122.40551292648338
Total T: 46200 Episode Num: 231 Episode T: 200 Reward: -235.938
Total T: 46400 Episode Num: 232 Episode T: 200 Reward: -117.960
Total T: 46600 Episode Num: 233 Episode T: 200 Re