In [1]:
import os
# import plotly.graph_objects as go
from tqdm import tqdm
from gym import envs
import argparse
import numpy as np
from bigmdp.data.buffer import StandardBuffer,ReplayBuffer, gather_data_in_buffer, get_iter_indexes
from bigmdp.mdp.MDP_GPU import FullMDP
from bigmdp.utils.utils_eval import evaluate_on_env
from bigmdp.mdp.agent import SimpleAgent
from copy import deepcopy as cpy
from os import path
from arg_def import * 
import gym
from sklearn.neighbors import KDTree
from IPython import display
import torch
import time
import copy

In [2]:
parser, ArgumentDict = get_argument_parser()
options = "--env_name CartPole-v0 --MAX_S_COUNT 15000 --MAX_NS_COUNT 5 --mdp_build_k 5 --policy_k 11 --normalize_by_distance"
args = parser.parse_args(options.split(" "))

for title, arg_names in ArgumentDict.items():
    print_args(args, to_show_args=arg_names, title = title)

##################################################    envArgs    ##################################################
env_name                       : CartPole-v0                    seed                           : 4444
###################################################################################################################
##################################################    mdpBuildArgs    ##################################################
unknown_transition_reward      : -1000                          rmax_reward                    : 10000
balanced_exploration           : 0                              rmax_threshold                 : 2
MAX_S_COUNT                    : 15000                          def_device                     : GPU
fill_with                      : 0Q_src-KNN                     mdp_build_k                    : 5
knn_delta                      : 1e-08                          penalty_type                   : linear
penalty_beta                   : 1     

In [3]:
env = gym.make(args.env_name)

In [4]:
train_buffer = ReplayBuffer(state_dim = env.observation_space.shape[0],
                           is_atari= False, 
                           atari_preprocessing= None, 
                           batch_size=32, 
                           buffer_size=20000,
                           device="cpu")

In [5]:
class DummyNet():
    def __init__(self, sim, add_noise=False):
        self.simulator = sim

    def encode_single(self, o):
        return tuple(o)

    def encode_batch(self, o_batch):
        return [tuple(o) for o in o_batch]

    def predict_single_transition(self, o, a):
        assert False, "Not Implemented Error"

    def predict_batch_transition(self, o_batch, a_batch):
        assert False, "Not Implemented Error"

In [6]:
empty_MDP = FullMDP(A= list(range(env.action_space.n)),
                    ur=args.unknown_transition_reward,
                    vi_params={"gamma":  args.gamma,
                               "slip_prob": args.slip_probability,
                               "rmax_reward": args.rmax_reward,
                               "rmax_thres": args.rmax_threshold,
                               "balanced_explr": args.balanced_exploration,
                              "rmin":-1000},
                    knn_delta=args.knn_delta,
                    MAX_S_COUNT=args.MAX_S_COUNT,
                    MAX_NS_COUNT=args.MAX_NS_COUNT,
                    default_mode=args.def_device)

myAgent =  SimpleAgent(mdp_T= empty_MDP, net = DummyNet(None), fill_with = args.fill_with,
                       mdp_build_k = args.mdp_build_k, plcy_k = args.policy_k[0],  
                       kNN_on_sa = args.smooth_with_seen, soft_at_plcy = args.soft_q, 
                       normalize_by_distance= args.normalize_by_distance,
                       penalty_type=args.penalty_type, penalty_beta = args.penalty_beta,abstraction_flag=False)

# myAgent.build_mdp(train_buffer, verbose= True)

In [7]:
eval_rewards = []
for i in range(20):
    train_buffer, info = gather_data_in_buffer(train_buffer, env,policy = lambda s:np.random.randint(2), episode_count=5, frame_count=1000)
    myAgent.build_mdp(train_buffer, verbose= False)
    eval_reward = evaluate_on_env(env,myAgent.policies["optimal"], eps_count=50,progress_bar=True)[0]
    eval_rewards.append(eval_reward)
    print("#"*20 + f"\t Episode: {i*5}, Size of MDP:{np.sum(myAgent.mdp_T.filled_mask)}, Eval Reward:{eval_reward}\t" + "#"*20)
        


100%|██████████| 1/1 [00:00<00:00, 321.75it/s]
100%|██████████| 105/105 [00:00<00:00, 6118.65it/s]
100%|██████████| 212/212 [00:00<00:00, 3336.03it/s]

Average Reward of collected trajectories:21.2
Len of to commit transitions 105
ABstraction Faldg False
Len of to commit sa pairs 212





Elapsed Time:0s, VI Error:0.0821991, #Backups: 250
Elapsed Time:1s, VI Error:0.00666046, #Backups: 500


 12%|█▏        | 6/50 [00:00<00:00, 57.50it/s]

Elapsed Time:2s, VI Error:0.00057983, #Backups: 750
Time takedn to solve 2.0695133209228516


100%|██████████| 50/50 [00:00<00:00, 61.48it/s]
100%|██████████| 1/1 [00:00<00:00, 421.37it/s]
100%|██████████| 216/216 [00:00<00:00, 8188.67it/s]
100%|██████████| 434/434 [00:00<00:00, 3375.96it/s]


####################	 Episode: 0, Size of MDP:108, Eval Reward:50.26	####################
Average Reward of collected trajectories:22.2
Len of to commit transitions 216
ABstraction Faldg False
Len of to commit sa pairs 434
Elapsed Time:0s, VI Error:0.00546265, #Backups: 1000


  8%|▊         | 4/50 [00:00<00:01, 39.54it/s]

Elapsed Time:1s, VI Error:0.00027466, #Backups: 1250
Time takedn to solve 1.1998069286346436


100%|██████████| 50/50 [00:01<00:00, 37.62it/s]
100%|██████████| 2/2 [00:00<00:00, 661.46it/s]
100%|██████████| 331/331 [00:00<00:00, 18112.39it/s]
 58%|█████▊    | 386/664 [00:00<00:00, 3856.11it/s]

####################	 Episode: 5, Size of MDP:219, Eval Reward:81.24	####################
Average Reward of collected trajectories:23.0
Len of to commit transitions 331
ABstraction Faldg False
Len of to commit sa pairs 664


100%|██████████| 664/664 [00:00<00:00, 3658.20it/s]


Elapsed Time:0s, VI Error:0.00601959, #Backups: 1500


  4%|▍         | 2/50 [00:00<00:03, 14.74it/s]

Elapsed Time:1s, VI Error:0.00037384, #Backups: 1750
Time takedn to solve 1.2228286266326904


100%|██████████| 50/50 [00:02<00:00, 17.24it/s]
100%|██████████| 2/2 [00:00<00:00, 638.89it/s]
100%|██████████| 456/456 [00:00<00:00, 20368.07it/s]
 41%|████      | 377/914 [00:00<00:00, 3769.14it/s]

####################	 Episode: 10, Size of MDP:334, Eval Reward:177.36	####################
Average Reward of collected trajectories:25.0
Len of to commit transitions 456
ABstraction Faldg False
Len of to commit sa pairs 914


100%|██████████| 914/914 [00:00<00:00, 3454.69it/s]


Elapsed Time:0s, VI Error:0.00102234, #Backups: 2000


  4%|▍         | 2/50 [00:00<00:02, 16.07it/s]

Elapsed Time:1s, VI Error:9.918e-05, #Backups: 2250
Time takedn to solve 1.1410744190216064


100%|██████████| 50/50 [00:03<00:00, 15.39it/s]
100%|██████████| 3/3 [00:00<00:00, 636.43it/s]
100%|██████████| 555/555 [00:00<00:00, 19906.10it/s]
 30%|██▉       | 329/1112 [00:00<00:00, 3277.05it/s]

####################	 Episode: 15, Size of MDP:459, Eval Reward:200.0	####################
Average Reward of collected trajectories:19.8
Len of to commit transitions 555
ABstraction Faldg False
Len of to commit sa pairs 1112


100%|██████████| 1112/1112 [00:00<00:00, 3351.52it/s]
  4%|▍         | 2/50 [00:00<00:03, 15.24it/s]

Elapsed Time:0s, VI Error:0.00038147, #Backups: 2500
Time takedn to solve 0.5700924396514893


100%|██████████| 50/50 [00:03<00:00, 14.60it/s]
100%|██████████| 3/3 [00:00<00:00, 786.14it/s]
100%|██████████| 668/668 [00:00<00:00, 21140.04it/s]
 29%|██▉       | 389/1338 [00:00<00:00, 3886.40it/s]

####################	 Episode: 20, Size of MDP:558, Eval Reward:198.54	####################
Average Reward of collected trajectories:22.6
Len of to commit transitions 668
ABstraction Faldg False
Len of to commit sa pairs 1338


100%|██████████| 1338/1338 [00:00<00:00, 3553.43it/s]
  4%|▍         | 2/50 [00:00<00:02, 16.34it/s]

Elapsed Time:0s, VI Error:0.0, #Backups: 2750
Time takedn to solve 0.570246696472168


100%|██████████| 50/50 [00:03<00:00, 15.09it/s]
100%|██████████| 3/3 [00:00<00:00, 614.40it/s]
100%|██████████| 744/744 [00:00<00:00, 19760.27it/s]
 23%|██▎       | 347/1490 [00:00<00:00, 3459.80it/s]

####################	 Episode: 25, Size of MDP:671, Eval Reward:200.0	####################
Average Reward of collected trajectories:15.2
Len of to commit transitions 744
ABstraction Faldg False
Len of to commit sa pairs 1490


100%|██████████| 1490/1490 [00:00<00:00, 3365.00it/s]
  4%|▍         | 2/50 [00:00<00:02, 16.33it/s]

Elapsed Time:0s, VI Error:0.0, #Backups: 3000
Time takedn to solve 0.5622689723968506


100%|██████████| 50/50 [00:03<00:00, 15.59it/s]
100%|██████████| 4/4 [00:00<00:00, 830.23it/s]
100%|██████████| 859/859 [00:00<00:00, 22128.70it/s]
 22%|██▏       | 386/1720 [00:00<00:00, 3852.24it/s]

####################	 Episode: 30, Size of MDP:747, Eval Reward:200.0	####################
Average Reward of collected trajectories:23.0
Len of to commit transitions 859
ABstraction Faldg False
Len of to commit sa pairs 1720


100%|██████████| 1720/1720 [00:00<00:00, 3822.46it/s]


Elapsed Time:0s, VI Error:0.00180054, #Backups: 3250


  4%|▍         | 2/50 [00:00<00:02, 16.07it/s]

Elapsed Time:1s, VI Error:0.00016022, #Backups: 3500
Time takedn to solve 1.1762535572052002


100%|██████████| 50/50 [00:03<00:00, 15.28it/s]
100%|██████████| 4/4 [00:00<00:00, 772.72it/s]
100%|██████████| 942/942 [00:00<00:00, 15679.95it/s]
 20%|█▉        | 376/1886 [00:00<00:00, 3758.53it/s]

####################	 Episode: 35, Size of MDP:862, Eval Reward:200.0	####################
Average Reward of collected trajectories:16.6
Len of to commit transitions 942
ABstraction Faldg False
Len of to commit sa pairs 1886


100%|██████████| 1886/1886 [00:00<00:00, 3774.91it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Elapsed Time:0s, VI Error:3.052e-05, #Backups: 3750
Time takedn to solve 0.5737464427947998


100%|██████████| 50/50 [00:03<00:00, 15.37it/s]
100%|██████████| 5/5 [00:00<00:00, 846.58it/s]
100%|██████████| 1112/1112 [00:00<00:00, 21210.13it/s]
 17%|█▋        | 374/2226 [00:00<00:00, 3739.92it/s]

####################	 Episode: 40, Size of MDP:945, Eval Reward:200.0	####################
Average Reward of collected trajectories:34.0
Len of to commit transitions 1112
ABstraction Faldg False
Len of to commit sa pairs 2226


100%|██████████| 2226/2226 [00:00<00:00, 3858.03it/s]


Elapsed Time:0s, VI Error:0.00217438, #Backups: 4000


  0%|          | 0/50 [00:00<?, ?it/s]

Elapsed Time:1s, VI Error:0.00018311, #Backups: 4250
Time takedn to solve 1.1389622688293457


100%|██████████| 50/50 [00:03<00:00, 15.07it/s]
100%|██████████| 5/5 [00:00<00:00, 776.12it/s]
100%|██████████| 1216/1216 [00:00<00:00, 22692.39it/s]
 16%|█▌        | 390/2434 [00:00<00:00, 3897.12it/s]

####################	 Episode: 45, Size of MDP:1115, Eval Reward:200.0	####################
Average Reward of collected trajectories:20.8
Len of to commit transitions 1216
ABstraction Faldg False
Len of to commit sa pairs 2434


100%|██████████| 2434/2434 [00:00<00:00, 3832.02it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Elapsed Time:0s, VI Error:1.526e-05, #Backups: 4500
Time takedn to solve 0.5573644638061523


100%|██████████| 50/50 [00:03<00:00, 15.36it/s]
100%|██████████| 6/6 [00:00<00:00, 958.92it/s]
100%|██████████| 1317/1317 [00:00<00:00, 22488.97it/s]
 15%|█▌        | 399/2636 [00:00<00:00, 3983.57it/s]

####################	 Episode: 50, Size of MDP:1219, Eval Reward:200.0	####################
Average Reward of collected trajectories:20.2
Len of to commit transitions 1317
ABstraction Faldg False
Len of to commit sa pairs 2636


100%|██████████| 2636/2636 [00:00<00:00, 3803.21it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Elapsed Time:0s, VI Error:2.289e-05, #Backups: 4750
Time takedn to solve 0.5570847988128662


100%|██████████| 50/50 [00:03<00:00, 15.28it/s]
100%|██████████| 6/6 [00:00<00:00, 688.36it/s]
100%|██████████| 1402/1402 [00:00<00:00, 21933.09it/s]
  0%|          | 0/2806 [00:00<?, ?it/s]

####################	 Episode: 55, Size of MDP:1320, Eval Reward:200.0	####################
Average Reward of collected trajectories:17.0
Len of to commit transitions 1402
ABstraction Faldg False
Len of to commit sa pairs 2806


100%|██████████| 2806/2806 [00:00<00:00, 3684.54it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Elapsed Time:0s, VI Error:0.0, #Backups: 5000
Time takedn to solve 0.5824873447418213


100%|██████████| 50/50 [00:03<00:00, 15.35it/s]
100%|██████████| 6/6 [00:00<00:00, 731.39it/s]
100%|██████████| 1483/1483 [00:00<00:00, 22665.88it/s]
 12%|█▏        | 361/2968 [00:00<00:00, 3606.98it/s]

####################	 Episode: 60, Size of MDP:1405, Eval Reward:200.0	####################
Average Reward of collected trajectories:16.2
Len of to commit transitions 1483
ABstraction Faldg False
Len of to commit sa pairs 2968


100%|██████████| 2968/2968 [00:00<00:00, 3718.42it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Elapsed Time:0s, VI Error:7.63e-06, #Backups: 5250
Time takedn to solve 0.6089859008789062


100%|██████████| 50/50 [00:03<00:00, 15.00it/s]
100%|██████████| 7/7 [00:00<00:00, 910.53it/s]
100%|██████████| 1673/1673 [00:00<00:00, 22089.74it/s]
 11%|█▏        | 380/3348 [00:00<00:00, 3796.59it/s]

####################	 Episode: 65, Size of MDP:1486, Eval Reward:200.0	####################
Average Reward of collected trajectories:38.0
Len of to commit transitions 1673
ABstraction Faldg False
Len of to commit sa pairs 3348


100%|██████████| 3348/3348 [00:00<00:00, 3756.80it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Elapsed Time:0s, VI Error:1.526e-05, #Backups: 5500
Time takedn to solve 0.5567700862884521


100%|██████████| 50/50 [00:03<00:00, 15.33it/s]
100%|██████████| 7/7 [00:00<00:00, 711.54it/s]
100%|██████████| 1746/1746 [00:00<00:00, 22784.92it/s]
  0%|          | 0/3494 [00:00<?, ?it/s]

####################	 Episode: 70, Size of MDP:1676, Eval Reward:200.0	####################
Average Reward of collected trajectories:14.6
Len of to commit transitions 1746
ABstraction Faldg False
Len of to commit sa pairs 3494


100%|██████████| 3494/3494 [00:00<00:00, 3769.66it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Elapsed Time:0s, VI Error:0.0, #Backups: 5750
Time takedn to solve 0.5878043174743652


100%|██████████| 50/50 [00:03<00:00, 14.80it/s]
100%|██████████| 8/8 [00:00<00:00, 129.15it/s]
100%|██████████| 1853/1853 [00:00<00:00, 17608.63it/s]
  0%|          | 0/3708 [00:00<?, ?it/s]

####################	 Episode: 75, Size of MDP:1749, Eval Reward:200.0	####################
Average Reward of collected trajectories:21.4
Len of to commit transitions 1853
ABstraction Faldg False
Len of to commit sa pairs 3708


100%|██████████| 3708/3708 [00:01<00:00, 3624.91it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Elapsed Time:0s, VI Error:5.341e-05, #Backups: 6000
Time takedn to solve 0.556251049041748


100%|██████████| 50/50 [00:03<00:00, 14.14it/s]
100%|██████████| 8/8 [00:00<00:00, 387.72it/s]
100%|██████████| 1936/1936 [00:00<00:00, 21544.35it/s]
  0%|          | 0/3874 [00:00<?, ?it/s]

####################	 Episode: 80, Size of MDP:1856, Eval Reward:200.0	####################
Average Reward of collected trajectories:16.6
Len of to commit transitions 1936
ABstraction Faldg False
Len of to commit sa pairs 3874


100%|██████████| 3874/3874 [00:01<00:00, 3668.60it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Elapsed Time:0s, VI Error:2.289e-05, #Backups: 6250
Time takedn to solve 0.5774431228637695


100%|██████████| 50/50 [00:03<00:00, 14.66it/s]
100%|██████████| 8/8 [00:00<00:00, 441.76it/s]
100%|██████████| 2003/2003 [00:00<00:00, 21553.67it/s]
  0%|          | 0/4008 [00:00<?, ?it/s]

####################	 Episode: 85, Size of MDP:1939, Eval Reward:200.0	####################
Average Reward of collected trajectories:13.4
Len of to commit transitions 2003
ABstraction Faldg False
Len of to commit sa pairs 4008


100%|██████████| 4008/4008 [00:01<00:00, 3760.12it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Elapsed Time:0s, VI Error:1.526e-05, #Backups: 6500
Time takedn to solve 0.5807526111602783


100%|██████████| 50/50 [00:03<00:00, 14.49it/s]
100%|██████████| 9/9 [00:00<00:00, 844.43it/s]
100%|██████████| 2215/2215 [00:00<00:00, 21237.93it/s]
  0%|          | 0/4432 [00:00<?, ?it/s]

####################	 Episode: 90, Size of MDP:2006, Eval Reward:200.0	####################
Average Reward of collected trajectories:42.4
Len of to commit transitions 2215
ABstraction Faldg False
Len of to commit sa pairs 4432


100%|██████████| 4432/4432 [00:01<00:00, 3679.86it/s]


Elapsed Time:0s, VI Error:0.00022125, #Backups: 6750
Time takedn to solve 0.6390211582183838


100%|██████████| 50/50 [00:03<00:00, 14.97it/s]

####################	 Episode: 95, Size of MDP:2218, Eval Reward:200.0	####################



