# Init

In [1]:
from random import random

import gym
from gym import wrappers

import numpy as np
from scipy.spatial import KDTree

# Env Settings

In [2]:
_OBS_DIM = 4
_ACT_DIM = 1
_BIGNUM  = 1e5

# Minimum Viable Components

In [3]:
K     = 3 # ---------------------- Number of neighbors to query
N     = 0 # ---------------------- Number of exemplars
F     = np.zeros( (N,_OBS_DIM,) ) # Approximating function
V     = np.array( [] ) # ------- Action values
A     = np.zeros( (N,_ACT_DIM,) ) # Actions (Output)
KDT   = None # ------------------- Spatial tree
eps   = 1.0 # -------------------- Exploration probability
decay = 0.99
rad   = 0.125 # ------------------ Minimum distance between exemplars (Overwrite radius)
vMar  = 0.10 # ------------------- Allowed margin on value estimates

In [4]:
def points_from_indices( pnts, ndcs ):
    """ Get the subset of `pnts` designated by `ndcs` """
    N = len( ndcs )
    P = np.zeros( (N,pnts.shape[1],) )
    for i, idx in enumerate( ndcs ):
        P[i,:] = pnts[idx,:]
    return P

In [5]:



def add_particle( state, action, value, getTree = False ):
    """ Add a new particle to the value function """
    global N, F, V, A, KDT
    
    print( f"V-Stack: {F.shape} + {state.shape}" )
    
    if N < 1:
        F = state
        V = np.array( [value,] )
        A = action
    else:
        F = np.vstack( (F,state, ) )
        temp = V.tolist()
        temp.append( value )
        V = np.array( temp )
        A = np.vstack( (A,action,) )

    print( f"New Values: {V}" )
    
    N += 1
    if getTree:
        return N, KDTree( F )
    else:
        return N, None
        

def recalc_spatial_tree():
    """ Recalculate spatial tree """
    global N, F, V, A, KDT
    if N > 1:
        KDT = KDTree( F )

    
def get_action_and_value_inv_dist( state ):
    """ Estimate the current optimal action and value for the state """
    global N, F, V, A, KDT
    if (N < 1) or (KDT is None):
        return None, None
    dists, indcs = KDT.query( state, K )
    dists = dists[0]
    indcs = indcs[0]
    # print( dists, indcs )
    fractV = []
    indcsV = []
    for i, d in enumerate( dists ):
        # print( d )
        if d < _BIGNUM:
            fractV.append( 1.0/d )
            indcsV.append( indcs[i] )
    fractV = np.array( fractV )
    normD  = np.linalg.norm( fractV )
    fractV = fractV / normD
    rtnAct = np.zeros( (_ACT_DIM,) )
    rtnVal = 0.0
    for i, frac in enumerate( fractV ):
        idx = indcsV[i]
        rtnAct += (A[ idx ] * frac)
        print( V )
        rtnVal += (V[ idx ] * frac)
    return rtnAct, rtnVal
   

def eval_particle( state, action, value ):
    """ Decide whetner this point represents a particle worth saving """
    global N, F, V, A, KDT
    # 0. Get our estimate of the value of this state
    if N > 1:
        print(N)
        estAct, estVal = get_action_and_value_inv_dist( state )
    else:
        estAct, estVal = None, None
    
    # 1. Find out if there is a particle there
    if KDT is not None:
        ndcs = KDT.query_ball_point( state, rad )
    else:
        ndcs = []

    # 2. If there is a particle already there and the current value is better, then update
    if len( ndcs ) and (value > estVal):
        index = ndcs[0]
        # fNear = points_from_indices( F, ndcs )
        vNear = points_from_indices( V, ndcs )
        if vNear[0] < value:
            A[index,:] = action
            V[index]   = value
        if len( ndcs ) > 1:
            print( "WARNING: NEARNESS CONSTRAINT VIOLATED" )

    # 3. Elif this is an open space that does NOT estimate the value well
    elif (estVal is None) or abs(estVal - value) > abs(value * vMar):
        print( state, action, value )
        add_particle( state, action, value )
        recalc_spatial_tree()
    # Else this is an open space that predicts the value well, No update!
    # N. Return the current number of particles in the estimator
    return N
    
            

# Simple Learning Test

In [6]:
EPISODES = 1000
epLen    =  500
avg_time = 0
max_time = -1
env      = gym.make( 'CartPole-v1' ).env
env      = wrappers.RecordEpisodeStatistics( env, 100 )

In [7]:
for i_episode in range( EPISODES ):
    # instansiating the environment
    obs = env.reset()[0].reshape( (1,_OBS_DIM,) )
    print( f"Episode {i_episode+1}: Starting at {obs}" )
    for t in range( epLen ):
        # uncomment this is you want to see the rendering 
        #env.render()
        if (random() < eps) or (N<2):
            action = env.action_space.sample()
        else:
            action, preVal = get_action_and_value_inv_dist( obs )
            
        sLast = obs
        
        obs, reward, terminated, truncated, info = env.step( action )
        print( obs, obs.shape )
        obs = obs.reshape( (1,_OBS_DIM,) )
        
        eval_particle( sLast, action, reward )
        
        if terminated:
            avg_time = avg_time + t
            if t > max_time:
                max_time = t
                print( f"\tMax. Uptime: {max_time}" )
            #print("Episode finished after {} timesteps".format(t+1))
            break
    # resetting the enviroment
    env.reset()
        

# printing the avg time the game lasted
avg_time = avg_time/EPISODES
print( 'avg time agent survives :', avg_time )

Episode 1: Starting at [[-0.01977153  0.04792293  0.03377539 -0.04437153]]
[-0.01881307 -0.14766665  0.03288796  0.2587736 ] (4,)
[[-0.01977153  0.04792293  0.03377539 -0.04437153]] 0 1.0
V-Stack: (0, 4) + (1, 4)
New Values: [1.]
[-0.0217664   0.04697073  0.03806343 -0.02335727] (4,)
[[-0.01881307 -0.14766665  0.03288796  0.2587736 ]] 1 1.0
V-Stack: (1, 4) + (1, 4)
New Values: [1. 1.]
[-0.02082699 -0.14867583  0.03759629  0.28108793] (4,)
2
[1. 1.]
[1. 1.]


ValueError: could not broadcast input array from shape (4,) into shape (2,)