In [35]:
from TileCoder import TileCoder
from Q_learning import Q_learning
import gym
import numpy as np
import ray
import scipy.stats

In [15]:
NUM_CPUS = 8
ray.init(num_cpus = NUM_CPUS,ignore_reinit_error=True)

2022-05-27 14:43:50,966	INFO resource_spec.py:204 -- Starting Ray with 10.45 GiB memory available for workers and up to 5.25 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2022-05-27 14:43:51,747	INFO services.py:1163 -- View the Ray dashboard at [1m[32mlocalhost:8268[39m[22m


{'node_ip_address': '143.248.39.21',
 'raylet_ip_address': '143.248.39.21',
 'redis_address': '143.248.39.21:59960',
 'object_store_address': '/tmp/ray/session_2022-05-27_14-43-50_965497_859182/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2022-05-27_14-43-50_965497_859182/sockets/raylet',
 'webui_url': 'localhost:8268',
 'session_dir': '/tmp/ray/session_2022-05-27_14-43-50_965497_859182'}

In [16]:
@ray.remote
def train(model,config):
        
    # gym
    env = gym.make('MountainCar-v0')

    # tile coding part
    features = np.array([env.observation_space.low,env.observation_space.high]).T
    delta = env.observation_space.high-env.observation_space.low

    num_tiles = config['num_tiles']
    tile_width = config['tile_width']
    offset = [i*(delta/tile_width)/num_tiles for i in range(num_tiles)]

    tileCoder = TileCoder(features,num_tiles,tile_width,offset,env)
    
    
    #agent config
    num_states = tileCoder.total_states #number of total tiles 
    num_actions = env.action_space.n
    epsilon = config['epsilon']
    
    agent = model(num_states,num_actions,tileCoder,config)
    
    #trainign log
    episode_reward_list = []
    
    
    for epoch in range(1,config['num_episodes']):

        done = False
        state = env.reset()
            
        # reset logs
        episode_reward = 0
        episode_steps = 0
        epi_max_pos = -2 
        epi_td_error = 0
        epi_avg_pos = 0
        epi_avg_vel = 0
        epi_avg_action = 0

        while not done:

            if episode_steps > config['window_size']:
                break

            if np.random.random()<epsilon:
                action = np.random.randint(0,num_actions)
            else:
                value = np.array([np.einsum('ij,ij->j',tileCoder.get_one_hot_tiles(state,a),agent.w) for a in range(num_actions)])
                action = np.random.choice(np.where( value == value.max())[0])
                           
                
            encoded_state = tileCoder.get_one_hot_tiles(state,action)
            next_state,reward,done,info = env.step(action)
    
            done_mask = 0.0 if done else 1.0
            
            if config['isTrain']:
                td_error = agent.update(encoded_state,next_state,action,reward,done_mask)
                epi_td_error += td_error
            

            state = next_state
            
            #update log
            epi_max_pos = max(epi_max_pos,state[0])
            episode_reward += reward
            episode_steps += 1
            epi_avg_pos += state[0]
            epi_avg_vel += state[1]
            epi_avg_action += action
            
        if config['epsilon_decay']:
            epsilon = max(config['min_epsilon'],epsilon-config['eta_decay_rate'])
        
        agent.on_epoch_end()

        
        episode_reward_list.append(episode_reward)

        if epoch % config['num_print_episodes'] ==0 :
            print(f"Epoch:{epoch}, Episode Reward :{episode_reward}")
    
        
    return episode_reward_list

In [28]:
config = {'eta':0.1,'gamma':0.99,'learning_rate':0.1,'epsilon':0,'epsilon_decay':True,'min_epsilon':0.01,
              'num_tiles':2,
              'tile_width':10,
              'isTrain':True,
              'num_episodes':1000,
              'num_print_episodes':100,
              'window_size':200,
              'eta_decay_rate':0,
                }

In [29]:
reward_list = ray.get([train.remote(Q_learning,config) for i in range(5)])

[2m[36m(pid=859647)[0m Epoch:100, Episode Reward :-200.0
[2m[36m(pid=859646)[0m Epoch:100, Episode Reward :-200.0
[2m[36m(pid=859643)[0m Epoch:100, Episode Reward :-153.0
[2m[36m(pid=859640)[0m Epoch:100, Episode Reward :-200.0
[2m[36m(pid=859642)[0m Epoch:100, Episode Reward :-200.0
[2m[36m(pid=859647)[0m Epoch:200, Episode Reward :-200.0
[2m[36m(pid=859643)[0m Epoch:200, Episode Reward :-200.0
[2m[36m(pid=859646)[0m Epoch:200, Episode Reward :-167.0
[2m[36m(pid=859640)[0m Epoch:200, Episode Reward :-172.0
[2m[36m(pid=859642)[0m Epoch:200, Episode Reward :-200.0
[2m[36m(pid=859647)[0m Epoch:300, Episode Reward :-147.0
[2m[36m(pid=859643)[0m Epoch:300, Episode Reward :-200.0
[2m[36m(pid=859646)[0m Epoch:300, Episode Reward :-192.0
[2m[36m(pid=859640)[0m Epoch:300, Episode Reward :-155.0
[2m[36m(pid=859642)[0m Epoch:300, Episode Reward :-156.0
[2m[36m(pid=859647)[0m Epoch:400, Episode Reward :-200.0
[2m[36m(pid=859646)[0m Epoch:400, Epi

In [37]:
last_scores = np.array(reward_list)[:,-1]
mean = np.mean(last_scores)
se = scipy.stats.sem(last_score)
print(f'mean:{mean},95% CI:{mean-se,mean+se}')

mean:-146.0,95% CI:(-149.42052627529742, -142.57947372470258)
