# This notebook show examples of different options to implement and customize RLlib

## Start Ray Cluster

In [42]:

import ray
from ray import tune

ray.shutdown()
ray.init(num_cpus=5,num_gpus=1)

2021-01-14 22:31:06,333	INFO services.py:1171 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '10.0.0.5',
 'raylet_ip_address': '10.0.0.5',
 'redis_address': '10.0.0.5:6379',
 'object_store_address': '/tmp/ray/session_2021-01-14_22-31-05_763742_16267/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-01-14_22-31-05_763742_16267/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-01-14_22-31-05_763742_16267',
 'metrics_export_port': 63504,
 'node_id': 'c2f8206f7715cf3acd7c9a7a4fcab282356e55437331a1909f4f90db'}

## Tune Basic

In [43]:
# #Tune basic
# from ray import tune

import time
def objective(step, alpha, beta):
    return (0.1 + alpha * step / 100)**(-1) + beta * 0.1


# def training_function(config):
#     # Hyperparameters
#     alpha, beta = config["alpha"], config["beta"]
#     for step in range(10):
#         # Iterative training function - can be any arbitrary training procedure.
#         intermediate_score = objective(step, alpha, beta)
#         # Feed the score back back to Tune.
#         tune.report(mean_loss=intermediate_score)


# analysis = tune.run(
#     training_function,
#     config={
#         "alpha": tune.grid_search([0.001, 0.01, 0.1]),
#         "beta": tune.choice([1, 2, 3,6])
#     })

# print("Best config: ", analysis.get_best_config(
#     metric="mean_loss", mode="min"))

# # Get a dataframe for analyzing trial results.
# df = analysis.results_df


class Trainable(tune.Trainable):
    def setup(self, config):
        # config (dict): A dict of hyperparameters
        self.x = 0
        self.a = config["a"]
        self.b = config["b"]

    def step(self):  # This is called iteratively.
        score = objective(self.x, self.a, self.b)
        self.x += 1
        time.sleep(1)
        return {"score": score}

analysis = tune.run(
    Trainable,
    num_samples=3,
    stop={"training_iteration": 20},
    config={
        "a":tune.uniform(-5, -1),
        "b": tune.grid_search([3,4,6,5,8])
    })

print('best config: ', analysis.get_best_config(metric="score", mode="max"))




Trial name,status,loc,a,b
Trainable_320a5_00000,RUNNING,,-1.33267,3


Result for Trainable_320a5_00000:
  date: 2021-01-14_22-31-09
  done: false
  experiment_id: 87ba97a450ae43448d52416fec552488
  hostname: nc6sv2
  iterations_since_restore: 1
  node_ip: 10.0.0.5
  pid: 22448
  score: 10.3
  time_since_restore: 1.0009114742279053
  time_this_iter_s: 1.0009114742279053
  time_total_s: 1.0009114742279053
  timestamp: 1610663469
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 320a5_00000
  
Result for Trainable_320a5_00003:
  date: 2021-01-14_22-31-09
  done: false
  experiment_id: b87f01a1951548d3bc70cee859e84bab
  hostname: nc6sv2
  iterations_since_restore: 1
  node_ip: 10.0.0.5
  pid: 22444
  score: 10.5
  time_since_restore: 1.0010316371917725
  time_this_iter_s: 1.0010316371917725
  time_total_s: 1.0010316371917725
  timestamp: 1610663469
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 320a5_00003
  
Result for Trainable_320a5_00001:
  date: 2021-01-14_22-31-09
  done: false
  experiment_id: 6f1800f5df3c4bb4a48dfa213

Trial name,status,loc,a,b,iter,total time (s),score
Trainable_320a5_00000,RUNNING,10.0.0.5:22448,-1.33267,3,5.0,5.00505,21.7164
Trainable_320a5_00001,RUNNING,10.0.0.5:22447,-4.66317,4,4.0,4.00298,-24.6658
Trainable_320a5_00002,RUNNING,10.0.0.5:22445,-4.36673,6,4.0,4.00243,-31.6561
Trainable_320a5_00003,RUNNING,10.0.0.5:22444,-2.5356,5,4.0,4.00416,42.285
Trainable_320a5_00004,RUNNING,10.0.0.5:22446,-4.67894,8,4.0,4.00326,-23.9721
Trainable_320a5_00005,PENDING,,-1.687,3,,,


Result for Trainable_320a5_00000:
  date: 2021-01-14_22-31-14
  done: false
  experiment_id: 87ba97a450ae43448d52416fec552488
  hostname: nc6sv2
  iterations_since_restore: 6
  node_ip: 10.0.0.5
  pid: 22448
  score: 30.27024535811851
  time_since_restore: 6.006099224090576
  time_this_iter_s: 1.0010449886322021
  time_total_s: 6.006099224090576
  timestamp: 1610663474
  timesteps_since_restore: 0
  training_iteration: 6
  trial_id: 320a5_00000
  
Result for Trainable_320a5_00003:
  date: 2021-01-14_22-31-14
  done: false
  experiment_id: b87f01a1951548d3bc70cee859e84bab
  hostname: nc6sv2
  iterations_since_restore: 6
  node_ip: 10.0.0.5
  pid: 22444
  score: -36.84143949835358
  time_since_restore: 6.0062034130096436
  time_this_iter_s: 1.001042366027832
  time_total_s: 6.0062034130096436
  timestamp: 1610663474
  timesteps_since_restore: 0
  training_iteration: 6
  trial_id: 320a5_00003
  
Result for Trainable_320a5_00001:
  date: 2021-01-14_22-31-14
  done: false
  experiment_id: 6

Trial name,status,loc,a,b,iter,total time (s),score
Trainable_320a5_00000,RUNNING,10.0.0.5:22448,-1.33267,3,10.0,10.0102,-49.8494
Trainable_320a5_00001,RUNNING,10.0.0.5:22447,-4.66317,4,9.0,9.00787,-3.26229
Trainable_320a5_00002,RUNNING,10.0.0.5:22445,-4.36673,6,9.0,9.00588,-3.41061
Trainable_320a5_00003,RUNNING,10.0.0.5:22444,-2.5356,5,9.0,9.0093,-9.2231
Trainable_320a5_00004,RUNNING,10.0.0.5:22446,-4.67894,8,9.0,9.00769,-2.84545
Trainable_320a5_00005,PENDING,,-1.687,3,,,


Result for Trainable_320a5_00000:
  date: 2021-01-14_22-31-19
  done: false
  experiment_id: 87ba97a450ae43448d52416fec552488
  hostname: nc6sv2
  iterations_since_restore: 11
  node_ip: 10.0.0.5
  pid: 22448
  score: -29.75968687993673
  time_since_restore: 11.011008024215698
  time_this_iter_s: 1.0007681846618652
  time_total_s: 11.011008024215698
  timestamp: 1610663479
  timesteps_since_restore: 0
  training_iteration: 11
  trial_id: 320a5_00000
  
Result for Trainable_320a5_00003:
  date: 2021-01-14_22-31-19
  done: false
  experiment_id: b87f01a1951548d3bc70cee859e84bab
  hostname: nc6sv2
  iterations_since_restore: 11
  node_ip: 10.0.0.5
  pid: 22444
  score: -6.0121210463203925
  time_since_restore: 11.01068377494812
  time_this_iter_s: 1.0010342597961426
  time_total_s: 11.01068377494812
  timestamp: 1610663479
  timesteps_since_restore: 0
  training_iteration: 11
  trial_id: 320a5_00003
  
Result for Trainable_320a5_00001:
  date: 2021-01-14_22-31-19
  done: false
  experimen

Trial name,status,loc,a,b,iter,total time (s),score
Trainable_320a5_00000,RUNNING,10.0.0.5:22448,-1.33267,3,15.0,15.015,-11.2508
Trainable_320a5_00001,RUNNING,10.0.0.5:22447,-4.66317,4,14.0,14.0115,-1.57546
Trainable_320a5_00002,RUNNING,10.0.0.5:22445,-4.36673,6,14.0,14.0106,-1.53824
Trainable_320a5_00003,RUNNING,10.0.0.5:22444,-2.5356,5,14.0,14.0138,-3.85487
Trainable_320a5_00004,RUNNING,10.0.0.5:22446,-4.67894,8,14.0,14.0115,-1.16749
Trainable_320a5_00005,PENDING,,-1.687,3,,,


Result for Trainable_320a5_00000:
  date: 2021-01-14_22-31-24
  done: false
  experiment_id: 87ba97a450ae43448d52416fec552488
  hostname: nc6sv2
  iterations_since_restore: 16
  node_ip: 10.0.0.5
  pid: 22448
  score: -9.709937927258268
  time_since_restore: 16.01514744758606
  time_this_iter_s: 1.0001513957977295
  time_total_s: 16.01514744758606
  timestamp: 1610663484
  timesteps_since_restore: 0
  training_iteration: 16
  trial_id: 320a5_00000
  
Result for Trainable_320a5_00003:
  date: 2021-01-14_22-31-24
  done: false
  experiment_id: b87f01a1951548d3bc70cee859e84bab
  hostname: nc6sv2
  iterations_since_restore: 16
  node_ip: 10.0.0.5
  pid: 22444
  score: -3.0671009296112475
  time_since_restore: 16.01585865020752
  time_this_iter_s: 1.001032829284668
  time_total_s: 16.01585865020752
  timestamp: 1610663484
  timesteps_since_restore: 0
  training_iteration: 16
  trial_id: 320a5_00003
  
Result for Trainable_320a5_00004:
  date: 2021-01-14_22-31-24
  done: false
  experiment_i

Trial name,status,loc,a,b,iter,total time (s),score
Trainable_320a5_00001,RUNNING,10.0.0.5:22447,-4.66317,4,19.0,19.0162,-0.952503
Trainable_320a5_00002,RUNNING,10.0.0.5:22445,-4.36673,6,19.0,19.0151,-0.857701
Trainable_320a5_00003,RUNNING,10.0.0.5:22444,-2.5356,5,19.0,19.019,-2.30578
Trainable_320a5_00004,RUNNING,10.0.0.5:22446,-4.67894,8,19.0,19.0153,-0.547331
Trainable_320a5_00005,PENDING,,-1.687,3,,,
Trainable_320a5_00000,TERMINATED,,-1.33267,3,20.0,20.018,-6.22709


Result for Trainable_320a5_00004:
  date: 2021-01-14_22-31-28
  done: true
  experiment_id: 10d384370ecc445aab444f915ac69a42
  hostname: nc6sv2
  iterations_since_restore: 20
  node_ip: 10.0.0.5
  pid: 22446
  score: -0.4674308437047685
  time_since_restore: 20.016326189041138
  time_this_iter_s: 1.0010449886322021
  time_total_s: 20.016326189041138
  timestamp: 1610663488
  timesteps_since_restore: 0
  training_iteration: 20
  trial_id: 320a5_00004
  
Result for Trainable_320a5_00001:
  date: 2021-01-14_22-31-28
  done: true
  experiment_id: 6f1800f5df3c4bb4a48dfa21304eb042
  hostname: nc6sv2
  iterations_since_restore: 20
  node_ip: 10.0.0.5
  pid: 22447
  score: -0.872261823868634
  time_since_restore: 20.017203092575073
  time_this_iter_s: 1.0010313987731934
  time_total_s: 20.017203092575073
  timestamp: 1610663488
  timesteps_since_restore: 0
  training_iteration: 20
  trial_id: 320a5_00001
  
Result for Trainable_320a5_00003:
  date: 2021-01-14_22-31-28
  done: true
  experiment

Trial name,status,loc,a,b,iter,total time (s),score
Trainable_320a5_00005,RUNNING,10.0.0.5:22614,-1.687,3,4.0,4.00332,20.5471
Trainable_320a5_00006,RUNNING,10.0.0.5:22616,-3.8138,4,4.0,4.00316,-68.9765
Trainable_320a5_00007,RUNNING,10.0.0.5:22617,-1.2278,6,4.0,4.00412,16.4313
Trainable_320a5_00008,RUNNING,10.0.0.5:22619,-2.03872,5,4.0,4.00388,26.2477
Trainable_320a5_00009,RUNNING,10.0.0.5:22620,-3.89953,8,3.0,3.0031,46.2352
Trainable_320a5_00010,PENDING,,-1.42817,3,,,
Trainable_320a5_00000,TERMINATED,,-1.33267,3,20.0,20.018,-6.22709
Trainable_320a5_00001,TERMINATED,,-4.66317,4,20.0,20.0172,-0.872262
Trainable_320a5_00002,TERMINATED,,-4.36673,6,20.0,20.0161,-0.770466
Trainable_320a5_00003,TERMINATED,,-2.5356,5,20.0,20.0198,-2.11942


Result for Trainable_320a5_00005:
  date: 2021-01-14_22-31-35
  done: false
  experiment_id: fb96109e71f3493ba041a04981df0959
  hostname: nc6sv2
  iterations_since_restore: 6
  node_ip: 10.0.0.5
  pid: 22614
  score: 64.19874610647139
  time_since_restore: 6.005388498306274
  time_this_iter_s: 1.001033067703247
  time_total_s: 6.005388498306274
  timestamp: 1610663495
  timesteps_since_restore: 0
  training_iteration: 6
  trial_id: 320a5_00005
  
Result for Trainable_320a5_00006:
  date: 2021-01-14_22-31-35
  done: false
  experiment_id: 66a771af1551465ea2e56124b7c03f18
  hostname: nc6sv2
  iterations_since_restore: 6
  node_ip: 10.0.0.5
  pid: 22616
  score: -10.626552823478569
  time_since_restore: 6.005230903625488
  time_this_iter_s: 1.0010316371917725
  time_total_s: 6.005230903625488
  timestamp: 1610663495
  timesteps_since_restore: 0
  training_iteration: 6
  trial_id: 320a5_00006
  
Result for Trainable_320a5_00008:
  date: 2021-01-14_22-31-35
  done: false
  experiment_id: 31

Trial name,status,loc,a,b,iter,total time (s),score
Trainable_320a5_00005,RUNNING,10.0.0.5:22614,-1.687,3,9.0,9.00789,-28.3038
Trainable_320a5_00006,RUNNING,10.0.0.5:22616,-3.8138,4,9.0,9.00833,-4.47557
Trainable_320a5_00007,RUNNING,10.0.0.5:22617,-1.2278,6,9.0,9.00929,563.54
Trainable_320a5_00008,RUNNING,10.0.0.5:22619,-2.03872,5,9.0,9.00755,-15.3485
Trainable_320a5_00009,RUNNING,10.0.0.5:22620,-3.89953,8,8.0,8.00794,-4.98144
Trainable_320a5_00010,PENDING,,-1.42817,3,,,
Trainable_320a5_00000,TERMINATED,,-1.33267,3,20.0,20.018,-6.22709
Trainable_320a5_00001,TERMINATED,,-4.66317,4,20.0,20.0172,-0.872262
Trainable_320a5_00002,TERMINATED,,-4.36673,6,20.0,20.0161,-0.770466
Trainable_320a5_00003,TERMINATED,,-2.5356,5,20.0,20.0198,-2.11942


Result for Trainable_320a5_00005:
  date: 2021-01-14_22-31-40
  done: false
  experiment_id: fb96109e71f3493ba041a04981df0959
  hostname: nc6sv2
  iterations_since_restore: 11
  node_ip: 10.0.0.5
  pid: 22614
  score: -14.255938785015015
  time_since_restore: 11.009953498840332
  time_this_iter_s: 1.0010309219360352
  time_total_s: 11.009953498840332
  timestamp: 1610663500
  timesteps_since_restore: 0
  training_iteration: 11
  trial_id: 320a5_00005
  
Result for Trainable_320a5_00006:
  date: 2021-01-14_22-31-40
  done: false
  experiment_id: 66a771af1551465ea2e56124b7c03f18
  hostname: nc6sv2
  iterations_since_restore: 11
  node_ip: 10.0.0.5
  pid: 22616
  score: -3.1539084493893563
  time_since_restore: 11.010396718978882
  time_this_iter_s: 1.001032829284668
  time_total_s: 11.010396718978882
  timestamp: 1610663500
  timesteps_since_restore: 0
  training_iteration: 11
  trial_id: 320a5_00006
  
Result for Trainable_320a5_00008:
  date: 2021-01-14_22-31-40
  done: false
  experim

Trial name,status,loc,a,b,iter,total time (s),score
Trainable_320a5_00005,RUNNING,10.0.0.5:22614,-1.687,3,14.0,14.013,-8.08148
Trainable_320a5_00006,RUNNING,10.0.0.5:22616,-3.8138,4,14.0,14.0135,-2.12656
Trainable_320a5_00007,RUNNING,10.0.0.5:22617,-1.2278,6,14.0,14.0138,-16.1748
Trainable_320a5_00008,RUNNING,10.0.0.5:22619,-2.03872,5,14.0,14.0118,-5.55937
Trainable_320a5_00009,RUNNING,10.0.0.5:22620,-3.89953,8,14.0,14.013,-1.65737
Trainable_320a5_00010,PENDING,,-1.42817,3,,,
Trainable_320a5_00000,TERMINATED,,-1.33267,3,20.0,20.018,-6.22709
Trainable_320a5_00001,TERMINATED,,-4.66317,4,20.0,20.0172,-0.872262
Trainable_320a5_00002,TERMINATED,,-4.36673,6,20.0,20.0161,-0.770466
Trainable_320a5_00003,TERMINATED,,-2.5356,5,20.0,20.0198,-2.11942


Result for Trainable_320a5_00005:
  date: 2021-01-14_22-31-45
  done: false
  experiment_id: fb96109e71f3493ba041a04981df0959
  hostname: nc6sv2
  iterations_since_restore: 16
  node_ip: 10.0.0.5
  pid: 22614
  score: -6.233781660509167
  time_since_restore: 16.01508140563965
  time_this_iter_s: 1.0010325908660889
  time_total_s: 16.01508140563965
  timestamp: 1610663505
  timesteps_since_restore: 0
  training_iteration: 16
  trial_id: 320a5_00005
  
Result for Trainable_320a5_00006:
  date: 2021-01-14_22-31-45
  done: false
  experiment_id: 66a771af1551465ea2e56124b7c03f18
  hostname: nc6sv2
  iterations_since_restore: 16
  node_ip: 10.0.0.5
  pid: 22616
  score: -1.7183275592170446
  time_since_restore: 16.015156745910645
  time_this_iter_s: 1.001037836074829
  time_total_s: 16.015156745910645
  timestamp: 1610663505
  timesteps_since_restore: 0
  training_iteration: 16
  trial_id: 320a5_00006
  
Result for Trainable_320a5_00008:
  date: 2021-01-14_22-31-45
  done: false
  experiment

Trial name,status,loc,a,b,iter,total time (s),score
Trainable_320a5_00005,RUNNING,10.0.0.5:22614,-1.687,3,19.0,19.0178,-4.61012
Trainable_320a5_00006,RUNNING,10.0.0.5:22616,-3.8138,4,19.0,19.0183,-1.30507
Trainable_320a5_00007,RUNNING,10.0.0.5:22617,-1.2278,6,19.0,19.0177,-7.66425
Trainable_320a5_00008,RUNNING,10.0.0.5:22619,-2.03872,5,19.0,19.0167,-3.24575
Trainable_320a5_00009,RUNNING,10.0.0.5:22620,-3.89953,8,19.0,19.0182,-0.861362
Trainable_320a5_00010,PENDING,,-1.42817,3,,,
Trainable_320a5_00000,TERMINATED,,-1.33267,3,20.0,20.018,-6.22709
Trainable_320a5_00001,TERMINATED,,-4.66317,4,20.0,20.0172,-0.872262
Trainable_320a5_00002,TERMINATED,,-4.36673,6,20.0,20.0161,-0.770466
Trainable_320a5_00003,TERMINATED,,-2.5356,5,20.0,20.0198,-2.11942


Result for Trainable_320a5_00005:
  date: 2021-01-14_22-31-49
  done: true
  experiment_id: fb96109e71f3493ba041a04981df0959
  hostname: nc6sv2
  iterations_since_restore: 20
  node_ip: 10.0.0.5
  pid: 22614
  score: -4.234511646964095
  time_since_restore: 20.018298387527466
  time_this_iter_s: 1.0004913806915283
  time_total_s: 20.018298387527466
  timestamp: 1610663509
  timesteps_since_restore: 0
  training_iteration: 20
  trial_id: 320a5_00005
  
Result for Trainable_320a5_00006:
  date: 2021-01-14_22-31-49
  done: true
  experiment_id: 66a771af1551465ea2e56124b7c03f18
  hostname: nc6sv2
  iterations_since_restore: 20
  node_ip: 10.0.0.5
  pid: 22616
  score: -1.2009665657480024
  time_since_restore: 20.019296169281006
  time_this_iter_s: 1.0010337829589844
  time_total_s: 20.019296169281006
  timestamp: 1610663509
  timesteps_since_restore: 0
  training_iteration: 20
  trial_id: 320a5_00006
  
Result for Trainable_320a5_00008:
  date: 2021-01-14_22-31-49
  done: true
  experiment

Trial name,status,loc,a,b,iter,total time (s),score
Trainable_320a5_00010,RUNNING,10.0.0.5:22825,-1.42817,3,3,3.00311,14.2984
Trainable_320a5_00011,RUNNING,10.0.0.5:22823,-1.05757,4,3,3.0031,13.0825
Trainable_320a5_00012,RUNNING,10.0.0.5:22828,-1.30377,6,3,3.00298,14.1273
Trainable_320a5_00013,RUNNING,10.0.0.5:22826,-4.28788,5,3,3.00225,70.7125
Trainable_320a5_00014,RUNNING,10.0.0.5:22833,-1.68109,8,4,4.00236,20.9747
Trainable_320a5_00000,TERMINATED,,-1.33267,3,20,20.018,-6.22709
Trainable_320a5_00001,TERMINATED,,-4.66317,4,20,20.0172,-0.872262
Trainable_320a5_00002,TERMINATED,,-4.36673,6,20,20.0161,-0.770466
Trainable_320a5_00003,TERMINATED,,-2.5356,5,20,20.0198,-2.11942
Trainable_320a5_00004,TERMINATED,,-4.67894,8,20,20.0163,-0.467431


Result for Trainable_320a5_00014:
  date: 2021-01-14_22-31-56
  done: false
  experiment_id: b7a1a310680948b9aaa906f4b79c7639
  hostname: nc6sv2
  iterations_since_restore: 6
  node_ip: 10.0.0.5
  pid: 22833
  score: 63.51459415074212
  time_since_restore: 6.004424810409546
  time_this_iter_s: 1.0010340213775635
  time_total_s: 6.004424810409546
  timestamp: 1610663516
  timesteps_since_restore: 0
  training_iteration: 6
  trial_id: 320a5_00014
  
Result for Trainable_320a5_00013:
  date: 2021-01-14_22-31-56
  done: false
  experiment_id: 8b1e64d0b0414ac0b423b265fbdee755
  hostname: nc6sv2
  iterations_since_restore: 6
  node_ip: 10.0.0.5
  pid: 22826
  score: -8.241730818324266
  time_since_restore: 6.0053486824035645
  time_this_iter_s: 1.0010337829589844
  time_total_s: 6.0053486824035645
  timestamp: 1610663516
  timesteps_since_restore: 0
  training_iteration: 6
  trial_id: 320a5_00013
  
Result for Trainable_320a5_00011:
  date: 2021-01-14_22-31-56
  done: false
  experiment_id: 

Trial name,status,loc,a,b,iter,total time (s),score
Trainable_320a5_00010,RUNNING,10.0.0.5:22825,-1.42817,3,8,8.00737,35469.2
Trainable_320a5_00011,RUNNING,10.0.0.5:22823,-1.05757,4,8,8.00732,38.9063
Trainable_320a5_00012,RUNNING,10.0.0.5:22828,-1.30377,6,8,8.00815,115.064
Trainable_320a5_00013,RUNNING,10.0.0.5:22826,-4.28788,5,8,8.00661,-4.49622
Trainable_320a5_00014,RUNNING,10.0.0.5:22833,-1.68109,8,9,9.00716,-28.1959
Trainable_320a5_00000,TERMINATED,,-1.33267,3,20,20.018,-6.22709
Trainable_320a5_00001,TERMINATED,,-4.66317,4,20,20.0172,-0.872262
Trainable_320a5_00002,TERMINATED,,-4.36673,6,20,20.0161,-0.770466
Trainable_320a5_00003,TERMINATED,,-2.5356,5,20,20.0198,-2.11942
Trainable_320a5_00004,TERMINATED,,-4.67894,8,20,20.0163,-0.467431


Result for Trainable_320a5_00014:
  date: 2021-01-14_22-32-01
  done: false
  experiment_id: b7a1a310680948b9aaa906f4b79c7639
  hostname: nc6sv2
  iterations_since_restore: 11
  node_ip: 10.0.0.5
  pid: 22833
  score: -13.88224043740622
  time_since_restore: 11.00862979888916
  time_this_iter_s: 1.0010337829589844
  time_total_s: 11.00862979888916
  timestamp: 1610663521
  timesteps_since_restore: 0
  training_iteration: 11
  trial_id: 320a5_00014
  
Result for Trainable_320a5_00013:
  date: 2021-01-14_22-32-01
  done: false
  experiment_id: 8b1e64d0b0414ac0b423b265fbdee755
  hostname: nc6sv2
  iterations_since_restore: 11
  node_ip: 10.0.0.5
  pid: 22826
  score: -2.541476824614537
  time_since_restore: 11.009364604949951
  time_this_iter_s: 1.0006844997406006
  time_total_s: 11.009364604949951
  timestamp: 1610663521
  timesteps_since_restore: 0
  training_iteration: 11
  trial_id: 320a5_00013
  
Result for Trainable_320a5_00011:
  date: 2021-01-14_22-32-01
  done: false
  experiment

Trial name,status,loc,a,b,iter,total time (s),score
Trainable_320a5_00010,RUNNING,10.0.0.5:22825,-1.42817,3,13,13.0125,-13.7095
Trainable_320a5_00011,RUNNING,10.0.0.5:22823,-1.05757,4,13,13.0125,-36.7624
Trainable_320a5_00012,RUNNING,10.0.0.5:22828,-1.30377,6,13,13.0117,-17.1142
Trainable_320a5_00013,RUNNING,10.0.0.5:22826,-4.28788,5,14,14.0117,-1.68616
Trainable_320a5_00014,RUNNING,10.0.0.5:22833,-1.68109,8,14,14.0117,-7.6358
Trainable_320a5_00000,TERMINATED,,-1.33267,3,20,20.018,-6.22709
Trainable_320a5_00001,TERMINATED,,-4.66317,4,20,20.0172,-0.872262
Trainable_320a5_00002,TERMINATED,,-4.36673,6,20,20.0161,-0.770466
Trainable_320a5_00003,TERMINATED,,-2.5356,5,20,20.0198,-2.11942
Trainable_320a5_00004,TERMINATED,,-4.67894,8,20,20.0163,-0.467431


Result for Trainable_320a5_00014:
  date: 2021-01-14_22-32-06
  done: false
  experiment_id: b7a1a310680948b9aaa906f4b79c7639
  hostname: nc6sv2
  iterations_since_restore: 16
  node_ip: 10.0.0.5
  pid: 22833
  score: -5.771846126639087
  time_since_restore: 16.012977838516235
  time_this_iter_s: 1.000197172164917
  time_total_s: 16.012977838516235
  timestamp: 1610663526
  timesteps_since_restore: 0
  training_iteration: 16
  trial_id: 320a5_00014
  
Result for Trainable_320a5_00013:
  date: 2021-01-14_22-32-06
  done: false
  experiment_id: 8b1e64d0b0414ac0b423b265fbdee755
  hostname: nc6sv2
  iterations_since_restore: 16
  node_ip: 10.0.0.5
  pid: 22826
  score: -1.3410053768230852
  time_since_restore: 16.013778686523438
  time_this_iter_s: 1.0010390281677246
  time_total_s: 16.013778686523438
  timestamp: 1610663526
  timesteps_since_restore: 0
  training_iteration: 16
  trial_id: 320a5_00013
  
Result for Trainable_320a5_00011:
  date: 2021-01-14_22-32-06
  done: false
  experime

Trial name,status,loc,a,b,iter,total time (s),score
Trainable_320a5_00010,RUNNING,10.0.0.5:22825,-1.42817,3,18,18.0176,-6.70336
Trainable_320a5_00011,RUNNING,10.0.0.5:22823,-1.05757,4,18,18.0164,-12.1333
Trainable_320a5_00012,RUNNING,10.0.0.5:22828,-1.30377,6,18,18.0169,-7.62096
Trainable_320a5_00013,RUNNING,10.0.0.5:22826,-4.28788,5,19,19.0167,-0.988499
Trainable_320a5_00014,RUNNING,10.0.0.5:22833,-1.68109,8,19,19.0161,-4.1359
Trainable_320a5_00000,TERMINATED,,-1.33267,3,20,20.018,-6.22709
Trainable_320a5_00001,TERMINATED,,-4.66317,4,20,20.0172,-0.872262
Trainable_320a5_00002,TERMINATED,,-4.36673,6,20,20.0161,-0.770466
Trainable_320a5_00003,TERMINATED,,-2.5356,5,20,20.0198,-2.11942
Trainable_320a5_00004,TERMINATED,,-4.67894,8,20,20.0163,-0.467431


Result for Trainable_320a5_00014:
  date: 2021-01-14_22-32-10
  done: true
  experiment_id: b7a1a310680948b9aaa906f4b79c7639
  hostname: nc6sv2
  iterations_since_restore: 20
  node_ip: 10.0.0.5
  pid: 22833
  score: -3.7577180622200403
  time_since_restore: 20.017137050628662
  time_this_iter_s: 1.0010409355163574
  time_total_s: 20.017137050628662
  timestamp: 1610663530
  timesteps_since_restore: 0
  training_iteration: 20
  trial_id: 320a5_00014
  
Result for Trainable_320a5_00013:
  date: 2021-01-14_22-32-10
  done: true
  experiment_id: 8b1e64d0b0414ac0b423b265fbdee755
  hostname: nc6sv2
  iterations_since_restore: 20
  node_ip: 10.0.0.5
  pid: 22826
  score: -0.899195275889022
  time_since_restore: 20.01769256591797
  time_this_iter_s: 1.001037359237671
  time_total_s: 20.01769256591797
  timestamp: 1610663530
  timesteps_since_restore: 0
  training_iteration: 20
  trial_id: 320a5_00013
  
Result for Trainable_320a5_00011:
  date: 2021-01-14_22-32-10
  done: true
  experiment_id

Trial name,status,loc,a,b,iter,total time (s),score
Trainable_320a5_00000,TERMINATED,,-1.33267,3,20,20.018,-6.22709
Trainable_320a5_00001,TERMINATED,,-4.66317,4,20,20.0172,-0.872262
Trainable_320a5_00002,TERMINATED,,-4.36673,6,20,20.0161,-0.770466
Trainable_320a5_00003,TERMINATED,,-2.5356,5,20,20.0198,-2.11942
Trainable_320a5_00004,TERMINATED,,-4.67894,8,20,20.0163,-0.467431
Trainable_320a5_00005,TERMINATED,,-1.687,3,20,20.0183,-4.23451
Trainable_320a5_00006,TERMINATED,,-3.8138,4,20,20.0193,-1.20097
Trainable_320a5_00007,TERMINATED,,-1.2278,6,20,20.0187,-6.90294
Trainable_320a5_00008,TERMINATED,,-2.03872,5,20,20.0168,-2.98
Trainable_320a5_00009,TERMINATED,,-3.89953,8,20,20.0193,-0.760279


2021-01-14 22:32:10,602	INFO tune.py:449 -- Total run time: 62.77 seconds (62.74 seconds for the tuning loop).


best config:  {'a': -4.67893517764665, 'b': 8}


## Using standard RLlib

In [14]:
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.agents.impala import ImpalaTrainer
ray.shutdown()
ray.init(num_cpus=5,num_gpus=1)

analysis = tune.run(PPOTrainer, config={"env": "CartPole-v0", "num_workers": 4,  "framework":"torch", "num_gpus":1,
                               "lr":1e-3},
                                                        stop={"training_iteration": 3},checkpoint_freq=3, checkpoint_at_end=True)  # "log_level": "INFO" for verbose,
                                                     # "framework": "tfe"/"tf2" for eager,
                                                     # "framework": "torch" for PyTorch
# print('best config: ', analysis.get_best_config(metric="episode_reward_mean", mode="max"))


2021-01-14 22:09:26,170	INFO services.py:1171 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


Trial name,status,loc
PPO_CartPole-v0_2a94e_00000,RUNNING,


[2m[36m(pid=17805)[0m Instructions for updating:
[2m[36m(pid=17805)[0m non-resource variables are not supported in the long term
[2m[36m(pid=17805)[0m Instructions for updating:
[2m[36m(pid=17805)[0m non-resource variables are not supported in the long term
[2m[36m(pid=17805)[0m Instructions for updating:
[2m[36m(pid=17805)[0m non-resource variables are not supported in the long term
[2m[36m(pid=17805)[0m 2021-01-14 22:09:29,386	INFO trainer.py:651 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=17805)[0m 2021-01-14 22:09:29,386	INFO trainer.py:651 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=17805)[0m 2021-01-14 22:09:29,386	INFO trainer.py:651 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=17807)[0m Instructions for u

Result for PPO_CartPole-v0_2a94e_00000:
  custom_metrics: {}
  date: 2021-01-14_22-09-42
  done: false
  episode_len_mean: 21.97752808988764
  episode_reward_max: 72.0
  episode_reward_mean: 21.97752808988764
  episode_reward_min: 9.0
  episodes_this_iter: 178
  episodes_total: 178
  experiment_id: eebbbbd3cd274a0991be2a6714ca669e
  hostname: nc6sv2
  info:
    learner:
      default_policy:
        allreduce_latency: 0.0
        cur_kl_coeff: 0.2
        cur_lr: 0.001
        entropy: 0.6538553964346647
        entropy_coeff: 0.0
        kl: 0.04015571379568428
        policy_loss: -0.05293832818279043
        total_loss: 76.64788365364075
        vf_explained_var: 0.31551337242126465
        vf_loss: 76.69279050827026
    num_steps_sampled: 4000
    num_steps_trained: 4000
  iterations_since_restore: 1
  node_ip: 10.0.0.5
  num_healthy_workers: 4
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 26.684615384615384
    ram_util_percent: 6.900000000000001
  pid: 17805
  policy_r

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_2a94e_00000,RUNNING,10.0.0.5:17805,1,8.50979,4000,21.9775,72,9,21.9775


Result for PPO_CartPole-v0_2a94e_00000:
  custom_metrics: {}
  date: 2021-01-14_22-09-51
  done: false
  episode_len_mean: 46.38
  episode_reward_max: 200.0
  episode_reward_mean: 46.38
  episode_reward_min: 9.0
  episodes_this_iter: 64
  episodes_total: 242
  experiment_id: eebbbbd3cd274a0991be2a6714ca669e
  hostname: nc6sv2
  info:
    learner:
      default_policy:
        allreduce_latency: 0.0
        cur_kl_coeff: 0.30000000000000004
        cur_lr: 0.001
        entropy: 0.5955502577126026
        entropy_coeff: 0.0
        kl: 0.021416230796603486
        policy_loss: -0.03505021892488003
        total_loss: 324.735689163208
        vf_explained_var: 0.2073364555835724
        vf_loss: 324.7643117904663
    num_steps_sampled: 8000
    num_steps_trained: 8000
  iterations_since_restore: 2
  node_ip: 10.0.0.5
  num_healthy_workers: 4
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 25.927272727272726
    ram_util_percent: 6.9
  pid: 17805
  policy_reward_max: {}
  policy_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_2a94e_00000,RUNNING,10.0.0.5:17805,2,16.7155,8000,46.38,200,9,46.38


Result for PPO_CartPole-v0_2a94e_00000:
  custom_metrics: {}
  date: 2021-01-14_22-09-59
  done: true
  episode_len_mean: 79.02
  episode_reward_max: 200.0
  episode_reward_mean: 79.02
  episode_reward_min: 11.0
  episodes_this_iter: 36
  episodes_total: 278
  experiment_id: eebbbbd3cd274a0991be2a6714ca669e
  hostname: nc6sv2
  info:
    learner:
      default_policy:
        allreduce_latency: 0.0
        cur_kl_coeff: 0.45000000000000007
        cur_lr: 0.001
        entropy: 0.5629450790584087
        entropy_coeff: 0.0
        kl: 0.010578483052086085
        policy_loss: -0.01883669220842421
        total_loss: 511.86048889160156
        vf_explained_var: 0.19831672310829163
        vf_loss: 511.87456798553467
    num_steps_sampled: 12000
    num_steps_trained: 12000
  iterations_since_restore: 3
  node_ip: 10.0.0.5
  num_healthy_workers: 4
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 25.166666666666668
    ram_util_percent: 6.900000000000001
  pid: 17805
  policy_rewa

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_2a94e_00000,RUNNING,10.0.0.5:17805,3,24.9341,12000,79.02,200,11,79.02


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v0_2a94e_00000,TERMINATED,,3,24.9341,12000,79.02,200,11,79.02


2021-01-14 22:09:59,542	INFO tune.py:449 -- Total run time: 32.72 seconds (32.48 seconds for the tuning loop).


In [15]:
checkpoints = analysis.get_trial_checkpoints_paths(
    trial=analysis.get_best_trial("episode_reward_mean", mode="max"),
    metric="episode_reward_mean")
checkpoint_path=checkpoints[0][0]
print(checkpoint_path)

/home/azureuser/ray_results/PPO_2021-01-14_22-09-26/PPO_CartPole-v0_2a94e_00000_0_2021-01-14_22-09-26/checkpoint_3/checkpoint-3


In [16]:
config={"env": "CartPole-v0", "num_workers": 4,  "num_gpus":1,"framework": "torch"}
agent = PPOTrainer(config=config)
agent.restore(checkpoint_path)


[2m[36m(pid=18040)[0m Instructions for updating:
[2m[36m(pid=18040)[0m non-resource variables are not supported in the long term
[2m[36m(pid=18040)[0m Instructions for updating:
[2m[36m(pid=18040)[0m non-resource variables are not supported in the long term
[2m[36m(pid=18040)[0m Instructions for updating:
[2m[36m(pid=18040)[0m non-resource variables are not supported in the long term
[2m[36m(pid=18041)[0m Instructions for updating:
[2m[36m(pid=18041)[0m non-resource variables are not supported in the long term
[2m[36m(pid=18041)[0m Instructions for updating:
[2m[36m(pid=18041)[0m non-resource variables are not supported in the long term
[2m[36m(pid=18041)[0m Instructions for updating:
[2m[36m(pid=18041)[0m non-resource variables are not supported in the long term
[2m[36m(pid=18042)[0m Instructions for updating:
[2m[36m(pid=18042)[0m non-resource variables are not supported in the long term
[2m[36m(pid=18042)[0m Instructions for updating:
[2

In [17]:
#Test play the restored agent
import gym

env = gym.make("CartPole-v0")

# run until episode ends
episode_reward = 0
done = False
obs = env.reset()
while not done:
    action = agent.compute_action(obs)
    print(action)
    obs, reward, done, info = env.step(action)
    episode_reward += reward


0
1
1
0
1
0
1
0
1
0
1
0
0
1
1
0
1
0
1
0
1
0
0
1
0
1
1
0
0
1
0
1
1
0
0
1
0
1
0
1
1
1
0
0
0
1
0
1
0
0
1
0
1
0
1
0
1
1
0
0
0
0
1
1
1
0
1
1
0
1
0
0
1
1
0
1
1
1
0
1
1
0
1
0
0
0
1
0
0
0
0
0
1
0
1
0
0
0
1
1
0
1
1
0
1
0
0
1
0
1
1
1
1
0
1
0
0
1
0
0
1
1
1
0
1
0
0
1
1
0
1
0
1
0
0
1
1
1
0
0
0
1
0
1
1
1
1
0
1
0
1
0
1
0
0
1
0
1
1
0
1
0
1
0
0
1
0
1
0
0
1
0
1
1
0
1
0
1
0
0
1
0
0
1
1
0
1
0
1
0
0
0
1
1
0
1
0
1
0
1


In [None]:
#Extract the preprocessor & policy from the trainer

In [18]:
from ray.rllib.models.preprocessors import get_preprocessor
prep = get_preprocessor(env.observation_space)(env.observation_space)
prep.transform(env.reset()).shape

(4,)

In [19]:
from ray.rllib.agents.ppo import PPOTrainer
import numpy as np
trainer = PPOTrainer(env="CartPole-v0", config={ "num_workers": 0})
policy = trainer.get_policy()
logits, _ = policy.model.from_batch({"obs": np.array([[0.1, 0.2, 0.3, 0.4]])})



In [20]:
logits

<tf.Tensor 'functional_1/fc_out/BiasAdd:0' shape=(1, 2) dtype=float32>

## Customizing RLlib to implement Connect4

### Customizing Environment

In [29]:
import gym, ray
from gym.spaces import Discrete, Box, Tuple, Dict
from copy import deepcopy

from kaggle_environments import make
import numpy as np
class CustomConnectXEnv(gym.Env):
    def __init__(self, env_config):
        #trainer environment with a random opponent
        self.env = make("connectx", debug=False).train([None, "random"])
        self.current_player=1
        self.action_space = Discrete(7)
        self.observation_space = Box(low=0,high=2, shape=(1,6,7), dtype=np.uint8)
        

      
    def reset(self):
        obs =self.env.reset()
        board =obs['board']
        board = np.array(board).reshape(1,6,7)
        self.current_player=obs['mark']
        board[0,:,:]=self.current_player
        return board

    def step(self, action):
        
        obs, reward, done, info = self.env.step(int(action))


        board =obs['board']
        if reward == None:
            reward = -1
        board = np.array(board).reshape(1,6,7)
        board[0,:,:]=self.current_player
        info ={"info":"empty"}
        return board, reward, done, info
    
    

In [30]:
ray.shutdown()
ray.init(num_cpus=5,num_gpus=1)
config={
    "framework":"torch",

    "env": CustomConnectXEnv,
    "num_gpus": 1,
    "num_workers": 4,
    "train_batch_size": 1000,
    "model": {"conv_filters": [[1, [1,1], 64], [64, [1,1], 12]]}

}
analysis= tune.run(
    run_or_experiment="PPO",
    config=config,
    stop={"training_iteration": 4},
    checkpoint_freq=1,
    local_dir='./logs4')

2021-01-14 22:21:08,536	INFO services.py:1171 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


Trial name,status,loc
PPO_CustomConnectXEnv_cd4d5_00000,RUNNING,


[2m[36m(pid=20219)[0m Instructions for updating:
[2m[36m(pid=20219)[0m non-resource variables are not supported in the long term
[2m[36m(pid=20219)[0m Instructions for updating:
[2m[36m(pid=20219)[0m non-resource variables are not supported in the long term
[2m[36m(pid=20219)[0m Instructions for updating:
[2m[36m(pid=20219)[0m non-resource variables are not supported in the long term
[2m[36m(pid=20219)[0m Instructions for updating:
[2m[36m(pid=20219)[0m non-resource variables are not supported in the long term
[2m[36m(pid=20219)[0m Instructions for updating:
[2m[36m(pid=20219)[0m non-resource variables are not supported in the long term
[2m[36m(pid=20219)[0m Instructions for updating:
[2m[36m(pid=20219)[0m non-resource variables are not supported in the long term
[2m[36m(pid=20219)[0m Instructions for updating:
[2m[36m(pid=20219)[0m non-resource variables are not supported in the long term
[2m[36m(pid=20219)[0m 2021-01-14 22:21:12,074	INFO t

[2m[36m(pid=20219)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=20219)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=20219)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=20219)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=20219)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=20219)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=20219)[0m Loading environment football failed: No module named 'gfootball'


[2m[36m(pid=20221)[0m Instructions for updating:
[2m[36m(pid=20221)[0m non-resource variables are not supported in the long term
[2m[36m(pid=20221)[0m Instructions for updating:
[2m[36m(pid=20221)[0m non-resource variables are not supported in the long term
[2m[36m(pid=20221)[0m Instructions for updating:
[2m[36m(pid=20221)[0m non-resource variables are not supported in the long term
[2m[36m(pid=20221)[0m Instructions for updating:
[2m[36m(pid=20221)[0m non-resource variables are not supported in the long term
[2m[36m(pid=20221)[0m Instructions for updating:
[2m[36m(pid=20221)[0m non-resource variables are not supported in the long term
[2m[36m(pid=20221)[0m Instructions for updating:
[2m[36m(pid=20221)[0m non-resource variables are not supported in the long term
[2m[36m(pid=20221)[0m Instructions for updating:
[2m[36m(pid=20221)[0m non-resource variables are not supported in the long term
[2m[36m(pid=20220)[0m Instructions for updating:
[2

[2m[36m(pid=20220)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=20220)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=20220)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=20220)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=20220)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=20220)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=20220)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=20217)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=20217)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=20217)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=20217)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(



Result for PPO_CustomConnectXEnv_cd4d5_00000:
  custom_metrics: {}
  date: 2021-01-14_22-21-22
  done: false
  episode_len_mean: 10.324675324675324
  episode_reward_max: 1.0
  episode_reward_mean: -0.05194805194805195
  episode_reward_min: -1.0
  episodes_this_iter: 154
  episodes_total: 154
  experiment_id: 66d20b5c4b2149e5a8498d1fa429285b
  hostname: nc6sv2
  info:
    learner:
      default_policy:
        allreduce_latency: 0.0
        cur_kl_coeff: 0.20000000000000004
        cur_lr: 5.000000000000001e-05
        entropy: 1.94129072702848
        entropy_coeff: 0.0
        kl: 0.0017312933475925373
        policy_loss: -0.005928801515927682
        total_loss: 0.8763057635380671
        vf_explained_var: 9.16994569166718e-09
        vf_loss: 0.8818883299827576
    num_steps_sampled: 1600
    num_steps_trained: 1600
  iterations_since_restore: 1
  node_ip: 10.0.0.5
  num_healthy_workers: 4
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 40.0125
    ram_util_percent: 7.0
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CustomConnectXEnv_cd4d5_00000,RUNNING,10.0.0.5:20219,1,5.41576,1600,-0.0519481,1,-1,10.3247


Result for PPO_CustomConnectXEnv_cd4d5_00000:
  custom_metrics: {}
  date: 2021-01-14_22-21-28
  done: false
  episode_len_mean: 10.944827586206896
  episode_reward_max: 1.0
  episode_reward_mean: -0.11724137931034483
  episode_reward_min: -1.0
  episodes_this_iter: 145
  episodes_total: 299
  experiment_id: 66d20b5c4b2149e5a8498d1fa429285b
  hostname: nc6sv2
  info:
    learner:
      default_policy:
        allreduce_latency: 0.0
        cur_kl_coeff: 0.10000000000000002
        cur_lr: 5.000000000000001e-05
        entropy: 1.9376580531780536
        entropy_coeff: 0.0
        kl: 0.0011254911525891377
        policy_loss: -0.004751701194506425
        total_loss: 0.8504464167814988
        vf_explained_var: 1.3754918093411561e-08
        vf_loss: 0.855085565493657
    num_steps_sampled: 3200
    num_steps_trained: 3200
  iterations_since_restore: 2
  node_ip: 10.0.0.5
  num_healthy_workers: 4
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 36.4
    ram_util_percent: 7.0
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CustomConnectXEnv_cd4d5_00000,RUNNING,10.0.0.5:20219,2,10.8244,3200,-0.117241,1,-1,10.9448


Result for PPO_CustomConnectXEnv_cd4d5_00000:
  custom_metrics: {}
  date: 2021-01-14_22-21-33
  done: false
  episode_len_mean: 10.282051282051283
  episode_reward_max: 1.0
  episode_reward_mean: -0.23076923076923078
  episode_reward_min: -1.0
  episodes_this_iter: 156
  episodes_total: 455
  experiment_id: 66d20b5c4b2149e5a8498d1fa429285b
  hostname: nc6sv2
  info:
    learner:
      default_policy:
        allreduce_latency: 0.0
        cur_kl_coeff: 0.05000000000000001
        cur_lr: 5.000000000000001e-05
        entropy: 1.9360699378527129
        entropy_coeff: 0.0
        kl: 0.0017538907973525615
        policy_loss: 0.0018004438338371424
        total_loss: 0.8093413664744451
        vf_explained_var: -1.3754918093411561e-08
        vf_loss: 0.8074532609719497
    num_steps_sampled: 4800
    num_steps_trained: 4800
  iterations_since_restore: 3
  node_ip: 10.0.0.5
  num_healthy_workers: 4
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 36.9375
    ram_util_percent: 7

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CustomConnectXEnv_cd4d5_00000,RUNNING,10.0.0.5:20219,3,16.2739,4800,-0.230769,1,-1,10.2821


Result for PPO_CustomConnectXEnv_cd4d5_00000:
  custom_metrics: {}
  date: 2021-01-14_22-21-39
  done: true
  episode_len_mean: 10.16025641025641
  episode_reward_max: 1.0
  episode_reward_mean: -0.16666666666666666
  episode_reward_min: -1.0
  episodes_this_iter: 156
  episodes_total: 611
  experiment_id: 66d20b5c4b2149e5a8498d1fa429285b
  hostname: nc6sv2
  info:
    learner:
      default_policy:
        allreduce_latency: 0.0
        cur_kl_coeff: 0.025000000000000005
        cur_lr: 5.000000000000001e-05
        entropy: 1.9199680548447828
        entropy_coeff: 0.0
        kl: 0.004040801181243016
        policy_loss: -0.003464007893433938
        total_loss: 0.8311652036813589
        vf_explained_var: -1.833989138333436e-08
        vf_loss: 0.8345281986089853
    num_steps_sampled: 6400
    num_steps_trained: 6400
  iterations_since_restore: 4
  node_ip: 10.0.0.5
  num_healthy_workers: 4
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 36.6625
    ram_util_percent: 7.0


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CustomConnectXEnv_cd4d5_00000,RUNNING,10.0.0.5:20219,4,21.7178,6400,-0.166667,1,-1,10.1603


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CustomConnectXEnv_cd4d5_00000,TERMINATED,,4,21.7178,6400,-0.166667,1,-1,10.1603


2021-01-14 22:21:40,175	INFO tune.py:449 -- Total run time: 30.95 seconds (30.31 seconds for the tuning loop).


In [31]:
checkpoints = analysis.get_trial_checkpoints_paths(
    trial=analysis.get_best_trial("episode_reward_mean", mode="max"),
    metric="episode_reward_mean")
checkpoint_path=checkpoints[0][0]
print(checkpoint_path)

/mnt/batch/tasks/shared/LS_root/mounts/clusters/nc6sv2/code/ray/logs4/PPO/PPO_CustomConnectXEnv_cd4d5_00000_0_2021-01-14_22-21-09/checkpoint_1/checkpoint-1


In [32]:
connectx_agent = PPOTrainer(config=config)
connectx_agent.restore(checkpoint_path)

[2m[36m(pid=20440)[0m Instructions for updating:
[2m[36m(pid=20440)[0m non-resource variables are not supported in the long term
[2m[36m(pid=20440)[0m Instructions for updating:
[2m[36m(pid=20440)[0m non-resource variables are not supported in the long term
[2m[36m(pid=20440)[0m Instructions for updating:
[2m[36m(pid=20440)[0m non-resource variables are not supported in the long term
[2m[36m(pid=20440)[0m Instructions for updating:
[2m[36m(pid=20440)[0m non-resource variables are not supported in the long term
[2m[36m(pid=20440)[0m Instructions for updating:
[2m[36m(pid=20440)[0m non-resource variables are not supported in the long term
[2m[36m(pid=20440)[0m Instructions for updating:
[2m[36m(pid=20440)[0m non-resource variables are not supported in the long term
[2m[36m(pid=20440)[0m Instructions for updating:
[2m[36m(pid=20440)[0m non-resource variables are not supported in the long term
[2m[36m(pid=20438)[0m Instructions for updating:
[2

[2m[36m(pid=20440)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=20440)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=20440)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=20440)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=20440)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=20440)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=20440)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=20438)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=20438)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=20438)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=20438)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(

2021-01-14 22:21:43,971	INFO trainable.py:329 -- Restored on 10.0.0.5 from checkpoint: /mnt/batch/tasks/shared/LS_root/mounts/clusters/nc6sv2/code/ray/logs4/PPO/PPO_CustomConnectXEnv_cd4d5_00000_0_2021-01-14_22-21-09/checkpoint_1/checkpoint-1
2021-01-14 22:21:43,972	INFO trainable.py:336 -- Current state after restoring: {'_iteration': 1, '_timesteps_total': None, '_time_total': 5.415760040283203, '_episodes_total': 154}


In [33]:
from kaggle_environments import make

env = make("connectx", debug=True)

# Training agent in first position (player 1) against the default random agent.
trainer = env.train([None, "random"])
done=False
obs = trainer.reset()

while (not done):
    board = obs['board']
    player= obs['mark']
#     print(player)
    board = np.array(board).reshape(1,6,7)
    board[0,:,:]=player
    action = int(connectx_agent.compute_action(board)) # Action for the agent being trained.
    obs, reward, done, info = trainer.step(action)

    if reward == None:
        print("none catched, obs is ", obs)
        
#     print("info", info)

    if done:
        print(reward)
#         print(obs)
        env.render(mode="ipython", width=500, height=450)

        obs = trainer.reset()

1


### Custom Algorithm with Alphago

In [37]:
from ray.rllib.contrib.alpha_zero.models.custom_torch_models import ConvNetModel
from ray.rllib.contrib.alpha_zero.environments.cartpole import CartPole
from ray.rllib.models.catalog import ModelCatalog
from ray.rllib.contrib.alpha_zero.core.alpha_zero_trainer import AlphaZeroTrainer

In [38]:
import gym, ray
from gym.spaces import Discrete, Box, Tuple, Dict
from copy import deepcopy

from kaggle_environments import make
import numpy as np


class MyEnv(gym.Env):
    def __init__(self, env_config):
        #trainer environment with a random opponent
        self.env_org = make("connectx", debug=False)
        self.trainer = self.env_org.train([None, "random"])
        self.current_player=1
        self.action_space = Discrete(7)
#         self.observation_space = Box(low=0,high=2, shape=(1,6,7), dtype=np.uint8)
        
        self.observation_space = Dict({
            "obs": Box(low=0,high=2, shape=(1,6,7), dtype=np.uint8),
            "action_mask": Box(low=0, high=1, shape=(self.action_space.n, ))
        })
        self.running_reward = 0

      
    def reset(self):
        board = self.env_org.reset()[0]['observation']['board']
        self.trainer.reset()
        self.running_reward = 0
        board = np.array(board).reshape(1,6,7)
        board[0,:,:]=self.current_player

        return {"obs":board, "action_mask":np.array([1,1,1,1,1,1,1])}
    def step(self, action):
        obs, reward, done, info = self.trainer.step(int(action))

        board =obs['board']
        if reward == None:
            reward = -1
        self.running_reward += reward
        board = np.array(board).reshape(1,6,7)
        board[0,:,:]=self.current_player
        info ={"info":"empty"}
        return {"obs":board, "action_mask":np.array([1,1,1,1,1,1,1])}, reward, done, info
    
    
    def set_state(self, state):
        self.running_reward = state[1]
        self.env_org = deepcopy(state[0])
        self.trainer = self.env_org.train([None, "random"])

        board =self.env_org.state[0]['observation']['board']
        board = np.array(board).reshape(1,6,7)
        board[0,:,:]=self.current_player
        return {"obs": board, "action_mask": np.array([1,1,1,1,1,1,1])}

    def get_state(self):
        return deepcopy(self.env_org), self.running_reward

### Custom NN Model

In [39]:
from abc import ABC
import numpy as np

from ray.rllib.models.modelv2 import restore_original_dimensions
from ray.rllib.models.preprocessors import get_preprocessor
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.utils.framework import try_import_torch

torch, nn = try_import_torch()


def convert_to_tensor(arr):
    tensor = torch.from_numpy(np.asarray(arr))
    if tensor.dtype == torch.double:
        tensor = tensor.float()
    return tensor


class ActorCriticModel(TorchModelV2, nn.Module, ABC):
    def __init__(self, obs_space, action_space, num_outputs, model_config,
                 name):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs,
                              model_config, name)
        nn.Module.__init__(self)

        self.preprocessor = get_preprocessor(obs_space.original_space)(
            obs_space.original_space)

        self.shared_layers = None
        self.actor_layers = None
        self.critic_layers = None

        self._value_out = None

    def forward(self, input_dict, state, seq_lens):
        x = input_dict["obs"]
        x = self.shared_layers(x)
        # actor outputs
        logits = self.actor_layers(x)

        # compute value
        self._value_out = self.critic_layers(x)
        return logits, None

    def value_function(self):
        return self._value_out

    def compute_priors_and_value(self, obs):
        obs = convert_to_tensor([self.preprocessor.transform(obs)])
        input_dict = restore_original_dimensions(obs, self.obs_space, "torch")

        with torch.no_grad():
            model_out = self.forward(input_dict, None, [1])
            logits, _ = model_out
            value = self.value_function()
            logits, value = torch.squeeze(logits), torch.squeeze(value)
            priors = nn.Softmax(dim=-1)(logits)

            priors = priors.cpu().numpy()
            value = value.cpu().numpy()

            return priors, value


class Flatten(nn.Module):
    def forward(self, input):
        return input.view(input.size(0), -1)


class ConvNetModel(ActorCriticModel):
    def __init__(self, obs_space, action_space, num_outputs, model_config,
                 name):
        ActorCriticModel.__init__(self, obs_space, action_space, num_outputs,
                                  model_config, name)

        in_channels = model_config["custom_model_config"]["in_channels"]
        feature_dim = model_config["custom_model_config"]["feature_dim"]

        self.shared_layers = nn.Sequential(
            nn.Conv2d(in_channels, 32, kernel_size=2, stride=1),
            nn.Conv2d(32, 64, kernel_size=2, stride=1),
            nn.Conv2d(64, 64, kernel_size=2, stride=1), Flatten(),
            nn.Linear(768, feature_dim))

        self.actor_layers = nn.Sequential(
            nn.Linear(in_features=feature_dim, out_features=action_space.n))

        self.critic_layers = nn.Sequential(
            nn.Linear(in_features=feature_dim, out_features=1))

        self._value_out = None

In [41]:
ray.shutdown()
ray.init(num_cpus=5,num_gpus=1)

ModelCatalog.register_custom_model("ConvNetModel", ConvNetModel)



config={
    "env": MyEnv,
    "env_config": {},
    "num_workers": 4,
#         "num_gpus":1,
    "framework":"torch",

    "rollout_fragment_length": 20,
    "train_batch_size": 80,
    "sgd_minibatch_size": 64,
    "lr": 1e-4,
    "num_sgd_iter": 1,
    "mcts_config": {
        "puct_coefficient": 1.5,
        "num_simulations": 100,
        "temperature": 1.0,
        "dirichlet_epsilon": 0.20,
        "dirichlet_noise": 0.03,
        "argmax_tree_policy": False,
        "add_dirichlet_noise": True,
    },
    "ranked_rewards": {
        "enable": True,
    },
    "model": {
        "custom_model": "ConvNetModel",
        "custom_model_config":{"in_channels":1, "feature_dim":64}
    },
}

analysis = tune.run(
   AlphaZeroTrainer
    ,
    config=config,
    max_failures=0,
    stop={"episode_reward_mean": -0.2,"time_total_s": 3600},
    checkpoint_freq=2,
    local_dir='./logs_alphago')

2021-01-14 22:28:40,767	INFO services.py:1171 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


Trial name,status,loc
AlphaZero_MyEnv_dacc4_00000,RUNNING,


[2m[36m(pid=21917)[0m Instructions for updating:
[2m[36m(pid=21917)[0m non-resource variables are not supported in the long term
[2m[36m(pid=21917)[0m Instructions for updating:
[2m[36m(pid=21917)[0m non-resource variables are not supported in the long term
[2m[36m(pid=21917)[0m Instructions for updating:
[2m[36m(pid=21917)[0m non-resource variables are not supported in the long term
[2m[36m(pid=21917)[0m Instructions for updating:
[2m[36m(pid=21917)[0m non-resource variables are not supported in the long term
[2m[36m(pid=21917)[0m Instructions for updating:
[2m[36m(pid=21917)[0m non-resource variables are not supported in the long term
[2m[36m(pid=21917)[0m Instructions for updating:
[2m[36m(pid=21917)[0m non-resource variables are not supported in the long term
[2m[36m(pid=21917)[0m Instructions for updating:
[2m[36m(pid=21917)[0m non-resource variables are not supported in the long term
[2m[36m(pid=21917)[0m Instructions for updating:
[2

[2m[36m(pid=21917)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=21917)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=21917)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=21917)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=21917)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=21917)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=21917)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=21917)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=21917)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=21917)[0m Loading environment football failed: No module named 'gfootball'


[2m[36m(pid=21917)[0m 2021-01-14 22:28:44,196	INFO trainer.py:651 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=21917)[0m 2021-01-14 22:28:44,196	INFO trainer.py:651 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=21917)[0m 2021-01-14 22:28:44,196	INFO trainer.py:651 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=21917)[0m 2021-01-14 22:28:44,196	INFO trainer.py:651 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=21917)[0m 2021-01-14 22:28:44,196	INFO trainer.py:651 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=21917)[0m 2021-01-14 22:28:44,196	INFO trainer.py:651 -- Current 

[2m[36m(pid=21914)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=21914)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=21914)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=21914)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=21914)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=21914)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=21914)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=21914)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=21914)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=21914)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(pid=21913)[0m Loading environment football failed: No module named 'gfootball'
[2m[36m(

[2m[36m(pid=21917)[0m 2021-01-14 22:28:54,603	INFO trainable.py:102 -- Trainable.setup took 10.407 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
[2m[36m(pid=21917)[0m 2021-01-14 22:28:54,603	INFO trainable.py:102 -- Trainable.setup took 10.407 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
[2m[36m(pid=21917)[0m 2021-01-14 22:28:54,603	INFO trainable.py:102 -- Trainable.setup took 10.407 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
[2m[36m(pid=21917)[0m 2021-01-14 22:28:54,603	INFO trainable.py:102 -- Trainable.setup took 10.407 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
[2m[36m(pid=21917)[0m 2021-01-14 22:28:54,603	INFO trainable.py:102 -- Trainable.setup took 10.407 second

KeyboardInterrupt: 

In [None]:
alphago_trainer = AlphaZeroTrainer(config= config)

checkpoints = analysis.get_trial_checkpoints_paths(
    trial=analysis.get_best_trial("episode_reward_mean", mode="max"),
    metric="episode_reward_mean")
checkpoint_path=checkpoints[0][0]
print(checkpoint_path)
alphago_trainer.restore(checkpoint_path)