In [1]:
import numpy as np
import gym
from go_ai import policies, game, metrics, data
from go_ai.models import value_model
import os
import random
import torch

# Hyperparameters

In [2]:
BOARD_SIZE = 4

In [3]:
ITERATIONS = 256
EPISODES_PER_ITERATION = 256
NUM_EVAL_GAMES = 256

In [4]:
INIT_TEMP = 1
TEMP_DECAY = 3/4
MIN_TEMP = 1/64

In [5]:
BATCH_SIZE = 32

In [6]:
LOAD_SAVED_MODELS = False

# Data Parameters

In [7]:
EPISODES_DIR = 'episodes/'

In [8]:
CHECKPOINT_PATH = 'checkpoints/checkpoint_{}x{}.pt'.format(BOARD_SIZE, BOARD_SIZE)

In [9]:
DEMO_TRAJECTORY_PATH = 'logs/a_trajectory.png'

# Go Environment
Train on a small board for fast training and efficient debugging

In [10]:
go_env = gym.make('gym_go:go-v0', size=BOARD_SIZE)

# Model

In [11]:
curr_model = value_model.ValueNet(BOARD_SIZE)
checkpoint_model = value_model.ValueNet(BOARD_SIZE)

if LOAD_SAVED_MODELS:
    assert os.path.exists(CHECKPOINT_PATH)
    print("Starting from checkpoint")
else:
    torch.save(curr_model.state_dict(), CHECKPOINT_PATH)
    print("Initialized checkpoint") 

curr_model.load_state_dict(torch.load(CHECKPOINT_PATH))
checkpoint_model.load_state_dict(torch.load(CHECKPOINT_PATH))

curr_model

Initialized checkpoint


ValueNet(
  (convs): Sequential(
    (0): Conv2d(6, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU()
    (4): Conv2d(128, 1, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): ReLU()
  )
  (fcs): Sequential(
    (0): Linear(in_features=16, out_features=16, bias=True)
    (1): ReLU()
    (2): Linear(in_features=16, out_features=1, bias=True)
  )
  (criterion): BCEWithLogitsLoss()
)

# Policies

In [12]:
curr_policy = policies.QTempPolicy('Current', curr_model, INIT_TEMP)
checkpoint_policy = policies.QTempPolicy('Checkpoint', checkpoint_model, INIT_TEMP)

random_policy = policies.RandomPolicy()
greedy_policy = policies.QTempPolicy('Greedy', policies.greedy_val_func, temp=0)
human_policy = policies.HumanPolicy()

In [13]:
def decay_temps(policies, temp_decay, min_temp):
    for policy in policies:
        assert hasattr(policy, 'temp')
        policy.temp *= temp_decay
        if policy.temp < min_temp:
            policy.temp = min_temp
        print(f"{policy.name} temp decayed to {policy.temp}")

# Demo and Time Games

Symmetries

In [14]:
%%time
go_env.reset()
action = (1, 1)
next_state, _, _, _ = go_env.step(action)
metrics.plot_symmetries(next_state, 'logs/symmetries.jpg')

CPU times: user 335 ms, sys: 29.9 ms, total: 365 ms
Wall time: 181 ms


With replay memory

In [15]:
%%time
go_env.reset()
_,_ = game.pit(go_env, curr_policy, curr_policy, get_traj=True)

CPU times: user 569 ms, sys: 60.7 ms, total: 630 ms
Wall time: 255 ms


In [16]:
%%time
metrics.gen_traj_fig(go_env, curr_policy, DEMO_TRAJECTORY_PATH)

CPU times: user 3.46 s, sys: 187 ms, total: 3.65 s
Wall time: 3.28 s


# Train

In [None]:
for iteration in range(ITERATIONS):
    print(f"Iteration {iteration}")
    
    # Make and write out the episode data
    _, replay_data = game.play_games(go_env, curr_policy, curr_policy, True, EPISODES_PER_ITERATION)
        
    # Process the data
    random.shuffle(replay_data)
    replay_data = data.replaylist_to_numpy(replay_data)

    # Optimize
    curr_model.optimize(replay_data, BATCH_SIZE)
    
    # Evaluate against checkpoint model and other baselines
    opp_winrate, _ = game.play_games(go_env, curr_policy, checkpoint_policy, False, EPISODES_PER_ITERATION)

    if opp_winrate > 0.6:
        # New parameters are significantly better. Accept it
        torch.save(curr_model.state_dict(), CHECKPOINT_PATH)
        checkpoint_model.load_state_dict(torch.load(CHECKPOINT_PATH))
        print(f"{100*opp_winrate:.1f}% Accepted new model")
        
        # Plot samples of states and response heatmaps
        metrics.gen_traj_fig(go_env, curr_policy, DEMO_TRAJECTORY_PATH)
        print("Plotted sample trajectory")
        
        rand_winrate, _ = game.play_games(go_env, curr_policy, random_policy, False, NUM_EVAL_GAMES)
        greed_winrate, _ = game.play_games(go_env, curr_policy, greedy_policy, False, NUM_EVAL_GAMES)

    elif opp_winrate >= 0.4:
        # Keep trying
        print(f"{100*opp_winrate:.1f}% Continuing to train current weights")
    else:
        # New parameters are significantly worse. Reject it.
        curr_model.load_state_dict(torch.load(CHECKPOINT_PATH))
        print(f"{100*opp_winrate:.1f}% Rejected new model")
        
    # Decay the temperatures if any
    decay_temps([curr_policy, checkpoint_policy], TEMP_DECAY, MIN_TEMP)

Current vs. Current:   1%|          | 1/128 [00:00<00:21,  5.90it/s, 0.0%]

Iteration 0


Current vs. Current: 100%|██████████| 128/128 [00:27<00:00,  4.66it/s, 59.8%]
Optimizing: 88it [00:01, 85.33it/s, 60.5%, 0.663L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:25<00:00,  4.93it/s, 60.9%]


60.9% Accepted new model


Current vs. Random:   0%|          | 0/128 [00:00<?, ?it/s]

Plotted sample trajectory


Current vs. Random: 100%|██████████| 128/128 [00:15<00:00,  8.05it/s, 68.4%]
Current vs. Greedy: 100%|██████████| 128/128 [00:26<00:00,  4.88it/s, 7.4%]
Current vs. Current:   0%|          | 0/128 [00:00<?, ?it/s]

Current temp decayed to 0.75
Checkpoint temp decayed to 0.75
Iteration 1


Current vs. Current: 100%|██████████| 128/128 [00:25<00:00,  4.99it/s, 45.7%]
Optimizing: 86it [00:01, 84.74it/s, 65.7%, 0.621L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:26<00:00,  4.88it/s, 54.7%]
Current vs. Current:   0%|          | 0/128 [00:00<?, ?it/s]

54.7% Continuing to train current weights
Current temp decayed to 0.5625
Checkpoint temp decayed to 0.5625
Iteration 2


Current vs. Current: 100%|██████████| 128/128 [00:27<00:00,  4.62it/s, 54.3%]
Optimizing: 99it [00:01, 87.28it/s, 63.3%, 0.649L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:25<00:00,  5.04it/s, 52.3%]
Current vs. Current:   0%|          | 0/128 [00:00<?, ?it/s]

52.3% Continuing to train current weights
Current temp decayed to 0.421875
Checkpoint temp decayed to 0.421875
Iteration 3


Current vs. Current: 100%|██████████| 128/128 [00:26<00:00,  4.85it/s, 59.4%]
Optimizing: 81it [00:00, 84.50it/s, 67.5%, 0.615L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:26<00:00,  4.86it/s, 55.5%]
Current vs. Current:   1%|          | 1/128 [00:00<00:15,  8.39it/s, 0.0%]

55.5% Continuing to train current weights
Current temp decayed to 0.31640625
Checkpoint temp decayed to 0.31640625
Iteration 4


Current vs. Current: 100%|██████████| 128/128 [00:25<00:00,  5.12it/s, 52.3%]
Optimizing: 81it [00:00, 85.38it/s, 68.4%, 0.603L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:27<00:00,  4.66it/s, 53.9%]
Current vs. Current:   1%|          | 1/128 [00:00<00:23,  5.39it/s, 0.0%]

53.9% Continuing to train current weights
Current temp decayed to 0.2373046875
Checkpoint temp decayed to 0.2373046875
Iteration 5


Current vs. Current: 100%|██████████| 128/128 [00:26<00:00,  4.80it/s, 59.0%]
Optimizing: 86it [00:00, 88.09it/s, 60.6%, 0.666L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:29<00:00,  4.37it/s, 59.4%]
Current vs. Current:   1%|          | 1/128 [00:00<00:21,  5.87it/s, 100.0%]

59.4% Continuing to train current weights
Current temp decayed to 0.177978515625
Checkpoint temp decayed to 0.177978515625
Iteration 6


Current vs. Current: 100%|██████████| 128/128 [00:28<00:00,  4.46it/s, 56.6%]
Optimizing: 93it [00:01, 88.52it/s, 62.9%, 0.646L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:25<00:00,  5.06it/s, 67.2%]


67.2% Accepted new model


Current vs. Random:   0%|          | 0/128 [00:00<?, ?it/s, 100.0%]

Plotted sample trajectory


Current vs. Random: 100%|██████████| 128/128 [00:14<00:00,  8.89it/s, 85.5%]
Current vs. Greedy: 100%|██████████| 128/128 [00:27<00:00,  4.64it/s, 17.2%]
Current vs. Current:   1%|          | 1/128 [00:00<00:15,  7.98it/s, 100.0%]

Current temp decayed to 0.13348388671875
Checkpoint temp decayed to 0.13348388671875
Iteration 7


Current vs. Current: 100%|██████████| 128/128 [00:23<00:00,  5.50it/s, 59.8%]
Optimizing: 72it [00:00, 83.08it/s, 66.7%, 0.617L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:26<00:00,  4.79it/s, 62.1%]


62.1% Accepted new model


Current vs. Random:   1%|          | 1/128 [00:00<00:19,  6.62it/s, 100.0%]

Plotted sample trajectory


Current vs. Random: 100%|██████████| 128/128 [00:14<00:00,  8.93it/s, 89.1%]
Current vs. Greedy: 100%|██████████| 128/128 [00:30<00:00,  4.21it/s, 25.0%]
Current vs. Current:   0%|          | 0/128 [00:00<?, ?it/s]

Current temp decayed to 0.1001129150390625
Checkpoint temp decayed to 0.1001129150390625
Iteration 8


Current vs. Current: 100%|██████████| 128/128 [00:28<00:00,  4.43it/s, 53.5%]
Optimizing: 104it [00:01, 71.81it/s, 61.5%, 0.650L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:26<00:00,  4.88it/s, 55.1%]
Current vs. Current:   1%|          | 1/128 [00:00<00:22,  5.58it/s, 100.0%]

55.1% Continuing to train current weights
Current temp decayed to 0.07508468627929688
Checkpoint temp decayed to 0.07508468627929688
Iteration 9


Current vs. Current: 100%|██████████| 128/128 [00:23<00:00,  5.35it/s, 63.3%]
Optimizing: 77it [00:01, 71.79it/s, 70.6%, 0.557L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:30<00:00,  4.21it/s, 60.9%]


60.9% Accepted new model


Current vs. Random:   1%|          | 1/128 [00:00<00:13,  9.60it/s, 100.0%]

Plotted sample trajectory


Current vs. Random: 100%|██████████| 128/128 [00:14<00:00,  8.76it/s, 93.0%]
Current vs. Greedy: 100%|██████████| 128/128 [00:31<00:00,  4.10it/s, 38.7%]
Current vs. Current:   0%|          | 0/128 [00:00<?, ?it/s]

Current temp decayed to 0.056313514709472656
Checkpoint temp decayed to 0.056313514709472656
Iteration 10


Current vs. Current: 100%|██████████| 128/128 [00:30<00:00,  4.21it/s, 52.7%]
Optimizing: 109it [00:01, 72.68it/s, 64.2%, 0.636L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:28<00:00,  4.49it/s, 35.9%]
Current vs. Current:   0%|          | 0/128 [00:00<?, ?it/s]

35.9% Rejected new model
Current temp decayed to 0.04223513603210449
Checkpoint temp decayed to 0.04223513603210449
Iteration 11


Current vs. Current: 100%|██████████| 128/128 [00:32<00:00,  3.98it/s, 58.2%]
Optimizing: 112it [00:01, 72.51it/s, 61.9%, 0.667L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:28<00:00,  4.42it/s, 42.2%]
Current vs. Current:   1%|          | 1/128 [00:00<00:23,  5.45it/s, 100.0%]

42.2% Continuing to train current weights
Current temp decayed to 0.03167635202407837
Checkpoint temp decayed to 0.03167635202407837
Iteration 12


Current vs. Current: 100%|██████████| 128/128 [00:24<00:00,  5.22it/s, 57.4%]
Optimizing: 81it [00:01, 71.78it/s, 67.7%, 0.596L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:29<00:00,  4.33it/s, 57.8%]
Current vs. Current:   1%|          | 1/128 [00:00<00:22,  5.69it/s, 0.0%]

57.8% Continuing to train current weights
Current temp decayed to 0.023757264018058777
Checkpoint temp decayed to 0.023757264018058777
Iteration 13


Current vs. Current: 100%|██████████| 128/128 [00:25<00:00,  5.10it/s, 62.5%]
Optimizing: 78it [00:01, 72.32it/s, 78.1%, 0.456L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:31<00:00,  4.07it/s, 56.2%]
Current vs. Current:   0%|          | 0/128 [00:00<?, ?it/s]

56.2% Continuing to train current weights
Current temp decayed to 0.017817948013544083
Checkpoint temp decayed to 0.017817948013544083
Iteration 14


Current vs. Current: 100%|██████████| 128/128 [00:30<00:00,  4.19it/s, 52.7%]
Optimizing: 108it [00:01, 72.81it/s, 58.4%, 0.695L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:29<00:00,  4.27it/s, 43.0%]
Current vs. Current:   0%|          | 0/128 [00:00<?, ?it/s]

43.0% Continuing to train current weights
Current temp decayed to 0.015625
Checkpoint temp decayed to 0.015625
Iteration 15


Current vs. Current: 100%|██████████| 128/128 [00:28<00:00,  4.46it/s, 60.2%]
Optimizing: 80it [00:01, 66.70it/s, 71.9%, 0.556L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:31<00:00,  4.11it/s, 77.7%]


77.7% Accepted new model


Current vs. Random:   1%|          | 1/128 [00:00<00:19,  6.58it/s, 100.0%]

Plotted sample trajectory


Current vs. Random: 100%|██████████| 128/128 [00:14<00:00,  8.93it/s, 96.9%]
Current vs. Greedy: 100%|██████████| 128/128 [00:28<00:00,  4.45it/s, 71.1%]
Current vs. Current:   1%|          | 1/128 [00:00<00:18,  6.93it/s, 100.0%]

Current temp decayed to 0.015625
Checkpoint temp decayed to 0.015625
Iteration 16


Current vs. Current: 100%|██████████| 128/128 [00:28<00:00,  4.53it/s, 84.0%]
Optimizing: 88it [00:01, 72.15it/s, 83.3%, 0.446L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:30<00:00,  4.26it/s, 72.7%]


72.7% Accepted new model


Current vs. Random:   1%|          | 1/128 [00:00<00:18,  6.85it/s, 100.0%]

Plotted sample trajectory


Current vs. Random: 100%|██████████| 128/128 [00:15<00:00,  8.16it/s, 93.8%]
Current vs. Greedy: 100%|██████████| 128/128 [00:30<00:00,  4.26it/s, 50.4%]
Current vs. Current:   1%|          | 1/128 [00:00<00:21,  5.90it/s, 100.0%]

Current temp decayed to 0.015625
Checkpoint temp decayed to 0.015625
Iteration 17


Current vs. Current: 100%|██████████| 128/128 [00:30<00:00,  4.18it/s, 66.8%]
Optimizing: 94it [00:01, 63.72it/s, 62.3%, 0.672L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:29<00:00,  4.40it/s, 63.3%]


63.3% Accepted new model


Current vs. Random:   1%|          | 1/128 [00:00<00:16,  7.60it/s, 100.0%]

Plotted sample trajectory


Current vs. Random: 100%|██████████| 128/128 [00:13<00:00,  9.59it/s, 100.0%]
Current vs. Greedy: 100%|██████████| 128/128 [00:27<00:00,  4.60it/s, 64.1%]
Current vs. Current:   0%|          | 0/128 [00:00<?, ?it/s]

Current temp decayed to 0.015625
Checkpoint temp decayed to 0.015625
Iteration 18


Current vs. Current: 100%|██████████| 128/128 [00:26<00:00,  4.88it/s, 57.0%]
Optimizing: 85it [00:01, 71.33it/s, 69.4%, 0.572L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:25<00:00,  4.96it/s, 75.8%]


75.8% Accepted new model


Current vs. Random:   2%|▏         | 2/128 [00:00<00:09, 13.55it/s, 100.0%]

Plotted sample trajectory


Current vs. Random: 100%|██████████| 128/128 [00:12<00:00, 10.19it/s, 99.2%]
Current vs. Greedy: 100%|██████████| 128/128 [00:28<00:00,  4.49it/s, 85.9%]
Current vs. Current:   0%|          | 0/128 [00:00<?, ?it/s]

Current temp decayed to 0.015625
Checkpoint temp decayed to 0.015625
Iteration 19


Current vs. Current: 100%|██████████| 128/128 [00:23<00:00,  5.38it/s, 83.6%]
Optimizing: 75it [00:01, 71.66it/s, 83.1%, 0.425L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:26<00:00,  4.81it/s, 52.0%]
Current vs. Current:   0%|          | 0/128 [00:00<?, ?it/s]

52.0% Continuing to train current weights
Current temp decayed to 0.015625
Checkpoint temp decayed to 0.015625
Iteration 20


Current vs. Current: 100%|██████████| 128/128 [00:29<00:00,  4.39it/s, 64.5%]
Optimizing: 104it [00:01, 69.08it/s, 66.0%, 0.640L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:24<00:00,  5.15it/s, 64.8%]


64.8% Accepted new model


Current vs. Random:   1%|          | 1/128 [00:00<00:14,  8.78it/s, 100.0%]

Plotted sample trajectory


Current vs. Random: 100%|██████████| 128/128 [00:12<00:00, 10.08it/s, 100.0%]
Current vs. Greedy: 100%|██████████| 128/128 [00:26<00:00,  4.90it/s, 80.5%]
Current vs. Current:   1%|          | 1/128 [00:00<00:20,  6.27it/s, 0.0%]

Current temp decayed to 0.015625
Checkpoint temp decayed to 0.015625
Iteration 21


Current vs. Current: 100%|██████████| 128/128 [00:25<00:00,  5.03it/s, 67.2%]
Optimizing: 82it [00:01, 70.47it/s, 75.4%, 0.472L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:27<00:00,  4.63it/s, 58.2%]
Current vs. Current:   0%|          | 0/128 [00:00<?, ?it/s]

58.2% Continuing to train current weights
Current temp decayed to 0.015625
Checkpoint temp decayed to 0.015625
Iteration 22


Current vs. Current: 100%|██████████| 128/128 [00:29<00:00,  4.35it/s, 61.3%]
Optimizing: 100it [00:01, 70.66it/s, 62.3%, 0.676L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:26<00:00,  4.80it/s, 53.1%]
Current vs. Current:   1%|          | 1/128 [00:00<00:24,  5.24it/s, 100.0%]

53.1% Continuing to train current weights
Current temp decayed to 0.015625
Checkpoint temp decayed to 0.015625
Iteration 23


Current vs. Current: 100%|██████████| 128/128 [00:27<00:00,  4.65it/s, 60.2%]
Optimizing: 82it [00:01, 69.28it/s, 73.4%, 0.515L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:28<00:00,  4.45it/s, 47.7%]
Current vs. Current:   0%|          | 0/128 [00:00<?, ?it/s]

47.7% Continuing to train current weights
Current temp decayed to 0.015625
Checkpoint temp decayed to 0.015625
Iteration 24


Current vs. Current: 100%|██████████| 128/128 [00:28<00:00,  4.51it/s, 35.2%]
Optimizing: 80it [00:01, 70.75it/s, 63.5%, 0.522L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:30<00:00,  4.16it/s, 57.0%]
Current vs. Current:   0%|          | 0/128 [00:00<?, ?it/s]

57.0% Continuing to train current weights
Current temp decayed to 0.015625
Checkpoint temp decayed to 0.015625
Iteration 25


Current vs. Current: 100%|██████████| 128/128 [00:34<00:00,  3.73it/s, 67.2%]
Optimizing: 105it [00:01, 72.13it/s, 67.1%, 0.652L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:27<00:00,  4.59it/s, 65.6%]


65.6% Accepted new model


Current vs. Random:   1%|          | 1/128 [00:00<00:14,  8.89it/s, 100.0%]

Plotted sample trajectory


Current vs. Random: 100%|██████████| 128/128 [00:14<00:00,  8.85it/s, 99.2%] 
Current vs. Greedy: 100%|██████████| 128/128 [00:28<00:00,  4.43it/s, 89.1%]
Current vs. Current:   1%|          | 1/128 [00:00<00:19,  6.64it/s, 100.0%]

Current temp decayed to 0.015625
Checkpoint temp decayed to 0.015625
Iteration 26


Current vs. Current: 100%|██████████| 128/128 [00:27<00:00,  4.69it/s, 89.1%]
Optimizing: 68it [00:01, 57.59it/s, 88.4%, 0.318L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:32<00:00,  3.94it/s, 66.4%]


66.4% Accepted new model


Current vs. Random:   1%|          | 1/128 [00:00<00:22,  5.61it/s, 100.0%]

Plotted sample trajectory


Current vs. Random: 100%|██████████| 128/128 [00:15<00:00,  8.12it/s, 95.3%]
Current vs. Greedy: 100%|██████████| 128/128 [00:31<00:00,  4.12it/s, 46.9%]
Current vs. Current:   0%|          | 0/128 [00:00<?, ?it/s]

Current temp decayed to 0.015625
Checkpoint temp decayed to 0.015625
Iteration 27


Current vs. Current: 100%|██████████| 128/128 [00:35<00:00,  3.64it/s, 71.9%]
Optimizing: 106it [00:01, 69.63it/s, 64.2%, 0.704L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:31<00:00,  4.09it/s, 69.5%]


69.5% Accepted new model


Current vs. Random:   1%|          | 1/128 [00:00<00:13,  9.42it/s, 100.0%]

Plotted sample trajectory


Current vs. Random: 100%|██████████| 128/128 [00:13<00:00,  9.23it/s, 98.4%]
Current vs. Greedy: 100%|██████████| 128/128 [00:28<00:00,  4.45it/s, 80.5%]
Current vs. Current:   1%|          | 1/128 [00:00<00:21,  6.05it/s, 0.0%]

Current temp decayed to 0.015625
Checkpoint temp decayed to 0.015625
Iteration 28


Current vs. Current: 100%|██████████| 128/128 [00:27<00:00,  4.70it/s, 64.1%]
Optimizing: 78it [00:01, 68.94it/s, 73.5%, 0.525L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:30<00:00,  4.19it/s, 68.0%]


68.0% Accepted new model


Current vs. Random:   2%|▏         | 2/128 [00:00<00:11, 10.56it/s, 100.0%]

Plotted sample trajectory


Current vs. Random: 100%|██████████| 128/128 [00:15<00:00,  8.14it/s, 94.5%]
Current vs. Greedy: 100%|██████████| 128/128 [00:33<00:00,  3.82it/s, 51.2%]
Current vs. Current:   0%|          | 0/128 [00:00<?, ?it/s]

Current temp decayed to 0.015625
Checkpoint temp decayed to 0.015625
Iteration 29


Current vs. Current: 100%|██████████| 128/128 [00:35<00:00,  3.66it/s, 68.0%]
Optimizing: 112it [00:01, 67.95it/s, 64.4%, 0.657L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:23<00:00,  5.39it/s, 20.7%]
Current vs. Current:   0%|          | 0/128 [00:00<?, ?it/s]

20.7% Rejected new model
Current temp decayed to 0.015625
Checkpoint temp decayed to 0.015625
Iteration 30


Current vs. Current: 100%|██████████| 128/128 [00:31<00:00,  4.07it/s, 64.8%]
Optimizing: 111it [00:01, 71.93it/s, 66.6%, 0.627L]
Current vs. Checkpoint: 100%|██████████| 128/128 [00:31<00:00,  4.08it/s, 67.2%]


67.2% Accepted new model


Current vs. Random:   1%|          | 1/128 [00:00<00:12,  9.93it/s, 100.0%]

Plotted sample trajectory


Current vs. Random: 100%|██████████| 128/128 [00:13<00:00,  9.78it/s, 100.0%]
Current vs. Greedy: 100%|██████████| 128/128 [00:28<00:00,  4.54it/s, 90.6%]
Current vs. Current:   1%|          | 1/128 [00:00<00:23,  5.48it/s, 100.0%]

Current temp decayed to 0.015625
Checkpoint temp decayed to 0.015625
Iteration 31


Current vs. Current:  80%|███████▉  | 102/128 [00:22<00:05,  4.73it/s, 84.3%]

# Evaluate

Play against our AI

In [None]:
set_temps([curr_policy, checkpoint_policy], 0)

In [None]:
game.pit(go_env, human_policy, checkpoint_policy, False)