In [1]:
from __future__ import print_function
from collections import deque
from tensorflow.python.framework import ops
from rl_reinforce import REINFORCEothello
from othello_net import *
from othello_rules import *
from feature_extractor import *
import os, random, sys, gym
import tensorflow as tf
import numpy as np
from datetime import datetime
from example_states import *
from training_utils import *
np.set_printoptions(precision=2)
ops.reset_default_graph()

In [2]:
def sample_action(prediction):
    prediction = np.transpose(prediction[0])
    prediction = np.transpose(prediction[1])
    legal_moves = find_legal_moves(board, player)
    prediciton = prediction / np.sum(prediction)
    cleaned_predictions = zero_illegal_moves(prediction, legal_moves)
    p = cleaned_predictions.flatten()
    p = p / np.sum(p)
    sample_index = np.flatnonzero( np.random.multinomial(1,p,1) )[0]
    sampled_move = moves[sample_index]
    if prev_move == sampled_move:
        print("action probabilities")
        print(p)
        print(prediction)
        print(cleaned_predictions)
    return sampled_move

moves = ['0'] * 64
for i in range(8):
    for j in range(8):
        moves[i*8 + j] = str((i+1) * 10 + (j+1))
        

batches = 1000
games_per_batch = 10
test_game_batch = 20
initial_a_model = "models/rl-p-a/a_player_0.ckpt"
initial_b_model = "models/rl-p-b/b_player_0.ckpt"

cnt = 90
action_buffer = []
state_buffer = []
reward_buffer = []
print("starting rl policy network")
for batch in range(batches):
    # TODO: Choose the initial b model as the opponent once in a while to see how
    # well our learner does agains it now
    ops.reset_default_graph()
    # Define the variables for our main learner
    graph_1, img_data_1, train_step_1, optimizer_1, ground_truths_1, loss_1, pred_up_1, keep_prob_1, learn_rate_1, score_out_1 = create_othello_net()
    sess_1 = tf.Session(graph=graph_1)
    saver_1 = tf.train.Saver()
    init_op_1 = tf.initialize_all_variables()
    sess_1.run(init_op_1)
    model_1 = "models/rl-p-a/a_player_" + str(cnt-1) + ".ckpt"
    if os.path.isfile(model_1):
        saver_1.restore(sess_1, model_1)
    rl_reinforce_1 = REINFORCEothello(sess_1,
                                           optimizer_1,
                                           learn_rate_1,
                                           keep_prob_1,
                                           loss_1,
                                           score_out_1,
                                           ground_truths_1,
                                           img_data_1)
    
    # Define the variables for the randomly chosen opponent
    graph_2, img_data_2, train_step_2, optimizer_2, ground_truths_2, loss_2, pred_up_2, keep_prob_2, learn_rate_2, score_out_2 = create_othello_net()
    sess_2 = tf.Session(graph=graph_2)
    saver_2 = tf.train.Saver()
    init_op_2 = tf.initialize_all_variables()
    sess_2.run(init_op_2)
    if batch % test_game_batch == 0:
        model_2 = 'models/tiny-rl-j.ckpt'
        games_per_batch = 100
    else:
        all_files = os.listdir("models/rl-p-b/")
        no_meta = [i for i in all_files if not ('meta' in i or 'checkpoint' in i)]
        model_2 = 'models/rl-p-b/' + random.choice(no_meta)
        games_per_batch = 10
    if os.path.isfile(model_2):
        saver_2.restore(sess_2, model_2)
    #rl_reinforce_2 = REINFORCEothello(sess_2,
    #                                   optimizer_2,
    #                                   learn_rate_2,
    #                                   keep_prob_2,
    #                                   loss_2,
    #                                   score_out_2,
    #                                   ground_truths_2,
    #                                   img_data_2)

    N = games_per_batch
    graph_1 = -1
    graph_2 = 1
    graph_1_wins = 0
    avg_vector_1 = []
    graph_2_wins = 0
    avg_vector_2 = []
    temp_reset = True
    for n in range(N):
        #TODO: Every once in a while play games against an independent
        # othello player and see how the standing is - this is the true
        # measure of the success of the RL part
        #TODO: Make a schedule as a function of current batch
        # then I have to let the REINFORCEothello take the learning rate
        # as a parameter
        board = initialize_game()
        player = -1
        prev_move = '00'
        input_batch = []
        label_batch = []
        while True:
            legal_moves = find_legal_moves(board, player)
            if len(legal_moves) == 0:
                winner = get_winner(board, 1, -1)
                if winner is graph_1:
                    graph_1_wins += 1
                if winner is graph_2:
                    graph_2_wins += 1
                break
            features = board_to_input(board, player)
            if player is graph_1:
                prediction = sess_1.run(pred_up_1, feed_dict={img_data_1:[features], keep_prob_1:1.0})
            else:
                prediction = sess_2.run(pred_up_2, feed_dict={img_data_2:[features], keep_prob_2:1.0})


            sampled_move = sample_action(prediction)
            if prev_move == sampled_move:
                print("illegal action sampled:")
                print(sampled_move)
                print("in state")
                print(board)
                break
            prev_move = sampled_move
            label = prepare_data(move_to_label(sampled_move))
            input_batch.append(features)
            label_batch.append(label)

            board = make_move(board, sampled_move, player, debug=True)

            if player is 1:
                player = -1
            else:
                player = 1
            legal_moves = find_legal_moves(board, player)
            if len(legal_moves) == 0:
                if player is 1:
                    player = -1
                else:
                    player = 1
        if winner:
            for j in range(len(input_batch)):
                # TODO: Write a prepare_training_batch function that also 
                # does flips, use that here and in the supervised-learning script
                # but make sure that the rewards match the amount of augmentation!
                # append the same reward four times before flipping, or something
                # like this. Also remember to divide the learning rate then by 4.
                state = input_batch[j]
                action = label_batch[j]
                if winner is -1:
                    reward = 1 if j% 2 is 0 else -0.1
                else:
                    reward = -0.1 if j% 2 is 0 else 1
                if batch % test_game_batch != 0:
                    rl_reinforce_1.storeRollout(state, action, reward)
                    action_buffer.append(action)
                    state_buffer.append(state)
                    reward_buffer.append(reward)
                    #rl_reinforce_2.storeRollout(state, action, reward)
        graph_1 = graph_1 * (-1)
        graph_2 = graph_2 * (-1)
        if n+1 is games_per_batch:
            print('%s  %s wr: %.2f, %s wr: %.2f' % (datetime.now().strftime("%d. %H:%M:%S"), model_1[14:], graph_1_wins/float(N), model_2[14:], graph_2_wins/float(N)))
            graph_1_wins = 0
            graph_2_wins = 0
        if n+1 is games_per_batch and batch % test_game_batch != 0:
            rl_reinforce_1.updateModel()
            #rl_reinforce_2.updateModel()
            save_path = saver_1.save(sess_1, "models/rl-p-a/a_player_" + str(cnt) + ".ckpt") 
            #save_path = saver_2.save(sess_2, "models/rl-p-b/b_player_" + str(batch) + ".ckpt")             

            
            ops.reset_default_graph()
            # Define these variables for the tensorflow name scope
            graph_1, img_data_1, train_step_1, optimizer_1, ground_truths_1, loss_1, pred_up_1, keep_prob_1, learn_rate_1, score_out_1 = create_othello_net()
            sess_1 = tf.Session(graph=graph_1)
            saver_1 = tf.train.Saver()
            init_op_1 = tf.initialize_all_variables()
            sess_1.run(init_op_1)
            # Now define the graph for the newest addition into our opponent pool
            # we let him watch all the games that were played between the randomly chosen
            # opponent and our main learner, update and save as a new model
            graph_2, img_data_2, train_step_2, optimizer_2, ground_truths_2, loss_2, pred_up_2, keep_prob_2, learn_rate_2, score_out_2 = create_othello_net()
            sess_2 = tf.Session(graph=graph_2)
            saver_2 = tf.train.Saver()
            init_op_2 = tf.initialize_all_variables()
            sess_2.run(init_op_2)
            # XXX: "not finding the model for some reason" happens after every test 
            # should rather have a cnt rather than let it go by batch
            model_2 = "models/rl-p-b/b_player_" + str(cnt-1) + ".ckpt"
            if os.path.isfile(model_2):
                saver_2.restore(sess_2, model_2)
            else:
                print("not finding the model for some reason")
            rl_reinforce_2 = REINFORCEothello(sess_2,
                                               optimizer_2,
                                               learn_rate_2,
                                               keep_prob_2, 
                                               loss_2,
                                               score_out_2,
                                               ground_truths_2,
                                               img_data_2)
            for buff in range(len(action_buffer)):
                rl_reinforce_2.storeRollout(state_buffer[buff], action_buffer[buff], reward_buffer[buff])
            rl_reinforce_2.updateModel()
            save_path = saver_2.save(sess_2, "models/rl-p-b/b_player_" + str(cnt) + ".ckpt")    
            cnt += 1
            action_buffer = []
            state_buffer = []
            reward_buffer = []

12. 12:19:54  a_player_89.ckpt wr: 0.38, -j.ckpt wr: 0.59
12. 12:20:07  a_player_89.ckpt wr: 0.20, b_player_45.ckpt wr: 0.80
12. 12:20:25  a_player_90.ckpt wr: 0.40, b_player_44.ckpt wr: 0.60
12. 12:20:43  a_player_91.ckpt wr: 0.40, b_player_53.ckpt wr: 0.60
12. 12:21:02  a_player_92.ckpt wr: 0.40, b_player_34.ckpt wr: 0.60
12. 12:21:19  a_player_93.ckpt wr: 0.60, b_player_47.ckpt wr: 0.40
12. 12:21:42  a_player_94.ckpt wr: 0.20, b_player_62.ckpt wr: 0.70
12. 12:22:02  a_player_95.ckpt wr: 0.40, b_player_28.ckpt wr: 0.60
12. 12:22:19  a_player_96.ckpt wr: 0.50, b_player_31.ckpt wr: 0.50
12. 12:22:37  a_player_97.ckpt wr: 0.30, b_player_35.ckpt wr: 0.70
12. 12:22:54  a_player_98.ckpt wr: 0.40, b_player_27.ckpt wr: 0.60
12. 12:23:11  a_player_99.ckpt wr: 0.40, b_player_97.ckpt wr: 0.60
12. 12:23:32  a_player_100.ckpt wr: 0.60, b_player_35.ckpt wr: 0.40
12. 12:23:50  a_player_101.ckpt wr: 0.40, b_player_84.ckpt wr: 0.60
12. 12:24:11  a_player_102.ckpt wr: 0.20, b_player_79.ckpt wr: 0.80
1

KeyboardInterrupt: 

In [3]:
#a_player_0 seems to win 40-49 % of games against j

#trained 1-67    with 5e-5
#trained 68-81   with 2e-5
#12. 11:52:10  a_player_66.ckpt wr: 0.44, -j.ckpt wr: 0.50
#12. 11:56:59  a_player_70.ckpt wr: 0.42, -j.ckpt wr: 0.54
#12. 12:02:07  a_player_74.ckpt wr: 0.38, -j.ckpt wr: 0.54
#12. 12:06:43  a_player_78.ckpt wr: 0.36, -j.ckpt wr: 0.62
#trained 81-     with 3e-5 but 1/5 of the games per opponent