In [1]:
from __future__ import print_function
from collections import deque
from tensorflow.python.framework import ops
from rl_reinforce import REINFORCEothello
from othello_net import *
from othello_rules import *
from feature_extractor import *
import os
import tensorflow as tf
import numpy as np
import sys
import gym
from datetime import datetime
from example_states import *
from training_utils import *
np.set_printoptions(precision=2)
ops.reset_default_graph()

In [2]:
def sample_action(prediction):
    prediction = np.transpose(prediction[0])
    prediction = np.transpose(prediction[1])
    legal_moves = find_legal_moves(board, player)
    prediciton = prediction / np.sum(prediction)
    cleaned_predictions = zero_illegal_moves(prediction, legal_moves)
    p = cleaned_predictions.flatten()
    p = p / np.sum(p)
    sample_index = np.flatnonzero( np.random.multinomial(1,p,1) )[0]
    sampled_move = moves[sample_index]
    if prev_move == sampled_move:
        print("action probabilities")
        print(p)
        print(prediction)
        print(cleaned_predictions)
    return sampled_move

moves = ['0'] * 64
for i in range(8):
    for j in range(8):
        moves[i*8 + j] = str((i+1) * 10 + (j+1))

ops.reset_default_graph()

graph_1, img_data_1, train_step_1, optimizer_1, ground_truths_1, loss_1, pred_up_1, keep_prob_1, learn_rate_1, score_out_1 = create_othello_net()
sess_1 = tf.Session(graph=graph_1)
saver_1 = tf.train.Saver()
init_op_1 = tf.initialize_all_variables()
sess_1.run(init_op_1)
model_1 = "models/rl-p-a/tiny-rl-m.ckpt"
if os.path.isfile(model_1):
    saver_1.restore(sess_1, model_1)
rl_reinforce = REINFORCEothello(sess_1,
                                       optimizer_1,
                                       learn_rate_1,
                                       keep_prob_1,
                                       loss_1,
                                       score_out_1,
                                       ground_truths_1,
                                       img_data_1)

graph_2, img_data_2, train_step_2, optimizer_2, ground_truths_2, loss_2, pred_up_2, keep_prob_2, learn_rate_2, score_out_2 = create_othello_net()
sess_2 = tf.Session(graph=graph_2)
saver_2 = tf.train.Saver()
init_op_2 = tf.initialize_all_variables()
sess_2.run(init_op_2)
model_2 = "models/rl-p-b/tiny-rl-n.ckpt"
if os.path.isfile(model_2):
    saver_2.restore(sess_2, model_2)
    
N = 601
graph_1 = -1
graph_2 = 1
graph_1_wins = 0
avg_vector_1 = []
graph_2_wins = 0
avg_vector_2 = []
temp_reset = True
print("starting self play")
for n in range(N):
    if n % 100 is 0 and n > 0:
        print('%s, Step %d: first = %s, second = %s' % (datetime.now().strftime("%d. %H:%M:%S"), n,
                                              graph_1_wins, graph_2_wins))
        save_path = saver_1.save(sess_1, "models/tiny-selfplay-1-1.ckpt")
        #print("first: " + str(graph_1_wins) + " second: " + str(graph_2_wins))
        avg_vector_1.append(graph_1_wins)
        avg_vector_2.append(graph_2_wins)

        graph_1_wins = 0
        graph_2_wins = 0
    board = initialize_game()
    player = -1
    prev_move = '00'
    input_batch = []
    label_batch = []
    while True:
        legal_moves = find_legal_moves(board, player)
        if len(legal_moves) == 0:
            winner = get_winner(board, 1, -1)
            if winner is graph_1:
                graph_1_wins += 1
            if winner is graph_2:
                graph_2_wins += 1
            break
        features = board_to_input(board, player)
        if player is graph_1:
            prediction = sess_1.run(pred_up_1, feed_dict={img_data_1:[features], keep_prob_1:1.0})
        else:
            prediction = sess_2.run(pred_up_2, feed_dict={img_data_2:[features], keep_prob_2:1.0})
            
        
        sampled_move = sample_action(prediction)
        if prev_move == sampled_move:
            print("illegal action sampled:")
            print(sampled_move)
            print("in state")
            print(board)
            print("current player is " + str(player))
            print("legal moves for this player are")
            print(find_legal_moves(board, player))
            break
        prev_move = sampled_move
        label = prepare_data(move_to_label(sampled_move))
        input_batch.append(features)
        label_batch.append(label)
        
        board = make_move(board, sampled_move, player, debug=True)

        if player is 1:
            player = -1
        else:
            player = 1
        legal_moves = find_legal_moves(board, player)
        if len(legal_moves) == 0:
            if player is 1:
                player = -1
            else:
                player = 1
    if winner:
        for j in range(len(input_batch)):
            state = input_batch[j]
            action = label_batch[j]
            if winner is -1:
                reward = 1 if j% 2 is 0 else -0.1
            else:
                reward = -0.1 if j% 2 is 0 else 1
            rl_reinforce.storeRollout(state, action, reward)
        #rl_reinforce.updateModel()
        if n > 100:
            if temp_reset:
                print('%s, Results before learning : first wr = %.2f, second wr = %.2f' % (datetime.now().strftime("%d. %H:%M:%S"), 
                                      np.mean(avg_vector_1)/float(100), np.mean(avg_vector_2)/float(100)))
                avg_vector_1 = []
                avg_vector_2 = []
                temp_reset = False
            rl_reinforce.updateModel()
            #if n % 50 is 0:
                #Frekar nefna leikmennina a og b eða álíka
                #save_path = saver_1.save(sess_1, model_1)
                #save_path = saver_1.save(sess_1, "models/tiny-selfplay-1-2.ckpt")
    graph_1 = graph_1 * (-1)
    graph_2 = graph_2 * (-1)
print('%s, Results after learning : first wr = %.2f, second wr = %.2f' % (datetime.now().strftime("%d. %H:%M:%S"), 
                      np.mean(avg_vector_1)/float(100), np.mean(avg_vector_2)/float(100)))

starting self play
11. 20:15:01, Step 100: first = 25, second = 73
11. 20:15:03, Results before learning : first wr = 0.25, second wr = 0.73
11. 20:16:37, Step 200: first = 35, second = 62
11. 20:17:59, Step 300: first = 43, second = 53
11. 20:19:25, Step 400: first = 48, second = 47
11. 20:20:55, Step 500: first = 44, second = 54
11. 20:22:35, Step 600: first = 45, second = 54
11. 20:22:37, Results after learning : first wr = 0.43, second wr = 0.54
