In [None]:
from __future__ import print_function
from collections import deque
from tensorflow.python.framework import ops
from rl_reinforce import REINFORCEothello
from othello_net import *
from othello_rules import *
from feature_extractor import *
import os
import tensorflow as tf
import numpy as np
import sys
import gym
from datetime import datetime
from example_states import *
from training_utils import *
np.set_printoptions(precision=2)
ops.reset_default_graph()
def avg_error(data_path, sess):
    errors = []
    validation_matches = get_all_matches(data_path)
    #XXX: Delete this line when testing is faster
    validation_matches = validation_matches[0:5]
    for i in range(len(validation_matches)):
        match = validation_matches[i]
        raw_match_movelist = match[8:]
        unpacked_movelist = unpack('b'*60, raw_match_movelist)
        board = initialize_game()
        player = -1
        for move in unpacked_movelist:
            if move == 0:
                break
            feature_path = 'cache/validation/features/features_' + str(i) + "_" + str(move) + ".npy"
            label_path = 'cache/validation/labels/labels_' + str(i) + "_" + str(move) + ".npy"
            if os.path.isfile(feature_path) and os.path.isfile(label_path):
                features = np.load(feature_path)
                label = np.load(label_path)
            else:
                features = board_to_input(board, player)
                label = prepare_data(move_to_label(move))
                
            input_batch = [features]
            label_batch = [label]
            error = sess.run(loss, feed_dict={img_data:input_batch, ground_truths: label_batch, keep_prob:1.0})
            errors.append(error)
            board = make_move(board, move, player)
            if player is 1:
                player = -1
            else:
                player = 1
            legal_moves = find_legal_moves(board, player)
            if len(legal_moves) == 0:
                if player is 1:
                    player = -1
                else:
                    player = 1
            #input_batch = prepare_data(board * )
            #label_batch = prepare_data(move_to_label(move))

    return np.sum(errors) / len(errors)
def prediction_accuracy(data_path, len_games=5):
    lengths = []
    successes = []
    validation_matches = get_all_matches(data_path)
    #XXX: Delete this line when testing is faster
    validation_matches = validation_matches[0:len_games]
    for i in range(len_games):
        test_match = validation_matches[i]
        board = initialize_game()
        #print(board)
        #print('\n')
        player = -1
        success = 0
        length = 0
        #test_match = matches[i]
        raw_match_movelist = test_match[8:]
        unpacked_movelist = unpack('b'*60, raw_match_movelist)
        for move in unpacked_movelist:
            length += 1
            if move == 0:
                winner = get_winner(board, 1, 2)
                break

            feature_path = 'cache/validation/features/features_' + str(i) + "_" + str(move) + ".npy"
            label_path = 'cache/validation/labels/labels_' + str(i) + "_" + str(move) + ".npy"
            if os.path.isfile(feature_path) and os.path.isfile(label_path):
                features = np.load(feature_path)
                label = np.load(label_path)
            else:
                features = board_to_input(board, player, training_stability)
                label = prepare_data(move_to_label(move))
                
            input_batch = [features]
            label_batch = [label]
            prediction = sess.run(pred_up, feed_dict={img_data:input_batch, ground_truths: label_batch, keep_prob:1.0})
            np.set_printoptions(precision=2)
            prediction = np.transpose(prediction[0])
            prediction = np.transpose(prediction[1])
            legal_moves = find_legal_moves(board, player)
            cleaned_predictions = zero_illegal_moves(prediction, legal_moves)
            i,j = np.unravel_index(cleaned_predictions.argmax(), cleaned_predictions.shape)
            move_argmax = str((i+1) * 10 + (j+1))
            if str(move) == str(move_argmax):
                success += 1
            original_board = np.array(board)
            board_upright = np.transpose(original_board)
            board_upleft = np.rot90(np.rot90(board_upright))
            board_both_flips = np.transpose(board_upleft)
            if np.array_equal(board, board_upright):
                if str(move_argmax) == flip_move_upright(move):
                    success += 1
            if np.array_equal(board, board_upleft):
                if str(move_argmax) == flip_move_upleft(move):
                    success += 1
            if np.array_equal(board, board_both_flips):
                if str(move_argmax) == flip_move_upright(flip_move_upleft(move)):
                    success += 1
            board = make_move(board, move, player)
            if player is 1:
                player = -1
            else:
                player = 1
            legal_moves = find_legal_moves(board, player)
            if len(legal_moves) == 0:
                if player is 1:
                    player = -1
                else:
                    player = 1
        legal_moves = find_legal_moves(board, player)
        winner = get_winner(board, 1, -1)
        successes.append(success)
        lengths.append(length)
    
    return np.mean(successes)

In [None]:

sess = tf.Session()
graph, img_data, train_step, optimizer, ground_truths, loss, pred_up, keep_prob, learn_rate, score_out = create_othello_net()
rl_reinforce = REINFORCEothello(sess,
                                       optimizer,
                                       learn_rate,
                                       keep_prob,
                                       loss,
                                       score_out,
                                       ground_truths,
                                       img_data)
saver = tf.train.Saver()
validation_path = "validation/"
matches = get_all_matches('training/')
lenmatches = len(matches)
iterations = 20000
prev_stop = 0
probs = 0.5
for i in range(prev_stop, prev_stop+iterations):
    if i % 100 is 0:
        print('%s: Step %d: Validation error = %.2f' % (datetime.now(), i,
                                                      avg_error(validation_path, sess)))
        print('%s: Step %d: Prediction accuracy = %.3f' % (datetime.now(), i,
                                                      prediction_accuracy(validation_path)/float(60)))
        #save_path = saver.save(sess, current_model)
    current_match = matches[i]
    raw_match_movelist = current_match[8:]
    unpacked_movelist = unpack('b'*60, raw_match_movelist)
    
    board = initialize_game()
    player = -1

    # One training batch is all the data from one match
    input_batch = []
    label_batch = []
    for move in unpacked_movelist:
        if move == 0:
            break
        feature_path = 'cache/training/features/features_' + str(i) + "_" + str(move) + ".npy"
        label_path = 'cache/training/labels/labels_' + str(i) + "_" + str(move) + ".npy"
        if os.path.isfile(feature_path) and os.path.isfile(label_path):
            try:
                features = np.load(feature_path)
                label = np.load(label_path)
            except:
                print("data corruption in match " + str(i))
                features = board_to_input(board, player)
                label = prepare_data(move_to_label(move))
        else:
            features = board_to_input(board, player)
            label = prepare_data(move_to_label(move))
        input_batch.append(features)
        label_batch.append(label)

        board = make_move(board, move, player)
        if player is 1:
            player = -1
        else:
            player = 1
        legal_moves = find_legal_moves(board, player)
        if len(legal_moves) == 0:
            if player is 1:
                player = -1
            else:
                player = 1
    winner = get_winner(board, 1, -1)
    if winner:
        j = 0
        for move in unpacked_movelist:
            if move == 0:
                break
            state = input_batch[j]
            action = label_batch[j]
            #reward = (-1) * winner if j% 2 is 0 else winner
            if winner is -1:
                reward = 1 if j% 2 is 0 else -0.01
            else:
                reward = -0.01 if j% 2 is 0 else 1
            rl_reinforce.storeRollout(state, action, reward)
            j = j+1

        rl_reinforce.updateModel()


        

2016-10-09 22:52:35.412165: Step 0: Validation error = 36.43
2016-10-09 22:52:35.940237: Step 0: Prediction accuracy = 0.167


In [None]:
validation_path = "validation/"
matches = get_all_matches('training/')
lenmatches = len(matches)
print(lenmatches)

# Byrjum þjálfunina
print("starting error:" + str(avg_error(validation_path, sess)))

## 