In [171]:
from tictactoe import TicTacToeEnv
import random
import gym

from keras.models import Sequential
from keras import layers
from keras import losses
from keras import optimizers
from keras import initializers
import numpy as np

%matplotlib notebook
import matplotlib.pyplot as plt

In [206]:
#from keras.model import Sequential

env = gym.make('tictactoe-v0')

def create_agent_model(learning_rate):
    model = Sequential()

    # input = one hot encoding of the board state (3*9 = 27 inputs)
    # TODO: Use 2-d inputs so the encoding & relation between cells does not have to be learned by the model
    model.add(layers.Dense(27, kernel_initializer=initializers.RandomNormal(stddev=0.01)))
    model.add(layers.Dense(27, kernel_initializer=initializers.RandomNormal(stddev=0.01)))
    model.add(layers.Dense(9, kernel_initializer=initializers.RandomNormal(stddev=0.01)))
    
    #sgd = optimizers.SGD(learning_rate=learning_rate, momentum=0.0, nesterov=False, name="SGD")
    adam =  optimizers.Adam(learning_rate=learning_rate)
    loss = losses.MeanSquaredError()
    model.compile(adam, loss)
    
    return model

class Agent:
    def __init__(self, model):
        self.model = model
        self.env = env
        self.random_rate = 0
        
    def set_random_rate(self, rate):
        self.random_rate = rate
        
    def get_action(self, env):
        #print("Getting action for")
        #env.render()
        if (self.random_rate > 0) and (random.random() < self.random_rate):
#             print("random move")
            return random.randint(0,8)
        else:
            best_action = self._get_best_action()
            return best_action
    
    def _get_best_action(self):
        #print("predicted Q values", self.model.predict(np.array([env._one_hot_board()])))
        best_action = np.argmax(self.model.predict(np.array([env._one_hot_board()])), axis=1)
        #print("best action", best_action[0])
        return best_action[0]
        
    def _get_random_action(self):
        # truly random, so could yield invalid moves
        return random.randint(0, 8)
    
    #TODO: get random *valid* action and get best *valid* action (to avoid invalid moves when not training)

In [207]:
def record_experience(env, agent1, agent2):
    # plays a game until it's done, recording all steps into tuples (state, action, reward, next_state, done)
    env.reset()
    done = False
    experience = []
    while not done:
        agent = agent1 if env.current_player == 0 else agent2
        
        action = agent.get_action(env)
        state = env._one_hot_board()
        
        step_result = env.step(action)
        (next_state, reward, done, info) = step_result
        
        experience.append((state, action, reward, next_state, done))
    return experience

In [208]:
model1 = create_agent_model(0.01)
model2 = create_agent_model(0.01)
agent1 = Agent(model1)
agent2 = Agent(model2)

In [209]:
#  work in progress

def split_experiences(experiences):
    # split in even and odd (agent1 and agent2) experiences
    return experiences[::2], experiences[1::2]
    
def train_model(experiences, model, verbose=False):
    start_states = np.array([e[0] for e in experiences])
    actions = np.array([e[1] for e in experiences])
    rewards = np.array([e[2] for e in experiences])
    next_states = np.array([e[3] for e in experiences])
    dones = [e[4] for e in experiences]
    # Only use the next Q value if the game is not done
    use_next = np.array([0 if e[4] else 1 for e in experiences])
    Q = model.predict(start_states)
    nextQ = model.predict(next_states)
    #print("nextQ", nextQ)
    
    gamma = 0.95
    
    #print("original Q", Q)
    #print("actions", actions)
    for row in range(len(Q)):
        # Update Q values for observed rewards + nextQ if game was not done
        # FIXME: Q[:,actions] = ... does  not what it should do
        Q[row,actions[row]] = rewards[row]  + 0 if dones[row] else (gamma * np.max(nextQ[row]))
    #print("updated Q", Q)
    
    model.fit(x=start_states, y=Q, epochs=10, verbose=verbose)

    #newQ = model.predict(start_states)
    #print("Q after fit", newQ)
    #print("best action after fit", np.argmax(newQ, axis = 1))
    
def train_game_per_game(env, agent1, agent2, num_batches = 100, batch_size = 100):
    moves = 0
    gamelengths = []
    
    for batch_num in range(num_batches):
        agent1_wins = 0
        agent2_wins = 0
        agent1_fail = 0
        agent2_fail = 0
        draws = 0
        
        agent1_experience_batch  = []
        agent2_experience_batch  = []
        for game_num in range(batch_size):
            experience = record_experience(env, agent1, agent2)
            #print(len(experience))

            #print actions when it was a draw (0 score)
            if (experience[-1][2] == 0):
                print("actions in game", [e[1] for e in experience])
            winner = env.get_winner()
            if winner == 0:
                agent1_wins += 1
            elif winner == 1:
                agent2_wins += 1
            else:
                if experience[-1][2] == -3:
                    if env.current_player == 0:
                        agent1_fail += 1
                    else:
                        agent2_fail += 1
                else:
                    draws+=1
                    
            moves += len(experience)

            e1, e2 = split_experiences(experience)
            agent1_experience_batch += e1
            agent2_experience_batch += e2

    #       print("game actions", [e[1] for e in experience])

        # train model after every game
        train_model(agent1_experience_batch, agent1.model)
        train_model(agent2_experience_batch, agent2.model)
        
        print("batch %3d / %4d moves / %4d games / %4d a1 wins / %4d a1 fails / %4d a2 wins / %4d a2 fails / %4d draws "%( 
              batch_num, moves, batch_size, agent1_wins, agent1_fail, agent2_wins, agent2_fail, draws))
        games = 0
        moves = 0
        agent1_wins = 0
        agent2_wins = 0
        draws = 0
    
agent1.set_random_rate(0.1)
agent2.set_random_rate(0.1)
train_game_per_game(env, agent1, agent2, 100, 10)


batch   0 /   49 moves /   10 games /    0 a1 wins /    9 a1 fails /    0 a2 wins /    1 a2 fails /    0 draws 
batch   1 /   30 moves /   10 games /    0 a1 wins /   10 a1 fails /    0 a2 wins /    0 a2 fails /    0 draws 
batch   2 /   34 moves /   10 games /    0 a1 wins /   10 a1 fails /    0 a2 wins /    0 a2 fails /    0 draws 
batch   3 /   47 moves /   10 games /    0 a1 wins /    9 a1 fails /    0 a2 wins /    1 a2 fails /    0 draws 
batch   4 /   37 moves /   10 games /    0 a1 wins /    7 a1 fails /    0 a2 wins /    3 a2 fails /    0 draws 
batch   5 /   47 moves /   10 games /    0 a1 wins /    7 a1 fails /    0 a2 wins /    3 a2 fails /    0 draws 
batch   6 /   31 moves /   10 games /    0 a1 wins /    9 a1 fails /    0 a2 wins /    1 a2 fails /    0 draws 
batch   7 /   30 moves /   10 games /    0 a1 wins /    8 a1 fails /    0 a2 wins /    2 a2 fails /    0 draws 
batch   8 /   32 moves /   10 games /    0 a1 wins /    8 a1 fails /    0 a2 wins /    2 a2 fails /    0

KeyboardInterrupt: 

In [None]:
def play_game(env, agent1, agent2, correct_invalid_actions = False):
    done = False
    env.reset()
    env.render()
    while not done:
        agent = agent1 if env.current_player == 0 else agent2
        action = agent.get_action(env)
        print("prediction", agent.model.predict(np.array([env._one_hot_board()])))
        print("action:", action)
        if env.board[action] is not None:
            action = valid_actions[random.randint(0, len(valid_actions)-1)]
            print("Agent picked invalid action. Adjusting with random value")
        obs,reward,done,info = env.step(action)
        env.render()

In [None]:
agent1.set_random_rate(0)
agent2.set_random_rate(0)
play_game(env, agent1, agent2)

In [30]:
nextQ = np.array([[1,2,3], [4,5,6]])
use_next = np.array([1,0])
use_next

array([1, 0])

In [145]:
Q = np.array([[1,2,3], [4,5,6]])
Q

array([[1, 2, 3],
       [4, 5, 6]])

In [137]:
nextQ  = np.array([[2,3,4],[5,6,7]])
nextQ

array([[2, 3, 4],
       [5, 6, 7]])

In [138]:
nextQ *  use_next[:, np.newaxis]

array([[2, 3, 4],
       [0, 0, 0]])

In [157]:
actions = np.array([1, 2])
actions

array([1, 2])

In [142]:
rewards = np.array([[2, 0]])

In [154]:
Q[actions]

IndexError: index 2 is out of bounds for axis 0 with size 2

In [158]:
np.eye(2)  *  actions

array([[1., 0.],
       [0., 2.]])

In [160]:
np.rand([4,5])

AttributeError: module 'numpy' has no attribute 'rand'

In [170]:
Q  =  np.random.rand(3,4)
Q

array([[0.52275479, 0.64401731, 0.27550146, 0.15703939],
       [0.25376878, 0.51209294, 0.63056784, 0.99384943],
       [0.40274882, 0.16589087, 0.65885759, 0.09049816]])

In [166]:
actions = [2,3,0,1]

In [169]:
np.eye(4) * Q[:, actions]

array([[0.59155955, 0.        , 0.        , 0.        ],
       [0.        , 0.4837838 , 0.        , 0.        ],
       [0.        , 0.        , 0.63966218, 0.        ],
       [0.        , 0.        , 0.        , 0.44093435]])

In [178]:
np.ix_(np.array(actions), Q)

ValueError: Cross index must be 1 dimensional

In [190]:
np.ix_([0,1],[1,2])

(array([[0],
        [1]]),
 array([[1, 2]]))

In [197]:
k = list(enumerate(actions))
k
a = np.ix_(*k)

(array([[[[0]]],
 
 
        [[[2]]]]),
 array([[[[1]],
 
         [[3]]]]),
 array([[[[2],
          [0]]]]),
 array([[[[3, 1]]]]))

In [198]:
Q[a]

NameError: name 'a' is not defined