In [32]:
import numpy as np
import random

class State():
    def __init__(self, string_or_array):
        if type(string_or_array) == str:
            self.string = string_or_array
            self.array = np.array(list(string_or_array)).reshape(3,3)
        elif type(string_or_array) == np.ndarray:
            self.array = string_or_array
            self.string = ''.join(string_or_array.flatten())
    
    def board(self):
        return " {0} | {1} | {2} \n-----------\n {3} | {4} | {5} \n-----------\n {6} | {7} | {8} ".format(*self.array.flatten())
    
    def player(self):
        if (self.array=='x').sum() > (self.array=='o').sum():
            return 'o'
        else:
            return 'x'
    
    def winner(self):
        x = (self.array == 'x')
        o = (self.array == 'o')
    
        x_triples = x.prod(axis=0).sum() + x.prod(axis=1).sum() + np.diag(x).prod() + np.diag(x.transpose()).prod()
        o_triples = o.prod(axis=0).sum() + o.prod(axis=1).sum() + np.diag(o).prod() + np.diag(o.transpose()).prod()
        
        if x_triples > 0 and o_triples == 0:
            return 'x'
        elif o_triples > 0 and x_triples == 0:
            return 'o'
        else:
            return False
    
    def actions(self):
        if not self.winner():
            return zip(np.where(self.array == ' ')[0], np.where(self.array == ' ')[1])
        else:
            return []
    
    def is_terminal(self):
        return len(self.actions())==0
    
    def transitions(self):
        return [{'player': self.player(), 'action': action} for action in self.actions()]
    
    def result(self,transition):
        newstate = self.array.copy()
        newstate[transition['action']] = transition['player']
        return newstate
    
class Tree():
    def __init__(self):
        nodes = {0: ['         ']}
        edges = {}
        
        for i in range(1,10):
            newnodes = []
            newedges = []
            for istring, string in enumerate(nodes[i-1]):
                #start_time = time.clock()
                state = State(string)
                newedges.append([])
                for itransition, transition in enumerate(state.transitions()):
                    newstate = State(state.result(transition))
                    newstring = newstate.string
                    try:
                        newedges[istring].append(newnodes.index(newstring))
                    except:
                        newnodes.append(newstring)
                        newedges[istring].append(itransition)
                nodes[i] = newnodes
                edges[i-1] = newedges
                #end_time = time.clock()
            
            #print '   level {0} constructed | duration: {1} | N: {2}'.format(*[i,end_time-start_time,len(newnodes)])
            
            self.nodes = nodes
            self.edges = edges
    
    def index(self, node):
        for i in range(10):
            try:
                index = {'level':i,'node_number': self.nodes[i].index(node)}
            except:
                pass
        return index
    
class Agent():
    def __init__(self, name, random_frequency, learning_rate, node = '         '):
        self.name = name
        self.random_frequency = random_frequency
        self.learning_rate = learning_rate
        self.game_node = '         '
        self.tree = Tree()
        
        self.node1 = None
        self.node0 = None
        self.value0 = None
        
        values = {}
        for level, nodes in self.tree.nodes.iteritems():
            if level>0 and ((name=="x" and level%2==1) or (name=="o" and level%2==0)):
                newvalues = []
                for inode, node in enumerate(nodes):
                    if State(node).is_terminal():
                        if State(node).winner() == name:
                            newvalues.append(1.)
                        else:
                            newvalues.append(0.)
                    else:
                        newvalues.append(np.random.rand())  
                values[level] = newvalues
        self.values = values
    
    def prospects(self):
        game_index = self.tree.index(self.game_node)
        i, n = game_index['level'], game_index['node_number']
        prospective_node_numbers = self.tree.edges[i][n]
        prospective_nodes_values = {self.tree.nodes[i+1][node_num]: self.values[i+1][node_num] for node_num in prospective_node_numbers}
        return prospective_nodes_values

    def greedy_node(self):
        d = self.prospects()
        return max(d.iterkeys(), key=(lambda key: d[key]))
    
    def random_node(self):
        return random.choice(self.prospects().keys())
    
    def make_move(self):
        self.node0 = self.node1
        if np.random.rand() < self.random_frequency:
            self.node1 = self.random_node()
        else:
            self.node1 = self.greedy_node()
        self.game_node = self.node1
    
    def update_values(self):
        index0 = self.tree.index(self.node0)
        index1 = self.tree.index(self.node1)
        
        value0 = self.values[index0['level']][index0['node_number']]
        value1 = self.values[index1['level']][index1['node_number']]
        print 'index0: {0} | value0: {1}'.format(*[index0, value0])
        print 'index1: {0} | value1: {1}'.format(*[index1, value1])
        print 'learning_rate: ',self.learning_rate
        print 'self.learning_rate*(value1 - value0) =',self.learning_rate*(value1 - value0)
        value0_updated = value0 + self.learning_rate*(value1 - value0)
        print 'value0_updated: {0}'.format(*[value0_updated])
        print "self.values[index0['level']][index0['node_number']] -- before assign:",self.values[index0['level']][index0['node_number']]
        self.values[index0['level']][index0['node_number']] = value0_updated
        print "self.values[index0['level']][index0['node_number']] -- after assign:",self.values[index0['level']][index0['node_number']]

        
    def reset_nodelog(self):
        self.game_node = '         '
        self.node1 = None
        self.node0 = None
        self.value0 = None
        
class Game():
    def __init__(self, agents):
        self.state = State('         ')
        self.agents = agents
        self.score = {k:0 for k,v in agents.iteritems()}
    
    def is_over(self):
        return self.state.is_terminal()
    
    def play_turn(self):
        player = self.state.player()
        self.agents[player].make_move()
        self.state = State(self.agents[player].game_node)
        self.agents['x'].game_node = self.state.string
        self.agents['o'].game_node = self.state.string
        
        if self.agents[player].node0:
            self.agents[player].update_values()
    
    def play_game(self):
        while not self.state.is_terminal():
            self.play_turn()
        self.score[self.state.winner()] += 1
    
    def reset_state(self):
        self.state = State('         ')
        
    def play_n_games(self, n):
        for _ in range(n):
            self.reset_state()
            self.agents['x'].reset_nodelog()
            self.agents['o'].reset_nodelog()
            self.play_game()

In [33]:
x = Agent('x', random_frequency = 0.1, learning_rate = 0)
o = Agent('o', random_frequency = 0.1, learning_rate = 0.1)
o_values_initial = o.values
game = Game({'x':x, 'o':o})
#game.play_n_games(1000)
#game.score

In [37]:
game.play_turn()
o_values_final = o.values
# figure out why new value isn't being assigned

index0: {'node_number': 2, 'level': 2} | value0: 0.847069782863
index1: {'node_number': 1, 'level': 4} | value1: 0.95587220064
learning_rate:  0.1
self.learning_rate*(value1 - value0) = 0.0108802417776
value0_updated: 0.857950024641
self.values[index0['level']][index0['node_number']] -- before assign: 0.847069782863
self.values[index0['level']][index0['node_number']] -- after assign: 0.857950024641


In [31]:
for k,v in o_values_initial.iteritems():
    print k, (o_values_initial[2] == o_values_final[2])

8 True
2 True
4 True
6 True


In [None]:
x.make_move()

In [None]:
game.agents[game.state.player()].make_move()

In [None]:
game.play_turn()

In [None]:
import random
from copy import copy, deepcopy
import csv
import matplotlib.pyplot as plt

EMPTY = 0
PLAYER_X = 1
PLAYER_O = 2
DRAW = 3

def emptystate():
    return [[EMPTY,EMPTY,EMPTY],[EMPTY,EMPTY,EMPTY],[EMPTY,EMPTY,EMPTY]]

def gameover(state):
    for i in range(3):
        if state[i][0] != EMPTY and state[i][0] == state[i][1] and state[i][0] == state[i][2]:
            return state[i][0]
        if state[0][i] != EMPTY and state[0][i] == state[1][i] and state[0][i] == state[2][i]:
            return state[0][i]
    if state[0][0] != EMPTY and state[0][0] == state[1][1] and state[0][0] == state[2][2]:
        return state[0][0]
    if state[0][2] != EMPTY and state[0][2] == state[1][1] and state[0][2] == state[2][0]:
        return state[0][2]
    for i in range(3):
        for j in range(3):
            if state[i][j] == EMPTY:
                return EMPTY
    return DRAW

def last_to_act(state):
    countx = 0
    counto = 0
    for i in range(3):
        for j in range(3):
            if state[i][j] == PLAYER_X:
                countx += 1
            elif state[i][j] == PLAYER_O:
                counto += 1
    if countx == counto:
        return PLAYER_O
    if countx == (counto + 1):
        return PLAYER_X
    return -1


def enumstates(state, idx, agent):
    if idx > 8:
        player = last_to_act(state)
        if player == agent.player:
            agent.add(state)
    else:
        winner = gameover(state)
        if winner != EMPTY:
            return
        i = idx / 3
        j = idx % 3
        for val in range(3):
            state[i][j] = val
            enumstates(state, idx+1, agent)

class Agent(object):
    def __init__(self, player, verbose = False, lossval = 0, learning = True):
        self.values = {}
        self.player = player
        self.verbose = verbose
        self.lossval = lossval
        self.learning = learning
        self.epsilon = 0.1
        self.alpha = 0.99
        self.prevstate = None
        self.prevscore = 0
        self.count = 0
        enumstates(emptystate(), 0, self)

    def episode_over(self, winner):
        self.backup(self.winnerval(winner))
        self.prevstate = None
        self.prevscore = 0

    def action(self, state):
        r = random.random()
        if r < self.epsilon:
            move = self.random(state)
            self.log('>>>>>>> Exploratory action: ' + str(move))
        else:
            move = self.greedy(state)
            self.log('>>>>>>> Best action: ' + str(move))
        state[move[0]][move[1]] = self.player
        self.prevstate = self.statetuple(state)
        self.prevscore = self.lookup(state)
        state[move[0]][move[1]] = EMPTY
        return move

    def random(self, state):
        available = []
        for i in range(3):
            for j in range(3):
                if state[i][j] == EMPTY:
                    available.append((i,j))
        return random.choice(available)

    def greedy(self, state):
        maxval = -50000
        maxmove = None
        if self.verbose:
            cells = []
        for i in range(3):
            for j in range(3):
                if state[i][j] == EMPTY:
                    state[i][j] = self.player
                    val = self.lookup(state)
                    state[i][j] = EMPTY
                    if val > maxval:
                        maxval = val
                        maxmove = (i, j)
                    if self.verbose:
                        cells.append('{0:.3f}'.format(val).center(6))
                elif self.verbose:
                    cells.append(NAMES[state[i][j]].center(6))
        if self.verbose:
            print BOARD_FORMAT.format(*cells)
        self.backup(maxval)
        return maxmove

    def backup(self, nextval):
        if self.prevstate != None and self.learning:
            self.values[self.prevstate] += self.alpha * (nextval - self.prevscore)

    def lookup(self, state):
        key = self.statetuple(state)
        if not key in self.values:
            self.add(key)
        return self.values[key]

    def add(self, state):
        winner = gameover(state)
        tup = self.statetuple(state)
        self.values[tup] = self.winnerval(winner)

    def winnerval(self, winner):
        if winner == self.player:
            return 1
        elif winner == EMPTY:
            return 0.5
        elif winner == DRAW:
            return 0
        else:
            return self.lossval

    def printvalues(self):
        vals = deepcopy(self.values)
        for key in vals:
            state = [list(key[0]),list(key[1]),list(key[2])]
            cells = []
            for i in range(3):
                for j in range(3):
                    if state[i][j] == EMPTY:
                        state[i][j] = self.player
                        cells.append(str(self.lookup(state)).center(3))
                        state[i][j] = EMPTY
                    else:
                        cells.append(NAMES[state[i][j]].center(3))
            print BOARD_FORMAT.format(*cells)

    def statetuple(self, state):
        return (tuple(state[0]),tuple(state[1]),tuple(state[2]))

    def log(self, s):
        if self.verbose:
            print s

class Human(object):
    def __init__(self, player):
        self.player = player

    def action(self, state):
        printboard(state)
        action = raw_input('Your move? ')
        return (int(action.split(',')[0]),int(action.split(',')[1]))

    def episode_over(self, winner):
        if winner == DRAW:
            print 'Game over! It was a draw.'
        else:
            print 'Game over! Winner: Player {0}'.format(winner)

def play(agent1, agent2):
    state = emptystate()
    for i in range(9):
        if i % 2 == 0:
            move = agent1.action(state)
        else:
            move = agent2.action(state)
        state[move[0]][move[1]] = (i % 2) + 1
        winner = gameover(state)
        if winner != EMPTY:
            return winner
    return winner

def measure_performance_vs_random(agent1, agent2):
    epsilon1 = agent1.epsilon
    epsilon2 = agent2.epsilon
    agent1.epsilon = 0
    agent2.epsilon = 0
    agent1.learning = False
    agent2.learning = False
    r1 = Agent(1)
    r2 = Agent(2)
    r1.epsilon = 1
    r2.epsilon = 1
    probs = [0,0,0,0,0,0]
    games = 100
    for i in range(games):
        winner = play(agent1, r2)
        if winner == PLAYER_X:
            probs[0] += 1.0 / games
        elif winner == PLAYER_O:
            probs[1] += 1.0 / games
        else:
            probs[2] += 1.0 / games
    for i in range(games):
        winner = play(r1, agent2)
        if winner == PLAYER_O:
            probs[3] += 1.0 / games
        elif winner == PLAYER_X:
            probs[4] += 1.0 / games
        else:
            probs[5] += 1.0 / games
    agent1.epsilon = epsilon1
    agent2.epsilon = epsilon2
    agent1.learning = True
    agent2.learning = True
    return probs

def measure_performance_vs_each_other(agent1, agent2):
    #epsilon1 = agent1.epsilon
    #epsilon2 = agent2.epsilon
    #agent1.epsilon = 0
    #agent2.epsilon = 0
    #agent1.learning = False
    #agent2.learning = False
    probs = [0,0,0]
    games = 100
    for i in range(games):
        winner = play(agent1, agent2)
        if winner == PLAYER_X:
            probs[0] += 1.0 / games
        elif winner == PLAYER_O:
            probs[1] += 1.0 / games
        else:
            probs[2] += 1.0 / games
    #agent1.epsilon = epsilon1
    #agent2.epsilon = epsilon2
    #agent1.learning = True
    #agent2.learning = True
    return probs

'''
if __name__ == "__main__":
    p1 = Agent(1, lossval = -1)
    p2 = Agent(2, lossval = -1)
    r1 = Agent(1, learning = False)
    r2 = Agent(2, learning = False)
    r1.epsilon = 1
    r2.epsilon = 1
    series = ['P1-Win','P1-Lose','P1-Draw','P2-Win','P2-Lose','P2-Draw']
    #series = ['P1-Win', 'P2-Win', 'Draw']
    colors = ['r','b','g','c','m','b']
    markers = ['+', '.', 'o', '*', '^', 's']
    f = open('results.csv', 'wb')
    writer = csv.writer(f)    
    writer.writerow(series)
    perf = [[] for _ in range(len(series) + 1)]
    for i in range(10000):
        if i % 10 == 0:
            print 'Game: {0}'.format(i)
            probs = measure_performance_vs_random(p1, p2)
            writer.writerow(probs)
            f.flush()
            perf[0].append(i)
            for idx,x in enumerate(probs):
                perf[idx+1].append(x)
        winner = play(p1,p2)
        p1.episode_over(winner)
        #winner = play(r1,p2)
        p2.episode_over(winner)
    f.close()
    for i in range(1,len(perf)):
        plt.plot(perf[0], perf[i], label=series[i-1], color=colors[i-1])
    plt.xlabel('Episodes')
    plt.ylabel('Probability')
    plt.title('RL Agent Performance vs. Random Agent\n({0} loss value, self-play)'.format(p1.lossval))
    #plt.title('P1 Loss={0} vs. P2 Loss={1}'.format(p1.lossval, p2.lossval))
    plt.legend()
    #plt.show()
    #plt.savefig('p1loss{0}vsp2loss{1}.png'.format(p1.lossval, p2.lossval))
    plt.savefig('selfplay_random_{0}loss.png'.format(p1.lossval))
    while True:
        p2.verbose = True
        p1 = Human(1)
        winner = play(p1,p2)
        p1.episode_over(winner)
        p2.episode_over(winner)
'''