In [5]:
import copy
import numpy as np
import ast
import matplotlib.pyplot as plt
import random

#game classes and helper classes
import TTT as T
import Mancala as M

#for testing
import winsound
import time as time

## Files
These methods are used for dealing with file writing and reading
* readFile takes a file name and creates the Q tables.  this way we can train the Qtables over time and use them later
* writeFile takes the Q tables and a file name.  It writes the Q tables to the file.  Its the boyfriend of readFile
* collectMetaData takes the Q tables, a file name, and the number of games played.  it appends the information to the file, this way we can collect information about the Qtables training each cycle.  The method writes the number of games played, the number of states in the Q table, the number of states that have not been reinforced, and the ratio of those two numbers expressed as a percent.


In [6]:
def readFile(fileName):
        Q1, Q2= {} , {}
        data = np.genfromtxt(fileName, dtype=str, delimiter="$", autostrip = True )
        for i in range(0,data.shape[0]):
            if data[i,0] == "Q1":
                Q1[ast.literal_eval(data[i,1])] = ast.literal_eval(data[i,2])
            if data[i,0] == "Q2":
                Q2[ast.literal_eval(data[i,1])] = ast.literal_eval(data[i,2])
        return Q1,Q2

def writeFile(Q1, Q2,fileName):
        writer = open(fileName, 'w')
        for i in Q1:
            writer.write("Q1"+" $ " +str(i) +" $ "+str(Q1[i])+'\n')
        for i in Q2:
            writer.write("Q2"+" $ " +str(i) +" $ "+str(Q2[i])+'\n')
        writer.close()

def collectMetaData(Q1, Q2,fileName,gamesPlayed):
        writer = open(fileName, 'a')
        count, revisited = 0, 0
        for i in Q1:
            count += 1
            if Q1[i] == -0.5:
                revisited +=1
        for i in Q2:
            count += 1
            if Q2[i] == -0.5:
                revisited +=1
        writer.write(str(gamesPlayed)  +" $ " +str(count) +" $ "+str(revisited)+" $ "+str(round(revisited/(count/100),2))+'\n')
        writer.close()


## Training
* *trainQ*  takes the number of repetitions, learningRate, epsilonDecayFactor, the game you are playing, and the Q tables.  It plays the games repeatedly each time, lowering the epsilon by multiplying it with the epsilonDecayFactor.  it calls runEarlyPlay once for each player, then runPlay until the game is over, lastly, it calls finalReinforcement.

* *runPlay* takes the game, the current players Q table, a copy of the old board, the last move, the epsilon, and the learning rate.  This method is how each player takes there turn.  The method calls getMoves for a list of moves then calls makeMove to make the move.  It checks to see if this state is in the Q table and if the game is over, then the Qtables old board/move state is reinforced by this state.  the BoardOld and moveOld values are assigned and returned along with a boolean.

* *runEarlyPlay* this is only called for each player first turn.  it takes the game we are playing, the current players Q table, the epsilon, and the learning rate.  It makes sure that the first state is in the Q table. the BoardOld and moveOld values are assigned and returned along with a boolean.

* *finalReinforcement* is only called when the game is over.  this provides reinforcement for both players penultimate states.  it takes both players old boards, old moves and Q tables.  it checks to see which player wins and reinforces the Q table accordingly

* *epsilonGreedy* is called on the game and take epsilon and the current players Q table.  It gets a list of valid moves, then it checks to see if a random number is less then epsilon, if the random number is less then the method returns a random move, else it returns the best move based on the Q table

In [7]:
def epsilonGreedy(self, epsilon, Q):
        valid = self.getMoves()
        if np.random.uniform() < epsilon:
            return valid[np.random.choice(len(valid))]
        else:
            Qs = np.array([Q.get(stateMoveTuple(self.getBoard(), move), -1) for move in valid]) 
            return valid[ np.argmax(Qs) ]
        
        

def finalReinforcement(boardOld1, moveOld1, Q1, boardOld2, moveOld2, Q2, game):
        if game.winner() == 1:
            Q1[(stateMoveTuple(boardOld1, moveOld1))] = -1.01
            Q2[(stateMoveTuple(boardOld2, moveOld2))] = -10.0
        elif game.winner() == 2:
            Q1[(stateMoveTuple(boardOld1, moveOld1))] = -10.0
            Q2[(stateMoveTuple(boardOld2, moveOld2))] = -1.01
        else:
            Q1[(stateMoveTuple(boardOld1, moveOld1))] = -5.0
            Q2[(stateMoveTuple(boardOld2, moveOld2))] = -5.0
        
        
def stateMoveTuple(board, move):
        return (tuple(board),move)
    
    
    
def runEarlyPlay(game, Q,  epsilon, learningRate):    
    move = epsilonGreedy(game, epsilon, Q)
    changePlayer = game.makeMove(move)
    
    if (stateMoveTuple(game.getBoard(), move)) not in Q:
        Q[stateMoveTuple(game.getBoard(), move)] = 0
    
    boardOld, moveOld = copy.deepcopy(game.getBoard()), copy.deepcopy(move)        
    return boardOld, moveOld, changePlayer
    

    
def runPlay(game, Q, boardOld, moveOld, epsilon, learningRate):
    
    move = epsilonGreedy(game, epsilon, Q)
    changePlayer = game.makeMove(move)
    
    if (stateMoveTuple(game.getBoard(), move)) not in Q:
        Q[stateMoveTuple(game.getBoard(), move)] = 0
    
    if game.isOver():
        Q[(stateMoveTuple(game.getBoard(), move))] = -1
        
    Q[stateMoveTuple(boardOld,moveOld)] += learningRate * (-1 + Q[stateMoveTuple(game.getBoard(),move)] - Q[stateMoveTuple(boardOld,moveOld)])
    boardOld, moveOld = copy.deepcopy(game.getBoard()), copy.deepcopy(move)        
    return boardOld, moveOld, changePlayer
    

def trainQ(nRepetitions, learningRate, epsilonDecayFactor, game,  Q1, Q2):
    epsilon = 1
    #boardOld1, moveOld1, boardOld2, moveOld2 = None, None, None, None
    for nGames in range(nRepetitions):          
        epsilon *= epsilonDecayFactor
        game.reset()
        flag  = False
        
        while not game.isOver():
            if flag:
                if game.player == 1:
                    boardOld1, moveOld1, changePlayer = runPlay(game, Q1, boardOld1, moveOld1, epsilon, learningRate)
                else:
                    boardOld2, moveOld2, changePlayer  = runPlay(game, Q2, boardOld2, moveOld2, epsilon, learningRate)         
            else:
                if game.player == 1:
                    boardOld1, moveOld1, changePlayer = runEarlyPlay(game, Q1, epsilon, learningRate)
                else:
                    boardOld2, moveOld2, changePlayer  = runEarlyPlay(game, Q2,epsilon, learningRate)
                    flag = True
            
            if game.isOver():
                finalReinforcement(boardOld1, moveOld1, Q1, boardOld2, moveOld2, Q2, game)           
            if changePlayer:
                game.changePlayer()
                
    return Q1,Q2






In [8]:

def Random(game,moves,Q1,Q2):
    move = random.choice(moves)    
    return game.makeMove(move)

def callMinMax(game,moves,Q1,Q2):
    bestMove = minimax(game, (0,0), Q1, Q2, 4)
    if bestMove[1] is not None:
        return game.makeMove(bestMove[1])
    else:
        return game.makeMove(random.choice(moves))


def QLookUp(game,moves,Q1,Q2):
    val = -99.99
    bestMove = None
    for move in moves:
        temp = copy.deepcopy(game.board)
        makemove = game.makeMove(move)
        if stateMoveTuple(game.board, move) in Q1:
            if Q1[stateMoveTuple(game.board, move)] > val:
                bestMove = move
                val = Q1[stateMoveTuple(game.board, move)]
        game.board = copy.deepcopy(temp)
    if bestMove is not None:
        return game.makeMove(bestMove)
    else:
        return game.makeMove(random.choice(moves))



def minimax(game, lastMove, QP, QO, depthLeft):
    if game.isOver():
        return[-1.0]      
    
    if depthLeft == 0:
        return [QO[stateMoveTuple(game.board, lastMove)]]
    
    bestValue, bestMove = None, None
    values = []
    
    for move in game.getMoves():
        
        temp = copy.deepcopy(game.board)
        changePlayer = game.makeMove(move)                      
        
        if stateMoveTuple(game.board, move) in QP:
            game.changePlayer()
            reValue = minimax(game, move, QO,QP ,depthLeft-1)
            game.changePlayer()
            if reValue[0] is not None:
                values.append([reValue[0] , move])
        
        game.board = copy.deepcopy(temp)
    #print(depthLeft, ": ", values)

    if len(values) > 0:
        if depthLeft % 2 == 1:
            bestValue = max(values, key = lambda item: item[0] )[0]
            bestMove = max(values, key = lambda item: item[0] )[1]
        else:
            bestValue = min(values, key = lambda item: item[0] )[0]
            bestMove = min(values, key = lambda item: item[0] )[1]
        #print(depthLeft, ": ", bestMove,bestValue,'\n')
   
    return [bestValue, bestMove]

      
def runGames(game, player1Move, Q1, player2Move, Q2):
    changePlayer = False
    game.reset()
    while not game.isOver():
        moves = game.getMoves()
        if game.player == 1:
            changePlayer = player1Move(game,moves,Q1,Q2)
        else:
            changePlayer = player2Move(game,moves,Q2,Q1)
        if changePlayer:
            game.changePlayer()
    return game.winner()


def tournament(game, strategy1, Q1, strategy2, Q2, rounds):
    P1, P2, Tie = 0,0,0
    start = time.time()
    for i in range(rounds):
        winner = runGames(game, strategy1, Q1, strategy2, Q2)
        if winner == 1:
            P1 += 1
        elif winner == 2:
            P2 += 1
        else:
            Tie += 1
    print("P1 ", strategy1," vs P2 ",strategy2)
    print("player 1 wins ",round(P1/(P1+P2+Tie)*100,2) ,"% of the time")
    print("player 2 wins ",round(P2/(P1+P2+Tie)*100,2) ,"% of the time")
    print("the game Ties ",round(Tie/(P1+P2+Tie)*100,2) ,"% of the time")
    print("Time to play ",rounds," games: ",time.time()- start )
    print()
    

this is where the training happens.  you will need to make the CSV files and save them in the same folder as this notebook.  the first time you train the tables you will use empty dictionaries.  All other times you will read in the Q tables with readFile.  the plus zero in collectMetaData is there because we are appending to that file.  the 2nd time you run the for loop the 0 should be replaced with the number of games your Q table has played so far.  The computer will beep when it is done training.

In [8]:
game = M.Mancala()
Q1, Q2 = readFile('M Qtable.csv')
i = 0
while((i * 100000 + 70000000)< 90000000 ):
    R1, R2 = {}, {}
    i += 1
    trainQ(50000, .5, .999, game,  Q1, Q2)
    writeFile(Q1, Q2,'M Qtable.csv')
    trainQ(25000, .5, .999, game,  R1, Q2)
    writeFile(Q1, Q2,'M Qtable.csv')
    trainQ(25000, .5, .999, game,  Q1, R2)
    writeFile(Q1, Q2,'M Qtable.csv')
    collectMetaData(Q1, Q2, 'M rawData.csv', i * 100000 + 70000000)
winsound.Beep(2750,1000)

In [None]:
game = T.TTT()
Q1, Q2 = readFile('T Qtable.csv')

tournament(game, Random, Q1, Random, Q2, 200)
tournament(game, Random, Q1, QLookUp, Q2, 200)
tournament(game, Random, Q1, callMinMax, Q2, 200)

tournament(game, QLookUp, Q1, Random, Q2, 200)
tournament(game, QLookUp, Q1, QLookUp, Q2, 200)
tournament(game, QLookUp, Q1, callMinMax, Q2, 200)
tournament(game, callMinMax, Q1, Random, Q2, 200)
tournament(game, callMinMax, Q1, QLookUp, Q2, 200)
tournament(game, callMinMax, Q1, callMinMax, Q2, 200)
winsound.Beep(2750,1000)

P1  <function Random at 0x000001E0EF80BE18>  vs P2  <function Random at 0x000001E0EF80BE18>
player 1 wins  58.5 % of the time
player 2 wins  31.0 % of the time
the game Ties  10.5 % of the time
Time to play  200  games:  0.14063596725463867

P1  <function Random at 0x000001E0EF80BE18>  vs P2  <function QLookUp at 0x000001E0EF80BF28>
player 1 wins  27.0 % of the time
player 2 wins  61.5 % of the time
the game Ties  11.5 % of the time
Time to play  200  games:  0.5160458087921143

P1  <function Random at 0x000001E0EF80BE18>  vs P2  <function callMinMax at 0x000001E0EF80BEA0>
player 1 wins  48.0 % of the time
player 2 wins  34.5 % of the time
the game Ties  17.5 % of the time
Time to play  200  games:  88.95846152305603

P1  <function QLookUp at 0x000001E0EF80BF28>  vs P2  <function Random at 0x000001E0EF80BE18>
player 1 wins  89.5 % of the time
player 2 wins  4.0 % of the time
the game Ties  6.5 % of the time
Time to play  200  games:  0.456768274307251

P1  <function QLookUp at 0x000001

In [35]:
data = np.genfromtxt("new 4.csv", dtype=str, delimiter="$", autostrip = True )
for i in range(data.shape[0]):
    data[i,0] = ast.literal_eval(data[i,0])+ 20000000
writer = open("new 4.csv", 'w')
for i in range(data.shape[0]):
    writer.write(str(data[i,0])  +" $ " +data[i,1] +" $ "+data[i,2]+" $ "+data[i,3]+'\n')
writer.close()
