In [110]:
import random

In [111]:
##Returns the adjusted strategy after an iteration
def getStrategy(regretSum,strategySum):
    ''' 
    Define each variable and give example values:
    - regretSum = [-2, -5, 2] ---> cumulative regret
    - strategy = [0.3333333333333333, 0.3333333333333333, 0.3333333333333333] or [0.0, 0.0, 1.0] or [0.3, 0.4, 0.3] ---> policy yielded by regret matching for next RPS showdown
    - strategySum = [1.0, 0.3333333333333333, 3.6666666666666665] ---> sum of probabilities in strategy array over all iterations
    - normalizingSum = 37 ---> used to get strategy array probabilities ---> strategy[i] = strategy[i]/normalizingSum ---> sum of positive regrets only
    '''
    actions = 3
    normalizingSum = 0
    strategy = [0,0,0]
    #Normalizingsum is the sum of positive regrets. 
    #This ensures do not 'over-adjust' and converge to equilibrium
    for i in range(0,actions):
        if regretSum[i] > 0:
            strategy[i] = regretSum[i]
        else:
            strategy[i] = 0
        normalizingSum += strategy[i]
    ##This loop normalizes our updated strategy
    for i in range(0,actions):
        if normalizingSum > 0:
            strategy[i] = strategy[i]/normalizingSum
        else:
            #Default to 33%
            strategy[i] = 1.0 / actions
        strategySum[i] += strategy[i]
    return (strategy, strategySum)

In [112]:
#Returns a random action according to the strategy
def getAction(strategy):
    ''' 
    As an example, use strategy = [0.3, 0.4, 0.3]
    This function returns 'ROCK' (0) if a random uniform generated value falls between 0-0.3
    This function returns 'PAPER' (2) if a random uniform generated value falls between 0.3-0.7 
    This function returns 'SCISSORS' (2) if a random uniform generated value falls between 0.7-1 
    '''
    r = random.uniform(0,1)
    if r >= 0 and r < strategy[0]:
        return 0
    elif r >= strategy[0] and r < strategy[0] + strategy[1]:
        return 1
    elif r >= strategy[0] + strategy[1] and r < sum(strategy):
        return 2
    else:
        return 0

In [113]:
def train(iterations,regretSum,oppStrategy):
    ''' 
    Define each variable and give example values:
    - regretSum = [-2, -5, 2] ---> cumulative regret
    - oppStrategy = [0.3, 0.4, 0.3]
    - strategy = [0.3333333333333333, 0.3333333333333333, 0.3333333333333333] or [0.0, 0.0, 1.0] or [0.3, 0.4, 0.3] ---> policy yielded by regret matching for next RPS showdown
    - strategySum = [1.0, 0.3333333333333333, 3.6666666666666665] ---> sum of probabilities in strategy array over all iterations
    - myaction = 0, 1 or 2 ---> 'ROCK' 'PAPER' or 'SCISSORS'
    - otherAction = 0, 1 or 2 ---> 'ROCK' 'PAPER' or 'SCISSORS'
    - actionUtility = [-1, 1, -1] or [-1, 0, 1] ---> Hapiness you would have gotten from each of RPS had you chosen them against a fixed opponent choice like 'ROCK'. 
                      First the 3 actionUtilities are calculated for the opponent, then the action utility for myaction and opponent action pair is calculated. 
                      Then they're all used to get the regret and then regretSum
    '''
    actionUtility = [0,0,0]
    strategySum = [0,0,0]
    actions = 3
    for i in range(0,iterations):
        ##Retrieve Actions
        t = getStrategy(regretSum,strategySum)
        strategy = t[0]
        strategySum = t[1]
        myaction = getAction(strategy)
        #Define an arbitrary opponent strategy from which to adjust
        otherAction = getAction(oppStrategy)   
        #Opponent Chooses scissors
        if otherAction == actions - 1: # FALSE: 3-1 = 2, WE HAVE otherAction == 1 
            #Utility(Rock) = 1
            actionUtility[0] = 1
            #Utility(Paper) = -1
            actionUtility[1] = -1
        #Opponent Chooses Rock
        elif otherAction == 0: # FALSE: WE HAVE otherAction == 1 
            #Utility(Scissors) = -1
            actionUtility[actions - 1] = -1
            #Utility(Paper) = 1
            actionUtility[1] = 1
        #Opopnent Chooses Paper
        else: # TRUE: WE HAVE otherAction == 1 
            #Utility(Rock) = -1
            actionUtility[0] = -1
            #Utility(Scissors) = 1
            actionUtility[2] = 1
                    
        #Add the regrets from this decision
        for i in range(0,actions):
            regretSum[i] += actionUtility[i] - actionUtility[myaction]
    return strategySum

In [114]:
def getAverageStrategy(iterations,oppStrategy):
    ''' 
    Define each variable and give example values:
    - avgStrategy = [0, 0, 0] ---> a.k.a.: 'Maximally Exploitative Strategy' ---> Strategy to use in next round, considering info from all previous rounds put together
    - strategySum = [1.0, 0.3333333333333333, 3.6666666666666665] ---> sum of probabilities in strategy array over all iterations
    - oppStrategy = [0.3, 0.4, 0.3]
    - normalizingSum = 37 ---> used to get strategy array probabilities ---> strategy[i] = strategy[i]/normalizingSum ---> sum of positive regrets only
    '''
    actions = 3
    strategySum = train(iterations,[0,0,0],oppStrategy)
    avgStrategy = [0,0,0]
    normalizingSum = 0
    for i in range(0,actions):
        normalizingSum += strategySum[i]
    for i in range(0,actions):
        if normalizingSum > 0:
            avgStrategy[i] = strategySum[i] / normalizingSum
        else:
            avgStrategy[i] = 1.0 / actions
    print('avgStrategy: ', avgStrategy)
    return avgStrategy

In [119]:
oppStrategy = [.34,.3,.3]
print("Opponent's Strategy",oppStrategy)
print("Maximally Exploitative Strat", getAverageStrategy(5000, oppStrategy))

Opponent's Strategy [0.34, 0.3, 0.3]
avgStrategy:  [0.004613333333333333, 0.9951866666666667, 0.0002]
Maximally Exploitative Strat [0.004613333333333333, 0.9951866666666667, 0.0002]
