**Example of Multi-bandit problem**

In [1]:
import numpy as np # Allow us to work with arrays

In [2]:
# Rules for the email process:
# 1 represents a click in the email. The probability of occurrence is 50%
# 0 represents an email ignored (not click). The probability of occurrence is 50%

# Action 1: Sending email with Format 1
def action_0():
    return np.random.choice([1,0], p=[0.5, 0.5]) # It returns 1 with 50% probability and 0 with 50% probability.

# Action 2: Sending email with Format 2
def action_1():
    return np.random.choice([1,0], p=[0.6, 0.4]) # It returns 1 with 60% probability and 0 with 40% probability.

# Action 3: Sending email with Format 3
def action_2():
    return np.random.choice([1,0], p=[0.2, 0.8]) # It returns 1 with 20% probability and 0 with 80% probability.

# Rewards
rewards = [action_0, action_1, action_2] # This variable contains all the rewards values for the 3 actions

# Verifying the functions are working
print ('''The Manager sends 1 email with format 1. 
After 24 hours, he discovered the following outcome:
%s'''%(rewards[0]())) # gives the reward value of action 0

print('''\nIf reward value is 1, it means that the client clicked on the email with format 1.
If reward value is 0, then it means that client ignored the email with format 1''')

# np.random.choice     This function is used to get random samples of one dimensional array. 
#                      Then return the random samples as numpy array
# np.random.choice(p=) This parameter is used to assign a probability value associated with each entry in the array. 
#                      If p() is not given, the sample assumes a uniform distribution over all entries in the array.

The Manager sends 1 email with format 1. 
After 24 hours, he discovered the following outcome:
1

If reward value is 1, it means that the client clicked on the email with format 1.
If reward value is 0, then it means that client ignored the email with format 1


In [3]:
pulls = 100000  # This variable contains the number of email sent by the Manager (sample)

action_value = []  # Create a new-empty list to store the calculated action values
i = 1              # i is initially 1
for reward in rewards:                        # For-loop iterates over the range [rewards]
    value = [reward() for _ in range(pulls)]  # This for-loop performs the reward value calculation 100000 times for each of the 3 actions.
    action_value.append(value)                # For each reward calculation, add the reward value of each action to the existing list "action_value"
    print ('\n\nResults of sending email with format %s:\n'%(i))
    print (action_value) # Output: Calculated rewards after sending 100,000 emails
    i += 1                                    # Output: i = [i + 1](increments 1 on each loop)
    
print('\nprocess is done')

# for _ in range ()   The underscore _ is used for ignoring specific values. 
#                     In other words, it is used when i is not required to be store or where a variable is 
#                     syntactically required but won't be used



Results of sending email with format 1:

[[0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0



Results of sending email with format 2:

[[0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0



Results of sending email with format 3:

[[0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0

In [4]:
# Calculating the expected reward value for each of the 3 actions

print('Action-values:\n')
for action, value in enumerate(action_value):             # For-loop-enumerate iterates over different emails (actions) and rewards (value) of a list containing all the calculated reward values after perfoming the experiment 100,000 times
    print('''The expected reward for taking Action %d is: 
Q(a_%d) = %.2f\n''' %(action, action, np.mean(value)))    # Output: Expected reward value for each action
    
# Action value is the expected reward for taking an action
# np.mean          This function is used to calculate the mean value

Action-values:

The expected reward for taking Action 0 is: 
Q(a_0) = 0.50

The expected reward for taking Action 1 is: 
Q(a_1) = 0.60

The expected reward for taking Action 2 is: 
Q(a_2) = 0.20



In [5]:
# Defining the policy of the actions
p0 = 0.33       # This is the probability of sending email with format 1
p1 = 0.34       # This is the probability of sending email with format 2
p2 = 0.33       # This is the probability of sending email with format 3

def policy():
    return np.random.choice([0,1,2], p=[p0, p1, p2])   # It returns 0 with 33.33% probability, 1 with 33.33% probability and 2 with 33.33% probability

print('\nfunction has been created succesfully')


function has been created succesfully


In [6]:
# Calculating Total reward of the actions (automated)

total_reward = 0                      # total_reward is initially 0
for pull in range(pulls):             # For-loop iterates over the range [pulls](0 to 99,999)
    action = policy()                 # This variable contains the output of the function policy(), which could be 0, 1 or 2. (This is repeated 100,000 times)
    total_reward += rewards[action]() # Output: total_reward = [total_reward + rewards[action]](increments according to the reward obtained given the action taken. (This is repeated 100,000 times)

print('Total reward is', total_reward) # Output: Total reward value
print('''\nAverage reward is:
V = ''', total_reward/pulls)   # Output: Average of the total reward value

Total reward is 43746

Average reward is:
V =  0.43746


In [7]:
# Calculating Total reward of the actions (manually)

V = np.mean(action_value[0])*p0 + np.mean(action_value[1])*p1 + np.mean(action_value[2])*p2
print('''\nAverage reward is:
V = ''', V)   # Output: Average of the total reward value

# Calculating Total reward of the actions (with the formula).
v1 = 0.5*p0 + 0.6*p1 + 0.2*p2

print('''\nAverage reward is:
V = ''', v1)   # Output: Average of the total reward value

# np.mean          This function is used to calculate the mean value


Average reward is:
V =  0.4357664

Average reward is:
V =  0.435


In [40]:
# Calculating Total regret of the actions (automated)

v_star = max([np.mean(value) for value in action_value]) # Get the mean of all the reward values calculated for each action. (This is repeated 100,000 times). Then get the highest value.
print('''Optimal policy is:
%.2f''' %(v_star))   # Output: Optimal policy (Best action to take)

total_regret = 0                                    # total_regret is initially 0
for pull in range (pulls):                          # For-loop iterates over the range [pulls](0 to 99,999)
    total_regret += (v_star - rewards[policy()]())  # Output: total_regret = [total_regret + (v_star - rewards[policy()]())]. 
                                                    # Increments according to the regret value obtained, which is the substraction of the maximum possible reward and the reward obtained given the policy followed. (This is repeated 100,000 times)

print('''\nAverage regret is:
I_T = %.2f''' %(total_regret/pulls))   # Output: Average of the total regret value   

# max()       This function is used to get the item with the highest value, or the item with the highest value in an iterable.

Optimal policy is:
0.60

Average regret is:
I_T = 0.16


In [41]:
# Calculating Total regret of the actions (manually)

I = (v_star - 0.5)*p0 + (v_star - 0.6)*p1 + (v_star - 0.2)*p2
print('''\nAverage Regret is:
V = ''', I)   # Output: Average of the total regret value

# Calculating total regret (Cumulative regret) of all the actions
print('''\nTotal regret is:
V = ''', total_regret)   # Output: Total regret of all the actions


Average Regret is:
V =  0.16541

Total regret is:
V =  16344.999999956073
