In [1]:
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import __version__ as matplotlib_version


# Produce a testbed of 2000 bandit problems
np.random.seed(10)
n_bandit = 2000
n_lever = 10
steps = 1000

testbed = np.random.normal(0,1,(n_bandit,n_lever))
initial_reward_estimates =  np.random.normal(testbed,1)
import multiprocessing


def rewards_per_step_mp(steps, n_bandit, n_lever, testbed, initial_reward_estimates, epsilon,global_reward_list):
    action_count = np.ones((n_bandit,n_lever))  # each lever is pulled atleast once
    reward_estimates = np.zeros((n_bandit,n_lever))  # _per_pull_per_bandit

    avg_rewards_per_step = []
    avg_rewards_per_step.append(0)
    avg_rewards_per_step.append(np.mean(initial_reward_estimates)) # step 1

    for step in range(2,steps):
        reward_sum_over_all_bandits_per_step = 0
        for problem_index in range(n_bandit):  
            if np.random.random() > epsilon:
                maxval = np.amax(reward_estimates[problem_index])
                maxval_indices = np.ravel(np.array(np.where(reward_estimates[problem_index] == maxval)))
                random_choice = np.random.choice( maxval_indices ) # Breaking ties randomly
            else :
                 random_choice = np.random.randint(n_lever)

            # Incremental Algorithm
            Rn = np.random.normal(testbed[problem_index][random_choice],1)
            n = action_count[problem_index] [random_choice]
            Qn = reward_estimates[problem_index][random_choice]
            
            Qnew = ( Rn + (n - 1) * Qn ) / n
            
            reward_estimates[problem_index][random_choice] = Qnew
            action_count[problem_index] [random_choice] = n+1
            
            reward_sum_over_all_bandits_per_step += Qnew
            

        avg_rewards_per_step.append((reward_sum_over_all_bandits_per_step)/n_bandit)
        
    global_reward_list.append(avg_rewards_per_step)
    return

In [None]:
%%time
with multiprocessing.Manager() as manager:
    global_reward_list = []
    mp_objects = []
    epsilon_list= [0.1]
    color_list = ['r']
    plt.figure(figsize=(10,10))
    plt.xlabel('Steps', fontsize=18)
    plt.ylabel('Average Reward', fontsize=16)
    plt.plot(np.arange(steps),global_reward_list[index][0],label=epsilon_list[index], color=color_list[index])
    
    plt.legend(loc=2, ncol=2)
    
    plt.show()