# Exercise 7: Angelie Kraft and Anton Wiehe

In [None]:
import numpy as np
class BanditEnv:
    def __init__(self, num_bandits=10):
        # initialize list with num_bandit mean values
        self.bandits = np.random.normal(loc=0, scale=1, size=num_bandits)
        self.optimal_action = np.argmax(self.bandits) # determine max
    
    def step(self, action):
        # return sample from distribution with idx == action
        return np.random.normal(loc=self.bandits[action], scale=1, size=1)
    
    def is_optimal_action(self, action):
        return action == self.optimal_action

In [None]:
class SimpleBanditAgent:
    def __init__(self, epsilon, num_actions):
        # initialize counts for actions and q values to 0
        self.q_vals = np.zeros(num_actions)
        self.counts = np.zeros(num_actions)
        self.epsilon = epsilon
    
    def act(self):
        # choose action/bandit with epsilon-greedy q-value
        if np.random.random() > self.epsilon:
            return np.argmax(self.q_vals)
        else:
            return np.random.randint(len(self.q_vals))
    
    def update(self, action, reward):
        # update action based on reward
        self.counts[action] += 1
        self.q_vals[action] += [reward - self.q_vals[action]] / self.counts[action]
    

In [None]:
def run(epsilon, steps):
    env = BanditEnv()
    agent = SimpleBanditAgent(epsilon, len(env.bandits))
    
    rewards = np.empty(steps)
    optimal_choices = np.empty(steps)
    for i in range(steps):
        # main algorithm
        action = agent.act()
        reward = env.step(action)
        agent.update(action, reward)
        
        # track performance
        rewards[i] = reward
        optimal_choices[i] = env.is_optimal_action(action)
        
    return rewards, optimal_choices
        

In [None]:
def run_experiment(epsilon, trials, steps):
    exp_rewards = np.empty((trials, steps))
    exp_optimal_choices = np.empty((trials, steps))
    for i in range(trials):
        rewards, optimal_choices = run(epsilon, steps)
        exp_rewards[i] = rewards
        exp_optimal_choices[i] = optimal_choices
        
    mean_rewards = np.mean(exp_rewards, axis=0)
    mean_optimal_choices = np.mean(exp_optimal_choices, axis=0)
    
    #print("optimal choices: ", mean_optimal_choices)
    
    return mean_rewards, mean_optimal_choices
    

In [None]:
import matplotlib.pyplot as plt
def meanSmoothing(x, N):
    x = np.array(x)
    out = np.zeros_like(x, dtype=np.float64)
    dim_len = x.shape[0]
    for i in range(dim_len):
        if N % 2 == 0:
            a, b = i - (N - 1) // 2, i + (N - 1) // 2 + 2
        else:
            a, b = i - (N - 1) // 2, i + (N - 1) // 2 + 1
        # cap indices to min and max indices
        a = max(0, a)
        b = min(dim_len, b)
        out[i] = np.mean(x[a:b])
    return out


def plot(exp_rewards, exp_choices, epsilons):
    fig_rewards, ax_rewards = plt.subplots()
    fig_choices, ax_choices = plt.subplots()
    
    for i in range(len(epsilons)):
        label = "epsilon = " + str(epsilons[i])
        ax_rewards.plot(meanSmoothing(exp_rewards[i], 15), label=label)
        ax_choices.plot(meanSmoothing(exp_choices[i], 15), label=label)
        
    ax_rewards.set_xlabel('Step')
    ax_rewards.set_ylabel('Mean reward per step')
    ax_choices.set_xlabel('Step')
    ax_choices.set_ylabel('Mean number of optimal action selection per step')
    
    fig_rewards.subplots_adjust(bottom=0.15)
    fig_rewards.legend(loc="lower center", ncol=3)
    fig_choices.subplots_adjust(bottom=0.15)
    fig_choices.legend(loc="lower center", ncol=3)
    
    
    fig_rewards.savefig('fig_rewards.pdf')
    fig_choices.savefig('fig_choices.pdf')

In [None]:
def start(trials=50, steps=1000):
    epsilons = [0.1, 0.01, 0.009]
    exp_rewards = np.empty((len(epsilons), steps))
    exp_choices = np.empty((len(epsilons), steps))
    for i in range(len(epsilons)):
        exp_rewards[i], exp_choices[i] = run_experiment(epsilons[i], trials, steps)
    plot(exp_rewards, exp_choices, epsilons)
    return exp_rewards, exp_choices
    

In [None]:
start()