# Imports

In [None]:
import numpy as np
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt

<font size=6>__Define Classes__</font>

## The first step is to define a `KArmedBandit` class. It needs to keep track of:
### 1] How many arms it has, $k$

### 2] The mean reward $\mu_a$ for pulling each arm 
><font size=3>_Begin by choosing these means from a normal distribution._</font>
### 3] The standard deviation $\sigma_a$ of rewards for pulling each arm
><font size=3>_To begin with, this can be set to 1 for all arms, but feel free to experiment later!_</font>

## The `KArmedBandit` should also have:
### 1] A function that pulls a specified lever and returns a reward drawn from the correct distribution
### 2] A function that returns the index of the optimal arm to pull

In [None]:
class KArmedBandit():
    def __init__(self, k):
        self.k = k
        # self.means = ?
        # self.stdevs = ?
    
    def pull_lever(self, arm):
        pass
        #return ?
        
    def optimal_arm(self):
        pass
        #return ?
        

## Next, define an `Agent` class. The agent should keep track of:
### 1] Its exploration rate $\epsilon$
### 2] Its current estimate of the true action value function  $Q_t(s, a)$
### 3] The number of times it has pulled each lever, $n_a$
### 4] The number of times it has pulled the optimal lever
### 5] A record of rewards received per time step
### 6] A record of the percentage of total pulls that were optimal as a function of time

## The `Agent` should also have:

### 1] A function to choose a lever $\epsilon$-greedily
### 2] An `act` function, which pulls a lever, receives a reward, and updates tracking of rewards and optimal pull %
### 3] An `update_Q` function, which updates our estimated action-value function $Q(s,a)$ given a reward and which lever was pulled
><font size=4>_The simplest way to do this is to keep track of the rewards assigned to each lever, but there is a more elegant solution_</font>
### 4] A `run_trial` function, which performs `act` $n_{steps}$ times



### The agent should have an act function, which chooses an action e-greedily, receives a reward, and updates the reward and optimal pull % trajectories

### The agent should also have a function to update its action-value function Q_t upon pulling a lever and receiving a reward

In [None]:
class Agent():
    def __init__(self, bandit, epsilon):
        self.bandit = bandit
        self.epsilon = epsilon
        self.num_optimal_pulls = 0
        self.reward_trajectory = []
        self.optimal_trajectory = []
        self.n = [0]*self.bandit.k
        
        # How will you initialise your Q estimates?
        # self.Q = ?
        
        
    
    def choose_e_greedy_action(self):
        pass
        # selected_lever = ?
        # return selected_lever
    
    def act(self):
        
        # Choose an action e-greedily
        lever = self.choose_e_greedy_action()
        
        # Update the array keeping track of how many times each lever has been pulled
        self.n[lever] += 1
        
        # Now that you know which lever to pull, how will you get the reward?
        # reward = ?
        
        # Did the agent pull the optimal arm?
        if lever == self.bandit.optimal_arm():
            self.num_optimal_pulls += 1
        
        # Update your reward and optimal pull % trajectories
        # self.reward_trajectory ?
        # self.optimal_trajectory ?

        # Update your q estimate
        self.update_Q(?)
    
    # What arguments does this function need to take?
    # How will you update the Q values?
    def update_Q(self, lever, reward):
        pass

    def run_trial(self, n_steps):
        for step in range(n_steps):
            # Which function needs to be called here?
            # self.?

# Define Hyperparameters

In [None]:
k = 10
num_steps = 1000

# Define Bandit & Agent

In [None]:
bandit = KArmedBandit(k = k)

In [None]:
agent = Agent(bandit, epsilon = 0.1)

# Run a single trial and plot the results!

In [None]:
agent.run_trial(num_steps)

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(agent.reward_trajectory)
ax.tick_params(labelsize=16)
ax.set_xlabel('Steps', fontsize=16)
ax.set_ylabel('Reward received', fontsize=16)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(agent.optimal_trajectory)
ax.tick_params(labelsize=16)
ax.set_xlabel('Steps', fontsize=16)
ax.set_ylabel('% of time we pulled the optimal lever', fontsize=16)
plt.show()

## Compare the bandit means to your best estimates!

In [None]:
bandit.means

In [None]:
agent.Q

# Run Trials of different epsilons

In [None]:
num_trials = 1000
epsilons_to_test = [0.01, 0.1, 1]

In [None]:
mean_reward_trajectory_array = []
mean_optimal_trajectory_array = []

for eps in epsilons_to_test:
    
    print(f'Testing epsilon = {eps}')
    # Initialise containers for trajectories
    reward_trajectory_array = []
    optimal_trajectory_array = []
    
    # Run num_trials trials and average the results
    for trial in tqdm_notebook(range(num_trials)):
        bandit = KArmedBandit(k = k)
        ag = Agent(bandit, eps)
    
        ag.run_trial(num_steps)
            
        # After each trial, add the reward and optimal % trajectory to an array     
        reward_trajectory_array.append(ag.reward_trajectory)
        optimal_trajectory_array.append(ag.optimal_trajectory)
    
    # After running num_trials trials, take the mean of the trajectories and store them
    # in an array
    mean_reward_trajectory_array.append(np.mean(np.array(reward_trajectory_array), axis=0))
    mean_optimal_trajectory_array.append(np.mean(np.array(optimal_trajectory_array), axis=0))
    
    
    

# Plot your results

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
for idx, eps in enumerate(epsilons_to_test):
    ax.plot(mean_reward_trajectory_array[idx])
ax.legend([str(e) for e in epsilons_to_test],fontsize=16)
ax.set_xlabel('Steps', fontsize=16)
ax.set_ylabel('Mean reward', fontsize=16)
ax.tick_params(labelsize=16)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
for idx, eps in enumerate(epsilons_to_test):
    ax.plot(mean_optimal_trajectory_array[idx])
ax.legend([str(e) for e in epsilons_to_test], fontsize=16)
ax.set_xlabel('Steps', fontsize=16)
ax.set_ylabel('Optimal lever %', fontsize=16)
ax.tick_params(labelsize=16)
plt.show()

# Extensions:

### 1. How do your estimates Q_t of the true action-value function q* converge over time? As a function of epsilon?

### 2. Optimistic initialisation: what happens when you initialise your Q values at 5 instead of 0? why?

### 3. Non-stationary q*(a): try adding a function that slightly modifies your bandit means after every time the agent acts. What happens now? (Hint: your Q_n update function should value nearer rewards to those further away. How can we achieve this?)