#### Exploration/Exploitation in Reinforcement Learning
- Reviewing different strategies to avoid the exploration-exploitation dilemma

In [11]:
%pip install numpy matplotlib ipywidgets
import numpy as np
import random
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display
from collections import Counter

Note: you may need to restart the kernel to use updated packages.


In [12]:
def plot_distribution(strategy, params, n=1000):
    actions = [strategy(*params) for _ in range(n)]
    counter = Counter(actions)
    plt.bar(counter.keys(), counter.values())
    plt.xlabel('Actions')
    plt.ylabel('Frequency')
    plt.title('Distribution of actions selected by epsilon-greedy')
    plt.show()

In [13]:
def epsilon_greedy(Q, state, epsilon=0.1):
    if random.uniform(0, 1) < epsilon:
        return random.choice(range(len(Q[state])))  # Explore
    else:
        return np.argmax(Q[state])  # Exploit

In [14]:
# Testing the epsilon_greedy function
Q = [1.0, 0.5, 0.2]  # Example Q-values for a given state
epsilon = 0.1  # 10% chance to explore

# Plot the distribution of actions selected by epsilon-greedy
plot_distribution(epsilon_greedy, (Q, 0, epsilon))

TypeError: object of type 'float' has no len()

In [None]:
def softmax_exploration(Q, state, tau=1.0):
    q_exp = np.exp(Q[state] / tau)
    action_probabilities = q_exp / np.sum(q_exp)
    return np.random.choice(range(len(Q[state])), p=action_probabilities)

In [None]:
def ucb(Q, state, counts, total_count, c=1):
    upper_bound = Q[state] + c * np.sqrt(np.log(total_count) / (counts[state] + 1e-5))
    return np.argmax(upper_bound)


In [None]:
# Assuming a Bernoulli Bandit problem for simplicity
def thompson_sampling(successes, failures):
    beta_samples = [np.random.beta(s+1, f+1) for s, f in zip(successes, failures)]
    return np.argmax(beta_samples)