In [1]:
import random
import math
import matplotlib.pyplot as plt

In [None]:
class KArmBandit:
    def __init__(self, k, aligned_articles, strategy="epsilon-greedy", epsilon=0.1, ucb_c=1.0):
        self.k = k
        self.aligned_articles = aligned_articles
        self.strategy = strategy 
        self.epsilon = epsilon
        self.ucb_c = ucb_c
        self.q_values = [0.0] * k
        self.action_counts = [0] * k
        self.time_step = 0

    def choose_action(self):
        if self.strategy == "epsilon-greedy":
            return self._choose_action_epsilon_greedy()
        elif self.strategy == "ucb":
            return self._choose_action_ucb()
        else:
            raise ValueError("Invalid strategy. Choose 'epsilon-greedy' or 'ucb'.")

    def _choose_action_epsilon_greedy(self):
        if random.random() < self.epsilon:
            return random.randrange(self.k) 
        else:
            max_q = max(self.q_values)
            return random.choice([i for i, q in enumerate(self.q_values) if q == max_q])


    def _choose_action_ucb(self):
        self.time_step += 1
        ucb_values = [self.q_values[i] + self.ucb_c * math.sqrt(math.log(self.time_step) / (self.action_counts[i] + 1e-6)) for i in range(self.k)]

        max_ucb = max(ucb_values)
        return random.choice([i for i, ucb in enumerate(ucb_values) if ucb == max_ucb])


    def update(self, action, reward):
        self.action_counts[action] += 1
        self.q_values[action] += (reward - self.q_values[action]) / self.action_counts[action]

In [3]:
k = 5
aligned_articles = [0, 1]
trials = 1000

bandit_epsilon = KArmBandit(k, aligned_articles, strategy="epsilon-greedy", epsilon=0.1)
bandit_ucb = KArmBandit(k, aligned_articles, strategy="ucb", ucb_c=2)

In [None]:
results = {  
    "epsilon-greedy": {"aligned_shown": 0},
    "ucb": {"aligned_shown": 0}
}

In [5]:
for t in range(trials):
    rewards = [random.gauss(5 if i in aligned_articles else 1, 1) for i in range(k)]

    for bandit, strategy_name in [(bandit_epsilon, "epsilon-greedy"), (bandit_ucb, "ucb")]:
        action = bandit.choose_action()
        bandit.update(action, rewards[action])
        if action in aligned_articles:
            results[strategy_name]["aligned_shown"] += 1

In [None]:
for strategy_name, data in results.items():
    print(f"\n{strategy_name.title()}:")
    print(f"Aligned Articles Shown: {data['aligned_shown']}/{trials} times.")
    if strategy_name == "epsilon-greedy":
        print("Estimated Q-values:", bandit_epsilon.q_values)
    elif strategy_name == "ucb":
        print("Estimated Q-values:", bandit_ucb.q_values)


Epsilon-Greedy:
Aligned Articles Shown: 932/1000 times.
Estimated Q-values: [4.930448547986353, 4.979424881548845, 1.1081998479653834, 1.0287491708191792, 0.8795194219889996]

Ucb:
Aligned Articles Shown: 993/1000 times.
Estimated Q-values: [4.952334224840985, 5.041216586956518, 1.1803944968684354, 1.690795998714935, 1.2343324464738585]
