## Recommendation system using K-arm bandits

In [1]:
import numpy as np
import random

class KArmBandit:
    def __init__(self, k, epsilon=0.1):
        self.k = k  # Nuo.of arms
        self.epsilon = epsilon  # rate of exploration
        self.counts = np.zeros(k)
        self.values = np.zeros(k) # reward estimated
        
    def select_arm(self):
        if random.random() > self.epsilon:
            return np.argmax(self.values) # exploitation
        else:
            return random.randint(0, self.k - 1) # exploration
        
    def update(self, chosen_arm, reward):
        self.counts[chosen_arm] += 1
        n = self.counts[chosen_arm]
        value = self.values[chosen_arm]
        # New estimated value
        new_value = value + (reward - value) / n
        self.values[chosen_arm] = new_value

    def recommend(self):
        chosen_arm = self.select_arm()
        return chosen_arm

In [2]:
k = 5
bandit = KArmBandit(k, epsilon=0.1)

n_rounds = 1000 # recomendation total rounds
true_rewards = [0.1, 0.5, 0.8, 0.6, 0.3] 

In [3]:
for i in range(n_rounds):
    chosen_arm = bandit.recommend()
    reward = 1 if random.random() < true_rewards[chosen_arm] else 0
    bandit.update(chosen_arm, reward)

In [4]:
best_arm = np.argmax(bandit.values)
print(f"The best recommendation based on learned rewards is item {best_arm} with estimated reward {bandit.values[best_arm]}")

The best recommendation based on learned rewards is item 2 with estimated reward 0.8187919463087249
