<a href="https://colab.research.google.com/github/issacridhin/Reinforcement_learning/blob/main/2348546_Lab1_RL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import random

class EpsilonGreedyBandit:
    def __init__(self, n_arms, epsilon=0.1):
        self.n_arms = n_arms  # Number of ad slots (arms)
        self.epsilon = epsilon  # Probability of exploration
        self.counts = np.zeros(n_arms)  # Counts of times each slot has been selected
        self.values = np.zeros(n_arms)  # Estimated CTR for each slot

    def select_arm(self):
        """Select the next ad slot (arm) based on epsilon-greedy strategy."""
        if random.random() > self.epsilon:
            # Exploit: select the arm with the highest estimated CTR
            return np.argmax(self.values)
        else:
            # Explore: randomly select an arm
            return np.random.randint(0, self.n_arms)

    def update(self, chosen_arm, reward):
        """Update the chosen ad slot's estimated CTR based on the observed reward."""
        self.counts[chosen_arm] += 1
        n = self.counts[chosen_arm]
        value = self.values[chosen_arm]
        # Update rule: new value is the incremental average of the observed rewards
        new_value = ((n - 1) / n) * value + (1 / n) * reward
        self.values[chosen_arm] = new_value

def simulate_bandit(bandit, n_impressions, true_ctr):
    """Simulate the epsilon-greedy multi-armed bandit for a series of ad impressions."""
    total_rewards = 0
    rewards_per_arm = np.zeros(bandit.n_arms)
    for i in range(n_impressions):
        # Select an ad slot (arm) using epsilon-greedy strategy
        chosen_arm = bandit.select_arm()

        # Simulate showing the ad in that slot and getting a click or not
        reward = np.random.binomial(1, true_ctr[chosen_arm])  # Simulate a click (1) or no-click (0)

        # Update the bandit's estimated CTR based on the reward
        bandit.update(chosen_arm, reward)

        # Track total rewards (total clicks)
        total_rewards += reward
        rewards_per_arm[chosen_arm] += reward

        # Print intermediate progress (for demonstration purposes)
        if (i + 1) % 100 == 0:
            print(f"Impressions: {i+1}, Total Clicks: {total_rewards}")

    return total_rewards, rewards_per_arm

# Defining ad slots and their true CTRs
ad_slots = ['Top Banner', 'Sidebar', 'Footer', 'Pop-up']
true_ctr = [0.03, 0.05, 0.02, 0.07]  # True click-through rates (CTR) for each ad slot

# Initialize the bandit with 4 arms (ad slots) and epsilon = 0.1 (10% exploration rate)
n_arms = len(ad_slots)
epsilon = 0.1
bandit = EpsilonGreedyBandit(n_arms, epsilon)

# Simulate 1000 ad impressions
n_impressions = 1000
total_clicks, clicks_per_arm = simulate_bandit(bandit, n_impressions, true_ctr)

# Display results
print("\nSimulation complete.")
print(f"Total Clicks after {n_impressions} impressions: {total_clicks}")
for i in range(n_arms):
    print(f"Ad Slot: {ad_slots[i]}, Clicks: {clicks_per_arm[i]}, Estimated CTR: {bandit.values[i]:.4f}, True CTR: {true_ctr[i]:.4f}")


Impressions: 100, Total Clicks: 4
Impressions: 200, Total Clicks: 6
Impressions: 300, Total Clicks: 9
Impressions: 400, Total Clicks: 11
Impressions: 500, Total Clicks: 14
Impressions: 600, Total Clicks: 21
Impressions: 700, Total Clicks: 26
Impressions: 800, Total Clicks: 30
Impressions: 900, Total Clicks: 33
Impressions: 1000, Total Clicks: 34

Simulation complete.
Total Clicks after 1000 impressions: 34
Ad Slot: Top Banner, Clicks: 33.0, Estimated CTR: 0.0361, True CTR: 0.0300
Ad Slot: Sidebar, Clicks: 0.0, Estimated CTR: 0.0000, True CTR: 0.0500
Ad Slot: Footer, Clicks: 1.0, Estimated CTR: 0.0263, True CTR: 0.0200
Ad Slot: Pop-up, Clicks: 0.0, Estimated CTR: 0.0000, True CTR: 0.0700
