In [None]:
import numpy as np               # To handle numbers, arrays, and math
import matplotlib.pyplot as plt  # To plot graphs (reward over time)


n_arms = 2  # We have 2 different promos (A and B)
d = 3       # Each user has 3 features let's take  age, gender and clicked for ex
steps = 200  # We'll simulate 200 users (or 200 rounds)

#behind the scenes (what kind of user likes each promo)
#irl with datasets it's different we don't invent true weights
true_weights = [
    np.array([0.2, 0.4, 0.1]),  # The second feature influences promo A the most
    np.array([0.5, 0.1, 0.3])   #The first feature influences promo B the most
]

alpha = 0.1  # This controls how much we explore new options higher = more exploration



# For each promo, we track 2 things:
#A is a d x d identity matrix, it's like a memory of all users who saw this promo
#b is a d x 1 vector to store rewards for each feature
A = [np.identity(d) for _ in range(n_arms)]  #one identity matrix for each arm. A is the memory of all users shown this promo
b = [np.zeros((d, 1)) for _ in range(n_arms)]  #  one reward vector for each arm b is your memory of all users who clicked

# To track how well the agent is doing
total_rewards = 0
average_rewards = []

# simulate users coming in one by one

for step in range(steps):
    # Create a random user with 3 features
    context = np.random.rand(d, 1)

    p = []  # This will store the UCB score for each arm

    for a in range(n_arms):
        # Estimate theta aka the weights for this arm
        A_inv = np.linalg.inv(A[a])  # Invert the matrix A (needed for formula)
        theta = A_inv @ b[a]         # Multiply A^-1 with b to get estimated weights (its from linear regression math gives the estimated weight for each feature) helps you estimate:
#“What kinds of people tend to click this promo?”

        # Compute the UCB score:
        # predicted reward + exploration bonus
        mean = (theta.T @ context)[0][0]  # Estimated reward (dot product)
        bonus = alpha * np.sqrt((context.T @ A_inv @ context)[0][0])  # Uncertainty
        ucb = mean + bonus
        p.append(ucb)  # Save this arm's UCB score

    # Choose the arm with the highest UCB score
    action = np.argmax(p)

    # Simulate the real reward based on true weights (adds randomness)
    true_prob = context.T @ true_weights[action].reshape(-1, 1)  # The true click probability
    reward = 1 if np.random.rand() < true_prob else 0  # Click (1) or not (0)

    # Update A and b for the selected arm
    A[action] += context @ context.T         # Multiply the vector context by its transpose to make a square matrix (context (3x1) @ context.T (1x3) = a 3x3 matrix)
    b[action] += reward * context            # b = b + reward * x

    # Track performance
    total_rewards += reward
    average_rewards.append(total_rewards / (step + 1))


print("Total reward collected:", total_rewards)  # How many clicks we got in total
print("Average reward:", total_rewards / steps)  # Average clicks per user

# Plot the learning progress
plt.plot(average_rewards)
plt.xlabel("Steps")
plt.ylabel("Average Reward")
plt.title("Contextual Bandit with LinUCB")
plt.show()

A:
each time we're adding context @ context.T to one of the matrices of A and each context @ context.T is a matrix that shows how important each feature and feature combination is for this one user.

b:
if people who are female (feature 2) tend to click on Promo A,
the second value of b[0] will grow bigger over time.
In the simulation I'm running now, the code pretends to know whether the user clicked because we invented the “truth” using true_weights and np.random.rand()

Each time a promo is shown and gets a click, we update b[action] like this:

b[action] += reward * context

reward is either 1 (clicked) or 0
so we either remember their feature or not
it's a cumulative reward per feature for this promo