In [1]:
# Import statements
import numpy as np

In [8]:
def epsilon_greedy(num_arms, num_plays, epsilon, true_reward_probs):
  Q = np.zeros(num_arms)
  N = np.zeros(num_arms)
  rewards = []

  def pull_arm(arm):
    return np.random.rand() < true_reward_probs[arm]

  for t in range(num_plays):
    if np.random.rand() < epsilon:
      arm = np.random.choice(num_arms)
    else:
      arm = np.argmax(Q)

    reward = pull_arm(arm)
    rewards.append(reward)

    N[arm] += 1
    Q[arm] += (reward - Q[arm]) / N[arm]

  return Q, N, rewards



In [9]:
# Defining the parameters
num_arms = 3
num_plays = 1000
epsilon = 0.1
true_reward_probs = [0.2, 0.5, 0.7]
num_simulations = 1000

In [10]:
# Running multiple simulations
all_Q = np.zeros((num_simulations, num_arms))
all_N = np.zeros((num_simulations, num_arms))
all_rewards = []

for i in range(num_simulations):
  Q, N, rewards = epsilon_greedy(num_arms, num_plays, epsilon, true_reward_probs)
  all_Q[i] = all_N[i] = N
  all_rewards.append(sum(rewards))

# Getting the average results
avg_Q = np.mean(all_Q, axis=0)
avg_N = np.mean(all_N, axis=0)
avg_cumulative_reward = np.mean(all_rewards)
best_arm_reward = max(true_reward_probs) * num_plays
avg_regret = best_arm_reward - avg_cumulative_reward

# Printting the results
print("Average Estimated values:", avg_Q)
print("Average Counts :", avg_N)
print("Average Cumulative Reward:", avg_cumulative_reward)
print("Average  Regret:", avg_regret)

Average Estimated values: [ 57.453  78.752 863.795]
Average Counts : [ 57.453  78.752 863.795]
Average Cumulative Reward: 655.546
Average  Regret: 44.45399999999995


This code implements the **epsilon-greedy algorithm** for the multi-armed bandit (MAB) problem, simulating and analyzing its performance over multiple runs. Here's a detailed explanation of the code:

---

### **Step-by-Step Explanation**

#### **1. Import Statements**
```python
import numpy as np
```
- **`numpy`** is used for numerical operations, including random number generation and array manipulations.

---

#### **2. Function: `epsilon_greedy`**
```python
def epsilon_greedy(num_arms, num_plays, epsilon, true_reward_probs):
    Q = np.zeros(num_arms)
    N = np.zeros(num_arms)
    rewards = []
```
- **Inputs:**
  - `num_arms`: Number of arms (actions or slot machines).
  - `num_plays`: Total number of times we interact with the arms.
  - `epsilon`: The probability of exploration (choosing a random arm).
  - `true_reward_probs`: The actual probabilities of getting a reward for each arm.
- **Variables:**
  - `Q`: Estimated values of each arm (initialized to 0).
  - `N`: Number of times each arm has been pulled (initialized to 0).
  - `rewards`: List to track all rewards obtained during the plays.

```python
    def pull_arm(arm):
        return np.random.rand() < true_reward_probs[arm]
```
- **`pull_arm(arm)`**:
  - Simulates pulling an arm.
  - Generates a random number between 0 and 1 (`np.random.rand()`).
  - Returns `True` (reward) if the number is less than the true probability of the arm.

---

#### **3. Main Loop: Playing the Game**
```python
    for t in range(num_plays):
        if np.random.rand() < epsilon:
            arm = np.random.choice(num_arms)
        else:
            arm = np.argmax(Q)
```
- At each play `t`:
  - With probability `epsilon`, **explore** by randomly selecting an arm (`np.random.choice(num_arms)`).
  - Otherwise, **exploit** by selecting the arm with the highest estimated value (`np.argmax(Q)`).

```python
        reward = pull_arm(arm)
        rewards.append(reward)
```
- Simulate pulling the selected arm and record the obtained reward.

```python
        N[arm] += 1
        Q[arm] += (reward - Q[arm]) / N[arm]
```
- **Update the statistics**:
  - Increment the count of pulls for the chosen arm (`N[arm]`).
  - Update the estimated value `Q[arm]` using the formula for the running mean:
    \[
    Q[arm] = Q[arm] + \frac{(\text{reward} - Q[arm])}{N[arm]}
    \]

---

#### **4. Returning Results**
```python
    return Q, N, rewards
```
- Returns:
  - `Q`: Final estimated values of each arm.
  - `N`: Total number of times each arm was pulled.
  - `rewards`: List of all rewards obtained.

---

### **5. Simulating the Algorithm**
```python
num_arms = 3
num_plays = 1000
epsilon = 0.1
true_reward_probs = [0.2, 0.5, 0.7]
num_simulations = 1000
```
- Parameters:
  - 3 arms with reward probabilities `[0.2, 0.5, 0.7]`.
  - Play each simulation for 1000 rounds.
  - Use `epsilon = 0.1` (10% exploration).
  - Run the algorithm for `num_simulations = 1000` independent simulations.

```python
all_Q = np.zeros((num_simulations, num_arms))
all_N = np.zeros((num_simulations, num_arms))
all_rewards = []
```
- Arrays to store results of all simulations:
  - `all_Q`: Stores estimated values (`Q`) for each simulation.
  - `all_N`: Stores counts of arm pulls for each simulation.
  - `all_rewards`: Stores total rewards for each simulation.

```python
for i in range(num_simulations):
    Q, N, rewards = epsilon_greedy(num_arms, num_plays, epsilon, true_reward_probs)
    all_Q[i] = Q
    all_N[i] = N
    all_rewards.append(sum(rewards))
```
- Run `epsilon_greedy` for each simulation.
- Record:
  - Estimated values (`Q`).
  - Number of pulls (`N`).
  - Total rewards (`sum(rewards)`).

---

### **6. Analyzing Results**
```python
avg_Q = np.mean(all_Q, axis=0)
avg_N = np.mean(all_N, axis=0)
avg_cumulative_reward = np.mean(all_rewards)
```
- Compute averages across simulations:
  - `avg_Q`: Average estimated values of arms.
  - `avg_N`: Average number of times each arm was pulled.
  - `avg_cumulative_reward`: Average cumulative reward across all simulations.

```python
best_arm_reward = max(true_reward_probs) * num_plays
avg_regret = best_arm_reward - avg_cumulative_reward
```
- **Best possible reward**: If the best arm (with the highest reward probability) was always selected:
  \[
  \text{best\_arm\_reward} = \text{max(true\_reward\_probs)} \times \text{num\_plays}
  \]
- **Regret**: Difference between the best possible reward and the actual average cumulative reward.

---

### **7. Printing Results**
```python
print("Average Estimated values:", avg_Q)
print("Average Counts :", avg_N)
print("Average Cumulative Reward:", avg_cumulative_reward)
print("Average  Regret:", avg_regret)
```
- Outputs key metrics for analysis:
  - Average estimated values for each arm (`avg_Q`).
  - Average number of times each arm was pulled (`avg_N`).
  - Average total reward across all simulations.
  - Average regret (loss due to not always selecting the best arm).

---

### **Key Insights**
- The **epsilon-greedy algorithm** balances exploration and exploitation.
- Over time, it learns to focus on the best arm while occasionally exploring other arms.
- Regret decreases as the algorithm gains more information about the arms' true reward probabilities.

In [12]:
# # Import statements
# import numpy as np

# def epsilon_greedy(num_arms, num_plays, epsilon, true_reward_probs):
#   Q = np.zeros(num_arms)
#   N = np.zeros(num_arms)
#   rewards = []

#   def pull_arm(arm):
#     return np.random.rand() < true_reward_probs[arm]

#   for t in range(num_plays):
#     if np.random.rand() < epsilon:
#       arm = np.random.choice(num_arms)
#     else:
#       arm = np.argmax(Q)

#     reward = pull_arm(arm)
#     rewards.append(reward)

#     N[arm] += 1
#     Q[arm] += (reward - Q[arm]) / N[arm]

#   return Q, N, rewards

# # Defining the parameters
# num_arms = 3
# num_plays = 1000
# epsilon = 0.1
# true_reward_probs = [0.2, 0.5, 0.7]
# num_simulations = 1000

# # Running multiple simulations
# all_Q = np.zeros((num_simulations, num_arms))
# all_N = np.zeros((num_simulations, num_arms))
# all_rewards = []

# for i in range(num_simulations):
#   Q, N, rewards = epsilon_greedy(num_arms, num_plays, epsilon, true_reward_probs)
#   all_Q[i] = all_N[i] = N
#   all_rewards.append(sum(rewards))

# # Getting the average results
# avg_Q = np.mean(all_Q, axis=0)
# avg_N = np.mean(all_N, axis=0)
# avg_cumulative_reward = np.mean(all_rewards)
# best_arm_reward = max(true_reward_probs) * num_plays
# avg_regret = best_arm_reward - avg_cumulative_reward

# # Printting the results
# print("Average Estimated values:", avg_Q)
# print("Average Counts :", avg_N)
# print("Average Cumulative Reward:", avg_cumulative_reward)
# print("Average  Regret:", avg_regret)


Other way of implementing it :

In [13]:
np.zeros(5)

array([0., 0., 0., 0., 0.])

In [11]:
import numpy as np

# Define number of arms and number of trials
n_arms = 5
n_trials = 1000
epsilon = 0.1  # Exploration probability

# Initialize estimates and action counts
Q = np.zeros(n_arms)  # Estimated rewards for each arm
N = np.zeros(n_arms)  # Number of times each arm is pulled

# True reward probabilities for each arm (unknown to the agent)
true_probs = np.random.rand(n_arms)

# Simulation of trials
for t in range(1, n_trials + 1):
    # Choose action using epsilon-greedy
    if np.random.rand() < epsilon:
        action = np.random.randint(0, n_arms)  # Explore
    else:
        action = np.argmax(Q)  # Exploit

    # Simulate pulling the arm and receiving reward
    reward = np.random.rand() < true_probs[action]

    # Update counts and action value estimates
    N[action] += 1
    Q[action] += (reward - Q[action]) / N[action]  # Incremental update

# Results
print("Estimated action values (Q):", Q)
print("True probabilities of arms:", true_probs)
print("Number of times each arm was pulled:", N)


Estimated action values (Q): [0.77826564 0.5625     0.36363636 0.06666667 0.66666667]
True probabilities of arms: [0.77150968 0.54069643 0.38195034 0.15768095 0.42852121]
Number of times each arm was pulled: [911.  16.  22.  30.  21.]
