In [17]:
import numpy as np

# Define the number of contexts and actions
num_contexts = 3
num_actions = 2

# Define the true reward probabilities for each context-action pair
true_rewards = np.array([[0.9, 0.1], [0.2, 0.8], [0.5, 0.5]])

class ContextualBandit:
    def __init__(self, num_contexts, num_actions, true_rewards):
        self.num_contexts = num_contexts
        self.num_actions = num_actions
        self.true_rewards = true_rewards

    def get_reward(self, context, action):
        reward_probability = self.true_rewards[context, action]
        reward = 1 if np.random.rand() < reward_probability else 0
        return reward

class QLearningAgent:
    def __init__(self, num_contexts, num_actions, learning_rate=0.1, epsilon=0.1):
        self.num_contexts = num_contexts
        self.num_actions = num_actions
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.q_table = np.zeros((num_contexts, num_actions))

    def select_action(self, context, greedy=False):
        if not greedy and np.random.rand() < self.epsilon:
            return np.random.choice(self.num_actions)
        else:
            return np.argmax(self.q_table[context])

    def update_q_values(self, context, action, reward):
        td_error = reward - self.q_table[context, action]
        self.q_table[context, action] += self.learning_rate * td_error

def train_agent(agent, bandit, num_episodes=100, steps_per_episode=100):
    episode_rewards = []

    for episode in range(num_episodes):
        total_reward = 0
        for step in range(steps_per_episode):
            context = np.random.choice(num_contexts)
            action = agent.select_action(context)
            reward = bandit.get_reward(context, action)
            agent.update_q_values(context, action, reward)
            total_reward += reward
        
        episode_rewards.append(total_reward)
        print(f'Episode {episode + 1}/{num_episodes}, Total Reward: {total_reward}')

    return agent

# Initialize bandit and agent
bandit = ContextualBandit(num_contexts, num_actions, true_rewards)
agent = QLearningAgent(num_contexts, num_actions)

# Train the agent
trained_agent = train_agent(agent, bandit)

# Evaluate the trained agent
def evaluate_agent(agent, bandit, num_episodes=100, steps_per_episode=100):
    total_reward = 0
    for episode in range(num_episodes):
        for step in range(steps_per_episode):
            context = np.random.choice(num_contexts)
            action = agent.select_action(context, greedy=True)
            reward = bandit.get_reward(context, action)
            total_reward += reward

    average_reward = total_reward / num_episodes
    return average_reward

average_reward = evaluate_agent(trained_agent, bandit)
print(f'Average reward over evaluation episodes: {average_reward:.2f}')


Episode 1/100, Total Reward: 72
Episode 2/100, Total Reward: 71
Episode 3/100, Total Reward: 68
Episode 4/100, Total Reward: 76
Episode 5/100, Total Reward: 66
Episode 6/100, Total Reward: 69
Episode 7/100, Total Reward: 65
Episode 8/100, Total Reward: 78
Episode 9/100, Total Reward: 75
Episode 10/100, Total Reward: 77
Episode 11/100, Total Reward: 67
Episode 12/100, Total Reward: 72
Episode 13/100, Total Reward: 64
Episode 14/100, Total Reward: 69
Episode 15/100, Total Reward: 77
Episode 16/100, Total Reward: 77
Episode 17/100, Total Reward: 74
Episode 18/100, Total Reward: 62
Episode 19/100, Total Reward: 68
Episode 20/100, Total Reward: 77
Episode 21/100, Total Reward: 69
Episode 22/100, Total Reward: 70
Episode 23/100, Total Reward: 75
Episode 24/100, Total Reward: 70
Episode 25/100, Total Reward: 67
Episode 26/100, Total Reward: 73
Episode 27/100, Total Reward: 73
Episode 28/100, Total Reward: 67
Episode 29/100, Total Reward: 69
Episode 30/100, Total Reward: 59
Episode 31/100, Tot

In [18]:
agent.q_table

array([[0.91642892, 0.0390233 ],
       [0.17823553, 0.93568616],
       [0.38541945, 0.49716237]])