In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Define the number of contexts and actions
num_contexts = 3
num_actions = 2

# Define the true reward probabilities for each context-action pair
true_rewards = torch.tensor([[0.9, 0.1], [0.2, 0.8], [0.5, 0.5]])

class ContextualBandit:
    def __init__(self, num_contexts, num_actions, true_rewards):
        self.num_contexts = num_contexts
        self.num_actions = num_actions
        self.true_rewards = true_rewards

    def get_reward(self, context, action):
        reward_probability = self.true_rewards[context, action]
        reward = 1 if np.random.rand() < reward_probability else 0
        return reward

class Agent(nn.Module):
    def __init__(self, num_contexts, num_actions):
        super(Agent, self).__init__()
        self.linear = nn.Linear(num_contexts, num_actions)

    def forward(self, x):
        return self.linear(x)

def train_agent(agent, bandit, num_episodes=1000, learning_rate=0.01):
    optimizer = optim.SGD(agent.parameters(), lr=learning_rate)
    loss_fn = nn.MSELoss()

    for episode in range(num_episodes):
        context = torch.eye(num_contexts)[np.random.choice(num_contexts)]
        context_index = torch.argmax(context).item()
        
        predicted_rewards = agent(context)
        action = torch.argmax(predicted_rewards).item()
        
        reward = bandit.get_reward(context_index, action)
        target = predicted_rewards.clone()
        target[action] = reward
        
        loss = loss_fn(predicted_rewards, target)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return agent

# Initialize bandit and agent
bandit = ContextualBandit(num_contexts, num_actions, true_rewards)
agent = Agent(num_contexts, num_actions)

# Train the agent
trained_agent = train_agent(agent, bandit)

# Evaluate the trained agent
def evaluate_agent(agent, bandit, num_episodes=100):
    total_reward = 0
    for _ in range(num_episodes):
        context = torch.eye(num_contexts)[np.random.choice(num_contexts)]
        context_index = torch.argmax(context).item()
        
        with torch.no_grad():
            predicted_rewards = agent(context)
        action = torch.argmax(predicted_rewards).item()
        
        reward = bandit.get_reward(context_index, action)
        total_reward += reward

    average_reward = total_reward / num_episodes
    return average_reward

average_reward = evaluate_agent(trained_agent, bandit)

print(f'Average reward over {100} episodes: {average_reward:.2f}')


Average reward over 100 episodes: 0.25
