## Simple contextual bandit with replay buffer

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import random
from collections import deque

# Set up device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize parameters
embedding_dim = 20  # Embedding dimension for entities and relations
K = 50  # Total number of entities (arms)
N_episodes = 1000  # Number of episodes
M = 100  # Number of steps per episode
batch_size = 32  # Batch size for replay buffer sampling
buffer_size = 1000  # Size of the replay buffer
noise_std = 0.1  # Standard deviation for the noise

# Generate entity and relation embeddings (move to device)
entity_embeddings = torch.tensor(
    np.random.randn(K, embedding_dim), dtype=torch.float32, device=device
)
relation_embedding = torch.tensor(
    np.random.randn(embedding_dim), dtype=torch.float32, device=device
)  # Only one relation "atLocation"

# True tail entities for each head entity
# This represents the correct tail for each head entity (randomly generated for now)
true_tail_entities = np.random.randint(0, K, size=K)


# Define the MLP Model with more complexity
class LinkPredictionMLP(nn.Module):
    def __init__(self, input_size):
        super(LinkPredictionMLP, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)  # Increased neurons
        self.fc2 = nn.Linear(64, 32)  # Another hidden layer for more complexity
        self.fc3 = nn.Linear(32, 1)  # Output layer: predicts probability of reward

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))  # Output between 0 and 1 (probability)
        return x


# Initialize the MLP model
model = LinkPredictionMLP(input_size=embedding_dim * 3).to(
    device
)  # Concatenate [head, relation, tail] embeddings
criterion = nn.BCELoss()  # Binary cross-entropy loss for binary rewards
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Replay buffer to store experiences
replay_buffer = deque(maxlen=buffer_size)


# Function to generate binary reward based on whether the predicted tail is correct
def generate_reward(head_entity, predicted_tail):
    true_tail = true_tail_entities[head_entity]  # Get the true tail entity for the head
    return 1 if predicted_tail == true_tail else 0  # Reward is 1 if correct, else 0


# Function to select an action (tail entity) based on MLP predictions (epsilon-greedy)
def select_action(
    model, head_embedding, relation_embedding, tail_entities, epsilon=0.1
):
    num_actions = tail_entities.shape[0]

    # Permute the tail entities randomly
    perm = torch.randperm(num_actions)
    tail_entities_permuted = tail_entities[perm]

    # Epsilon-greedy strategy: explore with probability epsilon, otherwise exploit
    if np.random.rand() < epsilon:
        return np.random.randint(0, num_actions)  # Explore: choose a random entity
    else:
        # Exploit: choose the entity with the highest predicted reward
        with torch.no_grad():
            inputs = torch.cat(
                [
                    head_embedding.repeat(num_actions, 1),
                    relation_embedding.repeat(num_actions, 1),
                    tail_entities_permuted,
                ],
                dim=1,
            )
            predictions = model(inputs)  # Predict reward probabilities for each entity

        # Get the predicted index in the permuted order
        predicted_index_permuted = torch.argmax(predictions).item()

        # Restore the original index by inverting the permutation
        predicted_index_original = perm[predicted_index_permuted].item()

        return predicted_index_original


# Function to train the MLP model using a batch of experiences from the replay buffer
def train_model_batch(model, optimizer, replay_buffer):
    if len(replay_buffer) < batch_size:
        return  # Don't train until the buffer is filled enough

    # Sample a batch of experiences
    batch = random.sample(replay_buffer, batch_size)

    # Extract input vectors (concatenated [head, relation, tail]) and rewards
    batch_inputs = torch.stack([experience[0] for experience in batch]).to(
        device
    )  # Contexts (head, relation, tail)
    batch_rewards = torch.tensor(
        [experience[1] for experience in batch], dtype=torch.float32, device=device
    )  # Rewards

    # Forward pass: predict rewards for each context in the batch
    model.train()
    predicted_rewards = model(batch_inputs)

    # Compute loss between predicted rewards and actual rewards
    loss = criterion(
        predicted_rewards, batch_rewards.unsqueeze(1)
    )  # Unsqueeze to match dimensions

    # Backpropagation and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


# Initialize episode rewards storage
episode_rewards = []

# Epsilon parameters for decay
epsilon = 1.0  # Start with high exploration
epsilon_min = 0.01  # Minimum exploration rate
epsilon_decay = 0.999  # Slower decay to encourage more exploration

# Simulate N episodes
torch.manual_seed(42)
np.random.seed(42)
for episode in tqdm(range(N_episodes)):
    total_reward = 0  # Total reward for the episode

    for step in range(M):
        # Randomly select a head entity (e.g., Bob)
        head_entity = np.random.randint(0, K)
        head_embedding = entity_embeddings[head_entity].unsqueeze(
            0
        )  # Shape (1, embedding_dim)

        # Add noise to head_embedding
        head_embedding_noisy = head_embedding + noise_std * torch.randn_like(
            head_embedding
        )

        # Randomly select a set of candidate tail entities (e.g., Kitchen, Bathroom, etc.)
        tail_entity_indices = np.random.choice(
            K, np.random.randint(1, K + 1), replace=False
        )
        tail_entity_embeddings = entity_embeddings[tail_entity_indices]

        # Add noise to each tail entity embedding
        tail_entity_embeddings_noisy = (
            tail_entity_embeddings
            + noise_std * torch.randn_like(tail_entity_embeddings)
        )

        # Select the best tail entity using epsilon-greedy strategy
        chosen_tail_index = select_action(
            model,
            head_embedding_noisy,
            relation_embedding.unsqueeze(0),
            tail_entity_embeddings_noisy,
            epsilon,
        )
        chosen_tail_entity = tail_entity_indices[chosen_tail_index]

        # Generate binary reward (1 if correct, 0 if wrong)
        reward = generate_reward(head_entity, chosen_tail_entity)

        # Store the experience in the replay buffer (store the concatenation of [head, relation, tail])
        input_vector = torch.cat(
            [
                head_embedding_noisy,
                relation_embedding.unsqueeze(0),
                entity_embeddings[chosen_tail_entity].unsqueeze(0),
            ],
            dim=1,
        ).squeeze()
        replay_buffer.append((input_vector, reward))

        # Accumulate the reward for this episode
        total_reward += reward

        # Train the MLP model using a batch of experiences from the replay buffer
        train_model_batch(model, optimizer, replay_buffer)

    # Store the total reward for the current episode
    episode_rewards.append(total_reward)

    # Decay epsilon after each episode
    epsilon = max(epsilon_min, epsilon * epsilon_decay)

# Plot the episode rewards to see if the model is learning over time
plt.plot(episode_rewards)
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("Total Reward per Episode over Time")
plt.show()

## Simple contextual bandit without replay buffer

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

# Set up device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize parameters
embedding_dim = 20  # Embedding dimension for entities and relations
K = 50  # Total number of entities (arms)
N_episodes = 1000  # Number of episodes
M = 100  # Number of steps per episode
noise_std = 0.1  # Standard deviation for the noise

# Generate entity and relation embeddings (move to device)
entity_embeddings = torch.tensor(
    np.random.randn(K, embedding_dim), dtype=torch.float32, device=device
)
relation_embedding = torch.tensor(
    np.random.randn(embedding_dim), dtype=torch.float32, device=device
)  # Only one relation "atLocation"

# True tail entities for each head entity
# This represents the correct tail for each head entity (randomly generated for now)
true_tail_entities = np.random.randint(0, K, size=K)


# Define the MLP Model with more complexity
class LinkPredictionMLP(nn.Module):
    def __init__(self, input_size):
        super(LinkPredictionMLP, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)  # Increased neurons
        self.fc2 = nn.Linear(64, 32)  # Another hidden layer for more complexity
        self.fc3 = nn.Linear(32, 1)  # Output layer: predicts probability of reward

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))  # Output between 0 and 1 (probability)
        return x


# Initialize the MLP model
model = LinkPredictionMLP(input_size=embedding_dim * 3).to(
    device
)  # Concatenate [head, relation, tail] embeddings
criterion = nn.BCELoss()  # Binary cross-entropy loss for binary rewards
optimizer = optim.Adam(model.parameters(), lr=0.001)


# Function to generate binary reward based on whether the predicted tail is correct
def generate_reward(head_entity, predicted_tail):
    true_tail = true_tail_entities[head_entity]  # Get the true tail entity for the head
    return 1 if predicted_tail == true_tail else 0  # Reward is 1 if correct, else 0


# Function to select an action (tail entity) based on MLP predictions (epsilon-greedy)
def select_action(
    model, head_embedding, relation_embedding, tail_entities, epsilon=0.1
):
    num_actions = tail_entities.shape[0]

    # Permute the tail entities randomly
    perm = torch.randperm(num_actions)
    tail_entities_permuted = tail_entities[perm]

    # Epsilon-greedy strategy: explore with probability epsilon, otherwise exploit
    if np.random.rand() < epsilon:
        return np.random.randint(0, num_actions)  # Explore: choose a random entity
    else:
        # Exploit: choose the entity with the highest predicted reward
        with torch.no_grad():
            inputs = torch.cat(
                [
                    head_embedding.repeat(num_actions, 1),
                    relation_embedding.repeat(num_actions, 1),
                    tail_entities_permuted,
                ],
                dim=1,
            )
            predictions = model(inputs)  # Predict reward probabilities for each entity

        # Get the predicted index in the permuted order
        predicted_index_permuted = torch.argmax(predictions).item()

        # Restore the original index by inverting the permutation
        predicted_index_original = perm[predicted_index_permuted].item()

        return predicted_index_original


# Function to train the MLP model online (update after each experience)
def train_model_online(model, optimizer, input_vector, reward):
    # Forward pass: predict reward for the current context
    model.train()
    predicted_reward = model(
        input_vector.unsqueeze(0)
    )  # Add batch dimension (1, input_size)

    # Compute loss between predicted reward and actual reward
    reward_tensor = torch.tensor([[reward]], dtype=torch.float32, device=device)
    loss = criterion(predicted_reward, reward_tensor)

    # Backpropagation and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


# Initialize episode rewards storage
episode_rewards = []

# Epsilon parameters for decay
epsilon = 1.0  # Start with high exploration
epsilon_min = 0.01  # Minimum exploration rate
epsilon_decay = 0.999  # Slower decay to encourage more exploration

# Simulate N episodes
torch.manual_seed(42)
np.random.seed(42)
for episode in tqdm(range(N_episodes)):
    total_reward = 0  # Total reward for the episode

    for step in range(M):
        # Randomly select a head entity (e.g., Bob)
        head_entity = np.random.randint(0, K)
        head_embedding = entity_embeddings[head_entity].unsqueeze(
            0
        )  # Shape (1, embedding_dim)

        # Add noise to head_embedding
        head_embedding_noisy = head_embedding + noise_std * torch.randn_like(
            head_embedding
        )

        # Randomly select a set of candidate tail entities (e.g., Kitchen, Bathroom, etc.)
        tail_entity_indices = np.random.choice(
            K, np.random.randint(1, K + 1), replace=False
        )
        tail_entity_embeddings = entity_embeddings[tail_entity_indices]

        # Add noise to each tail entity embedding
        tail_entity_embeddings_noisy = (
            tail_entity_embeddings
            + noise_std * torch.randn_like(tail_entity_embeddings)
        )

        # Select the best tail entity using epsilon-greedy strategy
        chosen_tail_index = select_action(
            model,
            head_embedding_noisy,
            relation_embedding.unsqueeze(0),
            tail_entity_embeddings_noisy,
            epsilon,
        )
        chosen_tail_entity = tail_entity_indices[chosen_tail_index]

        # Generate binary reward (1 if correct, 0 if wrong)
        reward = generate_reward(head_entity, chosen_tail_entity)

        # Prepare input vector for the model (concatenation of [head, relation, tail])
        input_vector = torch.cat(
            [
                head_embedding_noisy,
                relation_embedding.unsqueeze(0),
                entity_embeddings[chosen_tail_entity].unsqueeze(0),
            ],
            dim=1,
        ).squeeze()

        # Online training: update the model with this single experience
        train_model_online(model, optimizer, input_vector, reward)

        # Accumulate the reward for this episode
        total_reward += reward

    # Store the total reward for the current episode
    episode_rewards.append(total_reward)

    # Decay epsilon after each episode
    epsilon = max(epsilon_min, epsilon * epsilon_decay)

# Plot the episode rewards to see if the model is learning over time
plt.plot(episode_rewards)
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.title("Total Reward per Episode Over Time")
plt.show()

In [4]:
import torch


foo = torch.tensor([[1, 2, 3], [4, 5, 6]])
foo

tensor([[1, 2, 3],
        [4, 5, 6]])

In [5]:
foo.reshape(-1, 1)

tensor([[1],
        [2],
        [3],
        [4],
        [5],
        [6]])

In [7]:
foo.reshape(1, -1)

tensor([[1, 2, 3, 4, 5, 6]])

In [8]:
?torch.reshape

[0;31mDocstring:[0m
reshape(input, shape) -> Tensor

Returns a tensor with the same data and number of elements as :attr:`input`,
but with the specified shape. When possible, the returned tensor will be a view
of :attr:`input`. Otherwise, it will be a copy. Contiguous inputs and inputs
with compatible strides can be reshaped without copying, but you should not
depend on the copying vs. viewing behavior.

See :meth:`torch.Tensor.view` on when it is possible to return a view.

A single dimension may be -1, in which case it's inferred from the remaining
dimensions and the number of elements in :attr:`input`.

Args:
    input (Tensor): the tensor to be reshaped
    shape (tuple of int): the new shape

Example::

    >>> a = torch.arange(4.)
    >>> torch.reshape(a, (2, 2))
    tensor([[ 0.,  1.],
            [ 2.,  3.]])
    >>> b = torch.tensor([[0, 1], [2, 3]])
    >>> torch.reshape(b, (-1,))
    tensor([ 0,  1,  2,  3])
[0;31mType:[0m      builtin_function_or_method