In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random, numpy as np
import torch.nn.functional as F
import matplotlib.pyplot as plt
from google.colab import files

# -------------------
# Parameters
# -------------------
d = 60         # input dimension
N = 10         # context size
K = 2          # number of tasks
noise_std = 1/(d**0.5)
epochs = 120   # gradient steps per task
lr = .01      # learning rate
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



class HingeLoss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, preds, labels):
        return torch.mean(F.relu(1 - preds * labels))
# -------------------
# Data generation
# -------------------
def generate_task_data(N, m, d, noise_std, w):
    X_full, y_tasks = [], []
    for _ in range(m):
        y_task = torch.randint(0, 2, (1,), device=device) * 2 - 1
        y_tasks.append(y_task.item())
        X_ctx = y_task * w + noise_std * torch.randn(N, d, device=device)
        X_full.append(X_ctx)
    X_full = torch.stack(X_full)
    y_tasks = torch.tensor(y_tasks, device=device)
    return X_full, y_tasks

# -------------------
# Attention Layer


class SimpleTransformer(nn.Module):
    def __init__(self, input_dim, num_heads=1, num_layers=1, hidden_dim=30):
        super().__init__()

        # 1. Define a single, standard transformer encoder layer.
        #    batch_first=True is crucial for using (Batch, Sequence, Feature) input shape.
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=input_dim,
            nhead=num_heads,
            dim_feedforward=hidden_dim * 2, # A common choice for the internal MLP dimension
            activation='relu',
            batch_first=True
        )

        # 2. Stack the encoder layers into a single nn.TransformerEncoder module.
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer=encoder_layer,
            num_layers=num_layers
        )

        # 3. Define the final MLP head for the classification/regression task.
        #    This remains the same as your original implementation.
        self.mlp_head = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, x):
        """
        x: (batch_size, seq_len=N, d_model=input_dim)
        """
        # Pass the input through the transformer encoder stack.
        # The output has the same shape as the input.
        out = self.transformer_encoder(x)

        # Average the output features across the sequence length dimension.
        out = out.mean(dim=1)

        # Pass the result through the final MLP head.
        return self.mlp_head(out).squeeze(-1)

# Example Usage:
# model = SimpleTransformerBuiltIn(input_dim=128, num_heads=4, num_layers=2, hidden_dim=300)
# input_tensor = torch.randn(32, 50, 128) # (Batch Size, Sequence Length, Input Dim)
# output = model(input_tensor)
# print(output.shape) # Expected: torch.Size([32])

# -------------------
# One experiment
# -------------------
def run_experiment(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model = SimpleTransformer(input_dim=d).to(device)
    optimizer = optim.SGD(model.parameters(), lr=lr)
    logistic_loss = HingeLoss()

    per_task_losses = [[] for _ in range(K)]
    first_task_losses = []

    # Generate tasks
    tasks_data = []
    for l in range(K):
        if l >= 1:
            w = torch.zeros(d, device=device)
            w[l] = 1/(2*d)**0.5
            m = 1000
        else:
            w = torch.zeros(d, device=device)
            w[0] = 1/(2*d)**0.5
            m = 50
        X_full, y_query = generate_task_data(N, m, d, noise_std, w)
        tasks_data.append((X_full, y_query))

    X_first, y_first = tasks_data[0]

    # Train sequentially
    for task_id in range(K):
        X_full, y_query = tasks_data[task_id]
        for epoch in range(epochs):
            optimizer.zero_grad()
            logits = model(X_full)
            loss = logistic_loss(logits, y_query)
            loss.backward()
            optimizer.step()

            # Record losses
            with torch.no_grad():
                logits_first = model(X_first)
                loss_first = logistic_loss(logits_first, y_first)
                first_task_losses.append(loss_first.item())

                logits_current = model(X_full)
                loss_current = logistic_loss(logits_current, y_query)
                per_task_losses[task_id].append(loss_current.item())

    return first_task_losses, per_task_losses,logits_current

# -------------------
# Run 10 experiments
# -------------------
n_experiments = 10
all_first_losses = []
all_per_task_losses = []

for exp in range(n_experiments):
    print("expm:",exp)
    first_losses, per_task_losses,logits_current = run_experiment(seed=exp)
    all_first_losses.append(first_losses)
    all_per_task_losses.append(per_task_losses)

# Average across experiments
avg_first_losses = np.mean(np.array(all_first_losses), axis=0)
avg_per_task_losses = [np.mean(np.array([exp_losses[t] for exp_losses in all_per_task_losses]), axis=0)
                       for t in range(K)]

#print("Average first task loss (last epoch):", avg_first_losses[-1])
#for t in range(K):
 #   print(f"Average final loss for task {t+1}:", avg_per_task_losses[t][-1])


expm: 0




expm: 1
expm: 2
expm: 3
expm: 4


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random, numpy as np
import torch.nn.functional as F
import matplotlib.pyplot as plt
from google.colab import files

# -------------------
# Parameters
# -------------------
d = 60         # input dimension
N = 10         # context size
K = 2          # number of tasks
noise_std = 1/(d**0.5)
epochs = 120   # gradient steps per task
lr = .01      # learning rate
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# -------------------
# Logistic Loss
# -------------------
class logloss(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, preds, labels):
        return torch.mean(torch.log(1+torch.exp(-preds*labels)))
class HingeLoss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, preds, labels):
        return torch.mean(F.relu(1 - preds * labels))
# -------------------
# Data generation
# -------------------
def generate_task_data(N, m, d, noise_std, w):
    X_full, y_tasks = [], []
    for _ in range(m):
        y_task = torch.randint(0, 2, (1,), device=device) * 2 - 1
        y_tasks.append(y_task.item())
        X_ctx = y_task * w + noise_std * torch.randn(N, d, device=device)
        X_full.append(X_ctx)
    X_full = torch.stack(X_full)
    y_tasks = torch.tensor(y_tasks, device=device)
    return X_full, y_tasks

# -------------------
# Attention Layer


class SimpleTransformer(nn.Module):
    def __init__(self, input_dim, num_heads=1, num_layers=1, hidden_dim=10):
        super().__init__()

        # 1. Define a single, standard transformer encoder layer.
        #    batch_first=True is crucial for using (Batch, Sequence, Feature) input shape.
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=input_dim,
            nhead=num_heads,
            dim_feedforward=hidden_dim, # A common choice for the internal MLP dimension
            activation='relu',
            batch_first=True
        )

        # 2. Stack the encoder layers into a single nn.TransformerEncoder module.
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer=encoder_layer,
            num_layers=num_layers
        )

        # 3. Define the final MLP head for the classification/regression task.
        #    This remains the same as your original implementation.
        self.mlp_head = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, x):
        """
        x: (batch_size, seq_len=N, d_model=input_dim)
        """
        # Pass the input through the transformer encoder stack.
        # The output has the same shape as the input.
        out = self.transformer_encoder(x)

        # Average the output features across the sequence length dimension.
        out = out.mean(dim=1)

        # Pass the result through the final MLP head.
        return self.mlp_head(out).squeeze(-1)

# Example Usage:
# model = SimpleTransformerBuiltIn(input_dim=128, num_heads=4, num_layers=2, hidden_dim=300)
# input_tensor = torch.randn(32, 50, 128) # (Batch Size, Sequence Length, Input Dim)
# output = model(input_tensor)
# print(output.shape) # Expected: torch.Size([32])

# -------------------
# One experiment
# -------------------
def run_experiment(seed,samplesize):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    model = SimpleTransformer(input_dim=d).to(device)
    optimizer = optim.SGD(model.parameters(), lr=lr)
    logistic_loss = HingeLoss()

    per_task_losses = [[] for _ in range(K)]
    first_task_losses = []

    # Generate tasks
    tasks_data = []
    for l in range(K):
        if l >= 1:
            w = torch.zeros(d, device=device)
            w[l] = 1/(2*d)**0.5
            m = samplesize
        else:
            w = torch.zeros(d, device=device)
            w[0] = 1/(2*d)**0.5
            m = 50
        X_full, y_query = generate_task_data(N, m, d, noise_std, w)
        tasks_data.append((X_full, y_query))

    X_first, y_first = tasks_data[0]

    # Train sequentially
    for task_id in range(K):
        X_full, y_query = tasks_data[task_id]
        for epoch in range(epochs):
            optimizer.zero_grad()
            logits = model(X_full)
            loss = logistic_loss(logits, y_query)
            loss.backward()
            optimizer.step()

            # Record losses
            with torch.no_grad():
                logits_first = model(X_first)
                loss_first = logistic_loss(logits_first, y_first)
                first_task_losses.append(loss_first.item())

                logits_current = model(X_full)
                loss_current = logistic_loss(logits_current, y_query)
                per_task_losses[task_id].append(loss_current.item())

    return first_task_losses, per_task_losses,logits_current

# -------------------
# Run 10 experiments
# -------------------
n_experiments = 10
avg_first_losses=[]
avg_per_task_losses =[]
for mm in [20,50,200,500,1000,2000]:
    all_first_losses = []
    all_per_task_losses = []
    for exp in range(n_experiments):
        print("expm:",exp)
        print("m:",mm)

        first_losses, per_task_losses,logits_current = run_experiment(seed=exp,samplesize=mm)
        all_first_losses.append(first_losses)
        all_per_task_losses.append(per_task_losses)

    # Average across experiments
    avg_first_losses.append(np.mean(np.array(all_first_losses), axis=0))
    avg_per_task_losses.append([np.mean(np.array([exp_losses[t] for exp_losses in all_per_task_losses]), axis=0)
                          for t in range(K)])

#print("Average first task loss (last epoch):", avg_first_losses[-1])
#for t in range(K):
 #   print(f"Average final loss for task {t+1}:", avg_per_task_losses[t][-1])


expm: 0
m: 20




expm: 1
m: 20
expm: 2
m: 20
expm: 3
m: 20
expm: 4
m: 20
expm: 5
m: 20
expm: 6
m: 20
expm: 7
m: 20
expm: 8
m: 20
expm: 9
m: 20
expm: 0
m: 50
expm: 1
m: 50
expm: 2
m: 50
expm: 3
m: 50
expm: 4
m: 50
expm: 5
m: 50
expm: 6
m: 50
expm: 7
m: 50
expm: 8
m: 50
expm: 9
m: 50
expm: 0
m: 200
expm: 1
m: 200
expm: 2
m: 200
expm: 3
m: 200
expm: 4
m: 200
expm: 5
m: 200
expm: 6
m: 200
expm: 7
m: 200
expm: 8
m: 200
expm: 9
m: 200
expm: 0
m: 500
expm: 1
m: 500
expm: 2
m: 500
expm: 3
m: 500
expm: 4
m: 500
expm: 5
m: 500
expm: 6
m: 500
expm: 7
m: 500
expm: 8
m: 500
expm: 9
m: 500
expm: 0
m: 1000
expm: 1
m: 1000
expm: 2
m: 1000
expm: 3
m: 1000
expm: 4
m: 1000
expm: 5
m: 1000
expm: 6
m: 1000
expm: 7
m: 1000
expm: 8
m: 1000
expm: 9
m: 1000
expm: 0
m: 2000
expm: 1
m: 2000
expm: 2
m: 2000
expm: 3
m: 2000
expm: 4
m: 2000
expm: 5
m: 2000
expm: 6
m: 2000
expm: 7
m: 2000
expm: 8
m: 2000
expm: 9
m: 2000
