In [1]:
import torch, os, evaluate, sys
import torch.nn as nn
import pandas as pd

import dataclasses, sys, pickle, json
import torch.nn.functional as F

from typing import List, Optional

import matplotlib.pyplot as plt, random, numpy as np

from datasets import load_dataset,Dataset,DatasetDict
from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
from torch.utils.data import DataLoader
from transformers import AdamW,get_scheduler
from datasets import load_metric
from tqdm.auto import tqdm

from torch.nn import DataParallel

sys.path.append("/home/pritam.k/research/data-moe")


# from src.utils.helper import CustomModel, ModelArgs, MoeArgs
from src.utils.helper import ConfiguredMetric

  from .autonotebook import tqdm as notebook_tqdm


In [38]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MoE(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_experts=4, num_difficulties=3, capacity_factor=1.5):
        """
        Mixture of Experts (MoE) Layer with Difficulty-Aware Gating.

        Args:
            input_dim (int): Input feature dimension.
            hidden_dim (int): Hidden feature dimension for each expert.
            num_experts (int): Number of experts.
            num_difficulties (int): Number of difficulty categories.
            capacity_factor (float): Determines the capacity per instance.
        """
        super(MoE, self).__init__()
        self.num_experts = num_experts
        self.num_difficulties = num_difficulties
        self.capacity = int(capacity_factor)

        # Define experts
        self.experts = nn.ModuleList([nn.Linear(input_dim, hidden_dim) for _ in range(num_experts)])

        # Define difficulty embeddings
        self.difficulty_embedding = nn.Embedding(num_difficulties, input_dim)

        # Define gating network (conditioned on input and difficulty)
        self.gate = nn.Linear(input_dim * 2, num_experts)

    def forward(self, x, difficulty_labels=None):
        """
        Forward pass for MoE layer.

        Args:
            x (Tensor): Input tensor of shape (batch_size, input_dim).
            difficulty_labels (Tensor): Tensor of difficulty labels of shape (batch_size,).

        Returns:
            Tensor: Output tensor of shape (batch_size, hidden_dim).
            Tensor: Routing information tensor of shape (batch_size, topk).
        """
        print(x.shape)
        # batch_size = x.size(0)
        # seq_length = x.size(1)
        # hidden_dim = x.size(2)
        #print(difficulty_labels)
        batch_size, input_dim, _ = x.size()

        print(difficulty_labels)
        difficulty_embeds = self.difficulty_embedding(difficulty_labels)
        # if difficulty_labels is not None:
        #     difficulty_embeds = self.difficulty_embedding(difficulty_labels)
        # else:
        #     difficulty_embeds = torch.zeros((batch_size, input_dim)).to(x.device)  # or some other default value

        # Get difficulty embeddings
        #difficulty_embeds = self.difficulty_embedding(difficulty_labels)  # (batch_size, input_dim)

        # Concatenate input with difficulty embeddings
        gate_input = torch.cat([x, difficulty_embeds], dim=1)  # (batch_size, 2 * input_dim)


        # Compute gate logits
        gate_logits = self.gate(gate_input)  # (batch_size, num_experts)
        gate_probs = F.softmax(gate_logits, dim=1)  # (batch_size, num_experts)

        # Select top-k experts
        topk = 2  # Number of experts to select
        topk_probs, topk_indices = gate_probs.topk(topk, dim=1)  # Each row has top-k probabilities and indices

        # Initialize output
        output = torch.zeros(batch_size, self.experts[0].out_features).to(x.device)

        # Routing info for analysis
        routing_info = topk_indices  # (batch_size, topk)

        # Iterate over experts
        for i in range(self.num_experts):
            # Find instances assigned to expert i
            mask = (topk_indices == i).any(dim=1)  # (batch_size,)
            if mask.sum() == 0:
                continue
            selected_instances = x[mask]  # (num_selected, input_dim)
            selected_probs = topk_probs[mask]  # (num_selected, topk)

            # Pass through expert
            expert_output = self.experts[i](selected_instances)  # (num_selected, hidden_dim)

            # Weight by gate probabilities
            # Sum the probabilities for expert i (since it can be selected in top-k)
            # Find where expert i was selected in top-k
            expert_mask = (topk_indices[mask] == i)  # (num_selected, topk)
            weight = topk_probs[mask][expert_mask]  # (num_selected * occurrences,)

            # Sum weights per instance (if multiple occurrences, sum them)
            # Assuming top-k=2, and each expert can be selected at most once per instance
            # So, weight.sum(dim=1) is equivalent to weight.squeeze(1) if topk=2
            weight = expert_mask.float() * topk_probs[mask]  # (num_selected, topk)
            weight = weight.sum(dim=1, keepdim=True)  # (num_selected, 1)

            output[mask] += expert_output * weight  # Weighted sum

        return output, routing_info


In [131]:
# from transformers import RobertaModel, RobertaConfig
# import torch
# import torch.nn as nn

# class MoE(nn.Module):
#     def __init__(self, input_dim, hidden_dim, num_experts, num_difficulties):
#         """
#         MoE Layer constructor.

#         Args:
#             input_dim (int): Input dimension.
#             hidden_dim (int): Hidden dimension of each expert.
#             num_experts (int): Number of experts.
#             num_difficulties (int): Number of difficulty levels.
#         """
#         super(MoE, self).__init__()
#         self.experts = nn.ModuleList([nn.Linear(input_dim, hidden_dim) for _ in range(num_experts)])
#         self.gating_network = nn.Linear(input_dim, num_experts)
#         self.difficulty_embeddings = nn.Embedding(num_difficulties, input_dim)  # Embedding for difficulties

#     def forward(self, x, difficulties=None):
#         """
#         Forward pass for MoE layer.

#         Args:
#             x (Tensor): Input tensor.
#             difficulties (Tensor): Difficulty levels for each input in the batch.

#         Returns:
#             Tuple: (MoE output, routing information).
#         """
#         # Gating network outputs expert probabilities
#         gating_logits = self.gating_network(x)  # (batch_size, num_experts)
#         expert_weights = torch.softmax(gating_logits, dim=-1)  # Normalize over experts

#         # Apply difficulty embeddings
#         difficulty_embeds = self.difficulty_embeddings(difficulties)  # (batch_size, input_dim)
#         x = x + difficulty_embeds  # Add difficulty information to the input

#         # Weighted sum of expert outputs
#         expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=1)  # (batch_size, num_experts, hidden_dim)
#         weighted_output = torch.einsum('be,bem->bm', expert_weights, expert_outputs)  # (batch_size, hidden_dim)

#         return weighted_output, expert_weights

# class RobertaWithMoE(nn.Module):
#     def __init__(self, model_name='roberta-base', num_experts=4, hidden_dim=3072, num_difficulties=3, num_labels=2):
#         """
#         Custom RoBERTa model integrated with MoE layers.

#         Args:
#             model_name (str): Pre-trained RoBERTa model name.
#             num_experts (int): Number of experts in MoE layers.
#             hidden_dim (int): Hidden dimension for each expert.
#             num_difficulties (int): Number of difficulty categories.
#             num_labels (int): Number of classification labels.
#         """
#         super(RobertaWithMoE, self).__init__()
#         self.roberta = RobertaModel.from_pretrained(model_name)
#         config = self.roberta.config

#         # Replace the intermediate dense layer in each Transformer block with MoE
#         for layer in self.roberta.encoder.layer:
#             # Original intermediate layer
#             original_fc = layer.intermediate.dense

#             # New MoE layer
#             moe_layer = MoE(
#                 input_dim=original_fc.in_features,
#                 hidden_dim=hidden_dim,
#                 num_experts=num_experts,
#                 num_difficulties=num_difficulties
#             )
#             layer.intermediate = moe_layer

#         # Classification head
#         self.classifier = nn.Linear(config.hidden_size, num_labels)

#     def forward(self, input_ids, attention_mask=None, labels=None, difficulties=None):
#         """
#         Forward pass for the custom RoBERTa model with MoE.

#         Args:
#             input_ids (Tensor): Input token IDs.
#             attention_mask (Tensor): Attention masks.
#             labels (Tensor): Labels for classification.
#             difficulties (Tensor): Difficulty labels.

#         Returns:
#             Tuple: (loss, logits) if labels are provided; otherwise, (logits, routing_info).
#         """
#         # Get RoBERTa outputs
#         outputs = self.roberta(
#             input_ids=input_ids,
#             attention_mask=attention_mask,
#         )
#         hidden_states = outputs.last_hidden_state  # (batch_size, seq_length, hidden_size)

#         # Ensure difficulties are passed correctly
#         if difficulties is None:
#             raise ValueError("Difficulties tensor must be provided.")

#         # Pass through transformer layers with MoE
#         for layer in self.roberta.encoder.layer:
#             moe_output, routing_info = layer.intermediate(hidden_states[:, 0, :], difficulties)  # Using [CLS] token

#             # Continue with the output dense layer and other components
#             intermediate_output = moe_output  # (batch_size, hidden_dim)
#             layer_output = layer.output.dense(intermediate_output)  # (batch_size, hidden_size)
#             layer_output = layer.output.dropout(layer_output)
#             layer_output = layer.output.LayerNorm(layer_output + outputs.last_hidden_state[:, 0, :])  # Residual connection
#             hidden_states[:, 0, :] = layer_output  # Update [CLS] token

#         # Pooling (use [CLS] token representation)
#         pooled_output = hidden_states[:, 0, :]  # (batch_size, hidden_size)

#         # Classification
#         logits = self.classifier(pooled_output)  # (batch_size, num_labels)

#         loss = None
#         if labels is not None:
#             loss_fct = nn.CrossEntropyLoss()
#             loss = loss_fct(logits.view(-1, self.classifier.out_features), labels.view(-1))

#         return (loss, logits) if loss is not None else (logits, routing_info)


In [3]:
# from transformers import RobertaModel, RobertaConfig
# import torch
# import torch.nn as nn

# class MoE(nn.Module):
#     def __init__(self, input_dim, hidden_dim, num_experts, num_difficulties):
#         """
#         MoE Layer constructor.

#         Args:
#             input_dim (int): Input dimension.
#             hidden_dim (int): Hidden dimension of each expert.
#             num_experts (int): Number of experts.
#             num_difficulties (int): Number of difficulty levels.
#         """
#         super(MoE, self).__init__()
#         self.experts = nn.ModuleList([nn.Linear(input_dim, hidden_dim) for _ in range(num_experts)])
#         self.gating_network = nn.Linear(input_dim, num_experts)
#         self.difficulty_embeddings = nn.Embedding(num_difficulties, input_dim)  # Embedding for difficulties

#     def forward(self, x, difficulties=None):
#         """
#         Forward pass for MoE layer.

#         Args:
#             x (Tensor): Input tensor.
#             difficulties (Tensor): Difficulty levels for each input in the batch.

#         Returns:
#             Tuple: (MoE output, routing information).
#         """
#         # Gating network outputs expert probabilities
#         gating_logits = self.gating_network(x)  # (batch_size, num_experts)
#         expert_weights = torch.softmax(gating_logits, dim=-1)  # Normalize over experts

#         print(difficulties)
#         # Apply difficulty embeddings
#         difficulty_embeds = self.difficulty_embeddings(difficulties)  # (batch_size, input_dim)
#         x = x + difficulty_embeds  # Add difficulty information to the input

#         # Weighted sum of expert outputs
#         expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=1)  # (batch_size, num_experts, hidden_dim)
#         weighted_output = torch.einsum('be,bem->bm', expert_weights, expert_outputs)  # (batch_size, hidden_dim)

#         return weighted_output, expert_weights

In [39]:
from transformers import RobertaModel, RobertaConfig
import torch
import torch.nn as nn

class RobertaWithMoE(nn.Module):
    def __init__(self, model_name='roberta-base', num_experts=4, hidden_dim=3072, num_difficulties=3, num_labels=2):
        """
        Custom RoBERTa model integrated with MoE layers.

        Args:
            model_name (str): Pre-trained RoBERTa model name.
            num_experts (int): Number of experts in MoE layers.
            hidden_dim (int): Hidden dimension for each expert.
            num_difficulties (int): Number of difficulty categories.
            num_labels (int): Number of classification labels.
        """
        super(RobertaWithMoE, self).__init__()
        self.roberta = RobertaModel.from_pretrained(model_name)
        config = self.roberta.config

        # Replace the intermediate dense layer in each Transformer block with MoE
        for layer in self.roberta.encoder.layer:
            # Original intermediate layer
            original_fc = layer.intermediate.dense

            # New MoE layer
            moe_layer = MoE(
                input_dim=original_fc.in_features,
                hidden_dim=hidden_dim,
                num_experts=num_experts,
                num_difficulties=num_difficulties
            )
            layer.intermediate = moe_layer

        # Classification head
        self.classifier = nn.Linear(config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None, labels=None, difficulties=None):
        """
        Forward pass for the custom RoBERTa model with MoE.

        Args:
            input_ids (Tensor): Input token IDs.
            attention_mask (Tensor): Attention masks.
            labels (Tensor): Labels for classification.
            difficulties (Tensor): Difficulty labels.

        Returns:
            Tuple: (loss, logits) if labels are provided; otherwise, (logits, routing_info).
        """
        # Get RoBERTa outputs
        outputs = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        hidden_states = outputs.last_hidden_state  # (batch_size, seq_length, hidden_size)

        # Ensure difficulties are passed correctly
        if difficulties is None:
            raise ValueError("Difficulties tensor must be provided.")

        # Pass through transformer layers with MoE
        for layer in self.roberta.encoder.layer:
            moe_output, routing_info = layer.intermediate(hidden_states[:, 0, :], difficulties)  # Using [CLS] token

            # Continue with the output dense layer and other components
            intermediate_output = moe_output  # (batch_size, hidden_dim)
            layer_output = layer.output.dense(intermediate_output)  # (batch_size, hidden_size)
            layer_output = layer.output.dropout(layer_output)
            layer_output = layer.output.LayerNorm(layer_output + outputs.last_hidden_state[:, 0, :])  # Residual connection
            hidden_states[:, 0, :] = layer_output  # Update [CLS] token

        # Pooling (use [CLS] token representation)
        pooled_output = hidden_states[:, 0, :]  # (batch_size, hidden_size)

        # Classification
        logits = self.classifier(pooled_output)  # (batch_size, num_labels)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.classifier.out_features), labels.view(-1))

        return (loss, logits) if loss is not None else (logits, routing_info)

In [41]:
# import torch

# # Set random seed for reproducibility
# torch.manual_seed(42)

# # Initialize the MoE layer
# input_dim = 768  # Example input dimension
# hidden_dim = 3072  # Example hidden dimension for each expert
# num_experts = 4  # Number of experts
# num_difficulties = 3  # Number of difficulty levels

# moe_layer = MoE(input_dim=input_dim, hidden_dim=hidden_dim, num_experts=num_experts, num_difficulties=num_difficulties)

# # Create some dummy data for testing
# batch_size = 2  # Example batch size
# x = torch.randn(batch_size, input_dim)  # Random input tensor with shape (batch_size, input_dim)

# # Dummy difficulty levels (each value should be between 0 and num_difficulties-1)
# difficulties = torch.tensor([0, 2])  # Example difficulty labels for each input in the batch

# # Forward pass through the MoE layer
# output, routing_info = moe_layer(x, difficulties)

# # Print the outputs
# print("MoE Output (weighted sum of expert outputs):")
# print(output)

# print("\nRouting Information (expert weights):")
# print(routing_info)


In [16]:
# import torch
# from transformers import RobertaTokenizer

# # Initialize the RobertaWithMoE model
# model = RobertaWithMoE(model_name='roberta-base', num_experts=4, hidden_dim=3072, num_difficulties=3, num_labels=2)

# # Dummy input setup
# tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# # Example sentences
# sentences = ["This is a test sentence.", "Another sentence for testing."]

# # Tokenize the sentences (this will return input_ids and attention_mask)
# inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=10)

# # Generate dummy difficulties for each input (e.g., difficulties range from 0 to num_difficulties-1)
# difficulties = torch.tensor([0, 2])  # Example difficulty levels for each sentence
# difficulties = difficulties.long()
# # Generate dummy labels for classification (binary classification in this case)
# labels = torch.tensor([1, 0])  # Example labels (batch_size = 2)

# # Perform a forward pass through the model
# outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=labels, difficulties=difficulties)

# # Extract loss and logits (or logits and routing info if labels are None)
# if isinstance(outputs, tuple):
#     loss, logits = outputs
#     print("Loss:", loss.item())
# else:
#     logits, routing_info = outputs

# print("\nLogits (classification outputs):")
# print(logits)

# print("\nRouting Information (expert weights for MoE):")
# print(routing_info)


In [117]:
# from transformers import RobertaModel, RobertaConfig
# import torch.nn as nn

# class RobertaWithMoE(nn.Module):
#     def __init__(self, model_name='roberta-base', num_experts=4, hidden_dim=3072, num_difficulties=3, num_labels=2):
#         """
#         Custom RoBERTa model integrated with MoE layers.

#         Args:
#             model_name (str): Pre-trained RoBERTa model name.
#             num_experts (int): Number of experts in MoE layers.
#             hidden_dim (int): Hidden dimension for each expert.
#             num_difficulties (int): Number of difficulty categories.
#             num_labels (int): Number of classification labels.
#         """
#         super(RobertaWithMoE, self).__init__()
#         self.roberta = RobertaModel.from_pretrained(model_name)
#         config = self.roberta.config

#         # Replace the intermediate dense layer in each Transformer block with MoE
#         for layer in self.roberta.encoder.layer:
#             # Original intermediate layer
#             original_fc = layer.intermediate.dense

#             # New MoE layer
#             moe_layer = MoE(
#                 input_dim=original_fc.in_features,
#                 hidden_dim=hidden_dim,
#                 num_experts=num_experts,
#                 num_difficulties=num_difficulties
#             )
#             layer.intermediate = moe_layer

#             # Output layer remains unchanged

#         # Classification head
#         self.classifier = nn.Linear(config.hidden_size, num_labels)

#     def forward(self, input_ids, attention_mask=None, labels=None, difficulties=None):
#         """
#         Forward pass for the custom RoBERTa model with MoE.

#         Args:
#             input_ids (Tensor): Input token IDs.
#             attention_mask (Tensor): Attention masks.
#             labels (Tensor): Labels for classification.
#             difficulties (Tensor): Difficulty labels.

#         Returns:
#             Tuple: (loss, logits) if labels are provided; otherwise, (logits, routing_info).
#         """
#         # Get RoBERTa outputs
#         outputs = self.roberta(
#             input_ids=input_ids,
#             attention_mask=attention_mask,
#         )
#         hidden_states = outputs.last_hidden_state  # (batch_size, seq_length, hidden_size)
        
#         if difficulties is None:
#             raise ValueError("Difficulties tensor must be provided.")
#         # Pass through transformer layers with MoE
#         for layer in self.roberta.encoder.layer:
#             # Apply MoE layer (intermediate)
#             # print(difficulties)
#             moe_output, routing_info = layer.intermediate(hidden_states[:, 0, :], difficulties)  # Assuming classification uses [CLS] token

#             # Continue with the output dense layer and other components
#             # Original RoBERTa uses: intermediate -> output.dense -> output.dropout -> output.LayerNorm + residual
#             intermediate_output = moe_output  # (batch_size, hidden_dim)
#             layer_output = layer.output.dense(intermediate_output)  # (batch_size, hidden_size)
#             layer_output = layer.output.dropout(layer_output)
#             layer_output = layer.output.LayerNorm(layer_output + outputs.last_hidden_state[:, 0, :])  # Residual connection
#             hidden_states[:, 0, :] = layer_output  # Update [CLS] token

#         # Pooling (use [CLS] token representation)
#         pooled_output = hidden_states[:, 0, :]  # (batch_size, hidden_size)

#         # Classification
#         logits = self.classifier(pooled_output)  # (batch_size, num_labels)

#         loss = None
#         if labels is not None:
#             loss_fct = nn.CrossEntropyLoss()
#             loss = loss_fct(logits.view(-1, self.classifier.out_features), labels.view(-1))

#         return (loss, logits) if loss is not None else (logits, routing_info)


In [42]:
from transformers import RobertaTokenizer
from torch.utils.data import Dataset
import torch

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, difficulties, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.difficulties = difficulties
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        assert len(self.texts) == len(self.labels) == len(self.difficulties), "Dataset length mismatch!"

    def __len__(self):
        return len(self.texts)  # This should return the size of your dataset

    def __getitem__(self, idx):
        if idx >= len(self.texts):
            raise IndexError(f"Index {idx} out of bounds for dataset size {len(self.texts)}")
        
        text = self.texts[idx]
        label = self.labels[idx]
        difficulty = self.difficulties[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long),
            'difficulties': torch.tensor(difficulty, dtype=torch.long)
        }


In [43]:
# Example data
texts = [
    "I love this product!",
    "This product was terrible.",
    "It's okay, nothing special.",
    "Amazing service, highly recommend.",
    "Not my cup of tea.",
    "The experience was ambiguous and unclear.",
    "The functionality is hard to use.",
    "Could be better.",
    "Outstanding performance!",
    "Mediocre at best."
]
labels = [1, 0, 0, 1, 0, 1, 0, 0, 1, 0]  # 1: Positive, 0: Negative
difficulties = [0, 0, 0, 0, 0, 1, 2, 1, 0, 1]  # 0: easy, 1: ambiguous, 2: hard


In [21]:
labels

[1, 0, 0, 1, 0, 1, 0, 0, 1, 0]

In [125]:
len(texts), len(labels), len(difficulties)

(10, 10, 10)

In [44]:
from torch.utils.data import DataLoader
import torch.optim as optim
from collections import defaultdict
import matplotlib.pyplot as plt

# Initialize tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Create Dataset and DataLoader
dataset = CustomDataset(texts, labels, difficulties, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Initialize Model
model = RobertaWithMoE(
    model_name='roberta-base',
    num_experts=4,
    hidden_dim=3072,
    num_difficulties=3,
    num_labels=2
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

# Define Optimizer
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# Initialize Expert Usage Counters
expert_usage = defaultdict(lambda: defaultdict(int))  # expert_usage[difficulty][expert] = count


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [110]:
for batch in dataloader:
    print(batch)
    break

{'input_ids': tensor([[    0, 14944,  8190,   819,   328,     2,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,  

In [127]:
for batch in dataloader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    difficulties = batch['difficulties']

In [128]:
len(input_ids),len(attention_mask),len(labels),len(difficulties),

(2, 2, 2, 2)

In [14]:
model

RobertaWithMoE(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Laye

In [45]:

# Training Loop
epochs = 3
model.train()
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        difficulties = batch['difficulties'].to(device)

        print(input_ids, attention_mask, labels, difficulties)
        optimizer.zero_grad()
        (loss, logits), routing_info = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            difficulties=difficulties
        )
        loss.backward()
        optimizer.step()

        # Collect routing info
        # routing_info shape: (batch_size, topk)
        routing_info = routing_info.cpu().numpy()
        difficulties_np = difficulties.cpu().numpy()

        for d, experts in zip(difficulties_np, routing_info):
            for e in experts:
                expert_usage[d][e] += 1

        print(f"Loss: {loss.item()}")

    # Print expert usage after each epoch
    print(f"Expert usage after epoch {epoch + 1}:")
    for d in range(3):
        print(f"  Difficulty {d}:")
        for e in range(model.roberta.encoder.layer[0].intermediate.num_experts):
            print(f"    Expert {e}: {expert_usage[d][e]}")
    print()


Epoch 1/3
tensor([[    0,  7199,   127,  4946,     9,  6845,     4,     2,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1

TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not NoneType

In [17]:
dataset

<__main__.CustomDataset at 0x7fadb3d680a0>

In [15]:
temp_model=AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
temp_model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [12]:
model

RobertaWithMoE(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Laye