In [1]:
pip install torch transformers datasets



In [2]:
!pip uninstall torch -y
!pip install torch==2.4.1+cu121 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121


Found existing installation: torch 2.4.1+cu121
Uninstalling torch-2.4.1+cu121:
  Successfully uninstalled torch-2.4.1+cu121
Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torch==2.4.1+cu121
  Using cached https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp310-cp310-linux_x86_64.whl (798.9 MB)
Installing collected packages: torch
Successfully installed torch-2.4.1+cu121


In [3]:
from transformers import AlbertModel, AlbertTokenizer
import torch

In [4]:
# load original Albert-v2
model_name = "albert-base-v2"
model = AlbertModel.from_pretrained(model_name)
tokenizer = AlbertTokenizer.from_pretrained(model_name)
print("Model Architecture:")
print(model)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Model Architecture:
AlbertModel(
  (embeddings): AlbertEmbeddings(
    (word_embeddings): Embedding(30000, 128, padding_idx=0)
    (position_embeddings): Embedding(512, 128)
    (token_type_embeddings): Embedding(2, 128)
    (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0, inplace=False)
  )
  (encoder): AlbertTransformer(
    (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
    (albert_layer_groups): ModuleList(
      (0): AlbertLayerGroup(
        (albert_layers): ModuleList(
          (0): AlbertLayer(
            (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (attention): AlbertAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (attention_dropout): Dropout(p=0, i

In [11]:
from transformers import AlbertModel
import torch.nn as nn

# return trainable parameter of model
def count_parameters(module):
    return sum(p.numel() for p in module.parameters() if p.requires_grad)

def split_model_comprehensive(model, num_clients=4):
    # Comprehensive list of all modules in the ALBERT-v2 model
    modules = [
        ('embeddings.word_embeddings', model.embeddings.word_embeddings),
        ('embeddings.position_embeddings', model.embeddings.position_embeddings),
        ('embeddings.token_type_embeddings', model.embeddings.token_type_embeddings),
        ('embeddings.LayerNorm', model.embeddings.LayerNorm),
        ('encoder.embedding_hidden_mapping_in', model.encoder.embedding_hidden_mapping_in),
        ('encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm',
         model.encoder.albert_layer_groups[0].albert_layers[0].full_layer_layer_norm),
        ('encoder.albert_layer_groups.0.albert_layers.0.attention.query',
         model.encoder.albert_layer_groups[0].albert_layers[0].attention.query),
        ('encoder.albert_layer_groups.0.albert_layers.0.attention.key',
         model.encoder.albert_layer_groups[0].albert_layers[0].attention.key),
        ('encoder.albert_layer_groups.0.albert_layers.0.attention.value',
         model.encoder.albert_layer_groups[0].albert_layers[0].attention.value),
        ('encoder.albert_layer_groups.0.albert_layers.0.attention.dense',
         model.encoder.albert_layer_groups[0].albert_layers[0].attention.dense),
        ('encoder.albert_layer_groups.0.albert_layers.0.attention.LayerNorm',
         model.encoder.albert_layer_groups[0].albert_layers[0].attention.LayerNorm),
        ('encoder.albert_layer_groups.0.albert_layers.0.attention.dropout',
         model.encoder.albert_layer_groups[0].albert_layers[0].attention.attention_dropout),
        ('encoder.albert_layer_groups.0.albert_layers.0.attention.output_dropout',
         model.encoder.albert_layer_groups[0].albert_layers[0].attention.output_dropout),
        ('encoder.albert_layer_groups.0.albert_layers.0.ffn',
         model.encoder.albert_layer_groups[0].albert_layers[0].ffn),
        ('encoder.albert_layer_groups.0.albert_layers.0.ffn_output',
         model.encoder.albert_layer_groups[0].albert_layers[0].ffn_output),
        ('encoder.albert_layer_groups.0.albert_layers.0.activation',
         model.encoder.albert_layer_groups[0].albert_layers[0].activation),
        ('encoder.albert_layer_groups.0.albert_layers.0.dropout',
         model.encoder.albert_layer_groups[0].albert_layers[0].dropout),
        ('pooler', model.pooler),
        ('pooler_activation', model.pooler_activation)
    ]

    # Calculate total parameters
    total_params = sum(count_parameters(module) for _, module in modules)
    print(f'total params of list modules is ${total_params}')
    target_params_per_client = total_params // num_clients
    print(f'expected params of one client ${target_params_per_client}')

    client_modules = [[] for _ in range(num_clients)]
    current_client = 0
    current_client_params = 0

    for name, module in modules:
        client_modules[current_client].append((name, module))
        module_params = count_parameters(module)
        current_client_params += module_params
        print(f"name :${name} , current client:${current_client} , module parameter :${module_params} , currentParameterClient:${current_client_params},t:{current_client_params + module_params} ")
        # Check if adding this module exceeds the target per client and we haven't reached the last client
        if current_client_params > target_params_per_client  and current_client < num_clients - 1:
            current_client += 1
            current_client_params = 0

        # Assign the module to the current client

    return client_modules

# Load ALBERT model
model = AlbertModel.from_pretrained("albert-base-v2")

# Split the model between 4 clients
client_models = split_model_comprehensive(model, num_clients=4)




total params of list modules is $11683584
expected params of one client $2920896
name :$embeddings.word_embeddings , current client:$0 , module parameter :$3840000 , currentParameterClient:$3840000,t:7680000 
name :$embeddings.position_embeddings , current client:$1 , module parameter :$65536 , currentParameterClient:$65536,t:131072 
name :$embeddings.token_type_embeddings , current client:$1 , module parameter :$256 , currentParameterClient:$65792,t:66048 
name :$embeddings.LayerNorm , current client:$1 , module parameter :$256 , currentParameterClient:$66048,t:66304 
name :$encoder.embedding_hidden_mapping_in , current client:$1 , module parameter :$99072 , currentParameterClient:$165120,t:264192 
name :$encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm , current client:$1 , module parameter :$1536 , currentParameterClient:$166656,t:168192 
name :$encoder.albert_layer_groups.0.albert_layers.0.attention.query , current client:$1 , module parameter :$590592 , currentP

In [6]:
client_models[1]

[('embeddings.position_embeddings', Embedding(512, 128)),
 ('embeddings.token_type_embeddings', Embedding(2, 128)),
 ('embeddings.LayerNorm',
  LayerNorm((128,), eps=1e-12, elementwise_affine=True)),
 ('encoder.embedding_hidden_mapping_in',
  Linear(in_features=128, out_features=768, bias=True)),
 ('encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm',
  LayerNorm((768,), eps=1e-12, elementwise_affine=True)),
 ('encoder.albert_layer_groups.0.albert_layers.0.attention.query',
  Linear(in_features=768, out_features=768, bias=True)),
 ('encoder.albert_layer_groups.0.albert_layers.0.attention.key',
  Linear(in_features=768, out_features=768, bias=True)),
 ('encoder.albert_layer_groups.0.albert_layers.0.attention.value',
  Linear(in_features=768, out_features=768, bias=True)),
 ('encoder.albert_layer_groups.0.albert_layers.0.attention.dense',
  Linear(in_features=768, out_features=768, bias=True)),
 ('encoder.albert_layer_groups.0.albert_layers.0.attention.LayerNorm',
  Laye

In [7]:
# Display splitting information
for i, parts in enumerate(client_models):
    print(f"Client {i+1}:")
    client_total_params = 0
    for name, module in parts:
        num_params = count_parameters(module)
        client_total_params += num_params
        print(f"  - {name}: {num_params:,} parameters")
    print(f"  Total client parameters: {client_total_params:,}")
    print()

# Calculate total parameters
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total model parameters: {total_params:,}")

Client 1:
  - embeddings.word_embeddings: 3,840,000 parameters
  Total client parameters: 3,840,000

Client 2:
  - embeddings.position_embeddings: 65,536 parameters
  - embeddings.token_type_embeddings: 256 parameters
  - embeddings.LayerNorm: 256 parameters
  - encoder.embedding_hidden_mapping_in: 99,072 parameters
  - encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm: 1,536 parameters
  - encoder.albert_layer_groups.0.albert_layers.0.attention.query: 590,592 parameters
  - encoder.albert_layer_groups.0.albert_layers.0.attention.key: 590,592 parameters
  - encoder.albert_layer_groups.0.albert_layers.0.attention.value: 590,592 parameters
  - encoder.albert_layer_groups.0.albert_layers.0.attention.dense: 590,592 parameters
  - encoder.albert_layer_groups.0.albert_layers.0.attention.LayerNorm: 1,536 parameters
  - encoder.albert_layer_groups.0.albert_layers.0.attention.dropout: 0 parameters
  - encoder.albert_layer_groups.0.albert_layers.0.attention.output_dropout: 0 pa

In [8]:
print(client_models[1])

[('embeddings.position_embeddings', Embedding(512, 128)), ('embeddings.token_type_embeddings', Embedding(2, 128)), ('embeddings.LayerNorm', LayerNorm((128,), eps=1e-12, elementwise_affine=True)), ('encoder.embedding_hidden_mapping_in', Linear(in_features=128, out_features=768, bias=True)), ('encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm', LayerNorm((768,), eps=1e-12, elementwise_affine=True)), ('encoder.albert_layer_groups.0.albert_layers.0.attention.query', Linear(in_features=768, out_features=768, bias=True)), ('encoder.albert_layer_groups.0.albert_layers.0.attention.key', Linear(in_features=768, out_features=768, bias=True)), ('encoder.albert_layer_groups.0.albert_layers.0.attention.value', Linear(in_features=768, out_features=768, bias=True)), ('encoder.albert_layer_groups.0.albert_layers.0.attention.dense', Linear(in_features=768, out_features=768, bias=True)), ('encoder.albert_layer_groups.0.albert_layers.0.attention.LayerNorm', LayerNorm((768,), eps=1e-12, 

In [9]:
from datasets import load_dataset
from transformers import AlbertTokenizer, AlbertForSequenceClassification
import torch

# Load the GLUE MRPC dataset
dataset = load_dataset("glue", "mrpc")

# Load the tokenizer and model
model_name = "albert-base-v2"
tokenizer = AlbertTokenizer.from_pretrained(model_name)
model = AlbertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize the dataset with padding and truncation
def preprocess_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding='max_length', max_length=128)

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Use the tokenized datasets for training and evaluation
train_dataset = tokenized_datasets['train']
eval_dataset = tokenized_datasets['validation']

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
import torch
from torch.utils.data import DataLoader
from transformers import AlbertTokenizer, AlbertForSequenceClassification
from datasets import load_dataset




Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        ...,

        [[-0.5872, -0.6527,  1.4885,  ...,  1.3733,  2.1835, -4.2784],
         [-0.5869, -0.6527,  1.4873,  ...,  1.3742,  2.1839, -4.2787],
         [-0.5870, -0.6527,  1.4876,  ...,  1.3741,  2.1837, -4.2785],
         ...,
         [-0.5869, -0.6526,  1.4878,  ...,  1.3741,  2.1835, -4.2785],
         [-0.5869, -0.6526,  1.4879,  ...,  1.3741,  2.1835, -4.2785],
         [-0.5868, -0.6525,  1.4880,  ...,  1.3741,  2.1835, -4.2785]],

        [[-0.5848, -0.6502,  1.4870,  ...,  1.3697,  2.1805, -4.2784],
         [-0.5844, -0.6501,  1.4856,  ...,  1.3709,  2.1808, -4.2788],
         [-0.5846, -0.6500,  1.4859,  ...,  1.3706,  2.1808, -4.2786],
         ...,
         [-0.5844, -0.6500,  1.4862,  ...,  1.3706,  2.1805, -4.2785],
         [-0.5844, -0.6500,  1.4863,  ...,  1.3706,  2.1805, -4.2785],
         [-0.5844, -0.6500,  1.4864,  ...,  1.3707,  2.1805, -4.2786]],

        [[-0.5891, -0.6533,  1.4885, 

In [15]:
import torch
from torch.utils.data import DataLoader
from transformers import AlbertTokenizer, AlbertForSequenceClassification
from datasets import load_dataset
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss

# Load the GLUE MRPC dataset
dataset = load_dataset("glue", "mrpc")

# Load the tokenizer and model
model_name = "albert-base-v2"
tokenizer = AlbertTokenizer.from_pretrained(model_name)
model = AlbertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize the dataset with padding and truncation
def preprocess_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding='max_length', max_length=128)

tokenized_datasets = dataset.map(preprocess_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['sentence1', 'sentence2'])

# Create DataLoader with batch size of 16
train_dataset = tokenized_datasets['train']
eval_dataset = tokenized_datasets['validation']

# Custom collate function to ensure correct batching
def collate_fn(batch):
    input_ids = torch.tensor([item['input_ids'] for item in batch])
    attention_mask = torch.tensor([item['attention_mask'] for item in batch])
    token_type_ids = torch.tensor([item['token_type_ids'] for item in batch])
    labels = torch.tensor([item['label'] for item in batch])

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'token_type_ids': token_type_ids,
        'labels': labels
    }

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

# Device configuration (for multiple devices)
device1 = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
device2 = torch.device("cuda:1") if torch.cuda.device_count() > 1 else device1
device3 = torch.device("cuda:2") if torch.cuda.device_count() > 2 else device1

# Split the model into clients
model.albert.embeddings.to(device1)
model.albert.encoder.to(device2)
model.classifier.to(device3)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = CrossEntropyLoss()

# Function to process data through client 1
def forward_client_1(batch):
    input_ids = batch['input_ids'].to(device1)
    attention_mask = batch['attention_mask'].to(device1)
    token_type_ids = batch['token_type_ids'].to(device1)

    # Get the embedding output
    embedding_output = model.albert.embeddings(
        input_ids=input_ids,
        token_type_ids=token_type_ids
    )

    print(f"Client 1 - Output Shape: {embedding_output.shape}, Device: {embedding_output.device}")
    return embedding_output

# Function to process data through client 2
def forward_client_2(embedding_output, attention_mask):
    # Move embedding_output to device2
    embedding_output = embedding_output.to(device2)
    attention_mask = attention_mask.to(device2)

    # Reshape attention_mask to (batch_size, 1, 1, sequence_length)
    attention_mask = attention_mask[:, None, None, :]

    # Continue processing from the encoder layers of client 2
    encoder_output = model.albert.encoder(
        embedding_output,
        attention_mask=attention_mask
    )

    print(f"Client 2 - Output Shape: {encoder_output.last_hidden_state.shape}, Device: {encoder_output.last_hidden_state.device}")
    return encoder_output

# Function to process data through client 3 (classification head)
def forward_client_3(encoder_output, labels):
    # Move encoder_output to device3
    encoder_output = encoder_output.last_hidden_state[:, 0, :].to(device3)  # [CLS] token for classification

    # Process through classification head
    logits = model.classifier(encoder_output)
    labels = labels.to(device3)

    print(f"Client 3 - Output Shape: {logits.shape}, Device: {logits.device}")

    # Calculate the loss
    loss = criterion(logits, labels)
    print(f"Client 3 - Loss: {loss.item()}")

    return loss

# Loop over the dataloader and process batches
model.train()  # Set model to training mode
for batch in train_dataloader:
    # Process through Client 1
    output_client_1 = forward_client_1(batch)

    # Process through Client 2
    output_client_2 = forward_client_2(output_client_1, batch['attention_mask'])

    # Process through Client 3
    loss = forward_client_3(output_client_2, batch['labels'])

    # Backward pass and optimization
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    break  # Process one batch for verification


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Client 1 - Output Shape: torch.Size([16, 128, 128]), Device: cuda:0
Client 2 - Output Shape: torch.Size([16, 128, 768]), Device: cuda:0
Client 3 - Output Shape: torch.Size([16, 2]), Device: cuda:0
Client 3 - Loss: 0.6434246897697449
