<a href="https://colab.research.google.com/github/hassanSattariNia/FederatedLearning/blob/main/working_backpropagation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install transformers datasets



In [2]:
# !pip uninstall torch -y
!pip install torch==2.4.1+cu121 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121


Looking in indexes: https://download.pytorch.org/whl/cu121


In [3]:
from transformers import AlbertModel, AlbertTokenizer
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AlbertTokenizer, AlbertForSequenceClassification


In [4]:
# Load the GLUE MRPC dataset
dataset = load_dataset("glue", "mrpc")

# Load the tokenizer and model
model_name = "albert-base-v2"
tokenizer = AlbertTokenizer.from_pretrained(model_name)
model = AlbertForSequenceClassification.from_pretrained(model_name, num_labels=2)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from transformers import AlbertModel
import torch.nn as nn

# return trainable parameter of model
def count_parameters(module):
    return sum(p.numel() for p in module.parameters() if p.requires_grad)

def split_model_comprehensive(model, num_clients=4):
    # Comprehensive list of all modules in the ALBERT-v2 model
    modules = [
        ('embeddings.word_embeddings', model.embeddings.word_embeddings),
        ('embeddings.position_embeddings', model.embeddings.position_embeddings),
        ('embeddings.token_type_embeddings', model.embeddings.token_type_embeddings),
        ('embeddings.LayerNorm', model.embeddings.LayerNorm),
        ('encoder.embedding_hidden_mapping_in', model.encoder.embedding_hidden_mapping_in),
        ('encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm',
         model.encoder.albert_layer_groups[0].albert_layers[0].full_layer_layer_norm),
        ('encoder.albert_layer_groups.0.albert_layers.0.attention.query',
         model.encoder.albert_layer_groups[0].albert_layers[0].attention.query),
        ('encoder.albert_layer_groups.0.albert_layers.0.attention.key',
         model.encoder.albert_layer_groups[0].albert_layers[0].attention.key),
        ('encoder.albert_layer_groups.0.albert_layers.0.attention.value',
         model.encoder.albert_layer_groups[0].albert_layers[0].attention.value),
        ('encoder.albert_layer_groups.0.albert_layers.0.attention.dense',
         model.encoder.albert_layer_groups[0].albert_layers[0].attention.dense),
        ('encoder.albert_layer_groups.0.albert_layers.0.attention.LayerNorm',
         model.encoder.albert_layer_groups[0].albert_layers[0].attention.LayerNorm),
        ('encoder.albert_layer_groups.0.albert_layers.0.attention.dropout',
         model.encoder.albert_layer_groups[0].albert_layers[0].attention.attention_dropout),
        ('encoder.albert_layer_groups.0.albert_layers.0.attention.output_dropout',
         model.encoder.albert_layer_groups[0].albert_layers[0].attention.output_dropout),
        ('encoder.albert_layer_groups.0.albert_layers.0.ffn',
         model.encoder.albert_layer_groups[0].albert_layers[0].ffn),
        ('encoder.albert_layer_groups.0.albert_layers.0.ffn_output',
         model.encoder.albert_layer_groups[0].albert_layers[0].ffn_output),
        ('encoder.albert_layer_groups.0.albert_layers.0.activation',
         model.encoder.albert_layer_groups[0].albert_layers[0].activation),
        ('encoder.albert_layer_groups.0.albert_layers.0.dropout',
         model.encoder.albert_layer_groups[0].albert_layers[0].dropout),
        ('pooler', model.pooler),
        ('pooler_activation', model.pooler_activation)
    ]

    # Calculate total parameters
    total_params = sum(count_parameters(module) for _, module in modules)
    print(f'total params of list modules is ${total_params}')
    target_params_per_client = total_params // num_clients
    print(f'expected params of one client ${target_params_per_client}')

    client_modules = [[] for _ in range(num_clients)]
    current_client = 0
    current_client_params = 0

    for name, module in modules:
        client_modules[current_client].append((name, module))
        module_params = count_parameters(module)
        current_client_params += module_params
        print(f"name :${name} , current client:${current_client} , module parameter :${module_params} , currentParameterClient:${current_client_params},t:{current_client_params + module_params} ")
        # Check if adding this module exceeds the target per client and we haven't reached the last client
        if current_client_params > target_params_per_client  and current_client < num_clients - 1:
            current_client += 1
            current_client_params = 0

        # Assign the module to the current client

    return client_modules

# Load ALBERT model
model = AlbertModel.from_pretrained("albert-base-v2")

# Split the model between 4 clients
client_models = split_model_comprehensive(model, num_clients=4)

total params of list modules is $11683584
expected params of one client $2920896
name :$embeddings.word_embeddings , current client:$0 , module parameter :$3840000 , currentParameterClient:$3840000,t:7680000 
name :$embeddings.position_embeddings , current client:$1 , module parameter :$65536 , currentParameterClient:$65536,t:131072 
name :$embeddings.token_type_embeddings , current client:$1 , module parameter :$256 , currentParameterClient:$65792,t:66048 
name :$embeddings.LayerNorm , current client:$1 , module parameter :$256 , currentParameterClient:$66048,t:66304 
name :$encoder.embedding_hidden_mapping_in , current client:$1 , module parameter :$99072 , currentParameterClient:$165120,t:264192 
name :$encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm , current client:$1 , module parameter :$1536 , currentParameterClient:$166656,t:168192 
name :$encoder.albert_layer_groups.0.albert_layers.0.attention.query , current client:$1 , module parameter :$590592 , currentP

In [6]:
client_models[0]

[('embeddings.word_embeddings', Embedding(30000, 128, padding_idx=0))]

In [7]:
client_models[1]

[('embeddings.position_embeddings', Embedding(512, 128)),
 ('embeddings.token_type_embeddings', Embedding(2, 128)),
 ('embeddings.LayerNorm',
  LayerNorm((128,), eps=1e-12, elementwise_affine=True)),
 ('encoder.embedding_hidden_mapping_in',
  Linear(in_features=128, out_features=768, bias=True)),
 ('encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm',
  LayerNorm((768,), eps=1e-12, elementwise_affine=True)),
 ('encoder.albert_layer_groups.0.albert_layers.0.attention.query',
  Linear(in_features=768, out_features=768, bias=True)),
 ('encoder.albert_layer_groups.0.albert_layers.0.attention.key',
  Linear(in_features=768, out_features=768, bias=True)),
 ('encoder.albert_layer_groups.0.albert_layers.0.attention.value',
  Linear(in_features=768, out_features=768, bias=True)),
 ('encoder.albert_layer_groups.0.albert_layers.0.attention.dense',
  Linear(in_features=768, out_features=768, bias=True)),
 ('encoder.albert_layer_groups.0.albert_layers.0.attention.LayerNorm',
  Laye

In [8]:
# Display splitting information
for i, parts in enumerate(client_models):
    print(f"Client {i+1}:")
    client_total_params = 0
    for name, module in parts:
        num_params = count_parameters(module)
        client_total_params += num_params
        print(f"  - {name}: {num_params:,} parameters")
    print(f"  Total client parameters: {client_total_params:,}")
    print()

# Calculate total parameters
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total model parameters: {total_params:,}")

Client 1:
  - embeddings.word_embeddings: 3,840,000 parameters
  Total client parameters: 3,840,000

Client 2:
  - embeddings.position_embeddings: 65,536 parameters
  - embeddings.token_type_embeddings: 256 parameters
  - embeddings.LayerNorm: 256 parameters
  - encoder.embedding_hidden_mapping_in: 99,072 parameters
  - encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm: 1,536 parameters
  - encoder.albert_layer_groups.0.albert_layers.0.attention.query: 590,592 parameters
  - encoder.albert_layer_groups.0.albert_layers.0.attention.key: 590,592 parameters
  - encoder.albert_layer_groups.0.albert_layers.0.attention.value: 590,592 parameters
  - encoder.albert_layer_groups.0.albert_layers.0.attention.dense: 590,592 parameters
  - encoder.albert_layer_groups.0.albert_layers.0.attention.LayerNorm: 1,536 parameters
  - encoder.albert_layer_groups.0.albert_layers.0.attention.dropout: 0 parameters
  - encoder.albert_layer_groups.0.albert_layers.0.attention.output_dropout: 0 pa

In [9]:
# Load the GLUE MRPC dataset
dataset = load_dataset("glue", "mrpc")

# Load the tokenizer and model
model_name = "albert-base-v2"
tokenizer = AlbertTokenizer.from_pretrained(model_name)
model = AlbertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize the dataset with padding and truncation
def preprocess_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding='max_length', max_length=128)


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
print(dataset['train'][0])
tokenized_datasets = dataset.map(preprocess_function, batched=True)
tokenized_datasets = dataset.map(preprocess_function, batched=True)

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0}


Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [11]:
print(tokenized_datasets['train'][0])

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0, 'input_ids': [2, 589, 661, 2553, 4125, 33, 655, 13, 15, 1368, 24, 227, 13, 7, 14, 6165, 13, 7, 13, 15, 16, 10155, 1460, 2153, 1203, 33, 1445, 13, 9, 3, 7378, 20, 61, 28, 104, 13, 7, 14, 6165, 13, 7, 13, 15, 589, 661, 2553, 4125, 33, 655, 16, 10155, 1460, 2153, 1203, 33, 1445, 13, 9, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [12]:
tokenized_datasets = tokenized_datasets.remove_columns(['sentence1', 'sentence2'])

# Create DataLoader with batch size of 16
train_dataset = tokenized_datasets['train']
eval_dataset = tokenized_datasets['validation']

# Custom collate function to ensure correct batching
# convert individual data to batch of data
def collate_fn(batch):
    input_ids = torch.tensor([item['input_ids'] for item in batch])
    attention_mask = torch.tensor([item['attention_mask'] for item in batch])
    token_type_ids = torch.tensor([item['token_type_ids'] for item in batch])
    labels = torch.tensor([item['label'] for item in batch])

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'token_type_ids': token_type_ids,
        'labels': labels
    }

In [13]:
# Device configuration (for multiple devices)
device1 = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
device2 = torch.device("cuda:1") if torch.cuda.device_count() > 1 else device1
device3 = torch.device("cuda:2") if torch.cuda.device_count() > 2 else device1

print(device1 , device2 , device3)

cuda:0 cuda:0 cuda:0


In [15]:
from datasets import load_dataset
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from sklearn.metrics import accuracy_score  # Add this line

In [19]:
from torch.utils.data import DataLoader
from transformers import AdamW
from torch.nn import CrossEntropyLoss
import torch
from sklearn.metrics import accuracy_score

# DataLoader initialization
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

# Model split into "clients" (no pooler in ALBERT)
# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = CrossEntropyLoss()

# Function to calculate accuracy
def calculate_accuracy(preds, labels):
    preds = torch.argmax(preds, dim=1)
    return accuracy_score(labels.cpu().numpy(), preds.cpu().numpy())

# Function to process data through client 1 (embeddings)
def forward_client_1(batch):
    # Move input data to cuda
    input_ids = batch['input_ids'].to('cuda')
    attention_mask = batch['attention_mask'].to('cuda')
    token_type_ids = batch['token_type_ids'].to('cuda')

    # Get the embedding output
    embedding_output = model.albert.embeddings(
        input_ids=input_ids,
        token_type_ids=token_type_ids
    )

    # Store embedding output in a helper variable
    helper_embedding_output = embedding_output
    return helper_embedding_output, attention_mask

# Function to process data through client 2 (encoder)
def forward_client_2(helper_embedding_output, helper_attention_mask):
    # Reshape attention_mask to (batch_size, 1, 1, sequence_length) and move to cuda
    helper_attention_mask = helper_attention_mask[:, None, None, :].to('cuda')
    helper_embedding_output = helper_embedding_output.to('cuda')

    # Continue processing from the encoder layers of client 2
    encoder_output = model.albert.encoder(
        helper_embedding_output,
        attention_mask=helper_attention_mask
    )

    # Store encoder output in another helper variable
    helper_encoder_output = encoder_output
    return helper_encoder_output

# Function to process data through client 3 (classification head)
def forward_client_3(helper_encoder_output, labels):
    # Extract the [CLS] token for classification and move to cuda
    cls_output = helper_encoder_output.last_hidden_state[:, 0, :].to('cuda')
    labels = labels.to('cuda')

    # Process through classification head
    logits = model.classifier(cls_output)

    # Calculate the loss
    loss = criterion(logits, labels)
    return logits, loss

# Function to train and track metrics
def train_model(epochs):
    list_accuracies = []
    for epoch in range(epochs):
        model.train()  # Set model to training mode
        total_loss = 0
        all_preds = []
        all_labels = []

        print(len(train_dataloader))
        i = 0
        for batch in train_dataloader:
            i += 1
            print(f' {i}', end="")

            # Process through Client 1 (Embeddings)
            output_client_1, helper_attention_mask = forward_client_1(batch)

            # Process through Client 2 (Encoder)
            output_client_2 = forward_client_2(output_client_1, helper_attention_mask)

            # Process through Client 3 (Classification and loss calculation)
            logits, loss = forward_client_3(output_client_2, batch['labels'])

            # Move logits and labels back to CPU for metric calculation
            all_preds.append(logits.detach().cpu())
            all_labels.append(batch['labels'].cpu())

            # Accumulate the loss
            total_loss += loss.item()

            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        # Calculate metrics at the end of each epoch
        all_preds = torch.cat(all_preds, dim=0)
        all_labels = torch.cat(all_labels, dim=0)
        epoch_accuracy = calculate_accuracy(all_preds, all_labels)
        average_loss = total_loss / len(train_dataloader)

        print(f"Epoch {epoch + 1}/{epochs}")
        list_accuracies.append(epoch_accuracy)
        print(f"Loss: {average_loss:.4f} | Accuracy: {epoch_accuracy:.4f}")

# Train for specified number of epochs
train_model(epochs=10)


230
 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230Epoch 1/10
Loss: 0.6315 | Accuracy: 0.6655
230
 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 4

KeyboardInterrupt: 

In [26]:
import torch
from torch.utils.data import DataLoader
from transformers import AdamW
from torch.nn import CrossEntropyLoss, Dropout  # Import Dropout
from sklearn.metrics import accuracy_score

# DataLoader initialization
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

# Define optimizer, loss function, and a standalone dropout layer
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = CrossEntropyLoss()
dropout_layer = Dropout(p=0.1)  # Define a dropout layer with a dropout rate of 0.1

# Function to calculate accuracy
def calculate_accuracy(preds, labels):
    preds = torch.argmax(preds, dim=1)
    return accuracy_score(labels.cpu().numpy(), preds.cpu().numpy())

# Function to process data through client 1 (embeddings)
def forward_client_1(batch):
    input_ids = batch['input_ids'].to('cuda')
    attention_mask = batch['attention_mask'].to('cuda')
    token_type_ids = batch['token_type_ids'].to('cuda')

    embedding_output = model.albert.embeddings(
        input_ids=input_ids,
        token_type_ids=token_type_ids
    )
    return embedding_output, attention_mask

# Function to process data through client 2 (encoder)
def forward_client_2(helper_embedding_output, helper_attention_mask):
    helper_attention_mask = helper_attention_mask[:, None, None, :].to('cuda')
    helper_embedding_output = helper_embedding_output.to('cuda')

    encoder_output = model.albert.encoder(
        helper_embedding_output,
        attention_mask=helper_attention_mask
    )
    return encoder_output

# Function to process data through client 3 (classification head with dropout)
def forward_client_3(helper_encoder_output, labels):
    # Apply the standalone dropout layer after the encoder
    cls_output = dropout_layer(helper_encoder_output.last_hidden_state[:, 0, :].to('cuda'))

    # Process through classification head
    logits = model.classifier(cls_output)
    loss = criterion(logits, labels.to('cuda'))
    return logits, loss

# Function to train and track metrics
def train_model(epochs):
    list_accuracies = []
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        all_preds = []
        all_labels = []

        for batch in train_dataloader:
            output_client_1, helper_attention_mask = forward_client_1(batch)
            output_client_2 = forward_client_2(output_client_1, helper_attention_mask)
            logits, loss = forward_client_3(output_client_2, batch['labels'])

            all_preds.append(logits.detach().cpu())
            all_labels.append(batch['labels'].cpu())
            total_loss += loss.item()

            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        # Calculate metrics at the end of each epoch
        all_preds = torch.cat(all_preds, dim=0)
        all_labels = torch.cat(all_labels, dim=0)
        epoch_accuracy = calculate_accuracy(all_preds, all_labels)
        average_loss = total_loss / len(train_dataloader)

        print(f"Epoch {epoch + 1}/{epochs} | Loss: {average_loss:.4f} | Accuracy: {epoch_accuracy:.4f}")
        list_accuracies.append(epoch_accuracy)

# Train model
train_model(epochs=10)


Epoch 1/10 | Loss: 0.6402 | Accuracy: 0.6742
Epoch 2/10 | Loss: 0.6320 | Accuracy: 0.6745
Epoch 3/10 | Loss: 0.6347 | Accuracy: 0.6726
Epoch 4/10 | Loss: 0.6292 | Accuracy: 0.6668
Epoch 5/10 | Loss: 0.6360 | Accuracy: 0.6745
Epoch 6/10 | Loss: 0.6265 | Accuracy: 0.6707
Epoch 7/10 | Loss: 0.5913 | Accuracy: 0.6835
Epoch 8/10 | Loss: 0.5188 | Accuracy: 0.7574
Epoch 9/10 | Loss: 0.4965 | Accuracy: 0.7756
Epoch 10/10 | Loss: 0.4568 | Accuracy: 0.8111


## working Handle custom backpropagation

In [24]:
import torch
from torch.utils.data import DataLoader
from transformers import AdamW
from torch.nn import CrossEntropyLoss
from sklearn.metrics import accuracy_score

# DataLoader initialization
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
criterion = CrossEntropyLoss()

# Function to calculate accuracy
def calculate_accuracy(preds, labels):
    preds = torch.argmax(preds, dim=1)
    return accuracy_score(labels.cpu().numpy(), preds.cpu().numpy())

# Function to simulate parameter sending/receiving
def send_to_device(data, device):
    return data.to(device)

# Forward functions for each client with parameter transfer simulation
def forward_client_1(batch):
    input_ids = send_to_device(batch['input_ids'], 'cuda')
    attention_mask = send_to_device(batch['attention_mask'], 'cuda')
    token_type_ids = send_to_device(batch['token_type_ids'], 'cuda')

    embedding_output = model.albert.embeddings(input_ids=input_ids, token_type_ids=token_type_ids)
    return embedding_output, attention_mask

def forward_client_2(helper_embedding_output, helper_attention_mask):
    helper_attention_mask = helper_attention_mask[:, None, None, :].to('cuda')
    encoder_output = model.albert.encoder(helper_embedding_output, attention_mask=helper_attention_mask)
    return encoder_output

def forward_client_3(helper_encoder_output, labels):
    cls_output = helper_encoder_output.last_hidden_state[:, 0, :].to('cuda')
    logits = model.classifier(cls_output)
    loss = criterion(logits, labels.to('cuda'))
    return logits, loss

# Custom backward propagation
def custom_backward(output_client_3, output_client_2, output_client_1):
    logits, loss = output_client_3  # Unpack the logits and loss from output_client_3

    # Convert loss to a scalar by taking the mean
    scalar_loss = loss.mean()  # Ensures we get a scalar output for backprop
    grad_logits = torch.autograd.grad(outputs=scalar_loss, inputs=logits, retain_graph=True, allow_unused=True)[0]

    # Backpropagate through client 3
    cls_output = output_client_2.last_hidden_state[:, 0, :]  # Extract CLS token
    grad_cls_output = torch.autograd.grad(outputs=logits, inputs=cls_output, grad_outputs=grad_logits, retain_graph=True, allow_unused=True)[0]

    # Check if `grad_cls_output` exists and backpropagate through client 2 if valid
    if grad_cls_output is not None:
        grad_encoder_output = torch.autograd.grad(outputs=cls_output, inputs=output_client_1, grad_outputs=grad_cls_output, retain_graph=True, allow_unused=True)[0]

        # If `grad_encoder_output` is valid, continue backpropagation through client 1
        if grad_encoder_output is not None:
            torch.autograd.backward(output_client_1, grad_encoder_output)

# Training loop with custom backward
def train_model(epochs):
    list_accuracies = []
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        all_preds = []
        all_labels = []

        for batch in train_dataloader:
            # Forward pass
            output_client_1, helper_attention_mask = forward_client_1(batch)
            output_client_2 = forward_client_2(output_client_1, helper_attention_mask)
            logits, loss = forward_client_3(output_client_2, batch['labels'])

            # Store predictions and labels
            all_preds.append(logits.detach().cpu())
            all_labels.append(batch['labels'].cpu())
            total_loss += loss.item()

            # Custom backward pass
            custom_backward((logits, loss), output_client_2, output_client_1)

            # Optimizer step
            optimizer.step()
            optimizer.zero_grad()

        # Calculate metrics
        all_preds = torch.cat(all_preds, dim=0)
        all_labels = torch.cat(all_labels, dim=0)
        epoch_accuracy = calculate_accuracy(all_preds, all_labels)
        average_loss = total_loss / len(train_dataloader)

        print(f"Epoch {epoch + 1}/{epochs} | Loss: {average_loss:.4f} | Accuracy: {epoch_accuracy:.4f}")
        list_accuracies.append(epoch_accuracy)

# Train model
train_model(epochs=10)


Epoch 1/10 | Loss: 0.6332 | Accuracy: 0.6745
Epoch 2/10 | Loss: 0.6327 | Accuracy: 0.6745
Epoch 3/10 | Loss: 0.6332 | Accuracy: 0.6745
Epoch 4/10 | Loss: 0.6332 | Accuracy: 0.6745
Epoch 5/10 | Loss: 0.6341 | Accuracy: 0.6745
Epoch 6/10 | Loss: 0.6332 | Accuracy: 0.6745
Epoch 7/10 | Loss: 0.6327 | Accuracy: 0.6745
Epoch 8/10 | Loss: 0.6327 | Accuracy: 0.6745


KeyboardInterrupt: 

## drop out + custom backpropagation

In [27]:
import torch
from torch.utils.data import DataLoader
from transformers import AdamW
from torch.nn import CrossEntropyLoss, Dropout
from sklearn.metrics import accuracy_score

# DataLoader initialization
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
criterion = CrossEntropyLoss()
dropout_layer = Dropout(p=0.1)  # Define a dropout layer with a dropout rate of 0.1

# Function to calculate accuracy
def calculate_accuracy(preds, labels):
    preds = torch.argmax(preds, dim=1)
    return accuracy_score(labels.cpu().numpy(), preds.cpu().numpy())

# Function to simulate parameter sending/receiving
def send_to_device(data, device):
    return data.to(device)

# Forward functions for each client with parameter transfer simulation
def forward_client_1(batch):
    input_ids = send_to_device(batch['input_ids'], 'cuda')
    attention_mask = send_to_device(batch['attention_mask'], 'cuda')
    token_type_ids = send_to_device(batch['token_type_ids'], 'cuda')

    embedding_output = model.albert.embeddings(input_ids=input_ids, token_type_ids=token_type_ids)
    return embedding_output, attention_mask

def forward_client_2(helper_embedding_output, helper_attention_mask):
    helper_attention_mask = helper_attention_mask[:, None, None, :].to('cuda')
    encoder_output = model.albert.encoder(helper_embedding_output, attention_mask=helper_attention_mask)
    return encoder_output

def forward_client_3(helper_encoder_output, labels):
    # Apply the standalone dropout layer after the encoder
    cls_output = dropout_layer(helper_encoder_output.last_hidden_state[:, 0, :].to('cuda'))

    # Process through classification head
    logits = model.classifier(cls_output)
    loss = criterion(logits, labels.to('cuda'))
    return logits, loss

# Custom backward propagation
def custom_backward(output_client_3, output_client_2, output_client_1):
    logits, loss = output_client_3  # Unpack the logits and loss from output_client_3

    # Convert loss to a scalar by taking the mean
    scalar_loss = loss.mean()  # Ensures we get a scalar output for backprop
    grad_logits = torch.autograd.grad(outputs=scalar_loss, inputs=logits, retain_graph=True, allow_unused=True)[0]

    # Backpropagate through client 3
    cls_output = output_client_2.last_hidden_state[:, 0, :]  # Extract CLS token
    grad_cls_output = torch.autograd.grad(outputs=logits, inputs=cls_output, grad_outputs=grad_logits, retain_graph=True, allow_unused=True)[0]

    # Check if `grad_cls_output` exists and backpropagate through client 2 if valid
    if grad_cls_output is not None:
        grad_encoder_output = torch.autograd.grad(outputs=cls_output, inputs=output_client_1, grad_outputs=grad_cls_output, retain_graph=True, allow_unused=True)[0]

        # If `grad_encoder_output` is valid, continue backpropagation through client 1
        if grad_encoder_output is not None:
            torch.autograd.backward(output_client_1, grad_encoder_output)

# Training loop with custom backward
def train_model(epochs):
    list_accuracies = []
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        all_preds = []
        all_labels = []

        for batch in train_dataloader:
            # Forward pass
            output_client_1, helper_attention_mask = forward_client_1(batch)
            output_client_2 = forward_client_2(output_client_1, helper_attention_mask)
            logits, loss = forward_client_3(output_client_2, batch['labels'])

            # Store predictions and labels
            all_preds.append(logits.detach().cpu())
            all_labels.append(batch['labels'].cpu())
            total_loss += loss.item()

            # Custom backward pass
            custom_backward((logits, loss), output_client_2, output_client_1)

            # Optimizer step
            optimizer.step()
            optimizer.zero_grad()

        # Calculate metrics
        all_preds = torch.cat(all_preds, dim=0)
        all_labels = torch.cat(all_labels, dim=0)
        epoch_accuracy = calculate_accuracy(all_preds, all_labels)
        average_loss = total_loss / len(train_dataloader)

        print(f"Epoch {epoch + 1}/{epochs} | Loss: {average_loss:.4f} | Accuracy: {epoch_accuracy:.4f}")
        list_accuracies.append(epoch_accuracy)

# Train model
train_model(epochs=10)


Epoch 1/10 | Loss: 0.4117 | Accuracy: 0.8064
Epoch 2/10 | Loss: 0.4116 | Accuracy: 0.8040
Epoch 3/10 | Loss: 0.4112 | Accuracy: 0.8078
Epoch 4/10 | Loss: 0.4115 | Accuracy: 0.8075
Epoch 5/10 | Loss: 0.4126 | Accuracy: 0.8053
Epoch 6/10 | Loss: 0.4131 | Accuracy: 0.8045
Epoch 7/10 | Loss: 0.4118 | Accuracy: 0.8056


KeyboardInterrupt: 

In [35]:
import torch
from torch.utils.data import DataLoader
from transformers import AdamW
from torch.nn import CrossEntropyLoss, Dropout
from sklearn.metrics import accuracy_score

# DataLoader initialization
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
criterion = CrossEntropyLoss()
dropout_layer = Dropout(p=0.1)  # Define a standalone dropout layer

# Function to calculate accuracy
def calculate_accuracy(preds, labels):
    preds = torch.argmax(preds, dim=1)
    return accuracy_score(labels.cpu().numpy(), preds.cpu().numpy())

# Function to simulate parameter sending/receiving
def send_to_device(data, device):
    return data.to(device)

# Forward function for client 1 (embeddings)
def forward_client_1(batch):
    input_ids = send_to_device(batch['input_ids'], 'cuda')
    attention_mask = send_to_device(batch['attention_mask'], 'cuda')
    token_type_ids = send_to_device(batch['token_type_ids'], 'cuda')

    embedding_output = model.albert.embeddings(input_ids=input_ids, token_type_ids=token_type_ids)
    return embedding_output, attention_mask

# Forward functions for divided Encoder clients
def forward_client_encoder_part(helper_output, attention_mask, start_group, end_group, head_mask=None):
    # Check if `albert_layer_groups` exists
    if not hasattr(model.albert.encoder, 'albert_layer_groups'):
        print("Error: 'albert_layer_groups' not found in the model. Check model structure.")
        return None

    # Initialize head_mask if None
    if head_mask is None:
        head_mask = [None] * len(model.albert.encoder.albert_layer_groups)

    # Process a subset of encoder layers
    for i in range(start_group, end_group):
        layer_group = model.albert.encoder.albert_layer_groups[i]
        helper_output = layer_group(helper_output, attention_mask=attention_mask, head_mask=head_mask[i])[0]

    return helper_output

# Function to process data through classification head with dropout
def forward_client_classification(helper_encoder_output, labels):
    cls_output = dropout_layer(helper_encoder_output[:, 0, :].to('cuda'))
    logits = model.classifier(cls_output)
    loss = criterion(logits, labels.to('cuda'))
    return logits, loss

# Custom backward propagation
def custom_backward(output_client_3, outputs_encoder_parts, output_client_1):
    logits, loss = output_client_3
    scalar_loss = loss.mean()  # Ensure scalar for backprop
    grad_logits = torch.autograd.grad(outputs=scalar_loss, inputs=logits, retain_graph=True, allow_unused=True)[0]

    # Backpropagate through classification layer
    cls_output = outputs_encoder_parts[-1][:, 0, :]
    grad_cls_output = torch.autograd.grad(outputs=logits, inputs=cls_output, grad_outputs=grad_logits, retain_graph=True, allow_unused=True)[0]

    # Backpropagate through encoder parts in reverse order
    grad_output = grad_cls_output
    for i in reversed(range(len(outputs_encoder_parts) - 1)):
        grad_output = torch.autograd.grad(outputs=outputs_encoder_parts[i + 1], inputs=outputs_encoder_parts[i], grad_outputs=grad_output, retain_graph=True, allow_unused=True)[0]

    # Backpropagate to embeddings (client 1)
    torch.autograd.backward(output_client_1, grad_output)

# Training loop with custom backward
def train_model(epochs):
    list_accuracies = []
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        all_preds = []
        all_labels = []

        for batch in train_dataloader:
            # Forward pass through Client 1 (Embeddings)
            output_client_1, helper_attention_mask = forward_client_1(batch)

            # Forward pass through divided Encoder clients
            output_encoder_part_1 = forward_client_encoder_part(output_client_1, helper_attention_mask, 0, 1)
            output_encoder_part_2 = forward_client_encoder_part(output_encoder_part_1, helper_attention_mask, 1, 2)
            output_encoder_part_3 = forward_client_encoder_part(output_encoder_part_2, helper_attention_mask, 2, 3)
            output_encoder_part_4 = forward_client_encoder_part(output_encoder_part_3, helper_attention_mask, 3, 4)
            outputs_encoder_parts = [output_encoder_part_1, output_encoder_part_2, output_encoder_part_3, output_encoder_part_4]

            # Forward pass through classification head
            logits, loss = forward_client_classification(output_encoder_part_4, batch['labels'])

            # Store predictions and labels
            all_preds.append(logits.detach().cpu())
            all_labels.append(batch['labels'].cpu())
            total_loss += loss.item()

            # Custom backward pass
            custom_backward((logits, loss), outputs_encoder_parts, output_client_1)

            # Optimizer step
            optimizer.step()
            optimizer.zero_grad()

        # Calculate metrics
        all_preds = torch.cat(all_preds, dim=0)
        all_labels = torch.cat(all_labels, dim=0)
        epoch_accuracy = calculate_accuracy(all_preds, all_labels)
        average_loss = total_loss / len(train_dataloader)

        print(f"Epoch {epoch + 1}/{epochs} | Loss: {average_loss:.4f} | Accuracy: {epoch_accuracy:.4f}")
        list_accuracies.append(epoch_accuracy)

# Train model
train_model(epochs=10)


TypeError: 'NoneType' object is not subscriptable