## Test Code

In [1]:
import torch
import copy
import numpy as np
import pickle
# import evaluate

from utils.GLUE_partitaon_dataset import load_partition_glue_data
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments
from transformers import RobertaForSequenceClassification
from transformers import DataCollatorWithPadding
from peft import LoraConfig, get_peft_model


In [3]:
client_num = 10
alpha = 1.0
data = "mrpc"
client_datasets, tokenized_datasets = load_partition_glue_data(data, client_num, alpha)

client 0 
 train(330) | 1 : 296, 0 : 34 |, valid(271) | 1 : 237, 0 : 34 |, test(313) | 1 : 279, 0 : 34 |
client 1 
 train(330) | 1 : 184, 0 : 146 |, valid(314) | 1 : 184, 0 : 130 |, test(313) | 1 : 184, 0 : 129 |
client 2 
 train(330) | 1 : 50, 0 : 280 |, valid(180) | 1 : 50, 0 : 130 |, test(179) | 1 : 50, 0 : 129 |
client 3 
 train(330) | 1 : 330, 0 : 0 |, valid(237) | 1 : 237, 0 : 0 |, test(279) | 1 : 279, 0 : 0 |
client 4 
 train(330) | 1 : 113, 0 : 217 |, valid(243) | 1 : 113, 0 : 130 |, test(242) | 1 : 113, 0 : 129 |
client 5 
 train(330) | 1 : 330, 0 : 0 |, valid(237) | 1 : 237, 0 : 0 |, test(279) | 1 : 279, 0 : 0 |
client 6 
 train(330) | 1 : 2, 0 : 328 |, valid(132) | 1 : 2, 0 : 130 |, test(131) | 1 : 2, 0 : 129 |
client 7 
 train(330) | 1 : 271, 0 : 59 |, valid(296) | 1 : 237, 0 : 59 |, test(330) | 1 : 271, 0 : 59 |
client 8 
 train(330) | 1 : 330, 0 : 0 |, valid(237) | 1 : 237, 0 : 0 |, test(279) | 1 : 279, 0 : 0 |
client 9 
 train(331) | 1 : 331, 0 : 0 |, valid(237) | 1 : 23

In [64]:
save_dir = "./llm_models/FL/"
# 設置 LoRA 配置
lora_config = LoraConfig(
    r=16,  # Rank
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
)

# 訓練參數設置
training_args = TrainingArguments(
    output_dir=save_dir,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
)



In [65]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
model.config.pad_token_id = model.config.eos_token_id
peft_model = get_peft_model(model, lora_config)
client_models = [copy.deepcopy(peft_model) for idx in range(client_num)]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [66]:
# 初始化 GPT-2 的 tokenizer 並設置 padding token
tokenizer = AutoTokenizer.from_pretrained("./llm_models/roberta_base")
# tokenizer = AutoTokenizer.from_pretrained("roberta-base")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [68]:
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=client_datasets[0]["train"],
    eval_dataset=client_datasets[0]["valid"],
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log
3,No log,No log
4,No log,No log
5,No log,No log
6,No log,No log
7,No log,No log
8,No log,No log
9,No log,No log
10,No log,No log


TrainOutput(global_step=110, training_loss=0.667858193137429, metrics={'train_runtime': 31.6469, 'train_samples_per_second': 104.276, 'train_steps_per_second': 3.476, 'total_flos': 132764709990264.0, 'train_loss': 0.667858193137429, 'epoch': 10.0})

In [69]:
def predict_acc(trainer, test_data):
    
    accuracy_metric = load_metric("accuracy")
    predictions = trainer.predict(test_data)
    pred = np.argmax(predictions.predictions[1], axis=-1)
    accuracy = accuracy_metric.compute(predictions=pred, references=test_data["labels"])
    return accuracy, pred

In [70]:
acc, pred = predict_acc(trainer, client_datasets[0]["test"])
print(acc)

{'accuracy': 0.8913738019169329}


## test2

In [14]:
from datasets import load_dataset, DatasetDict
import numpy as np

# 加载数据集
dataset = load_dataset("glue", "mrpc")

# 检查数据集是否有 'id' 列
if 'id' not in dataset['train'].column_names:
    raise ValueError("Dataset does not contain an 'id' column")

# 自定义 ID
custom_ids = list(dataset['train']['id'])
np.random.shuffle(custom_ids)

ValueError: Dataset does not contain an 'id' column

In [None]:
# 将数据集分成 5 份
n_splits = 5
split_datasets = []
split_size = len(custom_ids) // n_splits

for i in range(n_splits):
    split_ids = custom_ids[i * split_size: (i + 1) * split_size]
    split_dataset = dataset['train'].filter(lambda example: example['id'] in split_ids)
    split_datasets.append(split_dataset)

# 将分割的数据集转换为 DatasetDict
split_dataset_dict = DatasetDict({
    f'train_split_{i}': split_datasets[i] for i in range(n_splits)
})

# 打印每个分割的数据集大小
for i in range(n_splits):
    print(f"train_split_{i} size: {len(split_dataset_dict[f'train_split_{i}'])}")

# 示例：训练第一个分割的数据集
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments

# 加载 tokenizer 和模型
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base')

# Tokenize 数据集
def tokenize_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], padding="max_length", truncation=True)

tokenized_datasets = split_dataset_dict.map(tokenize_function, batched=True)



In [None]:
# 设置训练参数
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# 定义 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train_split_0'],
    eval_dataset=tokenized_datasets['train_split_1']
)

# 训练模型
trainer.train()