In [1]:
import os, sys
import torch
import datasets
from transformers import (
    AutoTokenizer,
    AlbertTokenizer, FlaubertTokenizer,
    AutoModelForSequenceClassification,
    DistilBertForSequenceClassification, DistilBertTokenizer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    GenerationConfig
)
from peft import PeftModel, LoraConfig, prepare_model_for_kbit_training, get_peft_model
from accelerate import Accelerator

# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5"

  from .autonotebook import tqdm as notebook_tqdm


[2024-06-03 18:01:07,432] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
# model_names = ['llama3', 'distillbert', 'roberta_squad']
# model_paths = ["/root/TIMEGPT/Time-LLM-main/llama_7b/",
#               '/root/Literature/distilbert-base-uncased-finetuned-sst-2-english',
#               '/root/Literature/twitter-roberta-base-sentiment-latest/',
#               ]

# model_name = 'roberta_sentiment'
# model_name = 'llama3'
# model_name = 'roberta_base'
# model_name = 'camembert'
# model_name = 'distillbert'
# model_name = 'distill_roberta'
# model_name = 'bertweet'
model_name = 'bert'
# model_name = 'albert'
# model_name = 'xlnet'
# model_name = 'deberta'


# model_path = '/root/Literature/twitter-roberta-base-sentiment-latest/'
# model_path = '/root/Literature/llama3'
# model_path = '/root/Literature/roberta-base'
# model_path = '/root/Literature/camembert-base-toxic-fr-user-prompts'
# model_path = '/root/Literature/distillbert'
# model_path = '/root/Literature/distillroberta'
# model_path = '/root/Literature/bertweet'
model_path = '/root/Literature/BERT'
# model_path = '/root/Literature/Albert'
# model_path = '/root/Literature/XLNet'
# model_path = '/root/Literature/deberta'

model_save_path = '/root/Literature/models/{}_FT_models'.format(model_name)

if model_name == 'albert':
    tokenizer = AlbertTokenizer.from_pretrained(model_path)
elif model_name == 'flaubert':
    tokenizer = FlaubertTokenizer.from_pretrained(model_path)
else:
    tokenizer = AutoTokenizer.from_pretrained(model_path)

if model_name == 'distillbert' or model_name == 'bert' or model_name == 'flaubert':
    tokenizer.pad_token = '[PAD]'
else:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


ImportError: 
DebertaV2Converter requires the protobuf library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [None]:
if model_name == 'llama3':
    print('4bit')
    # device_map = 'auto'
    device_map = {'':0, '':1, '':2, '':3, '':4, '':5}
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype="float16",
    )
    max_memory = {0:'15GiB',1:'15GiB',2:'15GiB',3:'15GiB',4:'15GiB',5:'0GiB',6:'0GiB'}
    
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path,
        trust_remote_code=True,
        quantization_config=bnb_config,
        device_map=device_map,
        max_memory=max_memory,
        num_labels=3
    )
# elif model_name == 'roberta_sentiment' or model_name == 'roberta_irony' or model_name == 'roberta_base' or model_name == 'camembert':  
# elif model_name == 'distillbert' or model_name == 'bert' or model_name == 'xlnet' or model_name == 'albert' or model_name == 'bertweet':
else:
    print('32bit')
    # device_map = {'':torch.cuda.current_device()}
    device_map = {'':0, '':1, '':2, '':3, '':4}
    max_memory = {0:'24GiB',1:'24GiB',2:'24GiB',3:'24GiB',4:'24GiB',5:'24GiB',6:'0GiB'}
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path,
        trust_remote_code=True,
        device_map=device_map,
        max_memory=max_memory,
        num_labels=3
    )
# else:
#     print('8bit')
#     # device_map = {'':torch.cuda.current_device()}
#     # device_map = 'auto'
#     device_map = {'':0}
#     bnb_config = BitsAndBytesConfig(
#         load_in_8bit=True
#     )
#     max_memory = {0:'24GiB',1:'24GiB',2:'24GiB',3:'24GiB',4:'24GiB'}
    
#     model = AutoModelForSequenceClassification.from_pretrained(
#         model_path,
#         trust_remote_code=True,
#         quantization_config=bnb_config,
#         device_map=device_map,
#         max_memory=max_memory,
#         num_labels=3
#     ) 

32bit


In [None]:
def print_number_of_trainable_model_parameters(model):
  trainable_model_params = 0
  all_model_params = 0
  for _, param in model.named_parameters():
    all_model_params += param.numel()
    if param.requires_grad:
      trainable_model_params += param.numel()
  print(f"all params num: {all_model_params}, trainable param num: {trainable_model_params}")
  print(f'ratio: {trainable_model_params/all_model_params}')
  return trainable_model_params

ori_p = print_number_of_trainable_model_parameters(model)

all params num: 82120707, trainable param num: 82120707
ratio: 1.0


In [None]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (

In [None]:
if model_name == 'albert' or model_name == 'xlnet' or model_name == 'flaubert':
  gradient_checkpointing = False
else:
  gradient_checkpointing = True

model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing)
'''
- r, the dim of the low_rank matrices
- lora_alpha, scaling factor, the weight is scaled by lora_alpha/r, 
  the higher value assigns more weight to the LoRA activations
- target_modules: default is "q_proj", "v_proj"
- bias, the recommend setting bias to None first, and then lora_only, before trying all.
'''

# if model_name == 'llama':
#   target_modules = ["q_proj", "v_proj"]
# elif model_name == 'gpt2':
#   target_modules = ["attn.c_attn"]

if model_name == 'roberta_sentiment' or model_name == 'camembert' or model_name == 'roberta_base' or model_name == 'distill_roberta' or model_name == 'bertweet' or model_name == 'bert' or model_name == 'albert':
  target_modules = ["query", "key", "value", "dense"]
elif model_name == 'distillbert' or model_name == 'flaubert':
  target_modules = ["q_lin", "k_lin", "v_lin"]
elif model_name == 'xlnet':
  target_modules = ['layer_1', 'layer_2']
elif model_name == 'llama3':
  pass


peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=target_modules,
    bias="none",
    task_type="SEQ_CLS"
)
model = get_peft_model(model, peft_config)

### compare trainable parameters
peft_p = print_number_of_trainable_model_parameters(model)

all params num: 83401734, trainable param num: 1873926
ratio: 0.02246866953629525


In [None]:
dataset = datasets.load_dataset(
    "/root/Literature/books", split='train'
)

In [None]:
if model_name == 'bertweet' or model_name == 'roberta_base':
    max_length = 128
else:
    max_length = 512
    
def tokenize_function(examples):
    return tokenizer(examples["context"], padding="max_length", truncation=True, max_length=max_length)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

train_test_sp = tokenized_datasets.train_test_split(test_size=0.1, shuffle=True, seed=42)
cols = ["instruction", "category", 'context']
train_val_sp = train_test_sp['train'].train_test_split(test_size=0.2, shuffle=True, seed=42)
train_data = train_val_sp["train"].shuffle(seed=42).map(tokenize_function, remove_columns=cols).rename_column('response', 'label')
val_data = train_val_sp["test"].shuffle(seed=42).map(tokenize_function, remove_columns=cols).rename_column('response', 'label')
test_data = train_test_sp["test"].shuffle(seed=42)

# model_name == 'roberta_base' or 
# sometimes label should be in an one-hot form
if model_name == 'distillbert':
    labels = train_data['label']
    encoder = OneHotEncoder(sparse=False)
    labels_reshaped = np.array(labels).reshape(-1, 1)
    one_hot = encoder.fit_transform(labels_reshaped)
    new_dataset = train_data.remove_columns(['label'])
    train_data = new_dataset.add_column('label', one_hot.tolist())

    labels = val_data['label']
    encoder = OneHotEncoder(sparse=False)
    labels_reshaped = np.array(labels).reshape(-1, 1)
    one_hot = encoder.fit_transform(labels_reshaped)
    new_dataset = val_data.remove_columns(['label'])
    val_data = new_dataset.add_column('label', one_hot.tolist())

# if model_name == 'roberta_squad':
#     train_test_sp = tokenized_datasets.train_test_split(test_size=0.1, shuffle=True, seed=42)
#     cols = ["instruction", "category", 'context']
#     train_val_sp = train_test_sp['train'].train_test_split(test_size=0.2, shuffle=True, seed=42)
#     train_data = train_val_sp["train"].shuffle().map(tokenize_function, remove_columns=cols).rename_column('response', 'label')
#     val_data = train_val_sp["test"].shuffle().map(tokenize_function, remove_columns=cols).rename_column('response', 'label')
#     test_data = train_test_sp["test"].shuffle()
# elif model_name == 'roberta_irony':
# print('irony')
# train_test_sp = tokenized_datasets.train_test_split(test_size=0.1, shuffle=True, seed=42)
# cols = ["instruction", "category", 'context']
# train_val_sp = train_test_sp['train'].train_test_split(test_size=0.2, shuffle=True, seed=42)

# train_data = train_val_sp["train"].shuffle().map(tokenize_function, remove_columns=cols).rename_column('response', 'label')
# new_dataset = train_data.remove_columns(['label'])
# train_data = new_dataset.add_column('label', train_data['label'])

# val_data = train_val_sp["test"].shuffle().map(tokenize_function, remove_columns=cols).rename_column('response', 'label')
# new_dataset = val_data.remove_columns(['label'])
# val_data = new_dataset.add_column('label', val_data['label'])

# test_data = train_test_sp["test"].shuffle()



Map:  14%|█▍        | 225/1636 [00:00<00:00, 2210.49 examples/s]

Map: 100%|██████████| 1636/1636 [00:00<00:00, 1838.90 examples/s]


# FT

In [None]:
import transformers
transformers.logging.set_verbosity_info()

batch_size = 64
micro_batch_size = 16
gradient_accumulation_steps = batch_size // micro_batch_size

if model_name == 'albert' or model_name == 'xlnet' or model_name == 'flaubert':
    gradient_checkpointing = False
else:
    gradient_checkpointing = True

args = TrainingArguments(
    output_dir=model_save_path,
    num_train_epochs=20,
    max_steps=100,
    # fp16=True,
    optim="paged_adamw_32bit",
    learning_rate=1e-4,
    lr_scheduler_type="linear",
    weight_decay=0.01,
    per_device_train_batch_size=micro_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=gradient_checkpointing,
    group_by_length=False,
    logging_steps=5,
    metric_for_best_model='eval_loss',
    save_steps=10,
    save_total_limit=10,
    disable_tqdm=False
)

trainer = Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=args,
    data_collator=DataCollatorWithPadding(
      tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True),
)

# silence the warnings. re-enable for inference!
model.config.use_cache = False

trainer.train()
model.save_pretrained(model_save_path + '/' + model_name)
print('model train is finished')

Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.
max_steps is given, it will override any value given in num_train_epochs
***** Running training *****
  Num examples = 1,636
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulati

Step,Training Loss
5,5.725
10,3.9879
15,2.078
20,1.212
25,1.1459
30,1.0922
35,0.9561
40,0.9865
45,0.892
50,0.8735


Saving model checkpoint to /root/Literature/models/distill_roberta_FT_models/checkpoint-10
Saving model checkpoint to /root/Literature/models/distill_roberta_FT_models/checkpoint-20
Saving model checkpoint to /root/Literature/models/distill_roberta_FT_models/checkpoint-30
Saving model checkpoint to /root/Literature/models/distill_roberta_FT_models/checkpoint-40
Saving model checkpoint to /root/Literature/models/distill_roberta_FT_models/checkpoint-50
Saving model checkpoint to /root/Literature/models/distill_roberta_FT_models/checkpoint-60
Saving model checkpoint to /root/Literature/models/distill_roberta_FT_models/checkpoint-70
Saving model checkpoint to /root/Literature/models/distill_roberta_FT_models/checkpoint-80
Saving model checkpoint to /root/Literature/models/distill_roberta_FT_models/checkpoint-90
Saving model checkpoint to /root/Literature/models/distill_roberta_FT_models/checkpoint-100


Training completed. Do not forget to share your model on huggingface.co/models =)




model train is finished


# Evaluation

Before FT TC

In [None]:
import numpy as np

model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    # quantization_config=bnb_config,
    # device_map=device_map,
    # max_memory=max_memory,
    num_labels=3
)

model.eval()
preds = []
# test_data = test_data.to('cuda')

with torch.no_grad():
    for sentence in test_data['context']:
        if model_name == 'bertweet':
            inputs = tokenizer(sentence[:max_length], return_tensors="pt")
        else:
            inputs = tokenizer(sentence, return_tensors="pt")
        output = model(**inputs)
        scores = output[0][0].detach().numpy()
        preds.append(np.argmax(scores))
        
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

trues = test_data['response']

accuracy = accuracy_score(trues, preds)
precision = precision_score(trues, preds, average='weighted')  
recall = recall_score(trues, preds, average='weighted')
f1 = f1_score(trues, preds, average='weighted')
conf_matrix = confusion_matrix(trues, preds)

print('accuray', accuracy)
print('precision', precision)
print('recall', recall)
print('f1', f1)
print(conf_matrix)

loading configuration file /root/Literature/distillroberta/config.json
Model config RobertaConfig {
  "_name_or_path": "/root/Literature/distillroberta",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "negative",
    "1": "neutral",
    "2": "positive"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 0,
    "neutral": 1,
    "positive": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.31.0",
  "type_vocab_size": 1,
 

accuray 0.2543859649122807
precision 0.4399624765478424
recall 0.2543859649122807
f1 0.15438903585545508
[[  8 126   8]
 [  1  50   2]
 [  4  29   0]]


After FT TC

In [None]:
import numpy as np

model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    # quantization_config=bnb_config,
    # device_map=device_map,
    # max_memory=max_memory,
    num_labels=3
)

peft_path = f'{model_save_path}/checkpoint-100'
    
model = PeftModel.from_pretrained(
    model,
    peft_path,
    # torch_dtype=torch.float16,
)

model.eval()
preds = []

with torch.no_grad():
    for sentence in test_data['context']:
        if model_name == 'bertweet':
            inputs = tokenizer(sentence[:max_length], return_tensors="pt")
        else:
            inputs = tokenizer(sentence, return_tensors="pt")
        output = model(**inputs)
        scores = output[0][0].detach().numpy()
        preds.append(np.argmax(scores))
        
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

trues = test_data['response']

accuracy = accuracy_score(trues, preds)
precision = precision_score(trues, preds, average='weighted')
recall = recall_score(trues, preds, average='weighted')
f1 = f1_score(trues, preds, average='weighted')
conf_matrix = confusion_matrix(trues, preds)

print('accuray', accuracy)
print('precision', precision)
print('recall', recall)
print('f1', f1)
print(conf_matrix)

loading configuration file /root/Literature/distillroberta/config.json
Model config RobertaConfig {
  "_name_or_path": "/root/Literature/distillroberta",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "negative",
    "1": "neutral",
    "2": "positive"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 0,
    "neutral": 1,
    "positive": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.31.0",
  "type_vocab_size": 1,
 

accuray 0.2543859649122807
precision 0.4399624765478424
recall 0.2543859649122807
f1 0.15438903585545508
[[  8 126   8]
 [  1  50   2]
 [  4  29   0]]


In [None]:
def calculate_recall(F1, precision):
    if 2 * precision == F1:
        raise ValueError("Invalid input values: F1 cannot be equal to 2 times precision.")
    recall = (F1 * precision) / (2 * precision - F1)
    return recall


F1 = 0.4836
precision = 0.5323
recall = calculate_recall(F1, precision)
print(f"Recall: {recall}")


Recall: 0.44306416523235803
