In [25]:
import torch
import gc
from transformers import AutoTokenizer,AutoModelForCausalLM,AutoConfig
from datasets import load_dataset
import evaluate

# model name
model_name = "gpt2-xl"

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
# load  model
config = AutoConfig.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name,config=config, device_map="auto")



In [14]:
# load dataset
dataset = load_dataset("squad")
vali_ds = dataset['train'].select(range(5))
spilt_ds = dataset['train'].train_test_split(test_size=0.1)
train_ds = spilt_ds['train'].shuffle(seed=42).select(range(2000))
eval_ds = spilt_ds['test'].shuffle(seed=42).select(range(200))

In [16]:
dataset['validation']

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10570
})

In [9]:
# clear original dataset
del dataset
del spilt_ds
gc.collect() # 主动触发回收机制

606

In [10]:
# define format prompt function
def format_prompts(examples):
    prompts = [f"Context: {c}\nQuestion: {q}\n" for q, c in zip(examples['question'], examples['context'])]
    return tokenizer(prompts, padding="max_length", truncation=True, max_length=256, return_tensors='pt')

In [11]:
#define preprocess function
def preprocess_function(examples):
    # inputs: "C:{C} Q:{Q} A:" then tokenized as model_inputs
    inputs = [f"Context: {c}\nQuestion: {q}\nAnswer: " for q, c in zip(examples['question'], examples['context'])]
    model_inputs = tokenizer(inputs,padding="max_length", truncation=True, max_length=256,return_tensors='pt')
    
    # targets: answer's text inserted with ',' then tokenized as labels
    targets = [','.join(a['text']) if len(a['text']) > 0 else '' for a in examples['answers']]
    labels = tokenizer(targets,padding="max_length", truncation=True, max_length=256, return_tensors='pt')
    model_inputs["labels"] = labels['input_ids']
    model_inputs["labels_mask"] = labels['attention_mask']
    return model_inputs

tok_train_ds = train_ds.map(preprocess_function, batched=True)
tok_train_ds.set_format(type="torch", columns=["input_ids", "attention_mask","labels","labels_mask"])
print(tok_train_ds)
tok_eval_ds = eval_ds.map(preprocess_function, batched=True)
tok_eval_ds.set_format(type="torch", columns=["input_ids", "attention_mask","labels","labels_mask"])

Map: 100%|██████████| 2000/2000 [00:00<00:00, 2110.11 examples/s]


Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'attention_mask', 'labels', 'labels_mask'],
    num_rows: 2000
})


Map: 100%|██████████| 200/200 [00:00<00:00, 2264.36 examples/s]


In [13]:
examples = ["Hello", "this", "is", "a", "test", ""]
targets = [','.join(a) if len(a) > 0 else '' for a in examples]
targets

['H,e,l,l,o', 't,h,i,s', 'i,s', 'a', 't,e,s,t', '']

In [28]:
from peft import LoraConfig, TaskType, get_peft_model
config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    fan_in_fan_out=True,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    )

lora_model = get_peft_model(model,config)
lora_model.print_trainable_parameters()

# prepare for training
from transformers import TrainingArguments,Trainer
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    output_dir="./results",
    learning_rate=2e-4,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    num_train_epochs=3,
    )
trainer = Trainer(model=lora_model,
                  args=training_args,
                  tokenizer=tokenizer,
                  train_dataset=tok_train_ds,
                  eval_dataset=tok_eval_ds,
                    )
# train
trainer.train()

trainable params: 2,457,600 || all params: 1,560,068,800 || trainable%: 0.1575




OutOfMemoryError: CUDA out of memory. Tried to allocate 40.00 MiB. GPU 0 has a total capacty of 23.68 GiB of which 3.19 MiB is free. Process 1293983 has 3.99 GiB memory in use. Process 1296824 has 3.70 GiB memory in use. Process 2063921 has 3.52 GiB memory in use. Process 2242236 has 1.88 GiB memory in use. Including non-PyTorch memory, this process has 10.57 GiB memory in use. Of the allocated memory 10.18 GiB is allocated by PyTorch, and 20.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF