In [1]:
from transformers import GPT2Tokenizer, OPTForCausalLM

model_id = "facebook/opt-6.7b"

model = OPTForCausalLM.from_pretrained(model_id, load_in_8bit=True)
tokenizer = GPT2Tokenizer.from_pretrained(model_id)

Bad pipe message: %s [b'95a8e8fdd41cdc4b\r\nx-scheme: https\r\nx-request-id: 70e4d268482fa4f027007ded9023f539\r\nx-real-ip: 111']
Bad pipe message: %s [b'99.82.105\r\nx-original-forwarded-for: 111.199.', b'.105\r\nx-forwarded-scheme: https\r\nx-forwarded-proto: htt']
Bad pipe message: %s [b',http\r\nx-forwarded-port: 443,80\r\nx-forwarded-host: 1fcab7d27eec46b895a8e8fdd41cdc4b--45085.ap-shanghai.cloudstu', b'o.club\r\nx-forwarded-for: 111.199.82.105, 172.16.5.13, 172.18.138.4, 172.17.83.15,::ffff:10.89.0.2\r\n', b'client-proto-ver: HTTP/2.0\r\nx-client-proto: https\r\nupgrade-insecure-requests: 1\r\nsec-fetch-storage-access: active\r\nsec-', b'tch-site: same-site\r\nsec-fetch-mode: navigate\r\nsec-fetch-dest: iframe\r\nsec-ch-ua-platform: "macOS"\r\ns', b'-ch-ua-mobile: ?0\r\nsec-ch-ua: "Not;A=Brand";v="99", "Google Chrome";v="139", "Chromium";v="139"\r\nref', b'er: https://1fcab7d27eec46b895a8e8fdd41cdc4b.ap-shanghai.cloudstudio.club/\r\npriority: u=0, i\r\naccept']
Bad pipe message: 

pytorch_model.bin.index.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

In [2]:
text = "Python is the best programming language."

inputs = tokenizer(text, return_tensors="pt").to(0)  
outputs = model.generate(**inputs, max_length=50)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Text:", generated_text)

Generated Text: Python is the best programming language.
I'm not sure if you're being sarcastic or not, but I'm going to assume you're being serious.                  


In [5]:
from peft import prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

In [6]:
memory_footprint_bytes = model.get_memory_footprint()
memory_footprint_mib = memory_footprint_bytes / (1024 ** 3)  # 转换为 GB

print(f"{memory_footprint_mib:.2f}GB")
print(model)

6.80GB
OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 4096, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 4096)
      (final_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-31): 32 x OPTDecoderLayer(
          (self_attn): OPTSdpaAttention(
            (k_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=True)
            (v_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=True)
            (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=True)
            (out_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear8bitLt(in_features=4096, out_features=16384, bias=True)
          (fc2): Linear8bitLt(in_features=16384, out_features=4096

In [7]:
from peft import LoraConfig, get_peft_model

# 创建一个LoraConfig对象，用于设置LoRA的配置参数
config = LoraConfig(
    r=8,            # LoRA的秩，影响LoRA矩阵的大小
    lora_alpha=32,  # LoRA适应的比例因子
    # 指定将LoRA应用到的模型模块，通常是attention和全连接层的投影
    target_modules = ["q_proj", "k_proj", "v_proj", "out_proj", "fc_in", "fc_out"],
    lora_dropout=0.05,     # 在LoRA模块中使用的dropout率
    bias="none",           # 设置bias的使用方式，这里没有使用bias
    task_type="CAUSAL_LM"  # 任务类型，这里设置为因果(自回归）语言模型
)

# 使用get_peft_model函数和给定的配置来获取一个PEFT模型
model = get_peft_model(model, config)
model.print_trainable_parameters()
print(model)

trainable params: 8,388,608 || all params: 6,666,862,592 || trainable%: 0.1258
PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): OPTForCausalLM(
      (model): OPTModel(
        (decoder): OPTDecoder(
          (embed_tokens): Embedding(50272, 4096, padding_idx=1)
          (embed_positions): OPTLearnedPositionalEmbedding(2050, 4096)
          (final_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
          (layers): ModuleList(
            (0-31): 32 x OPTDecoderLayer(
              (self_attn): OPTSdpaAttention(
                (k_proj): lora.Linear8bitLt(
                  (base_layer): Linear8bitLt(in_features=4096, out_features=4096, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=4096, out_features=8, bias=False)
                  )
                  (lora_B): ModuleDict

In [8]:
from datasets import load_dataset

dataset = load_dataset("Abirate/english_quotes")
print(dataset["train"])

README.md: 0.00B [00:00, ?B/s]

quotes.jsonl:   0%|          | 0.00/647k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2508 [00:00<?, ? examples/s]

Dataset({
    features: ['quote', 'author', 'tags'],
    num_rows: 2508
})


In [9]:
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))
    
show_random_elements(dataset["train"])

Unnamed: 0,quote,author,tags
0,“The good thing about science is that it's true whether or not you believe in it.”,Neil deGrasse Tyson,"[belief, science, true]"
1,“Donâ€™t grieve. Anything you lose comes round in another form.”,Rumi,"[bereavement, consolation, grief, loss, reincarnation]"
2,"“Religion has actually convinced people that there's an invisible man living in the sky who watches everything you do, every minute of every day. And the invisible man has a special list of ten things he does not want you to do. And if you do any of these ten things, he has a special place, full of fire and smoke and burning and torture and anguish, where he will send you to live and suffer and burn and choke and scream and cry forever and ever 'til the end of time! But He loves you. He loves you, and He needs money! He always needs money! He's all-powerful, all-perfect, all-knowing, and all-wise, somehow just can't handle money!”",George Carlin,"[atheism, humor, life, religion]"
3,“Either write something worth reading or do something worth writing.”,Benjamin Franklin,[hmmm]
4,"“Read, read, read. Read everything -- trash, classics, good and bad, and see how they do it. Just like a carpenter who works as an apprentice and studies the master. Read! You'll absorb it.Then write. If it's good, you'll find out. If it's not, throw it out of the window.”",William Faulkner,"[reading, writing]"
5,“The last enemy that shall be destroyed is death.”,"J.K. Rowling,","[bible, death, enemy]"
6,"“I think that if I ever have kids, and they are upset, I won't tell them that people are starving in China or anything like that because it wouldn't change the fact that they were upset. And even if somebody else has it much worse, that doesn't really change the fact that you have what you have.”","Stephen Chbosky,",[honesty]
7,“The past has no power over the present moment.”,Eckhart Tolle,"[education, inspirational, life, philosophy, truth, wisdom]"
8,“There are wounds that never show on the body that are deeper and more hurtful than anything that bleeds.”,"Laurell K. Hamilton,","[depression, pain, trauma]"
9,“Science and religion are not at odds. Science is simply too young to understand.”,"Dan Brown,","[books, dan-brown, religion, science]"


In [10]:
from transformers import DataCollatorForLanguageModeling

tokenized_dataset = dataset.map(lambda samples: tokenizer(samples["quote"]), batched=True)
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

In [11]:
from transformers import TrainingArguments, Trainer

save_path = "./data/hf/models/opt-6.7b-lora"

training_args = TrainingArguments(
    output_dir=save_path,           # 指定模型输出和保存的目录
    per_device_train_batch_size=4,  # 每个设备上的训练批量大小
    learning_rate=2e-4,             # 学习率
    fp16=True,                      # 启用混合精度训练，可以提高训练速度，同时减少内存使用
    logging_steps=20,               # 指定日志记录的步长，用于跟踪训练进度
    max_steps=100,                  # 最大训练步长
    num_train_epochs=1              # 训练的总轮数
)

In [12]:
trainer = Trainer(
    model=model,                               # 指定训练时使用的模型
    train_dataset=tokenized_dataset["train"],  # 指定训练数据集
    args=training_args,
    data_collator=data_collator,
)

model.use_cache = False # 禁用模型的自回归生成缓存

Detected kernel version 5.4.241, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [13]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
20,1.935
40,1.952
60,1.8087
80,1.8001
100,1.9541


TrainOutput(global_step=100, training_loss=1.8899650192260742, metrics={'train_runtime': 177.8259, 'train_samples_per_second': 2.249, 'train_steps_per_second': 0.562, 'total_flos': 1364172665978880.0, 'train_loss': 1.8899650192260742, 'epoch': 0.1594896331738437})

In [14]:
model.save_pretrained(save_path)

In [15]:
lora_model = trainer.model

inputs = tokenizer(text, return_tensors="pt").to(0)
out = lora_model.generate(**inputs, max_length=50)
print(tokenizer.decode(out[0], skip_special_tokens=True))

  return fn(*args, **kwargs)


Python is the best programming language.
,,,,,, The The The The The.. I I am in in in in in the The The The The I am am a a a Son Son,,,....
