# 导入环境

In [1]:
from datasets import Dataset
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig
from modelscope import snapshot_download, AutoTokenizer

In [2]:
# 1.将train.jsonl和test.jsonl进行处理，转换成new_train.jsonl和new_test.jsonl
# 加载、处理数据集和测试集
# 加载、处理数据集和测试集
train_jsonl_new_path = 'Metal/train.json'
devel_jsonl_new_path = 'Metal/devel.json'
test_jsonl_new_path = 'Metal/test.json'

# 得到训练集和验证集
train_df = pd.read_json(train_jsonl_new_path, lines=True)
val_df = pd.read_json(devel_jsonl_new_path , lines=True)
test_df = pd.read_json(test_jsonl_new_path , lines=True)
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)

In [3]:
train_ds[:3]

{'instruction': ['You are an expert with a deep background in the field of material informatics, focusing on extracting material entities and related information from textual data.\nYou are tasked with performing a Named Entity Recognition (NER) operation on the provided text.\nYour goal is to identify and extract entities according to the specific categories related to the study of materials science.\nLet\'s work this out in a step by step way tobe sure we have the right answer.\n\nAll entity types including their detailed definition are listed as follow.\n### Definition:\n## Entity:\n- MAT: Any inorganic solid or alloy, any non-gaseous element (at RT), e.g., "BaTiO3", "titania", "Fe".\n- SPL: Names for crystal structures/phases, e.g., "tetragonal", "fcc", "rutile","perovskite"; or, any symmetry label such as "Pbnm", or "Pnma".\n- DSC: Special descriptions of the type/shape of the sample. Examples include "single crystal", "nanotube", "quantum dot".\n- PRO: Anything measurable that ca

# 处理数据集

In [4]:
model_id = "qwen/Qwen2-7B-Instruct"    
model_dir = "./qwen/Qwen2-7B-Instruct"

# 在modelscope上下载Qwen模型到本地目录下
model_dir = snapshot_download(model_id, cache_dir="./", revision="master")
# Transformers加载模型权重
tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=False, trust_remote_code=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
PROMPT_TEMPLATE="""You are an expert with a deep background in the field of material informatics, \
focusing on extracting material entities and related information from textual data.
You are tasked with performing a Named Entity Recognition (NER) operation on the provided text.
Your goal is to identify and extract entities according to the specific categories related to the study of materials science.
Let's work this out in a step by step way tobe sure we have the right answer.

All entity types including their detailed definition are listed as follow.
### Definition:
## Entity:
- MAT: Any inorganic solid or alloy, any non-gaseous element (at RT), e.g., "BaTiO3", "titania", "Fe".
- SPL: Names for crystal structures/phases, e.g., "tetragonal", "fcc", "rutile","perovskite"; or, any symmetry label such as "Pbnm", or "Pnma".
- DSC: Special descriptions of the type/shape of the sample. Examples include "single crystal", "nanotube", "quantum dot".
- PRO: Anything measurable that can have a unit and a value, e.g., "conductivity", “band gap”; or, any qualitative property or phenomenon exhibited by a material, e.g., "ferroelectric", "metallic".
- APL: Any high-level application such as "photovoltaics", or any specific device such as “field-effect transistor”.
- SMT: Any technique for synthesising a material, e.g., "pulsed laser deposition", "solid state reaction", or any other step in sample production such as "annealing" or "etching".
- CMT: Any method used to characterize a material, experiment or theory: e.g., "photoluminescence", "XRD", "tight binding", "DFT". It can also be a name for an equation or model, such "Bethe-Salpeter equation".

### Notice:
1. Each line of the output must be a valid JSON string.
2. If no entities are detected, the output should state '[]'.
3. Some extracted terms may not be classified into 'MAT', 'SPL', 'DSC', 'PRO', 'SMT', 'CMT',or 'APL'. In this case, discard these words and focus on the specified entities.
"""

In [6]:
def process_func(example):
    """
    将数据集进行预处理, 处理成模型可以接受的格式
    """

    MAX_LENGTH = 1024 
    input_ids, attention_mask, labels = [], [], []
    
    instruction = tokenizer(
        f"<|im_start|>system\n{PROMPT_TEMPLATE}<|im_end|>\n<|im_start|>user\n{example['input']}<|im_end|>\n<|im_start|>assistant\n",
        add_special_tokens=False,
    )
    response = tokenizer(f"{example['output']}", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = (
        instruction["attention_mask"] + response["attention_mask"] + [1]
    )
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}  

In [7]:
train_dataset = train_ds.map(process_func, remove_columns=train_ds.column_names)
val_dataset = val_ds.map(process_func, remove_columns=val_ds.column_names)
test_dataset = test_ds.map(process_func, remove_columns=test_ds.column_names)

Map:   0%|          | 0/4402 [00:00<?, ? examples/s]

Map:   0%|          | 0/511 [00:00<?, ? examples/s]

Map:   0%|          | 0/546 [00:00<?, ? examples/s]

In [8]:
tokenizer.decode(train_dataset[0]['input_ids'])

'<|im_start|>system\nYou are an expert with a deep background in the field of material informatics, focusing on extracting material entities and related information from textual data.\nYou are tasked with performing a Named Entity Recognition (NER) operation on the provided text.\nYour goal is to identify and extract entities according to the specific categories related to the study of materials science.\nLet\'s work this out in a step by step way tobe sure we have the right answer.\n\nAll entity types including their detailed definition are listed as follow.\n### Definition:\n## Entity:\n- MAT: Any inorganic solid or alloy, any non-gaseous element (at RT), e.g., "BaTiO3", "titania", "Fe".\n- SPL: Names for crystal structures/phases, e.g., "tetragonal", "fcc", "rutile","perovskite"; or, any symmetry label such as "Pbnm", or "Pnma".\n- DSC: Special descriptions of the type/shape of the sample. Examples include "single crystal", "nanotube", "quantum dot".\n- PRO: Anything measurable that

In [9]:
tokenizer.decode(list(filter(lambda x: x != -100, train_dataset[1]["labels"])))

"[('MAT', 'silica'), ('PRO', 'magnetic properties'), ('MAT', 'Fe4NiO8Zn'), ('MAT', 'O2Si'), ('DSC', 'nanocomposites'), ('CMT', 'electron paramagnetic resonance'), ('CMT', 'EPR')]<|endoftext|>"

In [10]:
print(tokenizer.decode(train_dataset[0]['input_ids']))
print(train_dataset[0]['input_ids'])
print(tokenizer.decode([151331, 151333, 151335]))
print(tokenizer.encode('[gMASK]<sop><|system|>', add_special_tokens=False))

<|im_start|>system
You are an expert with a deep background in the field of material informatics, focusing on extracting material entities and related information from textual data.
You are tasked with performing a Named Entity Recognition (NER) operation on the provided text.
Your goal is to identify and extract entities according to the specific categories related to the study of materials science.
Let's work this out in a step by step way tobe sure we have the right answer.

All entity types including their detailed definition are listed as follow.
### Definition:
## Entity:
- MAT: Any inorganic solid or alloy, any non-gaseous element (at RT), e.g., "BaTiO3", "titania", "Fe".
- SPL: Names for crystal structures/phases, e.g., "tetragonal", "fcc", "rutile","perovskite"; or, any symmetry label such as "Pbnm", or "Pnma".
- DSC: Special descriptions of the type/shape of the sample. Examples include "single crystal", "nanotube", "quantum dot".
- PRO: Anything measurable that can have a un

# 创建模型

In [11]:
import torch

model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto",torch_dtype=torch.bfloat16)
model

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_attention_layernorm): Qwen2RMSNorm()
      )
    )
    (norm): Qwen2RMSNorm()
  )
  (lm_head): Lin

In [12]:
model.enable_input_require_grads() # 开启梯度检查点时，要执行该方法

In [13]:
model.dtype

torch.bfloat16

# lora 

In [14]:
from peft import LoraConfig, TaskType, get_peft_model

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False, # 训练模式
    r=8, # Lora 秩
    lora_alpha=32, # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.1# Dropout 比例
)
config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=8, target_modules={'k_proj', 'o_proj', 'q_proj', 'gate_proj', 'v_proj', 'up_proj', 'down_proj'}, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)

In [15]:
model = get_peft_model(model, config)
config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='./qwen/Qwen2-7B-Instruct', revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=8, target_modules={'k_proj', 'o_proj', 'q_proj', 'gate_proj', 'v_proj', 'up_proj', 'down_proj'}, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)

In [16]:
model.print_trainable_parameters()

trainable params: 20,185,088 || all params: 7,635,801,600 || trainable%: 0.2643


# 配置训练参数

In [17]:
from swanlab.integration.huggingface import SwanLabCallback
import swanlab

swanlab_callback = SwanLabCallback(
    project="Qwen2-NER-fintune",
    experiment_name="Qwen2-7B-Instruct",
    description="使用通义千问Qwen2-7B-Instruct模型在NER数据集上微调，实现关键实体识别任务。",
    config={
        "model": model_id,
        "model_dir": model_dir,
    },
)

In [18]:
args = TrainingArguments(
    output_dir="./output/Qwen2_7B_instruct_lora",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=6,
    save_steps=100,
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=True,
    report_to="none",
)

In [19]:
from accelerate import Accelerator
accelerator = Accelerator()  # 创建加速器实例
model, train_dataset, args = accelerator.prepare(model, train_dataset, args)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [20]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
    callbacks=[swanlab_callback],
)

trainer.train()

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


[1m[34mswanlab[0m[0m: Tracking run with swanlab version 0.3.13                                  
[1m[34mswanlab[0m[0m: Run data will be saved locally in [35m[1m/workspace/nlp_pipline/LLM/信息抽取/NER/llm_微调/swanlog/run-20240712_001952-a3b1799d[0m[0m
[1m[34mswanlab[0m[0m: 👋 Hi [1m[39madasd[0m[0m, welcome to swanlab!
[1m[34mswanlab[0m[0m: Syncing run [33mQwen2-7B-Instruct_Jul12_00-19-52[0m to the cloud
[1m[34mswanlab[0m[0m: 🌟 Run `[1mswanlab watch -l /workspace/nlp_pipline/LLM/信息抽取/NER/llm_微调/swanlog[0m` to view SwanLab Experiment Dashboard locally
[1m[34mswanlab[0m[0m: 🏠 View project at [34m[4mhttps://swanlab.cn/@adasd/Qwen2-NER-fintune[0m[0m
[1m[34mswanlab[0m[0m: 🚀 View run at [34m[4mhttps://swanlab.cn/@adasd/Qwen2-NER-fintune/runs/jvvpozh1km2lcb8y3ujjm[0m[0m


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
10,0.584
20,0.2306
30,0.176
40,0.1826
50,0.1488
60,0.1339
70,0.1231
80,0.1174
90,0.1222
100,0.0943




[1m[33mswanlab[0m[0m: Step 1100 on key train/epoch already exists, ignored.


TrainOutput(global_step=1100, training_loss=0.04553380490704016, metrics={'train_runtime': 55147.9234, 'train_samples_per_second': 0.319, 'train_steps_per_second': 0.02, 'total_flos': 4.604493707126047e+17, 'train_loss': 0.04553380490704016, 'epoch': 3.9963669391462306})

# 合并加载模型

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from peft import PeftModel

mode_path = '/root/autodl-tmp/qwen/Qwen2-7B-Instruct/'
lora_path = './output/Qwen2_instruct_lora/checkpoint-10' # 这里改称你的 lora 输出对应 checkpoint 地址

# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(mode_path, trust_remote_code=True)

# 加载模型
model = AutoModelForCausalLM.from_pretrained(mode_path, device_map="auto",torch_dtype=torch.bfloat16, trust_remote_code=True).eval()

# 加载lora权重
model = PeftModel.from_pretrained(model, model_id=lora_path)

prompt = "你是谁？"
inputs = tokenizer.apply_chat_template([{"role": "user", "content": "假设你是皇帝身边的女人--甄嬛。"},{"role": "user", "content": prompt}],
                                       add_generation_prompt=True,
                                       tokenize=True,
                                       return_tensors="pt",
                                       return_dict=True
                                       ).to('cuda')


gen_kwargs = {"max_length": 2500, "do_sample": True, "top_k": 1}
with torch.no_grad():
    outputs = model.generate(**inputs, **gen_kwargs)
    outputs = outputs[:, inputs['input_ids'].shape[1]:]
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))