In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from datasets import load_dataset, Dataset, concatenate_datasets
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer

In [2]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             #load_in_8bit=True,
                                             torch_dtype=torch.bfloat16,
                                             device_map="auto"
                                            )



tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

In [3]:
data = load_dataset("enyuan/weibo")

data_train = data["train"]
print(data_train)
print(len(data_train['cleaned_text']))

Downloading data:   0%|          | 0.00/639M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/471450 [00:00<?, ? examples/s]

Dataset({
    features: ['关键词', '连接', '连接.1', 'id1', 'id2', 'id3', 'poi地址', '微博正文', '评论', '博主名', 'Unnamed: 10', '来自', '点赞数', '转发数', '关注数', '粉丝数', '微博数', '性别', 'location', 'verified', 'verified_type', 'verified_reason', '年纪', 'ip_location', 'time', 'cleaned_text', '省份'],
    num_rows: 471450
})
471450


In [4]:
def generate_prompt(content):
    begin = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>"
    #syst = "<<SYS>> You are a helpful assistant, always answer as helpfully as possible.\n If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.<</SYS>>\n"
    #inst = "Read the following text. Does it mention the Gilbert damping constant of a certain material? If so, list the corresponding material and its Gilbert damping canstant.\n" + content
    syst = "You are a helpful AI assistant, answer the question as short as possible.<|eot_id|>\n<|start_header_id|>user<|end_header_id|>"
    inst = "下面这段文字对于共享单车的情绪是正面的还是负面的还是中性的？Do not return anything except 'positive', 'neutral' or 'negative'."+content
    end = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"
    prompt = (" ").join([begin, syst, inst, end])
    return prompt

print(generate_prompt('How are you?'))
#print(generate_prompt(data[0]['content']))

<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a helpful AI assistant, answer the question as short as possible.<|eot_id|>
<|start_header_id|>user<|end_header_id|> 下面这段文字对于共享单车的情绪是正面的还是负面的还是中性的？Do not return anything except 'positive', 'neutral' or 'negative'.How are you? <|eot_id|><|start_header_id|>assistant<|end_header_id|>



In [5]:
input_prompt = generate_prompt('How are you?')
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
with torch.cuda.amp.autocast():
  generation_output = model.generate(
      input_ids=input_tokens,
      max_new_tokens=1000,
      do_sample=False,
      #top_k=10,
      #top_p=0.9,
      #temperature=0.3,
      repetition_penalty=1.15,
      num_return_sequences=1,
      eos_token_id=tokenizer.eos_token_id,
    )
op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


system You are a helpful AI assistant, answer the question as short as possible.
user 下面这段文字对于共享单车的情绪是正面的还是负面的还是中性的？Do not return anything except 'positive', 'neutral' or 'negative'.How are you? assistant
neutral


In [21]:
input_prompt = generate_prompt('How are you?')
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")

with torch.no_grad():
    logits = model(input_tokens).logits

probabilities = torch.softmax(logits[:, -1, :], dim=-1)

# Get the top 10 token IDs and their probabilities
top_k = 5
top_probabilities, top_token_ids = torch.topk(probabilities, top_k)

In [22]:
print(top_probabilities, top_token_ids)

tensor([[9.8042e-01, 1.0892e-02, 7.4856e-03, 7.8898e-04, 1.7605e-04]],
       device='cuda:0') tensor([[60668, 88007, 31587, 43324, 36590]], device='cuda:0')


In [None]:
import pandas as pd
import time

data = []
save_interval = 100

start_time = time.time()  # 记录开始时间

for i, text in enumerate(data_train['cleaned_text']):
    input_prompt = generate_prompt(text)
    #input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
    #inputs = tokenizer(input_prompt, return_tensors="pt", padding=True, truncation=True).to("cuda")
    inputs = tokenizer(input_prompt, return_tensors="pt", truncation=True).to("cuda")
    input_tokens = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    with torch.cuda.amp.autocast():
        generation_output = model.generate(
            input_ids=input_tokens,
            attention_mask=attention_mask,
            max_new_tokens=128,
            do_sample=False,
            #top_k=5,
            #top_p=0.9,
            #temperature=0.2,
            repetition_penalty=1.1,
            num_return_sequences=1,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id
        )
    op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
    #print(op)
    
    inst_index = op.find('assistant\n')
    
    if inst_index != -1:
        #print(op[inst_index + len('assistant\n'):])
        data.append({"cleaned_text": text, "sentiment": op[inst_index + len('assistant\n'):]})
    else:
        #print("未找到'assistant\n'标记")
        data.append({"cleaned_text": text, "sentiment":""})

    # 每 save_interval 个迭代保存一次
    if (i + 1) % save_interval == 0:
        df = pd.DataFrame(data)
        df.to_csv("output.csv", index=False, mode='a', header=False)  # 追加模式
        #print(f"已保存到 output.csv，当前迭代次数：{i+1}")
        data = []
        end_time = time.time()  # 记录结束时间
        elapsed_time = end_time - start_time  # 计算用时
        print(f"已保存到 output.csv，当前迭代次数：{i+1}，用时：{elapsed_time:.2f} 秒")

        start_time = end_time  # 更新开始时间，用于计算下一个周期的用时

# 最后一次保存
df = pd.DataFrame(data)
df.to_csv("output.csv", index=False, mode='a', header=False)
print("已保存到 output.csv")

#df = pd.DataFrame(data)
#df.to_csv("output.csv", index=False)


已保存到 output.csv，当前迭代次数：100，用时：21.73 秒
已保存到 output.csv，当前迭代次数：200，用时：21.73 秒
已保存到 output.csv，当前迭代次数：300，用时：20.41 秒
已保存到 output.csv，当前迭代次数：400，用时：21.49 秒


In [32]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset

# 确保 tokenizer 有一个 pad_token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

def collate_fn(batch):
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
    return inputs

# 创建 DataLoader
batch_size = 8
dataset = TextDataset(data_train['cleaned_text'][:40])
dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)

# 存储数据的列表
data = []

for batch in dataloader:
    input_tokens = batch["input_ids"].to("cuda")
    attention_mask = batch["attention_mask"].to("cuda")
    
    with torch.cuda.amp.autocast():
        generation_output = model.generate(
            input_ids=input_tokens,
            attention_mask=attention_mask,
            max_new_tokens=128,
            do_sample=True,
            top_k=5,
            top_p=0.9,
            temperature=0.2,
            repetition_penalty=1.1,
            num_return_sequences=1,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id  # 设置 pad_token_id
        )
    
    for idx in range(len(generation_output)):
        op = tokenizer.decode(generation_output[idx], skip_special_tokens=True)
        inst_index = op.find('assistant\n')
        
        if inst_index != -1:
            result = op[inst_index + len('assistant\n'):]
            print(result)
        else:
            result = "未找到'assistant\n'标记"
            print(result)
        
        # 将结果添加到列表中
        data.append({"i": dataset[idx], "op": result})

# 最终保存所有数据
df = pd.DataFrame(data)
df.to_csv("output.csv", index=False)
print("最终数据保存完成")

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


未找到'assistant
'标记
未找到'assistant
'标记
未找到'assistant
'标记
未找到'assistant
'标记
未找到'assistant
'标记
未找到'assistant
'标记
未找到'assistant
'标记
未找到'assistant
'标记


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


未找到'assistant
'标记
未找到'assistant
'标记
未找到'assistant
'标记
未找到'assistant
'标记
未找到'assistant
'标记
未找到'assistant
'标记
未找到'assistant
'标记
未找到'assistant
'标记


OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB (GPU 0; 23.63 GiB total capacity; 17.46 GiB already allocated; 1.81 GiB free; 21.30 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [26]:
op

"system You are a helpful AI assistant, answer the question as short as possible.\nuser 下面这段文字对于共享单车的情绪是正面的还是负面的还是中性的？Do not return anything except 'positive', 'neutral' or 'negative'.【中国互联网的2018：AI弯道超车移动支付走向海外】刚刚过去的2017年，中国互联网捷报频传。高铁、支付宝、共享单车、网购成为外国人眼中的中国“新四大发明”，AI异军突起，物联网、大数据、VR技术与实体经济进一步融合。有理由相信：2018年的中国互联网将再谱新篇，续写华章 assistant\npositive"

In [2]:
model_name = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             torch_dtype=torch.bfloat16,
                                             device_map="auto"
                                            )
tokenizer = AutoTokenizer.from_pretrained('results/checkpoint-19500')
model.resize_token_embeddings(len(tokenizer))

model.load_adapter('results/checkpoint-19500')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
data = load_dataset("enyuan/Abstracts")
data_train = data["train"]

custom_data = load_dataset('json', data_files='data_eval.json')
data_val = custom_data['train']

# Print the dataset details
print(data_train)
print(data_val)

# Access an example
#example = data_train[0]
#print(example)

def generate_prompt(title, abstract=None, eos_token="</s>"):
  instruction = "The abstract of the paper:\n"
  input = f"{title}\n"
  abstract = f"Abstract: {abstract + ' ' + eos_token if abstract else ''} "
  prompt = (" ").join([instruction, input, abstract])
  return prompt

print(generate_prompt(data_train[0]["title"], data_train[0]["abstract"]))

Dataset({
    features: ['title', 'doi', 'abstract', 'publicationDate'],
    num_rows: 165071
})
Dataset({
    features: ['title', 'abstract', 'publicationDate'],
    num_rows: 559
})
The abstract of the paper:
 Inconel 625 sustainable milling surface integrity and the dependence on alloy processing route
 Abstract: The discovery of deepwater oil and gas sources has altered the scenario of world production of oil products, attracting even more attention to nickel superalloys. However, this class of materials can be used in several applications. Furthermore, nickel superalloys are highly dependent on their processing history, and the manner in which superalloys react to machining can directly affect the finished product. This work aims to evaluate the surface integrity of two different materials after cryogenic side-milling in conditions that stimulate severe plastic deformation (SPD) and high heat generation. The results show that the material response to machining depends strongly on 

In [4]:
input_prompt = generate_prompt(data_train[50]["title"])
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
with torch.cuda.amp.autocast():
  generation_output = model.generate(
      input_ids=input_tokens,
      max_new_tokens=1000,
      do_sample=True,
      top_k=10,
      top_p=0.9,
      temperature=0.3,
      repetition_penalty=1.15,
      num_return_sequences=1,
      eos_token_id=tokenizer.eos_token_id,
    )
op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op)

The abstract of the paper:
 Effect of cryogenic cooling on residual stresses and surface finish of 316L during hybrid manufacturing
 Abstract:   In this work, a novel approach for reducing the residual stress in the welded joints of stainless steel is presented. A new process called Hybrid Manufacturing (HM) was developed to reduce the residual stress in the welded joints by using two different techniques namely, laser beam welding (LBW) and cryogenic treatment (CT). The effectiveness of HM technique has been studied with respect to the reduction of residual stress and improvement in surface roughness. The results showed that the residual stress can be reduced up to 40% when compared to conventional LBW method. Moreover, the surface roughness can also be improved significantly as shown by the Ra value which decreases from 25.87µm to 19.31µm after CT.
The full text of the article: http://www.sciencedirect.com/science/article/pii/S092583881400132X


In [4]:
with open('materials.txt', 'r') as file:
    word_list = file.read().splitlines()

In [7]:
new_data = {
    'title': word_list,
    'abstract': [s.replace('_', '') for s in word_list],
    'doi': ['material'] * len(word_list),  # 假设新数据集中没有doi信息
    'publicationDate': [None] * len(word_list)  # 假设新数据集中没有publicationDate信息
}
new_dataset = Dataset.from_dict(new_data)

data_train = concatenate_datasets([data_train, new_dataset])

new_data = {
    'title': [s.replace('_', '') for s in word_list],
    'abstract': word_list,
    'doi': ['material'] * len(word_list),  # 假设新数据集中没有doi信息
    'publicationDate': [None] * len(word_list)  # 假设新数据集中没有publicationDate信息
}
new_dataset = Dataset.from_dict(new_data)

data_train = concatenate_datasets([data_train, new_dataset])

In [8]:
data_val = data_train.select(range(200))

In [9]:
def generate_prompt(type, title, abstract=None, eos_token="</s>"):
    if type == 'material':
        instruction = "The material :\n"
        input = f"{title} is"
        output = f"{abstract + ' ' + eos_token if abstract else ''} "
        prompt = (" ").join([instruction, input, output])
    else:
        instruction = "The abstract of the paper:\n"
        input = f"{title}\n"
        output = f"Abstract: {abstract + ' ' + eos_token if abstract else ''} "
        prompt = (" ").join([instruction, input, output])
    return prompt

print(generate_prompt(data_train[-1]["doi"], data_train[-1]["title"], data_train[-1]["abstract"]))

The material :
 NiFeAlO4 is NiFeAlO_4 </s> 


In [8]:
lora_config = LoraConfig(
        r=256,
        lora_alpha=512,
        lora_dropout=0.05,
        target_modules=["q_proj","k_proj","v_proj","o_proj"],
        bias="none",
        task_type="CAUSAL_LM",
    )

In [11]:
# Step 1: Add new tokens to the tokenizer
num_added_toks = tokenizer.add_tokens(word_list)
print(f"Added {num_added_toks} tokens")

tokenizer.add_special_tokens({"pad_token": "<PAD>"})
model.resize_token_embeddings(len(tokenizer))

#model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

Added 5552 tokens


In [12]:
# Step 2: Freeze all parameters in the model
#for param in model.parameters():
#    param.requires_grad = False

embeddings = model.get_input_embeddings()

# Enable gradient updates for the entire embedding layer
# Assuming you might want to fine-tune all embeddings, but here's how to selectively unfreeze
embeddings.weight.requires_grad = True

In [5]:
# 准备训练参数
training_args = TrainingArguments(
    output_dir='results',            # 输出目录
    num_train_epochs=2,              # 总训练轮数
    per_device_train_batch_size=4,   # 训练的batch size
    per_device_eval_batch_size=4,    # 验证的batch size
    gradient_accumulation_steps=4, 
    #gradient_checkpointing=True,
    #optim = "paged_adamw_32bit",
    optim = "adamw_torch",
    bf16=True,
    #fp16=True,
    warmup_steps=300,                # 预热步数
    learning_rate = 1e-4,
    max_grad_norm = 0.2,
    #max_steps = 50,
    #warmup_ratio = 0.03,
    #weight_decay=0.01,               # 权重衰减
    save_strategy="steps",           # 设置保存策略为"steps"
    save_steps=300,                  # 每500步保存一次模型
    save_total_limit=3,              # 最多保存3个检查点
    evaluation_strategy="epoch",     # 设置评估策略为"steps"
    group_by_length=True,
    #eval_steps=10000
)

In [6]:
# gradient checkpointing enabling
model.enable_input_require_grads()
model.gradient_checkpointing_enable()

In [None]:
def formatting_func(prompt):
  output = []

  for a, d, s in zip(prompt["doi"], prompt["title"], prompt["abstract"]):
    op = generate_prompt(a, d, s)
    output.append(op)

  return output


trainer = SFTTrainer(
    model=model,
    train_dataset=data_train,
    eval_dataset=data_val,
    peft_config=lora_config,
    formatting_func=formatting_func,
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_args
)

# We will also pre-process the model by upcasting the layer norms in float 32 for more stable training
#for name, module in trainer.model.named_modules():
#    if "norm" in name:
#        module = module.to(torch.float32)

trainer.train()
trainer.save_model(f"{output_dir}/final")

# Step Training Loss Validation Loss
# 10 1.848200 1.746341
# 20 1.688300 1.696681
# 30 1.654500 1.698127
# 40 1.579400 1.652010
# 50 1.492600 1.701877

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss




In [None]:
def formatting_func(prompt):
  output = []

  for d, s in zip(prompt["title"], prompt["abstract"]):
    op = generate_prompt(d, s)
    output.append(op)

  return output


trainer = SFTTrainer(
    model=model,
    train_dataset=data_train,
    eval_dataset=data_val,
    peft_config=lora_config,
    formatting_func=formatting_func,
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_args
)

# We will also pre-process the model by upcasting the layer norms in float 32 for more stable training
#for name, module in trainer.model.named_modules():
#    if "norm" in name:
#        module = module.to(torch.float32)

trainer.train()
trainer.save_model(f"{output_dir}/final")

Map:   0%|          | 0/165071 [00:00<?, ? examples/s]

Map:   0%|          | 0/559 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss




In [19]:
model.push_to_hub("enyuan/llama_2_7b_materials")



adapter_model.safetensors:   0%|          | 0.00/1.15G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/enyuan/llama_2_7b_materials/commit/f3e916ad96f32cf5b0ab4fc51e5eca07fd5a38e7', commit_message='Upload LlamaForCausalLM', commit_description='', oid='f3e916ad96f32cf5b0ab4fc51e5eca07fd5a38e7', pr_url=None, pr_revision=None, pr_num=None)

In [20]:
tokenizer.push_to_hub("enyuan/llama_2_7b_materials")

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/enyuan/llama_2_7b_materials/commit/cccd6362887ae7730b7a9689bf36a3408e330a34', commit_message='Upload tokenizer', commit_description='', oid='cccd6362887ae7730b7a9689bf36a3408e330a34', pr_url=None, pr_revision=None, pr_num=None)

In [31]:
def generate_prompt(prompt, output=None, eos_token="</s>"):
    instruction = "Answer the materials:\n"
    input = f"Magnetic materials with {prompt}\n"
    output = f"The material is:"
    prompt = (" ").join([instruction, input, output])
    return prompt

input_prompt = generate_prompt('low magnetic damping constant.')
print(input_prompt)
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")

with torch.no_grad():
    logits = model(input_tokens).logits

probabilities = torch.softmax(logits[:, -1, :], dim=-1)

# Get the top 10 token IDs and their probabilities
top_k = 200
top_probabilities, top_token_ids = torch.topk(probabilities, top_k)
"""
# Convert probabilities to a human-readable format (e.g., Python list)
top_probabilities = top_probabilities.squeeze().tolist()
top_token_ids = top_token_ids.squeeze().tolist()

# Decode each token ID and pair it with its probability
top_words_with_probs = [(tokenizer.decode([token_id]), prob) for token_id, prob in zip(top_token_ids, top_probabilities)]

# Display the results
for word, prob in top_words_with_probs:
    print(f"Word: {word}, Probability: {prob:.4f}")
"""
# Filter tokens with IDs less than 32000
mask = top_token_ids >= 32000
filtered_top_token_ids = top_token_ids[mask]
filtered_top_probabilities = top_probabilities[mask]

# Convert probabilities to a human-readable format (e.g., Python list)
filtered_top_probabilities = filtered_top_probabilities.squeeze().tolist()
filtered_top_token_ids = filtered_top_token_ids.squeeze().tolist()

# Decode each token ID and pair it with its probability
top_words_with_probs = [(tokenizer.decode([token_id]), prob) for token_id, prob in zip(filtered_top_token_ids, filtered_top_probabilities)]

# Display the results
for word, prob in top_words_with_probs:
    print(f"Word: {word}, Probability: {prob:.4f}")

Answer the materials:
 Magnetic materials with low magnetic damping constant.
 The material is:
Word: BiFeO_3, Probability: 0.0095
Word: Fe_3O_4, Probability: 0.0045
Word: Mn_2V_3O_12, Probability: 0.0043
Word: BaAl_xCr_yFe_11O_19, Probability: 0.0034
Word: NiRh_2S_4, Probability: 0.0033
Word: Ti_0.94Co_0.03La_0.03O_2, Probability: 0.0031
Word: Gd_0.67Sr_0.33MnO_3, Probability: 0.0029
Word: Fe_0.8Ga_0.2, Probability: 0.0025
Word: NiFe_2O_4, Probability: 0.0025
Word: SrSm_2Fe_2O_7, Probability: 0.0020
Word: Bi_0.5La_0.5MnO_3, Probability: 0.0020
Word: Cu_0.5Fe_0.5Cr_2S_4, Probability: 0.0019
Word: Fe_xZn_2-xMo_3O_8, Probability: 0.0016
Word: CaMn_3V_4O_12, Probability: 0.0015
Word: CdFe_2O_4, Probability: 0.0015
Word: Ni_3O_3, Probability: 0.0015
Word: Ni_3Sn_2, Probability: 0.0015
Word: Ni_50Mn_29Ga_21, Probability: 0.0014
Word: Tl_2NaFeF_6, Probability: 0.0012
Word: Ni_1.25-xZn_xPb_0.25Fe_1.5O_4, Probability: 0.0012
Word: Li_3V_2, Probability: 0.0012
Word: SrSn_0.97-xFe_xSb_0.03O_3-, 

In [30]:
def generate_prompt(prompt, output=None, eos_token="</s>"):
    instruction = "Answer the materials:\n"
    input = f"Magnetic materials with {prompt}\n"
    output = f"The material is:"
    prompt = (" ").join([instruction, input, output])
    return prompt

input_prompt = generate_prompt('low magnetocrystalline anisotropy.')
print(input_prompt)
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")

with torch.no_grad():
    logits = model(input_tokens).logits

probabilities = torch.softmax(logits[:, -1, :], dim=-1)

# Get the top 10 token IDs and their probabilities
top_k = 200
top_probabilities, top_token_ids = torch.topk(probabilities, top_k)
"""
# Convert probabilities to a human-readable format (e.g., Python list)
top_probabilities = top_probabilities.squeeze().tolist()
top_token_ids = top_token_ids.squeeze().tolist()

# Decode each token ID and pair it with its probability
top_words_with_probs = [(tokenizer.decode([token_id]), prob) for token_id, prob in zip(top_token_ids, top_probabilities)]

# Display the results
for word, prob in top_words_with_probs:
    print(f"Word: {word}, Probability: {prob:.4f}")
"""
# Filter tokens with IDs less than 32000
mask = top_token_ids >= 32000
filtered_top_token_ids = top_token_ids[mask]
filtered_top_probabilities = top_probabilities[mask]

# Convert probabilities to a human-readable format (e.g., Python list)
filtered_top_probabilities = filtered_top_probabilities.squeeze().tolist()
filtered_top_token_ids = filtered_top_token_ids.squeeze().tolist()

# Decode each token ID and pair it with its probability
top_words_with_probs = [(tokenizer.decode([token_id]), prob) for token_id, prob in zip(filtered_top_token_ids, filtered_top_probabilities)]

# Display the results
for word, prob in top_words_with_probs:
    print(f"Word: {word}, Probability: {prob:.4f}")

Answer the materials:
 Magnetic materials with low magnetocrystalline anisotropy.
 The material is:
Word: BiFeO_3, Probability: 0.0149
Word: NiFe_2O_4, Probability: 0.0040
Word: Fe_3O_4, Probability: 0.0031
Word: Gd_0.67Sr_0.33MnO_3, Probability: 0.0031
Word: Mn_2V_3O_12, Probability: 0.0029
Word: Ti_0.94Co_0.03La_0.03O_2, Probability: 0.0027
Word: CaMn_3V_4O_12, Probability: 0.0021
Word: Fe_3Ni, Probability: 0.0018
Word: Fe_xZn_2-xMo_3O_8, Probability: 0.0018
Word: Fe_0.8Ga_0.2, Probability: 0.0018
Word: Ni_3O_3, Probability: 0.0018
Word: Cu_0.5Fe_0.5Cr_2S_4, Probability: 0.0017
Word: ZnFe_2O_4, Probability: 0.0016
Word: La_0.3R_0.2Sr_0.5Ti_0.5Fe_0.5O_3, Probability: 0.0016
Word: Mn_2O_3, Probability: 0.0016
Word: Co_2TiN, Probability: 0.0015
Word: SrSm_2Fe_2O_7, Probability: 0.0014
Word: BaFe_12O_19, Probability: 0.0013
Word: BaAl_xCr_yFe_11O_19, Probability: 0.0011
Word: SrSn_0.97-xFe_xSb_0.03O_3-, Probability: 0.0010
Word: Bi_0.5La_0.5MnO_3, Probability: 0.0010
Word: BiFe_1, Probab

In [32]:
def generate_prompt(prompt, output=None, eos_token="</s>"):
    instruction = "Answer the materials:\n"
    input = f"Magnetic materials with {prompt}\n"
    output = f"The material is:"
    prompt = (" ").join([instruction, input, output])
    return prompt

input_prompt = generate_prompt('low density of states at the Fermi level.')
print(input_prompt)
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")

with torch.no_grad():
    logits = model(input_tokens).logits

probabilities = torch.softmax(logits[:, -1, :], dim=-1)

# Get the top 10 token IDs and their probabilities
top_k = 200
top_probabilities, top_token_ids = torch.topk(probabilities, top_k)
"""
# Convert probabilities to a human-readable format (e.g., Python list)
top_probabilities = top_probabilities.squeeze().tolist()
top_token_ids = top_token_ids.squeeze().tolist()

# Decode each token ID and pair it with its probability
top_words_with_probs = [(tokenizer.decode([token_id]), prob) for token_id, prob in zip(top_token_ids, top_probabilities)]

# Display the results
for word, prob in top_words_with_probs:
    print(f"Word: {word}, Probability: {prob:.4f}")
"""
# Filter tokens with IDs less than 32000
mask = top_token_ids >= 32000
filtered_top_token_ids = top_token_ids[mask]
filtered_top_probabilities = top_probabilities[mask]

# Convert probabilities to a human-readable format (e.g., Python list)
filtered_top_probabilities = filtered_top_probabilities.squeeze().tolist()
filtered_top_token_ids = filtered_top_token_ids.squeeze().tolist()

# Decode each token ID and pair it with its probability
top_words_with_probs = [(tokenizer.decode([token_id]), prob) for token_id, prob in zip(filtered_top_token_ids, filtered_top_probabilities)]

# Display the results
for word, prob in top_words_with_probs:
    print(f"Word: {word}, Probability: {prob:.4f}")

Answer the materials:
 Magnetic materials with low density of states at the Fermi level.
 The material is:
Word: BiFeO_3, Probability: 0.0074
Word: MgMn_xCr_2-xO_4, Probability: 0.0062
Word: Mn_2V_3O_12, Probability: 0.0058
Word: Fe_3O_4, Probability: 0.0053
Word: NiFe_2O_4, Probability: 0.0033
Word: Fe_3Ni, Probability: 0.0030
Word: Ni_3O_3, Probability: 0.0027
Word: Gd_0.67Sr_0.33MnO_3, Probability: 0.0025
Word: CaMn_3V_4O_12, Probability: 0.0022
Word: SrSn_0.97-xFe_xSb_0.03O_3-, Probability: 0.0021
Word: ZnFe_2O_4, Probability: 0.0019
Word: Tl_2NaFeF_6, Probability: 0.0018
Word: Co_2TiN, Probability: 0.0017
Word: CuNMn_3, Probability: 0.0016
Word: Ga_1-xSn_xCMn_3, Probability: 0.0015
Word: NiRh_2S_4, Probability: 0.0015
Word: La_0.3R_0.2Sr_0.5Ti_0.5Fe_0.5O_3, Probability: 0.0014
Word: La_0.67Sr_0.16Ca_0.17MnO_3, Probability: 0.0013
Word: BaFe_12O_19, Probability: 0.0013
Word: Co_21Mo_2B_6, Probability: 0.0012
Word: Mn_4N, Probability: 0.0012
Word: Fe_0.8Ga_0.2, Probability: 0.0012
W

In [35]:
def generate_prompt(prompt, output=None, eos_token="</s>"):
    instruction = "Answer the materials:\n"
    input = f"Magnetic materials with {prompt}\n"
    output = f"The material is:"
    prompt = (" ").join([instruction, input, output])
    return prompt

input_prompt = generate_prompt('low conductivity.')
print(input_prompt)
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")

with torch.no_grad():
    logits = model(input_tokens).logits

probabilities = torch.softmax(logits[:, -1, :], dim=-1)

# Get the top 10 token IDs and their probabilities
top_k = 200
top_probabilities, top_token_ids = torch.topk(probabilities, top_k)
"""
# Convert probabilities to a human-readable format (e.g., Python list)
top_probabilities = top_probabilities.squeeze().tolist()
top_token_ids = top_token_ids.squeeze().tolist()

# Decode each token ID and pair it with its probability
top_words_with_probs = [(tokenizer.decode([token_id]), prob) for token_id, prob in zip(top_token_ids, top_probabilities)]

# Display the results
for word, prob in top_words_with_probs:
    print(f"Word: {word}, Probability: {prob:.4f}")
"""
# Filter tokens with IDs less than 32000
mask = top_token_ids >= 32000
filtered_top_token_ids = top_token_ids[mask]
filtered_top_probabilities = top_probabilities[mask]

# Convert probabilities to a human-readable format (e.g., Python list)
filtered_top_probabilities = filtered_top_probabilities.squeeze().tolist()
filtered_top_token_ids = filtered_top_token_ids.squeeze().tolist()

# Decode each token ID and pair it with its probability
top_words_with_probs = [(tokenizer.decode([token_id]), prob) for token_id, prob in zip(filtered_top_token_ids, filtered_top_probabilities)]

# Display the results
for word, prob in top_words_with_probs:
    print(f"Word: {word}, Probability: {prob:.4f}")

Answer the materials:
 Magnetic materials with low conductivity.
 The material is:
Word: BiFeO_3, Probability: 0.0080
Word: Fe_3O_4, Probability: 0.0077
Word: Ti_0.94Co_0.03La_0.03O_2, Probability: 0.0026
Word: SrSm_2Fe_2O_7, Probability: 0.0020
Word: Mn_2V_3O_12, Probability: 0.0020
Word: BaAl_xCr_yFe_11O_19, Probability: 0.0018
Word: Cu_0.5Fe_0.5Cr_2S_4, Probability: 0.0018
Word: NiFe_2O_4, Probability: 0.0018
Word: Bi_0.5La_0.5MnO_3, Probability: 0.0016
Word: Fe_0.8Ga_0.2, Probability: 0.0015
Word: Gd_0.67Sr_0.33MnO_3, Probability: 0.0015
Word: CaMn_3V_4O_12, Probability: 0.0011
Word: SrSn_0.97-xFe_xSb_0.03O_3-, Probability: 0.0011
Word: Ni_3O_3, Probability: 0.0010
Word: Na_2NiSi_4O_10, Probability: 0.0010
Word: V_2O_5, Probability: 0.0010
Word: Co_21Mo_2B_6, Probability: 0.0010
Word: Li_0.46Zn_0.04Fe_2.5O_4, Probability: 0.0009
Word: C_5R_5FeC_6R_6, Probability: 0.0009
Word: Fe_3Ni, Probability: 0.0008
Word: Fe_3W_3C, Probability: 0.0008
Word: Fe_xZn_2-xMo_3O_8, Probability: 0.000

In [44]:
def generate_prompt(prompt, output=None, eos_token="</s>"):
    #instruction = "Answer the materials:\n"
    input = f"Magnetic materials with {prompt}\n"
    output = f"The material is:"
    prompt = (" ").join([input, output])
    return prompt

input_prompt = generate_prompt('high band gap.')
print(input_prompt)
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")

with torch.no_grad():
    logits = model(input_tokens).logits

probabilities = torch.softmax(logits[:, -1, :], dim=-1)

# Get the top 10 token IDs and their probabilities
top_k = 100
top_probabilities, top_token_ids = torch.topk(probabilities, top_k)

# Filter tokens with IDs less than 32000
mask = top_token_ids >= 32000
filtered_top_token_ids = top_token_ids[mask]
filtered_top_probabilities = top_probabilities[mask]

# Convert probabilities to a human-readable format (e.g., Python list)
filtered_top_probabilities = filtered_top_probabilities.squeeze().tolist()
filtered_top_token_ids = filtered_top_token_ids.squeeze().tolist()

# Decode each token ID and pair it with its probability
top_words_with_probs = [(tokenizer.decode([token_id]), prob) for token_id, prob in zip(filtered_top_token_ids, filtered_top_probabilities)]

# Display the results
for word, prob in top_words_with_probs:
    print(f"Word: {word}, Probability: {prob:.4f}")

Magnetic materials with high band gap.
 The material is:
Word: BiFeO_3, Probability: 0.0214
Word: NiFe_2O_4, Probability: 0.0130
Word: Fe_3O_4, Probability: 0.0122
Word: CaMn_3V_4O_12, Probability: 0.0095
Word: MgMn_xCr_2-xO_4, Probability: 0.0081
Word: CoFe_2, Probability: 0.0074
Word: Ca_xFe_3-xO_4, Probability: 0.0058
Word: Fe_3Ni, Probability: 0.0042
Word: CoFe_2O_4, Probability: 0.0038
Word: Li_0.46Zn_0.04Fe_2.5O_4, Probability: 0.0037
Word: BaFe_12O_19, Probability: 0.0037
Word: Co_2TiN, Probability: 0.0036
Word: Ni_0.5Zn_0.5Fe_2O_4, Probability: 0.0036
Word: CsCoCl_3, Probability: 0.0034
Word: Cu_0.5Fe_0.5Cr_2S_4, Probability: 0.0032
Word: ZnFe_2O_4, Probability: 0.0026
Word: Fe_0.8Ga_0.2, Probability: 0.0026
Word: Ge_0.99Mn_0.01, Probability: 0.0026
Word: La_0.3R_0.2Sr_0.5Ti_0.5Fe_0.5O_3, Probability: 0.0026
Word: Fe_2B, Probability: 0.0026
Word: Ni_xZn_2-xGeO_4, Probability: 0.0026
Word: LiNi_0.65-xCo_0.1Mn_0.25Cr_xO_2, Probability: 0.0025
Word: Mn_3GaC, Probability: 0.0023
Wo

In [2]:
model_name = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             torch_dtype=torch.bfloat16,
                                             device_map="auto"
                                            )
tokenizer = AutoTokenizer.from_pretrained('results/checkpoint-8400')
model.resize_token_embeddings(len(tokenizer))

model.load_adapter('results/checkpoint-8400')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             torch_dtype=torch.bfloat16,
                                             device_map="auto"
                                            )
original_tokenizer = AutoTokenizer.from_pretrained(model_name)
original_embeddings = model.get_input_embeddings().weight.detach().clone()

tokenizer = AutoTokenizer.from_pretrained('results/checkpoint-8400')
model.resize_token_embeddings(len(tokenizer))
model.load_adapter('results/checkpoint-8400')


embeddings = model.get_input_embeddings().weight.data
embeddings[:len(original_tokenizer)] = original_embeddings[:len(original_tokenizer)]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def generate_prompt(prompt, output=None, eos_token="</s>"):
    instruction = "Answer the materials:\n"
    input = f"Magnetic materials with {prompt}\n"
    output = f"The molecular formula of the material:"
    prompt = (" ").join([instruction, input, output])
    return prompt

input_prompt = generate_prompt('low magnetic damping constant.')
print(input_prompt)
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")

with torch.no_grad():
    logits = model(input_tokens).logits

# 在计算softmax之前，为了数值稳定性，从logits中减去每个logit的最大值
logits_stable = logits - torch.max(logits, dim=-1, keepdim=True).values

probabilities = torch.softmax(logits_stable[:, -1, :], dim=-1)

# Get the top 10 token IDs and their probabilities
top_k = 10
top_probabilities, top_token_ids = torch.topk(probabilities, top_k)

# Convert probabilities to a human-readable format (e.g., Python list)
top_probabilities = top_probabilities.squeeze().tolist()
top_token_ids = top_token_ids.squeeze().tolist()

# Decode each token ID and pair it with its probability
top_words_with_probs = [(tokenizer.decode([token_id]), prob) for token_id, prob in zip(top_token_ids, top_probabilities)]

# Display the results
for word, prob in top_words_with_probs:
    print(f"Word: {word}, Probability: {prob:.4f}")

In [9]:
len(probabilities[0])

37553

In [23]:
input_tokens

tensor([[    1,   673,   278, 17279, 29901,    13,  3561,  1212,   293, 17279,
           411,  4482, 15611,   270,  1160,   292,  4868, 29889,    13,   450,
         13206, 16637,  7063,   310,   278,  5518, 29901]], device='cuda:0')

In [21]:
input_tokens

tensor([[    1,   673,   278, 17279, 29901,    13,  3561,  1212,   293, 17279,
           411,  4482, 15611,   270,  1160,   292,  4868, 29889,    13,   450,
         13206, 16637,  7063,   310,   278,  5518, 29901, 37551]],
       device='cuda:0')

In [67]:
model.save_pretrained('result', save_embedding_layers=True)

In [68]:
tokenizer.save_pretrained('result')

('result/tokenizer_config.json',
 'result/special_tokens_map.json',
 'result/tokenizer.model',
 'result/added_tokens.json',
 'result/tokenizer.json')

In [34]:
tokenizer.save_pretrained('results')

('results/tokenizer_config.json',
 'results/special_tokens_map.json',
 'results/tokenizer.model',
 'results/added_tokens.json',
 'results/tokenizer.json')

In [None]:
for param in model.parameters():
    print(param.dtype)

In [22]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(37553, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=256, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=256, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear(
                (base_layer): Linear(in_features=40

In [15]:
for name, param in model.named_parameters():
    print(param.requires_grad, name, param.dtype)

False base_model.model.model.embed_tokens.base_layer.weight torch.float16
True base_model.model.model.embed_tokens.lora_embedding_A.default torch.float16
True base_model.model.model.embed_tokens.lora_embedding_B.default torch.float16
False base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight torch.float16
True base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight torch.float16
True base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight torch.float16
False base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight torch.float16
True base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight torch.float16
True base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight torch.float16
False base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight torch.float16
True base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight torch.float16
True base_model.model.model.layers.0.self_attn.v_proj.l

In [14]:
# Verify which parameters are trainable
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Trainable: {name}")

Trainable: base_model.model.model.embed_tokens.weight
Trainable: base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight
Trainable: base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight
Trainable: base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight
Trainable: base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight
Trainable: base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight
Trainable: base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight
Trainable: base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight
Trainable: base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight
Trainable: base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight
Trainable: base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight
Trainable: base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight
Trainable: base_model.model.model.layers.1.s

In [None]:
for name, param in model.named_parameters():
    print(f"Trainable: {name}", param.requires_grad)

In [14]:
model.parameters()

<generator object Module.parameters at 0x7faf7c779ee0>

In [15]:
model.get_input_embeddings()

Embedding(38544, 4096)

In [18]:
model.num_parameters()

6922694656

In [19]:
tokenizer.add_special_tokens({"pad_token": "<PAD>"})
model.resize_token_embeddings(len(tokenizer))

Embedding(38545, 4096)