In [11]:
PROMPT='''You are an expert with a deep background in the field of material informatics, \
focusing on extracting material entities and related information from textual data.
You are tasked with performing a Named Entity Recognition (NER) operation on the provided text.
Your goal is to identify and extract entities according to the specific categories related to the study of materials science.
Let's work this out in a step by step way tobe sure we have the right answer.

All entity types including their detailed definition are listed as follow.
### Definition:
## Entity:
- MAT: Any inorganic solid or alloy, any non-gaseous element (at RT), e.g., "BaTiO3", "titania", "Fe".
- SPL: Names for crystal structures/phases, e.g., "tetragonal", "fcc", "rutile","perovskite"; or, any symmetry label such as "Pbnm", or "Pnma".
- DSC: Special descriptions of the type/shape of the sample. Examples include "single crystal", "nanotube", "quantum dot".
- PRO: Anything measurable that can have a unit and a value, e.g., "conductivity", “band gap”; or, any qualitative property or phenomenon exhibited by a material, e.g., "ferroelectric", "metallic".
- APL: Any high-level application such as "photovoltaics", or any specific device such as “field-effect transistor”.
- SMT: Any technique for synthesising a material, e.g., "pulsed laser deposition", "solid state reaction", or any other step in sample production such as "annealing" or "etching".
- CMT: Any method used to characterize a material, experiment or theory: e.g., "photoluminescence", "XRD", "tight binding", "DFT". It can also be a name for an equation or model, such "Bethe-Salpeter equation".

### Notice:
1. Each line of the output must be a valid JSON string.
2. If no entities are detected, the output should state '[]'.
3. Some extracted terms may not be classified into 'MAT', 'SPL', 'DSC', 'PRO', 'SMT', 'CMT',or 'APL'. \
In this case, discard these words and focus on the specified entities.

### Output format
## Entity Representation:
- Respond in the format of [('entity type', 'entity name'),...].
- The extracted entity name must be the same as in the original text.

## Your output should be a single JSON object in the following format:
{{
  "entities": "[('entity type', 'entity name')]",
  "reasons":"Give your reasons."
}}'''

In [24]:
import json

# 读取输入文本文件
input_file = 'llm_微调/Metal/devel.txt'
output_file = 'llm_微调/Metal/devel.json'

def convert_to_specified_format(text):
    paragraphs = text.strip().split('\n\n')
    results = []

    for paragraph in paragraphs:
        lines = paragraph.strip().split('\n')
        sentence = []
        entities = []
        current_entity = []
        current_label = None

        for line in lines:
            if not line.strip():
                continue

            parts = line.split()
            if len(parts) != 2:
                continue

            word, label = parts

            sentence.append(word)

            if label == 'O':
                if current_entity:
                    entities.append({"entity_text": " ".join(current_entity), "entity_label": current_label})
                    current_entity = []
                    current_label = None
            else:
                if label.startswith('B-'):
                    if current_entity:
                        entities.append({"entity_text": " ".join(current_entity), "entity_label": current_label})
                    current_entity = [word]
                    current_label = label[2:]
                elif label.startswith('I-') and current_entity:
                    current_entity.append(word)
                else:
                    if current_entity:
                        entities.append({"entity_text": " ".join(current_entity), "entity_label": current_label})
                    current_entity = []
                    current_label = None
                    sentence.append(word)

        if current_entity:
            entities.append({"entity_text": " ".join(current_entity), "entity_label": current_label})

        input_text = " ".join(sentence)
        
        output = [
            (entity["entity_label"], entity["entity_text"])
            for entity in entities
        ]  # 生成一个元组列表

        # 将 output 转换为字符串格式
        output_str = str(output)

        result = {
            "instruction": PROMPT,
            "input": f"text:{input_text}",
            "output": output_str  # 使用字符串表示元组列表
        }
        results.append(result)

    with open(output_file, "w", encoding="utf-8") as file:
        for message in results:
            file.write(json.dumps(message, ensure_ascii=False) + "\n")

with open(input_file, 'r', encoding='utf-8') as file:
    input_text = file.read()

# 转换格式
convert_to_specified_format(input_text)

print(f"转换结果已保存到 {output_file}")

转换结果已保存到 llm_微调/Metal/devel.json


## MPNs数据集转换

In [25]:
import pandas as pd

with open("llm_微调/MPNs/train.json", 'r', encoding='utf-8') as f:
    input_data = [json.loads(line) for line in f]

In [None]:
input_data

In [35]:
import json

# 定义转换函数
def convert_to_specified_format(examples):
    converted_list = []
    for example in examples:
        # 解析句子和实体
        sentence = example['sentences'][0]
        entities = example.get('ner', [])
        formatted_entities = []

        # 遍历实体并格式化
        for entity in entities:
            for start_idx, end_idx, label in entity:
                entity_text = ' '.join(sentence[start_idx:end_idx+1])
                formatted_entities.append((label, entity_text))

        # 构建输出字典
        output_dict = {
            "instruction": "根据文本数据执行命名实体识别任务。",
            "input": f"text:'{' '.join(sentence)}'",
            "output": json.dumps(formatted_entities, ensure_ascii=False),
        }
        converted_list.append(output_dict)

    return converted_list

# 执行转换
converted_data = convert_to_specified_format(input_data)

# 将转换后的数据写入JSON文件
with open('llm_微调/MPNs/converted_train.json', 'w', encoding='utf-8') as json_file:
    json.dump(converted_data, json_file, ensure_ascii=False, indent=4)

print("数据已成功保存到 converted_output.json")

数据已成功保存到 converted_output.json
