In [8]:
import json

def is_valid_data(data):
    if "entities" not in data or "relations" not in data:
        return False
    for entity in data["entities"]:
        if "id" not in entity or "label" not in entity or "start_offset" not in entity or "end_offset" not in entity:
            return False
    for relation in data["relations"]:
        if "id" not in relation or "from_id" not in relation or "to_id" not in relation or "type" not in relation:
            return False
    return True

def process_line(line):
    line = line.replace("```json", "").replace("```", "").strip()
    return line

def extract_data(input_path, output_path, error_log_path):
    valid_data_count = 0
    line_count = 0
    buffer = ""
    inside_object = False
    inside_array = False
    errors = []

    with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile, open(error_log_path, 'w', encoding='utf-8') as error_log:
        for line in infile:
            line_count += 1
            line = process_line(line)

            if not line:
                continue  # 跳过空行

            if line.startswith("[") and not inside_array:
                inside_array = True
                buffer = line
            elif line.startswith("]") and inside_array:
                inside_array = False
                buffer += line
                try:
                    data = json.loads(buffer)
                    if isinstance(data, list):
                        for item in data:
                            if is_valid_data(item):
                                outfile.write(json.dumps(item, ensure_ascii=False) + '\n')
                                valid_data_count += 1
                except json.JSONDecodeError as e:
                    errors.append(buffer)
                    error_log.write(f"Line {line_count}: {buffer}\n")
                buffer = ""
            elif inside_array:
                buffer += line
            elif line.startswith("{") and not inside_object:
                inside_object = True
                buffer = line
            elif line.startswith("}") and inside_object:
                inside_object = False
                buffer += line
                try:
                    data = json.loads(buffer)
                    if is_valid_data(data):
                        outfile.write(json.dumps(data, ensure_ascii=False) + '\n')
                        valid_data_count += 1
                except json.JSONDecodeError as e:
                    errors.append(buffer)
                    error_log.write(f"Line {line_count}: {buffer}\n")
                buffer = ""
            elif inside_object:
                buffer += line
            else:
                try:
                    data = json.loads(line)
                    if is_valid_data(data):
                        outfile.write(json.dumps(data, ensure_ascii=False) + '\n')
                        valid_data_count += 1
                except json.JSONDecodeError as e:
                    errors.append(line)
                    error_log.write(f"Line {line_count}: {line}\n")

    print(f"Total valid data count: {valid_data_count}")
    print(f"Total errors: {len(errors)}")
    return valid_data_count

# 定义文件路径
input_path = 'answer.txt'
output_path = 'test2.json'
error_log_path = 'error_log2.txt'

# 运行提取数据的函数
valid_data_count = extract_data(input_path, output_path, error_log_path)
print(f"Total valid data count: {valid_data_count}")


Total valid data count: 280
Total errors: 3681
Total valid data count: 280


In [3]:
import json

def is_valid_data(data):
    if "entities" not in data:
        return False
    for entity in data["entities"]:
        if "id" not in entity or "label" not in entity or "start_offset" not in entity or "end_offset" not in entity:
            return False
    return True

def process_line(line):
    return line.strip()

def extract_data(input_path, output_path, error_log_path):
    valid_data_count = 0
    line_count = 0
    buffer = ""
    inside_object = False
    errors = []

    with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile, open(
            error_log_path, 'w', encoding='utf-8') as error_log:
        for line in infile:
            line_count += 1
            line = process_line(line)

            if not line:
                continue  # 跳过空行

            if line.startswith("{") and not inside_object:
                inside_object = True
                buffer += line
            elif line.startswith("}") and inside_object:
                buffer += line
                inside_object = False
                try:
                    data = json.loads(buffer)
                    if isinstance(data, dict) and is_valid_data(data):
                        outfile.write(json.dumps(data, ensure_ascii=False) + '\n')
                        valid_data_count += 1
                except json.JSONDecodeError as e:
                    errors.append(buffer)
                    error_log.write(f"Line {line_count}: {buffer}\n")
                buffer = ""
            elif inside_object:
                buffer += line
            else:
                try:
                    data = json.loads(line)
                    if isinstance(data, dict) and is_valid_data(data):
                        outfile.write(json.dumps(data, ensure_ascii=False) + '\n')
                        valid_data_count += 1
                except json.JSONDecodeError as e:
                    errors.append(line)
                    error_log.write(f"Line {line_count}: {line}\n")

    print(f"Total valid data count: {valid_data_count}")
    print(f"Total errors: {len(errors)}")
    return valid_data_count

# 定义文件路径
input_path = 'NER.txt'
output_path = 'output_NER.json'
error_log_path = 'log.txt'

# 运行提取数据的函数
valid_data_count = extract_data(input_path, output_path, error_log_path)
print(f"Total valid data count: {valid_data_count}")


Total valid data count: 1
Total errors: 3859
Total valid data count: 1


In [5]:
import re

# 读取文件内容
file_path = 'NER.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()

# 使用正则表达式匹配所有的JSON对象
pattern = re.compile(r'\{.*?\}(?=\n|$)', re.DOTALL)
matches = pattern.findall(content)

# 将匹配到的JSON对象逐行写入新文件
output_file_path = 'extracted_NER.json'
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    for match in matches:
        output_file.write(match.strip() + '\n')

print(f'Extracted {len(matches)} JSON objects to {output_file_path}')


Extracted 1230 JSON objects to extracted_NER.json


In [6]:
import re
import json

# 读取文件内容
file_path = 'NER.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()

# 使用正则表达式匹配所有的JSON对象，尝试修正未封闭的JSON对象
pattern = re.compile(r'\{.*?\}(?=\n|$)', re.DOTALL)
matches = pattern.findall(content)

# 修正匹配到的JSON对象
corrected_matches = []
for match in matches:
    try:
        json_obj = json.loads(match)  # 检查JSON是否有效
        corrected_matches.append(match.strip())
    except json.JSONDecodeError:
        # 尝试修复JSON对象
        if match.count('{') != match.count('}'):
            match += '}' * (match.count('{') - match.count('}'))
        try:
            json.loads(match)  # 再次检查修复后的JSON
            corrected_matches.append(match.strip())
        except json.JSONDecodeError:
            print(f"Failed to correct JSON object: {match}")

# 将修正后的JSON对象逐行写入新文件
output_file_path = 'extracted_NER.json'
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    for match in corrected_matches:
        output_file.write(match + '\n')

print(f'Extracted {len(corrected_matches)} JSON objects to {output_file_path}')


Failed to correct JSON object: {"id": 1, "text": "第 3 章 个旧锡铜多金属矿床深部成矿地质背景",
"entities": [
      {"id": 1, "label": "矿床知识本体", "start_offset": 0, "end_offset": 23, "text": "第 3 章 个旧锡铜多金属矿床深部成矿地质背景"},
      {"id": 2, "label": "矿床", "start_offset": 6, "end_offset": 15, "text": "个旧锡铜多金属矿床"}}
Failed to correct JSON object: {
  "id": 1,
  "text": "云南个旧锡-铜多金属矿床，位于昆明以南约300km 处，是世界上最大的锡多金属矿床之一(图3-1)。",
  "entities": [
    {"id": 1, "label": "矿床", "start_offset": 2, "end_offset": 12, "text": "个旧锡-铜多金属矿床"},
    {"id": 2, "label": "地理位置、方位、方向等", "start_offset": 15, "end_offset": 20, "text": "昆明"},
    {"id": 3, "label": "地理位置、方位、方向等", "start_offset": 22, "end_offset": 28, "text": "以南约300km"},
    {"id": 4, "label": "矿床", "start_offset": 38, "end_offset": 44, "text": "锡多金属矿床"},
    {"id": 5, "label": "图表", "start_offset": 50, "end_offset": 56, "text": "图3-1"}}
Failed to correct JSON object: {"id": 1, "text": "矿田面积约1500km², 从北至南分布五座大型锡-铜-铅-锌多金属矿床，分别是马拉格、松树脚、高松、老厂和卡房矿床，包含平均品位为1%的锡矿石 300Mt, 平均品位为2%的铜矿石

In [10]:
import json


def is_valid_data(data):
    if "entities" not in data :
        return False
    for entity in data["entities"]:
        if "id" not in entity or "label" not in entity or "start_offset" not in entity or "end_offset" not in entity:
            return False
    return True


def process_line(line):
    line = line.replace("```json", "").replace("```", "").strip()
    return line


def extract_data(input_path, output_path, error_log_path):
    valid_data_count = 0
    line_count = 0
    buffer = ""
    inside_object = False
    errors = []

    with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile, open(
            error_log_path, 'w', encoding='utf-8') as error_log:
        for line in infile:
            line_count += 1
            line = process_line(line)

            if not line:
                continue  # 跳过空行

            if line.startswith("{") and not inside_object:
                inside_object = True
                buffer += line
            elif line.startswith("}") and inside_object:
                buffer += line
                inside_object = False
                try:
                    data = json.loads(buffer)
                    if isinstance(data, dict) and is_valid_data(data):
                        outfile.write(json.dumps(data, ensure_ascii=False) + '\n')
                        valid_data_count += 1
                except json.JSONDecodeError as e:
                    errors.append(buffer)
                    error_log.write(f"Line {line_count}: {buffer}\n")
                buffer = ""
            elif inside_object:
                buffer += line
            else:
                try:
                    data = json.loads(line)
                    if isinstance(data, dict) and is_valid_data(data):
                        outfile.write(json.dumps(data, ensure_ascii=False) + '\n')
                        valid_data_count += 1
                except json.JSONDecodeError as e:
                    errors.append(line)
                    error_log.write(f"Line {line_count}: {line}\n")

    print(f"Total valid data count: {valid_data_count}")
    print(f"Total errors: {len(errors)}")
    return valid_data_count

# 定义文件路径
input_path = 'NER.txt'
output_path = 'output_ner.json'
error_log_path = 'log_NER.txt'


# 运行提取数据的函数
valid_data_count = extract_data(input_path, output_path, error_log_path)
print(f"Total valid data count: {valid_data_count}")

Total valid data count: 159
Total errors: 610
Total valid data count: 159


In [18]:
import json

def is_valid_data(data):
    if "entities" not in data or "text" not in data:
        return False
    for entity in data["entities"]:
        if "id" not in entity or "label" not in entity or "start_offset" not in entity or "end_offset" not in entity or "text" not in entity:
            return False
    return True

def extract_data(input_path, output_path, error_log_path):
    valid_data_count = 0
    line_count = 0
    buffer = ""
    inside_json_block = False
    errors = []

    with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile, open(
            error_log_path, 'w', encoding='utf-8') as error_log:
        for line in infile:
            line_count += 1
            line = line.strip()

            # 检查是否进入或退出```json```标记
            if line.startswith("```json"):
                inside_json_block = True
                buffer = ""
                continue
            elif line.startswith("```") and inside_json_block:
                inside_json_block = False
                # 处理多个JSON对象
                objects = buffer.split('}{')
                for i in range(len(objects)):
                    if i > 0:
                        objects[i] = '{' + objects[i]
                    if i < len(objects) - 1:
                        objects[i] = objects[i] + '}'
                    try:
                        data = json.loads(objects[i])
                        if is_valid_data(data):
                            outfile.write(json.dumps(data, ensure_ascii=False) + '\n')
                            valid_data_count += 1
                        else:
                            error_log.write(f"Invalid data at line {line_count}: {objects[i]}\n")
                    except json.JSONDecodeError as e:
                        error_log.write(f"JSON decode error at line {line_count}: {objects[i]}\n")
                buffer = ""
                continue

            if inside_json_block:
                buffer += line

    print(f"Total valid data count: {valid_data_count}")
    print(f"Total errors: {line_count - valid_data_count}")
    return valid_data_count

# 定义文件路径
input_path = 'NER.txt'
output_path = 'output_ner.json'
error_log_path = 'log_NER.txt'

# 运行提取数据的函数
valid_data_count = extract_data(input_path, output_path, error_log_path)
print(f"Total valid data count: {valid_data_count}")

Total valid data count: 467
Total errors: 17396
Total valid data count: 467


In [5]:
import json
import re

def is_valid_data(data):
    if "entities" not in data or "text" not in data:
        return False
    for entity in data["entities"]:
        if "id" not in entity or "label" not in entity or "start_offset" not in entity or "end_offset" not in entity or "text" not in entity:
            return False
    return True

def clean_json_string(json_string):
    # 去除 JSON 对象内部的多余逗号
    json_string = re.sub(r',\s*}', '}', json_string)
    json_string = re.sub(r',\s*]', ']', json_string)
    return json_string

def extract_data(input_path, output_path, error_log_path):
    valid_data_count = 0
    line_count = 0
    buffer = ""
    errors = []

    with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile, open(
            error_log_path, 'w', encoding='utf-8') as error_log:
        for line in infile:
            line_count += 1
            line = line.strip()
            if line:
                buffer += line

        # 将多个 JSON 对象分割开
        json_objects = buffer.replace('}{', '}\n{').split('\n')

        for json_str in json_objects:
            json_str = clean_json_string(json_str)
            try:
                data = json.loads(json_str)
                if is_valid_data(data):
                    outfile.write(json.dumps(data, ensure_ascii=False) + '\n')
                    valid_data_count += 1
                else:
                    error_log.write(f"Invalid data at line {line_count}: {json_str}\n")
            except json.JSONDecodeError as e:
                error_log.write(f"JSON decode error at line {line_count}: {json_str}\n")

    print(f"Total valid data count: {valid_data_count}")
    print(f"Total errors: {line_count - valid_data_count}")
    return valid_data_count

# 定义文件路径
input_path = 'NER.jsonl'
output_path = 'output_ner.json'
error_log_path = 'log_NER.txt'

# 运行提取数据的函数
valid_data_count = extract_data(input_path, output_path, error_log_path)
print(f"Total valid data count: {valid_data_count}")


Total valid data count: 1233
Total errors: 15404
Total valid data count: 1233


In [20]:
import json

def merge_relations_with_entities(user_input, response):
    user_data = json.loads(user_input)

    # 尝试解析 response 为 JSON 对象或 JSON 数组
    try:
        relations_data = json.loads(response)
        if isinstance(relations_data, dict) and 'relations' in relations_data:
            user_data['relations'] = relations_data['relations']
        elif isinstance(relations_data, list):
            user_data['relations'] = relations_data
        else:
            user_data['relations'] = []
    except json.JSONDecodeError:
        user_data['relations'] = []

    return user_data

def process_data(input_path, output_path, error_log_path):
    line_count = 0
    valid_data_count = 0
    errors = []

    with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile, open(
            error_log_path, 'w', encoding='utf-8') as error_log:
        lines = infile.readlines()
        user_input = ""
        response = ""
        processing_user_input = False

        for line in lines:
            line_count += 1
            line = line.strip()

            if line.startswith("User Input:"):
                processing_user_input = True
                user_input = line.replace("User Input: ", "")
            elif line.startswith("Response:"):
                processing_user_input = False
                response = line.replace("Response: ", "")

                # 先移除 ```json 和 ``` 标签
                if response.startswith("```json"):
                    response = response[7:].strip()  # Remove starting ```json
                if response.endswith("```"):
                    response = response[:-3].strip()  # Remove ending ```

                try:
                    merged_data = merge_relations_with_entities(user_input, response)
                    outfile.write(json.dumps(merged_data, ensure_ascii=False) + '\n')
                    valid_data_count += 1
                except Exception as e:
                    error_log.write(f"Error processing data at line {line_count}: {str(e)}\n")
                user_input = ""
                response = ""

    return valid_data_count

# 定义文件路径
input_path = 'RE.txt'
output_path = 'output_RE.jsonl'
error_log_path = 'log_RE.txt'

# 运行处理数据的函数
valid_data_count = process_data(input_path, output_path, error_log_path)
valid_data_count

994

In [30]:
import json
import re

def extract_relations(response):
    # 使用正则表达式提取 "relations" 部分
    match = re.search(r'"relations":\s*(\[[^\]]*\])', response, re.DOTALL)
    if match:
        return match.group(1)
    return "[]"

def merge_relations_with_entities(user_input, response):
    user_data = json.loads(user_input)

    try:
        relations_str = extract_relations(response)
        relations_data = json.loads(relations_str)
        user_data['relations'] = relations_data
    except json.JSONDecodeError as e:
        print(f"JSONDecodeError: {str(e)} with response: {response}")
        user_data['relations'] = []
    except Exception as e:
        print(f"Error merging data: {str(e)}")
        user_data['relations'] = []

    return user_data

def process_data(input_path, output_path, error_log_path):
    line_count = 0
    valid_data_count = 0

    with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile, open(error_log_path, 'w', encoding='utf-8') as error_log:
        lines = infile.readlines()
        user_input = ""
        response = ""
        processing_user_input = False

        for line in lines:
            line_count += 1
            line = line.strip()

            if line.startswith("User Input:"):
                if user_input and response:
                    try:
                        merged_data = merge_relations_with_entities(user_input, response)
                        outfile.write(json.dumps(merged_data, ensure_ascii=False) + '\n')
                        valid_data_count += 1
                    except Exception as e:
                        error_log.write(f"Error processing data at line {line_count}: {str(e)}\n")
                user_input = line.replace("User Input: ", "")
                response = ""
                processing_user_input = True
                error_log.write(f"Read User Input: {user_input}\n")  # Debug log
            elif line.startswith("Response:"):
                processing_user_input = False
                response = line.replace("Response: ", "")
                error_log.write(f"Read Response: {response}\n")  # Debug log
            else:
                if not processing_user_input and line:
                    response += " " + line

        # 处理文件结尾的最后一个记录
        if user_input and response:
            try:
                merged_data = merge_relations_with_entities(user_input, response)
                outfile.write(json.dumps(merged_data, ensure_ascii=False) + '\n')
                valid_data_count += 1
            except Exception as e:
                error_log.write(f"Error processing data at line {line_count}: {str(e)}\n")

    return valid_data_count

# 定义文件路径
input_path = 'RE.txt'
output_path = 'output_RE.jsonl'
error_log_path = 'log_RE.txt'

# 运行处理数据的函数
valid_data_count = process_data(input_path, output_path, error_log_path)
print(f"Total valid data count: {valid_data_count}")


JSONDecodeError: Unterminated string starting at: line 1 column 594 (char 593) with response: "relations": [ {"id": 1, "from_id": 6373, "start_text": "研究区混合重力数据", "to_id": 6374, "end_text": "BIMF分量", "type": "分为"}, {"id": 2, "from_id": 6374, "start_text": "BIMF分量", "to_id": 6357, "end_text": "BIMF₁", "type": "包含"}, {"id": 3, "from_id": 6357, "start_text": "BIMF₁", "to_id": 6358, "end_text": "BIMF₂", "type": "和"}, {"id": 4, "from_id": 6358, "start_text": "BIMF₂", "to_id": 6359, "end_text": "BIMF₃", "type": "和"}, {"id": 5, "from_id": 6374, "start_text": "BIMF分量", "to_id": 6359, "end_text": "BIMF₃", "type": "包含"}, {"id": 6, "from_id": 6373, "start_text": "研究区混合重力数据", "to_id": 6361, "end_text": "[Res(m,n)]", "type": "和"}, {"id": 7, "from_id": 6373, "start_text": "研究区混合重力数据", "to_id": 8651, "end_text": "3个", "type": "分为"}, {"id": 8, "from_id": 8651, "start_text": "3个", "to_id": 6360, "end_text": "频率依次降低", "type": "为"}, {"id": 9, "from_id": 6360, "start_text": "频率依次降低", "to_id": 6357, "end_t

In [4]:
import json
import re

def clean_json_string(json_string):
    """
    Clean JSON string to ensure it is a valid JSON array or object.
    This includes handling cases where the JSON string is not properly terminated.
    """
    json_string = json_string.strip()
    if not (json_string.startswith('{') or json_string.startswith('[')):
        json_string = '{' + json_string
    if not (json_string.endswith('}') or json_string.endswith(']')):
        json_string = json_string + '}'
    return json_string

def extract_relations(response):
    # 使用正则表达式提取 "relations" 部分，处理不完整的数组
    match = re.search(r'"relations":\s*(\[[^\]]*\])', response, re.DOTALL)
    if match:
        return match.group(1)
    return "[]"

def merge_relations_with_entities(user_input, response):
    user_data = json.loads(user_input)

    try:
        relations_str = extract_relations(response)
        relations_str = clean_json_string(relations_str)
        relations_data = json.loads(relations_str)
        user_data['relations'] = relations_data
    except json.JSONDecodeError as e:
        print(f"JSONDecodeError: {str(e)} with response: {response}")
        user_data['relations'] = []
    except Exception as e:
        print(f"Error merging data: {str(e)}")
        user_data['relations'] = []

    return user_data

def process_data(input_path, output_path, error_log_path):
    line_count = 0
    valid_data_count = 0

    with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile, open(error_log_path, 'w', encoding='utf-8') as error_log:
        lines = infile.readlines()
        user_input = ""
        response_lines = []
        processing_user_input = False

        for line in lines:
            line_count += 1
            line = line.strip()

            if line.startswith("User Input:"):
                if user_input and response_lines:
                    response = ' '.join(response_lines)
                    try:
                        merged_data = merge_relations_with_entities(user_input, response)
                        outfile.write(json.dumps(merged_data, ensure_ascii=False) + '\n')
                        valid_data_count += 1
                    except Exception as e:
                        error_log.write(f"Error processing data at line {line_count}: {str(e)}\n")
                user_input = line.replace("User Input: ", "")
                response_lines = []
                processing_user_input = True
                error_log.write(f"Read User Input: {user_input}\n")  # Debug log
            elif line.startswith("Response:"):
                processing_user_input = False
                response = line.replace("Response: ", "")
                response_lines.append(response)
                error_log.write(f"Read Response: {response}\n")  # Debug log
            else:
                if not processing_user_input and line:
                    response_lines.append(line)

        # 处理文件结尾的最后一个记录
        if user_input and response_lines:
            response = ' '.join(response_lines)
            try:
                merged_data = merge_relations_with_entities(user_input, response)
                outfile.write(json.dumps(merged_data, ensure_ascii=False) + '\n')
                valid_data_count += 1
            except Exception as e:
                error_log.write(f"Error processing data at line {line_count}: {str(e)}\n")

    return valid_data_count

# 定义文件路径
input_path = 'RE.jsonal'
output_path = 'output_RE.jsonl'
error_log_path = 'log_RE.txt'

# 运行处理数据的函数
valid_data_count = process_data(input_path, output_path, error_log_path)
print(f"Total valid data count: {valid_data_count}")

JSONDecodeError: Unterminated string starting at: line 1 column 594 (char 593) with response: "relations": [ {"id": 1, "from_id": 6373, "start_text": "研究区混合重力数据", "to_id": 6374, "end_text": "BIMF分量", "type": "分为"}, {"id": 2, "from_id": 6374, "start_text": "BIMF分量", "to_id": 6357, "end_text": "BIMF₁", "type": "包含"}, {"id": 3, "from_id": 6357, "start_text": "BIMF₁", "to_id": 6358, "end_text": "BIMF₂", "type": "和"}, {"id": 4, "from_id": 6358, "start_text": "BIMF₂", "to_id": 6359, "end_text": "BIMF₃", "type": "和"}, {"id": 5, "from_id": 6374, "start_text": "BIMF分量", "to_id": 6359, "end_text": "BIMF₃", "type": "包含"}, {"id": 6, "from_id": 6373, "start_text": "研究区混合重力数据", "to_id": 6361, "end_text": "[Res(m,n)]", "type": "和"}, {"id": 7, "from_id": 6373, "start_text": "研究区混合重力数据", "to_id": 8651, "end_text": "3个", "type": "分为"}, {"id": 8, "from_id": 8651, "start_text": "3个", "to_id": 6360, "end_text": "频率依次降低", "type": "为"}, {"id": 9, "from_id": 6360, "start_text": "频率依次降低", "to_id": 6357, "end_t

In [5]:
import json
import re

def clean_json_string(json_string):
    """
    Clean JSON string to ensure it is a valid JSON array.
    """
    json_string = json_string.strip()
    if not json_string.startswith('['):
        json_string = '[' + json_string
    if not json_string.endswith(']'):
        json_string = json_string + ']'
    return json_string

def extract_relations(response):
    """
    Extract the "relations" portion from the response.
    """
    match = re.search(r'"relations":\s*(\[.*)', response, re.DOTALL)
    if match:
        relations_str = match.group(1)
        # Try to find the matching closing bracket for the JSON array
        open_brackets = 0
        for i, char in enumerate(relations_str):
            if char == '[':
                open_brackets += 1
            elif char == ']':
                open_brackets -= 1
                if open_brackets == 0:
                    return relations_str[:i+1]
    return "[]"

def merge_relations_with_entities(user_input, response):
    """
    Merge the relations data with the user input JSON.
    """
    user_data = json.loads(user_input)

    try:
        relations_str = extract_relations(response)
        relations_str = clean_json_string(relations_str)
        relations_data = json.loads(relations_str)
        user_data['relations'] = relations_data
    except json.JSONDecodeError as e:
        print(f"JSONDecodeError: {str(e)} with response: {response}")
        user_data['relations'] = []
    except Exception as e:
        print(f"Error merging data: {str(e)}")
        user_data['relations'] = []

    return user_data

def process_data(input_path, output_path, error_log_path):
    """
    Process the input file, extract the relations data, and merge with the user input data.
    """
    line_count = 0
    valid_data_count = 0

    with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile, open(error_log_path, 'w', encoding='utf-8') as error_log:
        lines = infile.readlines()
        user_input = ""
        response_lines = []
        processing_user_input = False

        for line in lines:
            line_count += 1
            line = line.strip()

            if line.startswith("User Input:"):
                if user_input and response_lines:
                    response = ' '.join(response_lines)
                    try:
                        merged_data = merge_relations_with_entities(user_input, response)
                        outfile.write(json.dumps(merged_data, ensure_ascii=False) + '\n')
                        valid_data_count += 1
                    except Exception as e:
                        error_log.write(f"Error processing data at line {line_count}: {str(e)}\n")
                user_input = line.replace("User Input: ", "")
                response_lines = []
                processing_user_input = True
                error_log.write(f"Read User Input: {user_input}\n")  # Debug log
            elif line.startswith("Response:"):
                processing_user_input = False
                response = line.replace("Response: ", "")
                response_lines.append(response)
                error_log.write(f"Read Response: {response}\n")  # Debug log
            else:
                if not processing_user_input and line:
                    response_lines.append(line)

        # 处理文件结尾的最后一个记录
        if user_input and response_lines:
            response = ' '.join(response_lines)
            try:
                merged_data = merge_relations_with_entities(user_input, response)
                outfile.write(json.dumps(merged_data, ensure_ascii=False) + '\n')
                valid_data_count += 1
            except Exception as e:
                error_log.write(f"Error processing data at line {line_count}: {str(e)}\n")

    return valid_data_count

# 定义文件路径
input_path = 'RE.jsonal'
output_path = 'output_RE.jsonl'
error_log_path = 'log_RE.txt'

# 运行处理数据的函数
valid_data_count = process_data(input_path, output_path, error_log_path)
valid_data_count


994