In [29]:
import re
import json

def parse_charlabels(charlabels_content):
    charlabels = {}
    lines = charlabels_content.strip().split("\n")
    char_pattern = re.compile(r"\[(\d+)\(\d+\)\]\s+'(.+?)'")
    for line in lines:
        match = char_pattern.match(line.strip().rstrip(','))
        if match:
            char_index = int(match.group(1))
            description = match.group(2)
            charlabels[char_index] = description
    return charlabels

def parse_statelabels(statelabels_content):
    statelabels = {}
    lines = statelabels_content.strip().split("\n")
    current_char = None
    states = []

    for line in lines:
        if re.match(r'^\d+', line):
            if current_char is not None:
                statelabels[current_char] = states
            parts = line.split(' ', 2)
            current_char = int(parts[0])
            states = parts[1].strip().strip(',').split("' '")
            states = [state.strip("'") for state in states]
        else:
            additional_states = line.strip().strip(',').split("' '")
            additional_states = [state.strip("'") for state in additional_states]
            states.extend(additional_states)

    if current_char is not None:
        statelabels[current_char] = states

    return statelabels

def combine_labels_and_states(charlabels, statelabels):
    character_info = {}
    for char_index, description in charlabels.items():
        states = statelabels.get(char_index, [])
        state_dict = {str(i + 1): state for i, state in enumerate(states)}
        character_info[str(char_index)] = {
            "description": description,
            "states": state_dict
        }
    return character_info

def extract_nexus_sections(nexus_content):
    charlabels_content = ""
    statelabels_content = ""
    lines = nexus_content.strip().split("\n")
    in_charlabels = False
    in_statelabels = False

    for line in lines:
        if "CHARLABELS" in line:
            in_charlabels = True
            continue
        if "STATELABELS" in line:
            in_statelabels = True
            continue
        if ";" in line:
            in_charlabels = False
            in_statelabels = False
        
        if in_charlabels:
            charlabels_content += line + "\n"
        if in_statelabels:
            statelabels_content += line + "\n"

    return charlabels_content, statelabels_content

def parse_nexus_file(file_path):
    with open(file_path, 'r') as file:
        nexus_content = file.read()

    charlabels_content, statelabels_content = extract_nexus_sections(nexus_content)
    
    # 解析CHARLABELS部分
    charlabels = parse_charlabels(charlabels_content)

    # 解析STATELABELS部分
    statelabels = parse_statelabels(statelabels_content)

    # 结合解析结果生成character_info字典
    character_info = combine_labels_and_states(charlabels, statelabels)
    
    return character_info

# 示例使用
file_path = "D:/桌面/taxonomy_primary_result/The_GPT-4_result/Dataset_3 (The Lycopodiales (Diphasiastrum, Huperzia, Isoetes, Lycopodium, Selaginella)) 4/Information gain methods/nexdata"
character_info = parse_nexus_file(file_path)

# 打印结果
print(json.dumps(character_info, indent=4))

# 保存结果到文件
output_path = "D:/桌面/character_info.json"
with open(output_path, "w") as f:
    json.dump(character_info, f, indent=4)


{
    "1": {
        "description": "stems <elongation>",
        "states": {
            "1": "elongated"
        }
    },
    "2": {
        "description": "stems <carriage>",
        "states": {
            "1": "suberect",
            "2": "creeping, and rooting from cha"
        }
    },
    "3": {
        "description": "stems <manner of branching>",
        "states": {
            "1": "overtly"
        }
    },
    "4": {
        "description": "stems <whether dorsiventral>",
        "states": {
            "1": "dorsiventrally"
        }
    },
    "5": {
        "description": "stems <whether with flattened",
        "states": {
            "1": "with",
            "2": "with strongly flattened branch"
        }
    },
    "6": {
        "description": "stems <presence of secondary t",
        "states": {
            "1": "with"
        }
    },
    "7": {
        "description": "the old leaf bases <in Isoetes",
        "states": {
            "1": "persistent"
        }
    

In [58]:
import re
import json

def parse_charlabels(charlabels_content):
    charlabels = {}
    lines = charlabels_content.strip().split("\n")
    char_pattern = re.compile(r"\[(\d+)\(\d+\)\]\s+'(.+?)'")
    for line in lines:
        match = char_pattern.match(line.strip().rstrip(','))
        if match:
            char_index = int(match.group(1))
            description = match.group(2)
            charlabels[char_index] = description
    return charlabels

def parse_statelabels(statelabels_content):
    statelabels = {}
    lines = statelabels_content.strip().split("\n")
    current_char = None
    states = []

    for line in lines:
        if re.match(r'^\d+', line):
            if current_char is not None:
                statelabels[current_char] = states
            parts = line.split(' ', 1)
            current_char = int(parts[0])
            states = parts[1].strip().strip(',').split("' '")
            states = [state.strip("'") for state in states]
        else:
            additional_states = line.strip().strip(',').split("' '")
            additional_states = [state.strip("'") for state in additional_states]
            states.extend(additional_states)

    if current_char is not None:
        statelabels[current_char] = states

    return statelabels

def combine_labels_and_states(charlabels, statelabels):
    character_info = {}
    for char_index, description in charlabels.items():
        states = statelabels.get(char_index, [])
        state_dict = {str(i + 1): state for i, state in enumerate(states)}
        character_info[str(char_index)] = {
            "description": description,
            "states": state_dict
        }
    return character_info

def extract_nexus_sections(nexus_content):
    charlabels_content = ""
    statelabels_content = ""
    lines = nexus_content.strip().split("\n")
    in_charlabels = False
    in_statelabels = False

    for line in lines:
        if "CHARLABELS" in line:
            in_charlabels = True
            continue
        if "STATELABELS" in line:
            in_statelabels = True
            continue
        if ";" in line:
            in_charlabels = False
            in_statelabels = False
        
        if in_charlabels:
            charlabels_content += line + "\n"
        if in_statelabels:
            statelabels_content += line + "\n"

    return charlabels_content, statelabels_content

def parse_nexus_file(file_path):
    with open(file_path, 'r') as file:
        nexus_content = file.read()

    charlabels_content, statelabels_content = extract_nexus_sections(nexus_content)
    
    # 解析CHARLABELS部分
    charlabels = parse_charlabels(charlabels_content)

    # 解析STATELABELS部分
    statelabels = parse_statelabels(statelabels_content)

    # 结合解析结果生成character_info字典
    character_info = combine_labels_and_states(charlabels, statelabels)
    
    return character_info

# 示例使用
file_path = "D:/桌面/taxonomy_primary_result/The_GPT-4_result/Dataset_3 (The Lycopodiales (Diphasiastrum, Huperzia, Isoetes, Lycopodium, Selaginella)) 4/Information gain methods/nexdata"
character_info = parse_nexus_file(file_path)
print(json.dumps(character_info, indent=4))



{
    "1": {
        "description": "stems <elongation>",
        "states": {
            "1": "elongated, with numerous small",
            "2": "short and tuberous, with sheat"
        }
    },
    "2": {
        "description": "stems <carriage>",
        "states": {
            "1": "suberect, and rooting at the b",
            "2": "creeping, and rooting directly",
            "3": "creeping, and rooting from cha"
        }
    },
    "3": {
        "description": "stems <manner of branching>",
        "states": {
            "1": "overtly dichotomising vegetati",
            "2": "ostensibly monopodial vegetati"
        }
    },
    "4": {
        "description": "stems <whether dorsiventral>",
        "states": {
            "1": "dorsiventrally organized, with",
            "2": "not dorsiventrally organized"
        }
    },
    "5": {
        "description": "stems <whether with flattened",
        "states": {
            "1": "with non-flattened branches",
            "2": "wit

In [48]:
import json

# 示例 character_info 字典，包含特征描述和状态
character_info = {
    "1": {
        "description": "the shoots <dimorphism>",
        "states": {
            "1": "conspicuously dimorphic: the c",
            "2": "distinguishable as fertile and",
            "3": "all green and alike vegetative"
        }
    },
    "2": {
        "description": "the shoots <dimorphism>",
        "states": {
            "1": "conspicuously dimorphic: the c",
            "2": "distinguishable as fertile and",
            "3": "all green and alike vegetative"
        }
    },
    "5": {
        "description": "the main stems <of the assimil",
        "states": {
            "1": "bright green",
            "2": "dull green"
        }
    },
    "11": {
        "description": "<foliage> leaves <appressed or",
        "states": {
            "1": "appressed",
            "2": "spreading"
        }
    },
    "12": {
        "description": "leaves <whether hair-pointed>",
        "states": {
            "1": "with long, filiform hair-like",
            "2": "not hair-pointed"
        }
    },
    "18": {
        "description": "the megaspores <of Isoetes, su",
        "states": {
            "1": "covered with short, blunt tube",
            "2": "covered with long, fragile spi",
            "3": "with a reticulate ornamentatio"
        }
    },
    "20": {
        "description": "the primary branches <carriage",
        "states": {
            "1": "ascending",
            "2": "spreading",
            "3": "drooping"
        }
    },
    "22": {
        "description": "the first <primary> branch int",
        "states": {
            "1": "much shorter than the subtendi",
            "2": "at least as long as the subten"
        }
    }
}

# 示例分类检索表
classification_key = {
    "Character 1": {
        "State 1": {
            "Character 2": {
                "State 1": "Huperzia selago",
                "State 2": {
                    "Character 20": {
                        "State 1": "Lycopodiella inundata",
                        "State 2 and 3": "Lycopodium annotinum",
                        "State 1 and 2 and 3": {
                            "Character 12": {
                                "State 1": "Lycopodium clavatum",
                                "State 2": {
                                    "Character 5": {
                                        "State 2": "Diphasiastrum complanatum",
                                        "State 1": "Diphasiastrum alpinum"
                                    }
                                }
                            }
                        }
                    }
                },
                "State 3": {
                    "Character 11": {
                        "State 1 and 2": "Selaginella kraussiana",
                        "State 1": "Selaginella selaginoides"
                    }
                }
            }
        },
        "State 2": {
            "Character 18": {
                "State 1": "Isoetes lacustris",
                "State 2": "Isoetes echinospora",
                "State 3": "Isoetes histrix"
            }
        }
    }
}

def replace_indices_with_descriptions_in_key(key, character_info, parent_char_index=None):
    updated_key = {}
    for char_state, subtree in key.items():
        if char_state.startswith("Character"):
            parts = char_state.split()
            if len(parts) > 1:
                char_index = parts[1]
                if char_index in character_info:
                    char_description = f"Character {char_index}: {character_info[char_index]['description']}"
                    if isinstance(subtree, dict):
                        updated_subtree = replace_indices_with_descriptions_in_key(subtree, character_info, char_index)
                        updated_key[char_description] = updated_subtree
                    else:
                        updated_key[char_description] = subtree
                else:
                    updated_key[char_state] = subtree
            else:
                updated_key[char_state] = subtree
        elif char_state.startswith("State") and parent_char_index:
            states = char_state.split()[1:]
            state_descriptions = []
            for state in states:
                individual_states = state.split("and")
                descriptions = [character_info[parent_char_index]["states"].get(s.strip(), "") for s in individual_states]
                state_descriptions.append(" and ".join(filter(None, descriptions)))
            state_key = f"State {' and '.join(states)}: {' / '.join(state_descriptions)}"
            if isinstance(subtree, dict):
                updated_key[state_key] = replace_indices_with_descriptions_in_key(subtree, character_info, parent_char_index)
            else:
                updated_key[state_key] = subtree
        else:
            updated_key[char_state] = subtree
    return updated_key

# 处理分类检索表
updated_classification_key = replace_indices_with_descriptions_in_key(classification_key, character_info)

# 打印更新后的分类检索表
print("Updated Classification Key:")
print(json.dumps(updated_classification_key, indent=4))


Updated Classification Key:
{
    "Character 1: the shoots <dimorphism>": {
        "State 1: conspicuously dimorphic: the c": {
            "Character 2: the shoots <dimorphism>": {
                "State 1: conspicuously dimorphic: the c": "Huperzia selago",
                "State 2: distinguishable as fertile and": {
                    "Character 20: the primary branches <carriage": {
                        "State 1: ascending": "Lycopodiella inundata",
                        "State 2 and and and 3: spreading /  / drooping": "Lycopodium annotinum",
                        "State 1 and and and 2 and and and 3: ascending /  / spreading /  / drooping": {
                            "Character 12: leaves <whether hair-pointed>": {
                                "State 1: with long, filiform hair-like": "Lycopodium clavatum",
                                "State 2: not hair-pointed": {
                                    "Character 5: the main stems <of the assimil": {
            

In [32]:
print(classification_key)

{'Character 1': {'State 1': {'Character 2': {'State 1': 'Huperzia selago', 'State 2': {'Character 20': {'State 1': 'Lycopodiella inundata', 'State 2 and 3': 'Lycopodium annotinum', 'State 1 and 2 and 3': {'Character 12': {'State 1': 'Lycopodium clavatum', 'State 2': {'Character 5': {'State 2': 'Diphasiastrum complanatum', 'State 1': 'Diphasiastrum alpinum'}}}}}}, 'State 3': {'Character 11': {'State 1 and 2': 'Selaginella kraussiana', 'State 1': 'Selaginella selaginoides'}}}}, 'State 2': {'Character 18': {'State 1': 'Isoetes lacustris', 'State 2': 'Isoetes echinospora', 'State 3': 'Isoetes histrix'}}}}


In [39]:
import json
# 示例分类检索表
classification_key = {
    "Character 1": {
        "State 1": {
            "Character 2": {
                "State 1": "Huperzia selago",
                "State 2": {
                    "Character 20": {
                        "State 1": "Lycopodiella inundata",
                        "State 2 and 3": "Lycopodium annotinum",
                        "State 1 and 2 and 3": {
                            "Character 12": {
                                "State 1": "Lycopodium clavatum",
                                "State 2": {
                                    "Character 5": {
                                        "State 2": "Diphasiastrum complanatum",
                                        "State 1": "Diphasiastrum alpinum"
                                    }
                                }
                            }
                        }
                    }
                },
                "State 3": {
                    "Character 11": {
                        "State 1 and 2": "Selaginella kraussiana",
                        "State 1": "Selaginella selaginoides"
                    }
                }
            }
        },
        "State 2": {
            "Character 18": {
                "State 1": "Isoetes lacustris",
                "State 2": "Isoetes echinospora",
                "State 3": "Isoetes histrix"
            }
        }
    }
}

def replace_indices_with_descriptions_in_key(key, character_info, parent_char_index=None):
    updated_key = {}
    for char_state, subtree in key.items():
        if char_state.startswith("Character"):
            parts = char_state.split()
            if len(parts) > 1:
                char_index = parts[1]
                if char_index in character_info:
                    char_description = f"Character {char_index}: {character_info[char_index]['description']}"
                    if isinstance(subtree, dict):
                        updated_subtree = replace_indices_with_descriptions_in_key(subtree, character_info, char_index)
                        updated_key[char_description] = updated_subtree
                    else:
                        updated_key[char_description] = subtree
                else:
                    updated_key[char_state] = subtree
            else:
                updated_key[char_state] = subtree
        elif char_state.startswith("State") and parent_char_index:
            states = char_state.split()[1:]
            state_descriptions = []
            for state in states:
                individual_states = state.split("and")
                descriptions = [character_info[parent_char_index]["states"].get(s.strip(), "") for s in individual_states]
                state_descriptions.append(" and ".join(filter(None, descriptions)))
            state_key = f"State {' '.join(states)}: {' / '.join(state_descriptions)}"
            if isinstance(subtree, dict):
                updated_key[state_key] = replace_indices_with_descriptions_in_key(subtree, character_info, parent_char_index)
            else:
                updated_key[state_key] = subtree
        else:
            updated_key[char_state] = subtree
    return updated_key

# 处理分类检索表
updated_classification_key = replace_indices_with_descriptions_in_key(classification_key, character_info)

# 打印更新后的分类检索表
print("Updated Classification Key:")
print(json.dumps(updated_classification_key, indent=4))


Updated Classification Key:
{
    "Character": "Character1",
    "States": {
        "1": {
            "Character": "Character2",
            "States": {
                "1": [
                    "Huperzia selago"
                ],
                "2": {
                    "Character": "Character5",
                    "States": {
                        "1": {
                            "Character": "Character20",
                            "States": {
                                "1": [
                                    "Lycopodiella inundata"
                                ],
                                "2 and 3": [
                                    "Lycopodium annotinum"
                                ],
                                "1 and 2 and 3": [
                                    "Lycopodium clavatum"
                                ]
                            }
                        },
                        "2": [
                            "Dip

In [50]:
import json

# 示例 character_info 字典，包含特征描述和状态
character_info = {
    "1": {
        "description": "the shoots <dimorphism>",
        "states": {
            "1": "conspicuously dimorphic: the c",
            "2": "distinguishable as fertile and",
            "3": "all green and alike vegetative"
        }
    },
    "2": {
        "description": "the shoots <dimorphism>",
        "states": {
            "1": "conspicuously dimorphic: the c",
            "2": "distinguishable as fertile and",
            "3": "all green and alike vegetative"
        }
    },
    "5": {
        "description": "the main stems <of the assimil",
        "states": {
            "1": "bright green",
            "2": "dull green"
        }
    },
    "11": {
        "description": "<foliage> leaves <appressed or",
        "states": {
            "1": "appressed",
            "2": "spreading"
        }
    },
    "12": {
        "description": "leaves <whether hair-pointed>",
        "states": {
            "1": "with long, filiform hair-like",
            "2": "not hair-pointed"
        }
    },
    "18": {
        "description": "the megaspores <of Isoetes, su",
        "states": {
            "1": "covered with short, blunt tube",
            "2": "covered with long, fragile spi",
            "3": "with a reticulate ornamentatio"
        }
    },
    "20": {
        "description": "the primary branches <carriage",
        "states": {
            "1": "ascending",
            "2": "spreading",
            "3": "drooping"
        }
    },
    "22": {
        "description": "the first <primary> branch int",
        "states": {
            "1": "much shorter than the subtendi",
            "2": "at least as long as the subten"
        }
    }
}

# 示例输入，实际为JSON字符串，需要先转换为字典
input_data = {
    '1': '{\n    "Character": "Character2",\n    "States": {\n        "1": ["Huperzia selago"],\n        "2": {\n            "Character": "Character12",\n            "States": {\n                "1": ["Lycopodium clavatum"],\n                "2": {\n                    "Character": "Character9",\n                    "States": {\n                        "1": {\n                            "Character": "Character5",\n                            "States": {\n                                "2": ["Diphasiastrum alpinum"],\n                                "3": ["Diphasiastrum complanatum"]\n                            }\n                        },\n                        "2": {\n                            "Character": "Character20",\n                            "States": {\n                                "1": ["Lycopodiella inundata"],\n                                "2 and 3": ["Lycopodium annotinum"]\n                            }\n                        }\n                    }\n                }\n            }\n        },\n        "3": {\n            "Character": "Character8",\n            "States": {\n                "1": {\n                    "Character": "Character4",\n                    "States": {\n                        "1": ["Selaginella kraussiana"],\n                        "2": ["Selaginella selaginoides"]\n                    }\n                }\n            }\n        }\n    }\n}',
    '2': '{\n    "Character": "Character20",\n    "States": {\n        "1": ["Isoetes histrix"],\n        "2 and 3": ["Isoetes lacustris"],\n        "1 and 2 and 3": ["Isoetes echinospora"]\n    }\n}'
}

# 将输入字符串转换为字典
classification_result = {key: json.loads(value) for key, value in input_data.items()}

# 递归函数将结构转换为所需格式
def convert_structure(node):
    if "Character" in node and "States" in node:
        character = node["Character"]
        states = node["States"]
        converted = {f"Character {character.replace('Character', '')}": {}}
        for state, sub_node in states.items():
            state_key = f"State {state}"
            if isinstance(sub_node, list):
                converted[f"Character {character.replace('Character', '')}"][state_key] = sub_node[0] if len(sub_node) == 1 else sub_node
            elif isinstance(sub_node, dict):
                converted[f"Character {character.replace('Character', '')}"][state_key] = convert_structure(sub_node)
        return converted
    return node

# 处理分类检索表
converted_result = {}
for key, value in classification_result.items():
    converted_result[f"Character {key}"] = convert_structure(value)

# 递归函数替换特征和状态描述
def replace_indices_with_descriptions_in_key(key, character_info, parent_char_index=None):
    updated_key = {}
    for char_state, subtree in key.items():
        if char_state.startswith("Character"):
            parts = char_state.split()
            if len(parts) > 1:
                char_index = parts[1]
                if char_index in character_info:
                    char_description = f"Character {char_index}: {character_info[char_index]['description']}"
                    if isinstance(subtree, dict):
                        updated_subtree = replace_indices_with_descriptions_in_key(subtree, character_info, char_index)
                        updated_key[char_description] = updated_subtree
                    else:
                        updated_key[char_description] = subtree
                else:
                    updated_key[char_state] = subtree
            else:
                updated_key[char_state] = subtree
        elif char_state.startswith("State") and parent_char_index:
            states = char_state.split()[1:]
            state_descriptions = []
            for state in states:
                individual_states = state.split("and")
                descriptions = [character_info[parent_char_index]["states"].get(s.strip(), "") for s in individual_states]
                state_descriptions.append(" and ".join(filter(None, descriptions)))
            state_key = f"State {' '.join(states)}: {' / '.join(state_descriptions)}"
            if isinstance(subtree, dict):
                updated_key[state_key] = replace_indices_with_descriptions_in_key(subtree, character_info, parent_char_index)
            else:
                updated_key[state_key] = subtree
        else:
            updated_key[char_state] = subtree
    return updated_key

# 替换特征和状态描述
updated_classification_key = replace_indices_with_descriptions_in_key(converted_result, character_info)

# 打印更新后的分类检索表
print("Updated Classification Key:")
print(json.dumps(updated_classification_key, indent=4, ensure_ascii=False))


Updated Classification Key:
{
    "Character 1: the shoots <dimorphism>": {
        "Character 2: the shoots <dimorphism>": {
            "State 1: conspicuously dimorphic: the c": "Huperzia selago",
            "State 2: distinguishable as fertile and": {
                "Character 12: leaves <whether hair-pointed>": {
                    "State 1: with long, filiform hair-like": "Lycopodium clavatum",
                    "State 2: not hair-pointed": {
                        "Character 9": {
                            "State 1": {
                                "Character 5": {
                                    "State 2": "Diphasiastrum alpinum",
                                    "State 3": "Diphasiastrum complanatum"
                                }
                            },
                            "State 2": {
                                "Character 20": {
                                    "State 1": "Lycopodiella inundata",
                                    

In [56]:
import json

# 示例初始分类结果
classification_results = {
    '1': '{\n    "Character": "Character2",\n    "States": {\n        "1": ["Huperzia selago"],\n        "2": {\n            "Character": "Character12",\n            "States": {\n                "1": ["Lycopodium clavatum"],\n                "2": {\n                    "Character": "Character9",\n                    "States": {\n                        "1": {\n                            "Character": "Character5",\n                            "States": {\n                                "2": ["Diphasiastrum alpinum"],\n                                "3": ["Diphasiastrum complanatum"]\n                            }\n                        },\n                        "2": {\n                            "Character": "Character20",\n                            "States": {\n                                "1": ["Lycopodiella inundata"],\n                                "2 and 3": ["Lycopodium annotinum"]\n                            }\n                        }\n                    }\n                }\n            }\n        },\n        "3": {\n            "Character": "Character8",\n            "States": {\n                "1": {\n                    "Character": "Character4",\n                    "States": {\n                        "1": ["Selaginella kraussiana"],\n                        "2": ["Selaginella selaginoides"]\n                    }\n                }\n            }\n        }\n    }\n}',
    '2': '{\n    "Character": "Character20",\n    "States": {\n        "1": ["Isoetes histrix"],\n        "2 and 3": ["Isoetes lacustris"],\n        "1 and 2 and 3": ["Isoetes echinospora"]\n    }\n}'
}

# 示例初始分类信息
parsed_initial_classification = {
    'Character': 'Character1',
    'States': {
        '1': ['Diphasiastrum alpinum', 'Diphasiastrum complanatum', 'Huperzia selago', 'Lycopodiella inundata', 'Lycopodium annotinum', 'Lycopodium clavatum', 'Selaginella kraussiana', 'Selaginella selaginoides'],
        '2': ['Isoetes echinospora', 'Isoetes histrix', 'Isoetes lacustris']
    }
}

# 示例字符信息
character_info = {
    '1': {'description': 'Initial Character', 'states': {'1': 'Group 1', '2': 'Group 2'}},
    '2': {'description': 'Leaf shape', 'states': {'1': 'Linear', '2': 'Ovate', '3': 'Elliptic'}},
    '12': {'description': 'Flower color', 'states': {'1': 'Red', '2': 'Yellow'}},
    '9': {'description': 'Height', 'states': {'1': '<10cm', '2': '>10cm'}},
    '5': {'description': 'Stem texture', 'states': {'2': 'Smooth', '3': 'Rough'}},
    '20': {'description': 'Habitat', 'states': {'1': 'Aquatic', '2': 'Terrestrial', '2 and 3': 'Mixed'}},
    '8': {'description': 'Root type', 'states': {'1': 'Fibrous', '2': 'Taproot'}},
    '4': {'description': 'Leaf margin', 'states': {'1': 'Entire', '2': 'Serrate'}}
}

# Parse the API response JSON strings
parsed_classification_results = {key: json.loads(value) for key, value in classification_results.items()}

# Function to combine the initial and secondary classification results
def combine_results(initial, secondary, state_key):
    if not secondary:
        return

    initial_states = initial["States"].get(state_key)
    if initial_states is None:
        initial["States"][state_key] = secondary
        return

    if isinstance(initial_states, list):
        if isinstance(secondary, list):
            initial["States"][state_key] = list(set(initial_states + secondary))  # Merge two lists and remove duplicates
        else:
            initial["States"][state_key] = secondary
    elif isinstance(initial_states, dict):
        if isinstance(secondary, dict):
            for key, value in secondary["States"].items():
                if key not in initial_states:
                    initial_states[key] = value
                else:
                    combine_results(initial_states, value, key)
        else:
            raise ValueError(f"Conflicting types for key {state_key}: {type(initial_states)} vs {type(secondary)}")
    else:
        raise ValueError(f"Unexpected type for initial states: {type(initial_states)}")

# Dynamically combine all secondary classification results
for state_key, secondary in parsed_classification_results.items():
    combine_results(parsed_initial_classification, secondary, state_key)

# Function to replace indices with descriptions
def replace_indices_with_descriptions(node, character_info, parent_char_index=None):
    if "Character" in node and "States" in node:
        character = node["Character"].replace('Character', '')
        states = node["States"]
        char_description = character_info[character]['description']
        updated_node = {f"Character {character}: {char_description}": {}}
        for state, sub_node in states.items():
            if parent_char_index:
                state_descriptions = [character_info[parent_char_index]['states'].get(s.strip(), '') for s in state.split('and')]
                state_key = f"State {state}: {' / '.join(state_descriptions)}"
            else:
                state_key = f"State {state}"
            if isinstance(sub_node, dict):
                updated_node[f"Character {character}: {char_description}"][state_key] = replace_indices_with_descriptions(sub_node, character_info, character)
            else:
                updated_node[f"Character {character}: {char_description}"][state_key] = sub_node
        return updated_node
    return node

# Replace indices with descriptions in the final result
final_result_with_descriptions = replace_indices_with_descriptions(parsed_initial_classification, character_info)

# Print the final result with descriptions
print("Final Result with Descriptions:")
print(json.dumps(final_result_with_descriptions, indent=4, ensure_ascii=False))


Final Result with Descriptions:
{
    "Character 1: Initial Character": {
        "State 1": {
            "Character 2: Leaf shape": {
                "State 1: Group 1": [
                    "Huperzia selago"
                ],
                "State 2: Group 2": {
                    "Character 12: Flower color": {
                        "State 1: Linear": [
                            "Lycopodium clavatum"
                        ],
                        "State 2: Ovate": {
                            "Character 9: Height": {
                                "State 1: Red": {
                                    "Character 5: Stem texture": {
                                        "State 2: >10cm": [
                                            "Diphasiastrum alpinum"
                                        ],
                                        "State 3: ": [
                                            "Diphasiastrum complanatum"
                                        ]
   

In [63]:
import json

# 示例初始分类结果
classification_results = {
    '1': '{\n    "Character": "Character2",\n    "States": {\n        "1": ["Huperzia selago"],\n        "2": {\n            "Character": "Character12",\n            "States": {\n                "1": ["Lycopodium clavatum"],\n                "2": {\n                    "Character": "Character9",\n                    "States": {\n                        "1": {\n                            "Character": "Character5",\n                            "States": {\n                                "2": ["Diphasiastrum alpinum"],\n                                "3": ["Diphasiastrum complanatum"]\n                            }\n                        },\n                        "2": {\n                            "Character": "Character20",\n                            "States": {\n                                "1": ["Lycopodiella inundata"],\n                                "2 and 3": ["Lycopodium annotinum"]\n                            }\n                        }\n                    }\n                }\n            }\n        },\n        "3": {\n            "Character": "Character8",\n            "States": {\n                "1": {\n                    "Character": "Character4",\n                    "States": {\n                        "1": ["Selaginella kraussiana"],\n                        "2": ["Selaginella selaginoides"]\n                    }\n                }\n            }\n        }\n    }\n}',
    '2': '{\n    "Character": "Character20",\n    "States": {\n        "1": ["Isoetes histrix"],\n        "2 and 3": ["Isoetes lacustris"],\n        "1 and 2 and 3": ["Isoetes echinospora"]\n    }\n}'
}

# 示例初始分类信息
parsed_initial_classification = {
    'Character': 'Character1',
    'States': {
        '1': ['Diphasiastrum alpinum', 'Diphasiastrum complanatum', 'Huperzia selago', 'Lycopodiella inundata', 'Lycopodium annotinum', 'Lycopodium clavatum', 'Selaginella kraussiana', 'Selaginella selaginoides'],
        '2': ['Isoetes echinospora', 'Isoetes histrix', 'Isoetes lacustris']
    }
}

# Parse the API response JSON strings
parsed_classification_results = {key: json.loads(value) for key, value in classification_results.items()}

# Function to combine the initial and secondary classification results
def combine_results(initial, secondary, state_key):
    if not secondary:
        return

    initial_states = initial["States"].get(state_key)
    if initial_states is None:
        initial["States"][state_key] = secondary
        return

    if isinstance(initial_states, list):
        if isinstance(secondary, list):
            initial["States"][state_key] = list(set(initial_states + secondary))  # Merge two lists and remove duplicates
        else:
            initial["States"][state_key] = secondary
    elif isinstance(initial_states, dict):
        if isinstance(secondary, dict):
            for key, value in secondary["States"].items():
                if key not in initial_states:
                    initial_states[key] = value
                else:
                    combine_results(initial_states, value, key)
        else:
            raise ValueError(f"Conflicting types for key {state_key}: {type(initial_states)} vs {type(secondary)}")
    else:
        raise ValueError(f"Unexpected type for initial states: {type(initial_states)}")

# Dynamically combine all secondary classification results
for state_key, secondary in parsed_classification_results.items():
    combine_results(parsed_initial_classification, secondary, state_key)

# Function to replace indices with descriptions, including handling multiple state descriptions
def replace_indices_with_descriptions(node, character_info, parent_char_index=None):
    if "Character" in node and "States" in node:
        character = node["Character"].replace('Character', '')
        states = node["States"]
        char_description = character_info[character]['description']
        updated_node = {f"Character {character}: {char_description}": {}}
        for state, sub_node in states.items():
            if parent_char_index:
                state_parts = state.split('and')
                state_descriptions = [character_info[parent_char_index]['states'].get(s.strip(), '') for s in state_parts]
                state_key = f"State {state}: {' / '.join(state_descriptions)}"
            else:
                state_key = f"State {state}"
            if isinstance(sub_node, dict):
                updated_node[f"Character {character}: {char_description}"][state_key] = replace_indices_with_descriptions(sub_node, character_info, character)
            else:
                updated_node[f"Character {character}: {char_description}"][state_key] = sub_node
        return updated_node
    return node

# Replace indices with descriptions in the final result
final_result_with_descriptions = replace_indices_with_descriptions(parsed_initial_classification, character_info)

# Print the final result with descriptions
print("Final Result with Descriptions:")
print(json.dumps(final_result_with_descriptions, indent=4, ensure_ascii=False))


Final Result with Descriptions:
{
    "Character 1: stems <elongation>": {
        "State 1": {
            "Character 2: stems <carriage>": {
                "State 1: elongated, with numerous small": [
                    "Huperzia selago"
                ],
                "State 2: short and tuberous, with sheat": {
                    "Character 12: leaves <whether hair-pointed>": {
                        "State 1: suberect, and rooting at the b": [
                            "Lycopodium clavatum"
                        ],
                        "State 2: creeping, and rooting directly": {
                            "Character 9: leaves <arrangement>": {
                                "State 1: with long, filiform hair-like": {
                                    "Character 5: stems <whether with flattened": {
                                        "State 2: not 4-ranked": [
                                            "Diphasiastrum alpinum"
                                

In [68]:
import json

# 示例 initial_classification 和 character_info 字典
initial_classification = {
    'Character': 'Character1',
    'States': {
        '1': ['Diphasiastrum alpinum', 'Diphasiastrum complanatum', 'Huperzia selago', 'Lycopodiella inundata', 'Lycopodium annotinum', 'Lycopodium clavatum', 'Selaginella kraussiana', 'Selaginella selaginoides'],
        '2': ['Isoetes echinospora', 'Isoetes histrix', 'Isoetes lacustris']
    }
}

character_info = {
    "1": {
        "description": "the shoots <dimorphism>",
        "states": {
            "1": "conspicuously dimorphic: the c",
            "2": "distinguishable as fertile and",
            "3": "all green and alike vegetative"
        }
    },
    "2": {
        "description": "the shoots <dimorphism>",
        "states": {
            "1": "conspicuously dimorphic: the c",
            "2": "distinguishable as fertile and",
            "3": "all green and alike vegetative"
        }
    },
    "5": {
        "description": "the main stems <of the assimil",
        "states": {
            "1": "bright green",
            "2": "dull green"
        }
    },
    "12": {
        "description": "leaves <whether hair-pointed>",
        "states": {
            "1": "with long, filiform hair-like",
            "2": "not hair-pointed"
        }
    },
    "20": {
        "description": "the primary branches <carriage",
        "states": {
            "1": "ascending",
            "2": "spreading",
            "3": "drooping"
        }
    }
}

# 示例 input_data
input_data = {
    '1': '{\n    "Character": "Character2",\n    "States": {\n        "1": ["Huperzia selago"],\n        "2": {\n            "Character": "Character12",\n            "States": {\n                "1": ["Lycopodium clavatum"],\n                "2": {\n                    "Character": "Character9",\n                    "States": {\n                        "1": {\n                            "Character": "Character5",\n                            "States": {\n                                "2": ["Diphasiastrum alpinum"],\n                                "3": ["Diphasiastrum complanatum"]\n                            }\n                        },\n                        "2": {\n                            "Character": "Character20",\n                            "States": {\n                                "1": ["Lycopodiella inundata"],\n                                "2 and 3": ["Lycopodium annotinum"]\n                            }\n                        }\n                    }\n                }\n            }\n        },\n        "3": {\n            "Character": "Character8",\n            "States": {\n                "1": {\n                    "Character": "Character4",\n                    "States": {\n                        "1": ["Selaginella kraussiana"],\n                        "2": ["Selaginella selaginoides"]\n                    }\n                }\n            }\n        }\n    }\n}',
    '2': '{\n    "Character": "Character20",\n    "States": {\n        "1": ["Isoetes histrix"],\n        "2 and 3": ["Isoetes lacustris"],\n        "1 and 2 and 3": ["Isoetes echinospora"]\n    }\n}'
}

# 将输入字符串转换为字典
classification_result = {key: json.loads(value) for key, value in input_data.items()}

# 递归函数将结构转换为所需格式
def convert_structure(node):
    if "Character" in node and "States" in node:
        character = node["Character"]
        states = node["States"]
        converted = {f"Character {character.replace('Character', '')}": {}}
        for state, sub_node in states.items():
            state_key = f"State {state}"
            if isinstance(sub_node, list):
                converted[f"Character {character.replace('Character', '')}"][state_key] = sub_node[0] if len(sub_node) == 1 else sub_node
            elif isinstance(sub_node, dict):
                converted[f"Character {character.replace('Character', '')}"][state_key] = convert_structure(sub_node)
        return converted
    return node

# 处理分类检索表
converted_result = {}
for key, value in classification_result.items():
    converted_result[f"Character {key}"] = convert_structure(value)

# 将初始分类与其他结果整合
def combine_results(initial, secondary, state_key):
    if not secondary:
        return

    initial_states = initial["States"].get(state_key)
    if initial_states is None:
        initial["States"][state_key] = secondary
        return

    if isinstance(initial_states, list):
        if isinstance(secondary, list):
            initial["States"][state_key] = list(set(initial_states + secondary))  # 合并两个列表并去重
        else:
            initial["States"][state_key] = secondary
    elif isinstance(initial_states, dict):
        if isinstance(secondary, dict):
            for key, value in secondary["States"].items():
                if key not in initial_states:
                    initial_states[key] = value
                else:
                    combine_results(initial_states, value, key)
        else:
            raise ValueError(f"冲突的类型，键 {state_key}: {type(initial_states)} vs {type(secondary)}")
    else:
        raise ValueError(f"初始状态的意外类型: {type(initial_states)}")

# 动态合并所有次级分类结果
for state_key, secondary in classification_result.items():
    combine_results(initial_classification, secondary, state_key)

# 将合并后的结果转换为所需格式
converted_initial_classification = convert_structure(initial_classification)

# 递归函数替换特征和状态描述
def replace_indices_with_descriptions_in_key(key, character_info, parent_char_index=None):
    updated_key = {}
    for char_state, subtree in key.items():
        if char_state.startswith("Character"):
            parts = char_state.split()
            if len(parts) > 1:
                char_index = parts[1]
                if char_index in character_info:
                    char_description = f"Character {char_index}: {character_info[char_index]['description']}"
                    if isinstance(subtree, dict):
                        updated_subtree = replace_indices_with_descriptions_in_key(subtree, character_info, char_index)
                        updated_key[char_description] = updated_subtree
                    else:
                        updated_key[char_description] = subtree
                else:
                    updated_key[char_state] = subtree
            else:
                updated_key[char_state] = subtree
        elif char_state.startswith("State") and parent_char_index:
            states = char_state.split()[1:]
            state_descriptions = []
            for state in states:
                individual_states = state.split("and")
                descriptions = [character_info[parent_char_index]["states"].get(s.strip(), "") for s in individual_states]
                state_descriptions.append(" and ".join(filter(None, descriptions)))
            state_key = f"State {' '.join(states)}: {' / '.join(state_descriptions)}"
            if isinstance(subtree, dict):
                updated_key[state_key] = replace_indices_with_descriptions_in_key(subtree, character_info, parent_char_index)
            else:
                updated_key[state_key] = subtree
        else:
            updated_key[char_state] = subtree
    return updated_key

# 替换特征和状态描述
updated_classification_key = replace_indices_with_descriptions_in_key(converted_initial_classification, character_info)

# 打印更新后的分类检索表
print("Updated Classification Key:")
print(json.dumps(updated_classification_key, indent=4, ensure_ascii=False))


Updated Classification Key:
{
    "Character 1: the shoots <dimorphism>": {
        "State 1: conspicuously dimorphic: the c": {
            "Character 2: the shoots <dimorphism>": {
                "State 1: conspicuously dimorphic: the c": "Huperzia selago",
                "State 2: distinguishable as fertile and": {
                    "Character 12: leaves <whether hair-pointed>": {
                        "State 1: with long, filiform hair-like": "Lycopodium clavatum",
                        "State 2: not hair-pointed": {
                            "Character 9": {
                                "State 1": {
                                    "Character 5": {
                                        "State 2": "Diphasiastrum alpinum",
                                        "State 3": "Diphasiastrum complanatum"
                                    }
                                },
                                "State 2": {
                                    "Character 20

In [70]:
import json

# 示例 parsed_initial_classification 和 character_info 字典
parsed_initial_classification = {
    'Character': 'Character1',
    'States': {
        '1': ['Diphasiastrum alpinum', 'Diphasiastrum complanatum', 'Huperzia selago', 'Lycopodiella inundata', 'Lycopodium annotinum', 'Lycopodium clavatum', 'Selaginella kraussiana', 'Selaginella selaginoides'],
        '2': ['Isoetes echinospora', 'Isoetes histrix', 'Isoetes lacustris']
    }
}

character_info = {
    "1": {
        "description": "the shoots <dimorphism>",
        "states": {
            "1": "conspicuously dimorphic: the c",
            "2": "distinguishable as fertile and",
            "3": "all green and alike vegetative"
        }
    },
    "2": {
        "description": "the shoots <dimorphism>",
        "states": {
            "1": "conspicuously dimorphic: the c",
            "2": "distinguishable as fertile and",
            "3": "all green and alike vegetative"
        }
    },
    "5": {
        "description": "the main stems <of the assimil",
        "states": {
            "1": "bright green",
            "2": "dull green"
        }
    },
    "12": {
        "description": "leaves <whether hair-pointed>",
        "states": {
            "1": "with long, filiform hair-like",
            "2": "not hair-pointed"
        }
    },
    "20": {
        "description": "the primary branches <carriage",
        "states": {
            "1": "ascending",
            "2": "spreading",
            "3": "drooping"
        }
    }
}

# 示例 input_data
input_data = {
    '1': '{\n    "Character": "Character2",\n    "States": {\n        "1": ["Huperzia selago"],\n        "2": {\n            "Character": "Character12",\n            "States": {\n                "1": ["Lycopodium clavatum"],\n                "2": {\n                    "Character": "Character9",\n                    "States": {\n                        "1": {\n                            "Character": "Character5",\n                            "States": {\n                                "2": ["Diphasiastrum alpinum"],\n                                "3": ["Diphasiastrum complanatum"]\n                            }\n                        },\n                        "2": {\n                            "Character": "Character20",\n                            "States": {\n                                "1": ["Lycopodiella inundata"],\n                                "2 and 3": ["Lycopodium annotinum"]\n                            }\n                        }\n                    }\n                }\n            }\n        },\n        "3": {\n            "Character": "Character8",\n            "States": {\n                "1": {\n                    "Character": "Character4",\n                    "States": {\n                        "1": ["Selaginella kraussiana"],\n                        "2": ["Selaginella selaginoides"]\n                    }\n                }\n            }\n        }\n    }\n}',
    '2': '{\n    "Character": "Character20",\n    "States": {\n        "1": ["Isoetes histrix"],\n        "2 and 3": ["Isoetes lacustris"],\n        "1 and 2 and 3": ["Isoetes echinospora"]\n    }\n}'
}

# 将输入字符串转换为字典
classification_result = {key: json.loads(value) for key, value in input_data.items()}

# 递归函数将结构转换为所需格式
def convert_structure(node):
    if "Character" in node and "States" in node:
        character = node["Character"]
        states = node["States"]
        converted = {f"Character {character.replace('Character', '')}": {}}
        for state, sub_node in states.items():
            state_key = f"State {state}"
            if isinstance(sub_node, list):
                converted[f"Character {character.replace('Character', '')}"][state_key] = sub_node[0] if len(sub_node) == 1 else sub_node
            elif isinstance(sub_node, dict):
                converted[f"Character {character.replace('Character', '')}"][state_key] = convert_structure(sub_node)
        return converted
    return node

# 处理分类检索表
converted_result = {}
for key, value in classification_result.items():
    converted_result[f"Character {key}"] = convert_structure(value)

# 将初始分类与其他结果整合
def combine_results(initial, secondary, state_key):
    if not secondary:
        return

    initial_states = initial["States"].get(state_key)
    if initial_states is None:
        initial["States"][state_key] = secondary
        return

    if isinstance(initial_states, list):
        if isinstance(secondary, list):
            initial["States"][state_key] = list(set(initial_states + secondary))  # 合并两个列表并去重
        else:
            initial["States"][state_key] = secondary
    elif isinstance(initial_states, dict):
        if isinstance(secondary, dict):
            for key, value in secondary["States"].items():
                if key not in initial_states:
                    initial_states[key] = value
                else:
                    combine_results(initial_states, value, key)
        else:
            raise ValueError(f"冲突的类型，键 {state_key}: {type(initial_states)} vs {type(secondary)}")
    else:
        raise ValueError(f"初始状态的意外类型: {type(initial_states)}")

# 动态合并所有次级分类结果
for state_key, secondary in classification_result.items():
    combine_results(parsed_initial_classification, secondary, state_key)

# 将合并后的结果转换为所需格式
converted_initial_classification = convert_structure(parsed_initial_classification)

# 递归函数替换特征和状态描述
def replace_indices_with_descriptions_in_key(key, character_info, parent_char_index=None):
    updated_key = {}
    for char_state, subtree in key.items():
        if char_state.startswith("Character"):
            parts = char_state.split()
            if len(parts) > 1:
                char_index = parts[1]
                if char_index in character_info:
                    char_description = f"Character {char_index}: {character_info[char_index]['description']}"
                    if isinstance(subtree, dict):
                        updated_subtree = replace_indices_with_descriptions_in_key(subtree, character_info, char_index)
                        updated_key[char_description] = updated_subtree
                    else:
                        updated_key[char_description] = subtree
                else:
                    updated_key[char_state] = subtree
            else:
                updated_key[char_state] = subtree
        elif char_state.startswith("State") and parent_char_index:
            states = char_state.split()[1:]
            state_descriptions = []
            for state in states:
                individual_states = state.split("and")
                descriptions = [character_info[parent_char_index]["states"].get(s.strip(), "") for s in individual_states]
                state_descriptions.append(" and ".join(filter(None, descriptions)))
            state_key = f"State {' '.join(states)}: {' / '.join(state_descriptions)}"
            if isinstance(subtree, dict):
                updated_key[state_key] = replace_indices_with_descriptions_in_key(subtree, character_info, parent_char_index)
            else:
                updated_key[state_key] = subtree
        else:
            updated_key[char_state] = subtree
    return updated_key

# 替换特征和状态描述
updated_classification_key = replace_indices_with_descriptions_in_key(converted_initial_classification, character_info)

# 打印更新后的分类检索表
print("Updated Classification Key:")
print(json.dumps(updated_classification_key, indent=4, ensure_ascii=False))


Updated Classification Key:
{
    "Character 1: the shoots <dimorphism>": {
        "State 1: conspicuously dimorphic: the c": {
            "Character 2: the shoots <dimorphism>": {
                "State 1: conspicuously dimorphic: the c": "Huperzia selago",
                "State 2: distinguishable as fertile and": {
                    "Character 12: leaves <whether hair-pointed>": {
                        "State 1: with long, filiform hair-like": "Lycopodium clavatum",
                        "State 2: not hair-pointed": {
                            "Character 9": {
                                "State 1": {
                                    "Character 5": {
                                        "State 2": "Diphasiastrum alpinum",
                                        "State 3": "Diphasiastrum complanatum"
                                    }
                                },
                                "State 2": {
                                    "Character 20