In [None]:
# Import necessary packages
import json  # For handling JSON data
from openai import OpenAI  # For interacting with OpenAI API
import os  # For interacting with the operating system, such as file paths
import re  # For regular expressions, useful for pattern matching in strings
import pandas as pd

In [34]:
# Function to convert letter to number
def letter_to_number(letter):
    return str(ord(letter) - ord('A') + 10)

# Function to parse matrix content
def parse_matrix(matrix_content):
    data = []
    headers = []
    lines = matrix_content.strip().split('\n')
    for i in range(0, len(lines), 2):
        taxa = lines[i].strip().strip("'")
        traits = lines[i + 1].strip()
        species_traits = []
        j = 0
        while j < len(traits):
            if traits[j] == '(':
                j += 1
                states = ''
                while traits[j] != ')':
                    if traits[j].isalpha():
                        states += letter_to_number(traits[j])
                    else:
                        states += traits[j]
                    j += 1
                species_traits.append(','.join(states))
            elif traits[j] == '?':
                species_traits.append('Missing')
            elif traits[j] == '-':
                species_traits.append('Not Applicable')
            elif traits[j].isalpha():
                species_traits.append(letter_to_number(traits[j]))
            else:
                species_traits.append(traits[j])
            j += 1
        data.append([taxa] + species_traits)
    max_traits = max(len(row) - 1 for row in data)
    headers = ['taxa'] + [f'Character{i + 1}' for i in range(max_traits)]
    try:
        df = pd.DataFrame(data, columns=headers)
        return df
    except Exception as e:
        print(f"Error creating DataFrame: {e}")
        return None

# Function to convert NEXUS to CSV
def convert_nexus_to_csv(file_path, output_path):
    try:
        encodings = ['utf-8', 'gbk', 'latin1']  # List of encodings to try
        for encoding in encodings:
            try:
                with open(file_path, 'r', encoding=encoding) as file:
                    content = file.read()
                print(f"Successfully read file with encoding: {encoding}")
                break
            except UnicodeDecodeError:
                print(f"Failed to read file with encoding: {encoding}")
                continue
        else:
            raise ValueError("Failed to read file with all attempted encodings.")
        
        matrix_content = re.search(r'MATRIX\s*(.*?)\s*;', content, re.DOTALL).group(1).strip()
        df = parse_matrix(matrix_content)
        df.to_csv(output_path, index=False)
        return df
    except FileNotFoundError:
        print(f"File {file_path} not found.")
    except Exception as e:
        print(f"Appear error：{e}")

# Function to build knowledge graph from CSV
def build_knowledge_graph(matrix):
    knowledge_graph = {}
    for _, row in matrix.iterrows():
        taxa = row.iloc[0]
        characteristics = {}
        for col in matrix.columns[1:]:
            state = row[col]
            if isinstance(state, str) and ',' in state:
                state = state.replace(',', ' and ')
            characteristics[col] = str(state)
        knowledge_graph[taxa] = {'Characteristics': characteristics}
    return knowledge_graph

# Function to save knowledge graph as JSON
def save_knowledge_graph_as_json(knowledge_graph, file_path):
    with open(file_path, 'w') as f:
        json.dump(knowledge_graph, f, indent=4)

# Main function to combine all steps
def nexus_to_knowledge_graph(nexus_file_path, csv_output_path, json_output_path):
    # Step 1: Convert NEXUS to CSV
    df = convert_nexus_to_csv(nexus_file_path, csv_output_path)
    
    if df is not None:
        # Step 2: Build the knowledge graph from the DataFrame
        knowledge_graph = build_knowledge_graph(df)
        
        # Step 3: Save the knowledge graph as a JSON file
        save_knowledge_graph_as_json(knowledge_graph, json_output_path)
        
        # Optional: Print the JSON structure for verification
        knowledge_graph_json = json.dumps(knowledge_graph, indent=4)
        # print(knowledge_graph_json)
        
        # Return the knowledge graph for further use
        return knowledge_graph
    else:
        print("Failed to create the DataFrame from NEXUS file.")
        return None

# Example Usage
nexus_file_path = "D:/桌面/taxonomy_primary_result/The_GPT-4_result/Dataset_3 (The Lycopodiales (Diphasiastrum, Huperzia, Isoetes, Lycopodium, Selaginella)) 4/Information gain methods/nexdata"
csv_output_path = "D:/桌面/process_data_2.csv"
json_output_path = "D:/桌面/knowledge_graph.json"
# Process the file and get the knowledge graph
knowledge_graph = nexus_to_knowledge_graph(nexus_file_path, csv_output_path, json_output_path)


Successfully read file with encoding: utf-8


In [124]:
def parse_charlabels(charlabels_content):
    charlabels = {}
    lines = charlabels_content.strip().split("\n")
    char_pattern = re.compile(r"\[(\d+)\(\d+\)\]\s+'(.+?)'")
    for line in lines:
        match = char_pattern.match(line.strip().rstrip(','))
        if match:
            char_index = int(match.group(1))
            description = match.group(2)
            charlabels[char_index] = description
    return charlabels

def parse_statelabels(statelabels_content):
    statelabels = {}
    lines = statelabels_content.strip().split("\n")
    current_char = None
    states = []

    for line in lines:
        if re.match(r'^\d+', line):
            if current_char is not None:
                statelabels[current_char] = states
            parts = line.split(' ', 1)
            current_char = int(parts[0])
            states = parts[1].strip().strip(',').split("' '")
            states = [state.strip("'") for state in states]
        else:
            additional_states = line.strip().strip(',').split("' '")
            additional_states = [state.strip("'") for state in additional_states]
            states.extend(additional_states)

    if current_char is not None:
        statelabels[current_char] = states

    return statelabels

def combine_labels_and_states(charlabels, statelabels):
    character_info = {}
    for char_index, description in charlabels.items():
        states = statelabels.get(char_index, [])
        state_dict = {str(i + 1): state for i, state in enumerate(states)}
        character_info[str(char_index)] = {
            "description": description,
            "states": state_dict
        }
    return character_info

def extract_nexus_sections(nexus_content):
    charlabels_content = ""
    statelabels_content = ""
    lines = nexus_content.strip().split("\n")
    in_charlabels = False
    in_statelabels = False

    for line in lines:
        if "CHARLABELS" in line:
            in_charlabels = True
            continue
        if "STATELABELS" in line:
            in_statelabels = True
            continue
        if ";" in line:
            in_charlabels = False
            in_statelabels = False
        
        if in_charlabels:
            charlabels_content += line + "\n"
        if in_statelabels:
            statelabels_content += line + "\n"

    return charlabels_content, statelabels_content

def parse_nexus_file(file_path):
    with open(file_path, 'r') as file:
        nexus_content = file.read()

    charlabels_content, statelabels_content = extract_nexus_sections(nexus_content)
    
    # Analyzing the CHARLABELS section
    charlabels = parse_charlabels(charlabels_content)

    # Analyzing the STATELABELS section
    statelabels = parse_statelabels(statelabels_content)

    # Combine parsing results to generate character_info dictionary
    character_info = combine_labels_and_states(charlabels, statelabels)
    
    return character_info

# 示例使用
file_path = "D:/桌面/taxonomy_primary_result/The_GPT-4_result/Dataset_3 (The Lycopodiales (Diphasiastrum, Huperzia, Isoetes, Lycopodium, Selaginella)) 4/Information gain methods/nexdata"
character_info = parse_nexus_file(file_path)
print(character_info)

{'1': {'description': 'stems <elongation>', 'states': {'1': 'elongated, with numerous small', '2': 'short and tuberous, with sheat'}}, '2': {'description': 'stems <carriage>', 'states': {'1': 'suberect, and rooting at the b', '2': 'creeping, and rooting directly', '3': 'creeping, and rooting from cha'}}, '3': {'description': 'stems <manner of branching>', 'states': {'1': 'overtly dichotomising vegetati', '2': 'ostensibly monopodial vegetati'}}, '4': {'description': 'stems <whether dorsiventral>', 'states': {'1': 'dorsiventrally organized, with', '2': 'not dorsiventrally organized'}}, '5': {'description': 'stems <whether with flattened', 'states': {'1': 'with non-flattened branches', '2': 'with only slightly flattened b', '3': 'with strongly flattened branch'}}, '6': {'description': 'stems <presence of secondary t', 'states': {'1': 'with anomalous secondary thick', '2': 'without secondary thickening'}}, '7': {'description': 'the old leaf bases <in Isoetes', 'states': {'1': 'persistent o

In [35]:
# Input the API key and morphological matrix
# Initialize the OpenAI client with the API key from environment variables
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
# This code sets up the prompt for the API input using the client.chat.completions.create interface
# to conduct multi-turn conversations. By assigning different roles in the conversation (user, assistant, system),
# all input information is fully conveyed to the API model.
messages_initial = [
    # Set the system role to focus on taxonomy tasks.
    # Emphasize the system's understanding of morphological matrices, information gain,
    # and the construction of classification keys in the system setup.
    {"role": "system",
     "content":
         """You are a helpful taxonomist assistant.
         You are skilled at calculating the correct information gain to choose the character that best divides species into even groups based on their states.
         Based on the selected character, classify the species into different groups according to their states.
         For each group with more than two species, continue selecting characters to further classify this group until each group only has one species.
         After multiple classifications, determine the final classification levels and record each classifying character and its state. 
         Finally, generate a taxonomic key.
         You need to strictly ensure that you categorize all species when selecting the initial character, and that you don't ignore any of the species.
         Please format the classification result as follows:
        ```
        {
            "Character": "Character1",
            "States": {
                "State 1": ["species1", "species2", ...],
                "State 2": ["species3", "species4", ...],
                "State ...": ["species5", "species6", ...]
            }
        }
        ```
        Ensure that the response follows this format exactly.
        Additionally,exactly ensure that all species are included in the initial classification result.
         """},

    # Input the main task request, construct the classification key, and emphasize the details needed for classification.
    # This includes: the main objective, information gain calculation, ignoring invalid states, clarifying multi-character states,
    # guiding correct selection based on the significance of information gain, and standardizing the final output format for subsequent extraction of API results.
    {"role": "system",
     "content": """
                Generate the taxonomic key based on the provided morphological matrix. The matrix includes all species and their different states for each character.
                The process involves selecting a character to classify the species into groups. Repeat this classification within each subgroup until each group contains only one species.
                Information gain measures how much the uncertainty in the dataset is reduced after using a character for classification. It helps in selecting characters that minimize the entropy of the subset after classification, leading to better classification results.
                Please select the initial classification character for all species based on the morphological matrix and information gain methods.
                In the morphological matrix, 'Missing' and 'Not applicable' are invalid states. If a character has invalid states for the group being classified, it should be ignored.
                States are represented by numbers, and like the '1 and 2' means multiple states should be treated as a single state type.(such as '3' and '2 and 3'  is the different state, these are two separate states, when i choose character to based on different state to distinguish the species)
                You need to calculate the information gain for each character, and choose the highest information gain result, The higher the Information Gain result, the greater the contribution of the feature to the classification.
                Now I will show you the morphological matrix. Please provide only the initial classification character and the categorization of species based on its state, and you should singly show this called as # initial character classify result #
            """},

    # Use the assistant to summarize and refine the prompt content for the API.
    # Through the conversation with the assistant, deepen the API understanding of the content to some extent,
    # control the API response results, and standardize the output format of the API response.
    {"role": "assistant",
     "content": """
                Understood. I will generate the taxonomic key based on the provided morphological matrix. Here is a summary of the steps I will follow:
                1. The matrix includes all species and their different states for each character.
                2. I will select a character to classify the species into groups and repeat this classification within each subgroup until each group contains only one species, and i'll not ignore any species.
                3. I will use information gain to measure how much the uncertainty in the dataset is reduced after using a feature for classification. This helps in selecting features that minimize the entropy of the subset after classification, leading to better classification results.
                4. I will select the initial classification character for all species based on the morphological matrix and information gain methods.
                5. In the morphological matrix, 'Missing' and 'Not applicable' are considered invalid states. If a character has invalid states for the group being classified, it will be ignored.
                6. States are represented by numbers. For example, '1 and 2' means multiple states should be treated as a single state type. (such as '1' and '1 and 2'  is the different state, these are two separate states, when i choose character to based on different state to distinguish the species)
                7. I will use information gain to calculate all character and choose the highest information gain result, The higher the Information Gain result, the greater the contribution of the feature to the classification.so i need to make sure the result is Average classification
                8. The final result will provide only the initial classification character and the categorization of species based on its state.
                9. I will use all the species in their entirety, strictly making sure to categorize all of them! (need to make sure contain all species)
                10. Don't need to show how i calculate, only need to show the final result, and please show the final result in #initail character classify result# block, Don't have errors where the state and species don't match
                Please provide the morphological matrix data so that I can proceed with the initial classification.
            """},

    # Input the corresponding morphological matrix information for analysis.
    {"role": "user",
     "content": f"Here is morphological matrix:{knowledge_graph}"}
]

# Set various parameters to control the API response. 
# Setting the temperature to 0 and limiting max_tokens to save costs and avoid long, redundant outputs.
initial_character_info = client.chat.completions.create(
    model="gpt-4o",
    messages=messages_initial ,
    stop=None,
    max_tokens=1000,
    temperature=0,
    n=1
)

# Store the API call response results as a file. 
# (For subsequent distributed API call loops, consider storing in environment variables for continuous calls and modifications).
initial_response = initial_character_info.choices[0].message.content

# If used as a whole pipeline to transfer the results, ignore this print. 
# However, for debugging, you can use this print statement to check the response.
print(initial_response)


# Initial Character Classify Result #
```
{
    "Character": "Character1",
    "States": {
        "1": ["Diphasiastrum alpinum", "Diphasiastrum complanatum", "Huperzia selago", "Lycopodiella inundata", "Lycopodium annotinum", "Lycopodium clavatum", "Selaginella kraussiana", "Selaginella selaginoides"],
        "2": ["Isoetes echinospora", "Isoetes histrix", "Isoetes lacustris"]
    }
}
```


In [122]:
def parse_classification_result(result_text):
    classification = {"Character": None, "States": {}}
    try:
        # Attempt to match the Character
        character_match = re.search(r'"Character": "([^"]+)"', result_text)
        if character_match:
            classification["Character"] = character_match.group(1)
        else:
            raise ValueError("Character not found in the result text.")

        # Attempt to match each State and the corresponding species
        state_sections = re.findall(r'"(\d+|[^"]+)":\s*\[(.*?)\]', result_text)
        if not state_sections:
            raise ValueError("No states found in the result text.")

        for state, species_block in state_sections:
            species_list = re.findall(r'"([^"]+)"', species_block)
            if not species_list:
                raise ValueError(f"No species found for state {state}.")
            classification["States"][state] = species_list

    except Exception as e:
        print(f"Error parsing classification result: {e}")
        # Decide whether to return an empty classification or raise an exception when an error occurs
        raise e  # Or return classification

    return classification
print(type(initial_response))
parsed_initial_classification = parse_classification_result(initial_response)
print(parsed_initial_classification)

# Function to generate groups from the classification result
def generate_groups_from_classification(classification_result):
    """
    Generate groups from classification result.
    
    :param classification_result: Dictionary containing the classification result
    :return: List of tuples, where each tuple contains a state and a list of species
    """
    groups = []
    for state, species_list in classification_result["States"].items():
        groups.append((state, species_list))
    return groups

# Generate groups from the parsed initial classification
groups = generate_groups_from_classification(parsed_initial_classification)
print(groups)
print(groups[0])
print(groups[1])


<class 'str'>
{'Character': 'Character1', 'States': {'1': ['Diphasiastrum alpinum', 'Diphasiastrum complanatum', 'Huperzia selago', 'Lycopodiella inundata', 'Lycopodium annotinum', 'Lycopodium clavatum', 'Selaginella kraussiana', 'Selaginella selaginoides'], '2': ['Isoetes echinospora', 'Isoetes histrix', 'Isoetes lacustris']}}
[('1', ['Diphasiastrum alpinum', 'Diphasiastrum complanatum', 'Huperzia selago', 'Lycopodiella inundata', 'Lycopodium annotinum', 'Lycopodium clavatum', 'Selaginella kraussiana', 'Selaginella selaginoides']), ('2', ['Isoetes echinospora', 'Isoetes histrix', 'Isoetes lacustris'])]
('1', ['Diphasiastrum alpinum', 'Diphasiastrum complanatum', 'Huperzia selago', 'Lycopodiella inundata', 'Lycopodium annotinum', 'Lycopodium clavatum', 'Selaginella kraussiana', 'Selaginella selaginoides'])
('2', ['Isoetes echinospora', 'Isoetes histrix', 'Isoetes lacustris'])


In [37]:
# API call function for continued grouping for each subgroup
def classify_group(group_species):
    # Create a sub-matrix for the group of species
    group_matrix = {species: knowledge_graph[species] for species in group_species}
    group_matrix_str = json.dumps(group_matrix, ensure_ascii=False)
    
    # Define the messages for the API call
    messages_secondary = [
        {"role": "system",
         "content":
             """
             You are a helpful taxonomist assistant.
             You are skilled at calculating the correct information gain to choose the character that best divides species into even groups based on their states.
             Based on the selected character, classify the species into different groups according to their states.
             For each group with more than two species, continue selecting characters to further classify this group until each group only has one species.
             After multiple classifications, determine the final classification levels and record each classifying character and its state.
             Finally, generate a taxonomic key.
             ***IMPORTANT: Ensure that each group contains only one species in the final classification result. don't appear the result like state :[species A, species B], need to choose character continue class this teo species***
             """},
        {"role": "system",
         "content":
             """
             Generate the nested taxonomic key based on the provided morphological matrix.
             The process involves selecting a character to classify the species into groups. Repeat this classification within each subgroup until each group contains only one species.
             Information gain measures how much the uncertainty in the dataset is reduced after using a character for classification. It helps in selecting characters that minimize the entropy of the subset after classification, leading to better classification results.
             Please select the classification character for these group's species based on the morphological matrix and information gain methods.
             In the morphological matrix, 'Missing' and 'Not applicable' are invalid states. If a character has invalid states for the group being classified, it should be ignored.
             States are represented by numbers. For example, '1 and 2' means multiple states should be treated as a single state type and this multi-state characterization should not be confused with the single states within it (the state of '3' and '2 and 3' is different state, when you choose the character to based on the state to distinguish need to careful handle). The initial character should have no more than three state types.
             You need to calculate the information gain for each character and choose the highest information gain result. The higher the information gain result, the greater the contribution of the feature to the classification.
             After selecting the initial classification character and categorizing the species based on its state, repeat the process within each subgroup. For each subgroup, select the character with the highest information gain to further classify the species. Continue this process recursively until each group contains only one species.
             Now I will show you the morphological matrix. Please provide the classification character and the categorization of species based on its state. Then, continue to classify each subgroup recursively, showing the chosen character and categorization for each subgroup. Please present the result in a structured format, with each step clearly labeled.
             Please don't show how you analyze and calculate, please show me the final result.
             """},
        {"role": "assistant",
         "content":
             """
             Understood. I will generate the nested taxonomic key based on the provided morphological matrix. Here is a summary of the steps I will follow:
             1. The matrix includes all species and their different states for each character.
             2. I will select a character to classify the species into groups and repeat this classification within each subgroup until each group contains only one species.
             3. I will use information gain to measure how much the uncertainty in the dataset is reduced after using a feature for classification. This helps in selecting features that minimize the entropy of the subset after classification, leading to better classification results.
             4. I will select the classification character for the group's species based on the morphological matrix and information gain methods.
             5. In the morphological matrix, 'Missing' and 'Not applicable' are considered invalid states. If a character has invalid states for the group being classified, it will be ignored.
             6. States are represented by numbers. For example, '2 and 3' means multiple states should be treated as a single state type, and this multi-state characterization should not be confused with the individual states (like '2', '3') within it (such as '3' and '2 and 3' are different states, these are two separate states, when I choose a character based on different states to distinguish the species). The classification character should have no more than three state types.
             7. I will use information gain to calculate all characters and choose the highest information gain result. The higher the information gain result, the greater the contribution of the feature to the classification.
             8. The final result will provide only the initial classification character and the categorization of species based on its state.
             9. Don't need to show how the process about choosing, only need to show the final result as a nested structure, and I will store the result in #character classify result# block.
             Please provide the group morphological matrix data so that I can proceed with the classification.
             """},
        {"role": "user", "content": f"Here is the group information need to be classify and include the morphological matrix {group_matrix_str}"}
    ]
    
    # Make the API call to classify the group
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages_secondary,
        stop=None,
        temperature=0,
        max_tokens=1000,
        n=1
    )
    result_secondary = response.choices[0].message.content
    # print(f"API response for group {group_species}: {result}")
    
    # Define messages for formatting the response to JSON
    messages_JSON = [
        {"role": "system",
         "content":
             """
             You are a helpful JSON format converter.
             You can express the nested structure as a JSON result based on the corresponding content.
             """},
        {"role": "system",
         "content":
             """
             Please format the classification result as follows:
             ```
             # Final taxonomic key result JSON format #
             {
                 "Character": "CharacterX",
                 "States": {
                     "1": ["speciesA"],
                     "2": {
                         "Character": "CharacterY",
                         "States": {
                             "1": ["speciesB"],
                             "2": ["speciesC"]
                         }
                     }
                 }
             }
             ```
             Ensure that the response follows this format exactly.
             """},
        {"role": "assistant",
         "content":
             """
             Understood. I'll convert the nested structure you gave me into JSON format and store it in # final result #.
             Please provide what you need to convert the format.
             """},
        {"role": "user", "content": f"Here are the taxonomic results for the nested schema representation {result_secondary}"}
    ]
    
    # Make the API call to format the response as JSON
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages_JSON,
        stop=None,
        temperature=0,
        max_tokens=1500,
        n=1
    )
    json_result = response.choices[0].message.content
    print(json_result)
    return json_result


In [38]:
# Function to clean and extract JSON string
def extract_json_string(json_string):
    # Find the positions of the start and end of the JSON object
    start = json_string.find('{')
    end = json_string.rfind('}') + 1
    
    # If both start and end positions are valid, extract and return the JSON string
    if start != -1 and end != -1:
        cleaned_string = json_string[start:end]
        return cleaned_string.strip()
    
    # If positions are not valid, return an empty string
    return ""


In [39]:
def recursive_classification(groups, final_classification, classification_results, depth=0, max_depth=10):
    """
    Recursive classification function to process groups and store results.
    :param groups: Groups to be processed
    :param final_classification: Final classification result
    :param classification_results: Classification results
    :param depth: Current recursion depth
    :param max_depth: Maximum recursion depth
    :return: Final classification result
    """
    # Continue looping while the groups list is not empty
    # Initialize state and current_group for error handling
    state, current_group = None, []
    while groups:
        try:
            # Pop the first group from the list, getting the state and current group of species
            state, current_group = groups.pop(0)
            print(f"Processing group with state: {state}, species: {current_group}, at depth: {depth}")

            # If the current group has only one species, add it to the final classification
            if len(current_group) == 1:
                final_classification[current_group[0]] = current_group
            # If the current recursion depth has reached the maximum depth, stop further classification
            elif depth >= max_depth:
                print(f"Reached max depth {max_depth}. Stopping further classification for group: {current_group}")
                final_classification[state] = current_group
            else:
                # Call the classify_group function to classify the current group
                classification_result = classify_group(current_group)
                # Clean the API classification result to extract the JSON string
                cleaned_classification_result = extract_json_string(classification_result)  
                # Store the classification result in classification_results
                classification_results[state] = cleaned_classification_result

                # Parse the classification result, create new subgroups, and add them to groups for further classification
                parsed_result = parse_classification_result(classification_result)
                new_groups = generate_groups_from_classification(parsed_result)

                # Recursively call itself to process new subgroups, increasing the recursion depth
                recursive_classification(new_groups, final_classification, classification_results, depth + 1, max_depth)

        except Exception as e:
            # Catch exceptions and print error messages
            print(f"Error processing group with state: {state}, species: {current_group}, at depth: {depth}")
            print(f"Exception: {e}")
            raise e

    return final_classification

In [72]:
# Assume the variables have been initialized
max_depth =  5  # Can be adjusted based on the hierarchical structure of input data and application requirements
# here is the initial character level is about species number need to classify

# Dictionary to store the final classification where each species is classified individually
final_classification = {}

# Dictionary to store the API classification results for each state
classification_results = {}

# Print the initial state of groups and dictionaries for debugging purposes
print("Initial groups:", groups)
print("Initial final_classification:", final_classification)
print("Initial classification_results:", classification_results)

# Call the recursive_classification function to process the groups and store the results
final_classification = recursive_classification(groups, final_classification, classification_results, depth=0, max_depth=max_depth)

# Print the final classification results
print("Final Classification:")
print(json.dumps(final_classification, indent=2, ensure_ascii=False))

# Print the classification results from the API calls
print("\nClassification Results:")
print(classification_results)


Initial groups: [('1', ['Diphasiastrum alpinum', 'Diphasiastrum complanatum', 'Huperzia selago', 'Lycopodiella inundata', 'Lycopodium annotinum', 'Lycopodium clavatum', 'Selaginella kraussiana', 'Selaginella selaginoides']), ('2', ['Isoetes echinospora', 'Isoetes histrix', 'Isoetes lacustris'])]
Initial final_classification: {}
Initial classification_results: {}
Processing group with state: 1, species: ['Diphasiastrum alpinum', 'Diphasiastrum complanatum', 'Huperzia selago', 'Lycopodiella inundata', 'Lycopodium annotinum', 'Lycopodium clavatum', 'Selaginella kraussiana', 'Selaginella selaginoides'], at depth: 0
```json
{
    "Character": "Character2",
    "States": {
        "1": ["Huperzia selago"],
        "2": {
            "Character": "Character5",
            "States": {
                "1": {
                    "Character": "Character9",
                    "States": {
                        "1": ["Diphasiastrum alpinum"],
                        "2": {
                       

In [73]:

groups = generate_groups_from_classification(parsed_initial_classification)
print(classification_results)

{'1': '{\n    "Character": "Character2",\n    "States": {\n        "1": ["Huperzia selago"],\n        "2": {\n            "Character": "Character5",\n            "States": {\n                "1": {\n                    "Character": "Character9",\n                    "States": {\n                        "1": ["Diphasiastrum alpinum"],\n                        "2": {\n                            "Character": "Character12",\n                            "States": {\n                                "1": ["Lycopodium clavatum"],\n                                "2": {\n                                    "Character": "Character20",\n                                    "States": {\n                                        "1": ["Lycopodiella inundata"],\n                                        "2": ["Lycopodium annotinum"],\n                                        "3": ["Lycopodium annotinum"]\n                                    }\n                                }\n                          

In [78]:
def extract_paths(node, path=None):
    if path is None:
        path = {}

    if 'Character' in node and 'States' in node:
        current_character = node['Character'].replace(" ", "").strip()
        for state, value in node['States'].items():
            new_path = path.copy()
            new_path[current_character] = state
            if isinstance(value, dict):
                yield from extract_paths(value, new_path)
            else:
                for species in value:
                    yield species, new_path

# Process each classification result and extract the path
final_results = {}

for key, json_str in classification_results.items():
    classification_data = json.loads(json_str)
    species_paths = list(extract_paths(classification_data))

    formatted_results = {}
    for species, path in species_paths:
        formatted_results[species] = {"Characteristics": path}
    
    final_results[key] = formatted_results
    

In [79]:
def check_state_match(state, correct_state):
    if correct_state is None:
        return False
    if " and " in correct_state:
        correct_states = correct_state.split(" and ")
        return all(sub_state in correct_states for sub_state in state.split(" and "))
    return state == correct_state

# Validate classification results and log errors
def validate_results(final_results, knowledge_graph):
    errors = []
    for key, results in final_results.items():
        for species, data in results.items():
            if species in knowledge_graph:
                mismatch = False
                incorrect_character_states = {}
                for character, state in data["Characteristics"].items():
                    character = character.replace(" ", "").strip()
                    correct_state = knowledge_graph[species]["Characteristics"].get(character)
                    if correct_state is None or not check_state_match(state, correct_state):
                        mismatch = True
                        incorrect_character_states[character] = {"error_state": state, "correct_state": correct_state}
                if mismatch:
                    errors.append({
                        "species": species,
                        "key": key,
                        "error": "Mismatch",
                        "error_result": incorrect_character_states,
                        "correct_result": {character: knowledge_graph[species]["Characteristics"].get(character) for character in incorrect_character_states}
                    })
            else:
                errors.append({
                    "species": species,
                    "key": key,
                    "error": "Species not found in knowledge graph",
                    "error_result": data["Characteristics"]
                })
    return errors

In [80]:
def get_species_list_for_state(groups, key):
    species_list = []
    for state, species in groups:
        if state == key:
            species_list = species
            break
    if not species_list:
        print(f"Key {key} not found in groups")
    else:
        print(f"Processing species list for state '{key}': {species_list}")
    return species_list

In [81]:
def correct_classification(errors, classification_results, knowledge_graph):
    for error in errors:
        key = error['key']
        
        species_list = get_species_list_for_state(groups, key)
        if not species_list:
            continue
        
        group_matrix = {s: knowledge_graph[s] for s in species_list}
        group_matrix_str = json.dumps(group_matrix, ensure_ascii=False)  
        
        messages2 = [
            {"role": "system",
             "content": """
             You are a helpful taxonomist assistant.
             You are skilled at calculating the correct information gain to choose the character that best divides species into even groups based on their states.
             Based on the selected character, classify the species into different groups according to their states.
             For each group with more than two species, continue selecting characters to further classify this group until each group only has one species.
             After multiple classifications, determine the final classification levels and record each classifying character and its state.
             Finally, generate a taxonomic key.
             You are able to avoid the same error in your results based on the corrected results previously passed to you
             ***IMPORTANT: Ensure that each group contains only one species in the final classification result. don't appear the result like state :[species A, species B]***
             """},
            {"role": "system",
             "content": """
             Generate the nested taxonomic key based on the provided morphological matrix.
             The process involves selecting a character to classify the species into groups. Repeat this classification within each subgroup until each group contains only one species.
             Information gain measures how much the uncertainty in the dataset is reduced after using a character for classification. It helps in selecting characters that minimize the entropy of the subset after classification, leading to better classification results.
             Please select the classification character for these group's species based on the morphological matrix and information gain methods.
             In the morphological matrix, 'Missing' and 'Not applicable' are invalid states. If a character has invalid states for the group being classified, it should be ignored.
             States are represented by numbers. For example, '1 and 2' means multiple states should be treated as a single state type and this multi-state characterization should not be confused with the single states within it (the state of '3' and '2 and 3' is different state, when you choose the character to based on the state to distinguish need to careful handle). The initial character should have no more than three state types.
             You need to calculate the information gain for each character and choose the highest information gain result. The higher the information gain result, the greater the contribution of the feature to the classification.
             After selecting the initial classification character and categorizing the species based on its state, repeat the process within each subgroup. For each subgroup, select the character with the highest information gain to further classify the species. Continue this process recursively until each group contains only one species.
             in the results, each group only allow one species, need to choose suitable character
             Now I will show you the morphological matrix. Please provide the classification character and the categorization of species based on its state. Then, continue to classify each subgroup recursively, showing the chosen character and categorization for each subgroup. Please present the result in a structured format, with each step clearly labeled.
             Please don't show how you analyze and calculate, please show me the final result.
             """},
            {"role": "user", "content": f"""
            This is the result of the error you generated in the previous API call. 
            In this file I have provided you with the CORRECT result. 
            Please strictly adhere to the use of the correct species feature status message!{error}
            """},
            {"role": "assistant", "content": f"""
            I will strictly use the correct species feature state information for evaluation, 
            while I will avoid using these incorrect feature state information that appeared previously in the classification results
            """},
            {"role": "user", "content": f"Here is the group information need to be classify and include the morphological matrix {group_matrix_str}"}
        ]
        
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=messages2,
            stop=None,
            temperature=0,
            max_tokens=1000,
            n=1
        )
        corrected_result = response.choices[0].message.content
        
        messages_JSON = [
        {"role": "system",
         "content":
             """
             You are a helpful JSON format converter.
             You can express the nested structure as a JSON result based on the corresponding content.
             """},
        {"role": "system",
         "content":
             """
             Please format the classification result as follows:
             ```
             # Final taxonomic key result JSON format #
             {
                 "Character": "CharacterX",
                 "States": {
                     "1": ["speciesA"],
                     "2": {
                         "Character": "CharacterY",
                         "States": {
                             "1": ["speciesB"],
                             "2": ["speciesC"]
                         }
                     }
                 }
             }
             ```
             Ensure that the response follows this format exactly.
             """},
        {"role": "assistant",
         "content":
             """
             Understood. I'll convert the nested structure you gave me into JSON format and store it in # final result #.
             Please provide what you need to convert the format.
             """},
        {"role": "user", "content": f"Here are the taxonomic results for the nested schema representation {corrected_result}"}
        ]
    
        # Make the API call to format the response as JSON
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=messages_JSON,
            stop=None,
            temperature=0,
            max_tokens=1500,
            n=1
        )
        json_result = response.choices[0].message.content
        json_cleaned_result = extract_json_string(json_result)
        print(json_cleaned_result)
        classification_results[key] = json_cleaned_result
        return classification_results

In [82]:
print(classification_results)

{'1': '{\n    "Character": "Character2",\n    "States": {\n        "1": ["Huperzia selago"],\n        "2": {\n            "Character": "Character5",\n            "States": {\n                "1": {\n                    "Character": "Character9",\n                    "States": {\n                        "1": ["Diphasiastrum alpinum"],\n                        "2": {\n                            "Character": "Character12",\n                            "States": {\n                                "1": ["Lycopodium clavatum"],\n                                "2": {\n                                    "Character": "Character20",\n                                    "States": {\n                                        "1": ["Lycopodiella inundata"],\n                                        "2": ["Lycopodium annotinum"],\n                                        "3": ["Lycopodium annotinum"]\n                                    }\n                                }\n                          

In [83]:
# Cycle checks and corrections
errors = validate_results(final_results, knowledge_graph)
# Purpose: Enter a loop until all errors have been fixed.
# Function: Executes the code inside the loop when the errors list is not empty.
while errors:
    # Fix current categorization errors. (based on the API )
    classification_results = correct_classification(errors, classification_results, knowledge_graph)
    # Purpose: To reset the final_results dictionary to store the corrected categorization results.
    final_results = {}
    # Iterate over the corrected classification results and extract species classification paths.
    for key, json_str in classification_results.items():
        classification_data = json.loads(json_str)
        species_paths = list(extract_paths(classification_data))
        # Purpose: Format the extracted classification paths and store them in the formatted_results dictionary.
        formatted_results = {}
        for species, path in species_paths:
            formatted_results[species] = {"Characteristics": path}
        # Purpose: Add the formatted classification results to final_results.
        final_results[key] = formatted_results
    # Purpose: Re-validate the corrected classification results and log any remaining errors.
    # Function: Call the validate_results function, passing in the updated final_results and knowledge_graph and returning a new list of errors. If there are no errors, then errors is empty and the loop ends.
    errors = validate_results(final_results, knowledge_graph)

# Save the final classification results
with open('final_classification.json', 'w') as f:
    json.dump(final_results, f, indent=4)
print("Final classification results have been saved to 'final_classification.json'.")
print(json.dumps(final_results, indent=4))

Processing species list for state '1': ['Diphasiastrum alpinum', 'Diphasiastrum complanatum', 'Huperzia selago', 'Lycopodiella inundata', 'Lycopodium annotinum', 'Lycopodium clavatum', 'Selaginella kraussiana', 'Selaginella selaginoides']
{
    "Character": "Character2",
    "States": {
        "1": ["Huperzia selago"],
        "2": {
            "Character": "Character5",
            "States": {
                "1": {
                    "Character": "Character20",
                    "States": {
                        "1": ["Lycopodiella inundata"],
                        "2 and 3": ["Lycopodium annotinum"],
                        "1 and 2 and 3": ["Lycopodium clavatum"]
                    }
                },
                "2": ["Diphasiastrum alpinum"],
                "3": ["Diphasiastrum complanatum"]
            }
        },
        "3": {
            "Character": "Character8",
            "States": {
                "1": ["Selaginella kraussiana"],
                "2": ["

In [96]:
print(classification_results)
print(type(classification_results))

{'1': '{\n    "Character": "Character2",\n    "States": {\n        "1": ["Huperzia selago"],\n        "2": {\n            "Character": "Character12",\n            "States": {\n                "1": ["Lycopodium clavatum"],\n                "2": {\n                    "Character": "Character9",\n                    "States": {\n                        "1": {\n                            "Character": "Character5",\n                            "States": {\n                                "2": ["Diphasiastrum alpinum"],\n                                "3": ["Diphasiastrum complanatum"]\n                            }\n                        },\n                        "2": {\n                            "Character": "Character20",\n                            "States": {\n                                "1": ["Lycopodiella inundata"],\n                                "2 and 3": ["Lycopodium annotinum"]\n                            }\n                        }\n                    }\n      

In [117]:
print(character_info)

{}


In [125]:
classification_result = {key: json.loads(value) for key, value in classification_results.items()}

# Recursive function converts the structure into the desired format
def convert_structure(node):
    if "Character" in node and "States" in node:
        character = node["Character"]
        states = node["States"]
        converted = {f"Character {character.replace('Character', '')}": {}}
        for state, sub_node in states.items():
            state_key = f"State {state}"
            if isinstance(sub_node, list):
                converted[f"Character {character.replace('Character', '')}"][state_key] = sub_node[0] if len(sub_node) == 1 else sub_node
            elif isinstance(sub_node, dict):
                converted[f"Character {character.replace('Character', '')}"][state_key] = convert_structure(sub_node)
        return converted
    return node

# Processing Classification Retrieval Table
converted_result = {}
for key, value in classification_result.items():
    converted_result[f"Character {key}"] = convert_structure(value)

# Integration of initial categorization with other results
def combine_results(initial, secondary, state_key):
    if not secondary:
        return

    initial_states = initial["States"].get(state_key)
    if initial_states is None:
        initial["States"][state_key] = secondary
        return

    if isinstance(initial_states, list):
        if isinstance(secondary, list):
            initial["States"][state_key] = list(set(initial_states + secondary))  # 合并两个列表并去重
        else:
            initial["States"][state_key] = secondary
    elif isinstance(initial_states, dict):
        if isinstance(secondary, dict):
            for key, value in secondary["States"].items():
                if key not in initial_states:
                    initial_states[key] = value
                else:
                    combine_results(initial_states, value, key)
        else:
            raise ValueError(f"冲突的类型，键 {state_key}: {type(initial_states)} vs {type(secondary)}")
    else:
        raise ValueError(f"初始状态的意外类型: {type(initial_states)}")

# Dynamic consolidation of all sub-categorized results
for state_key, secondary in classification_result.items():
    combine_results(parsed_initial_classification, secondary, state_key)

# Convert the merged result into the desired format
converted_initial_classification = convert_structure(parsed_initial_classification)

# Recursive Function Replacement Characterization and State Description
def replace_indices_with_descriptions_in_key(key, character_info, parent_char_index=None):
    updated_key = {}
    for char_state, subtree in key.items():
        if char_state.startswith("Character"):
            parts = char_state.split()
            if len(parts) > 1:
                char_index = parts[1]
                if char_index in character_info:
                    char_description = f"Character {char_index}: {character_info[char_index]['description']}"
                    if isinstance(subtree, dict):
                        updated_subtree = replace_indices_with_descriptions_in_key(subtree, character_info, char_index)
                        updated_key[char_description] = updated_subtree
                    else:
                        updated_key[char_description] = subtree
                else:
                    updated_key[char_state] = subtree
            else:
                updated_key[char_state] = subtree
        elif char_state.startswith("State") and parent_char_index:
            states = char_state.split()[1:]
            state_descriptions = []
            for state in states:
                individual_states = state.split("and")
                descriptions = [character_info[parent_char_index]["states"].get(s.strip(), "") for s in individual_states]
                state_descriptions.append(" and ".join(filter(None, descriptions)))
            state_key = f"State {' '.join(states)}: {' / '.join(state_descriptions)}"
            if isinstance(subtree, dict):
                updated_key[state_key] = replace_indices_with_descriptions_in_key(subtree, character_info, parent_char_index)
            else:
                updated_key[state_key] = subtree
        else:
            updated_key[char_state] = subtree
    return updated_key

# Replacement characterization and state description
updated_classification_key = replace_indices_with_descriptions_in_key(converted_initial_classification, character_info)

# Print the updated categorized search form
print("Updated Classification Key:")
print(json.dumps(updated_classification_key, indent=4, ensure_ascii=False))


Updated Classification Key:
{
    "Character 1: stems <elongation>": {
        "State 1: elongated, with numerous small": {
            "Character 2: stems <carriage>": {
                "State 1: suberect, and rooting at the b": "Huperzia selago",
                "State 2: creeping, and rooting directly": {
                    "Character 12: leaves <whether hair-pointed>": {
                        "State 1: with long, filiform hair-like": "Lycopodium clavatum",
                        "State 2: not hair-pointed": {
                            "Character 9: leaves <arrangement>": {
                                "State 1: 4-ranked on the branches": {
                                    "Character 5: stems <whether with flattened": {
                                        "State 2: with only slightly flattened b": "Diphasiastrum alpinum",
                                        "State 3: with strongly flattened branch": "Diphasiastrum complanatum"
                                    

In [126]:
# Example initial result
initial_result = parsed_initial_classification

# Parse the API response JSON strings
parsed_classification_results = {key: json.loads(value) for key, value in classification_results.items()}

# Function to combine the initial and secondary classification results
def combine_results(initial, secondary, state_key):
    if not secondary:
        return

    initial_states = initial["States"].get(state_key)
    if initial_states is None:
        initial["States"][state_key] = secondary
        return

    if isinstance(initial_states, list):
        if isinstance(secondary, list):
            initial["States"][state_key] = list(set(initial_states + secondary))  # Merge two lists and remove duplicates
        else:
            initial["States"][state_key] = secondary
    elif isinstance(initial_states, dict):
        if isinstance(secondary, dict):
            for key, value in secondary["States"].items():
                if key not in initial_states:
                    initial_states[key] = value
                else:
                    combine_results(initial_states, value, key)
        else:
            raise ValueError(f"Conflicting types for key {state_key}: {type(initial_states)} vs {type(secondary)}")
    else:
        raise ValueError(f"Unexpected type for initial states: {type(initial_states)}")

# Dynamically combine all secondary classification results
for state_key, secondary in parsed_classification_results.items():
    combine_results(initial_result, secondary, state_key)

# Function to display the final classification result
def display_classification(result, indent=0):
    indent_space = " " * indent
    character = result.get("Character")
    states = result.get("States")

    classification = {}
    if character and states:
        classification["Character"] = character
        classification["States"] = {}
        print(f"{indent_space}1. **{character}:**")
        for state, species in states.items():
            if isinstance(species, list):
                print(f"{indent_space}   - State \"{state}\": {', '.join(species)}")
                classification["States"][state] = species
            elif isinstance(species, dict):
                print(f"{indent_space}   - State \"{state}\":")
                classification["States"][state] = display_classification(species, indent + 4)
    return classification

# Display the final classification result
final_result = display_classification(initial_result)
# print("\nFinal Result JSON:")
# print(json.dumps(final_result, indent=2))


1. **Character1:**
   - State "1":
    1. **Character2:**
       - State "1": Huperzia selago
       - State "2":
        1. **Character12:**
           - State "1": Lycopodium clavatum
           - State "2":
            1. **Character9:**
               - State "1":
                1. **Character5:**
                   - State "2": Diphasiastrum alpinum
                   - State "3": Diphasiastrum complanatum
               - State "2":
                1. **Character20:**
                   - State "1": Lycopodiella inundata
                   - State "2 and 3": Lycopodium annotinum
       - State "3":
        1. **Character8:**
           - State "1":
            1. **Character4:**
               - State "1": Selaginella kraussiana
               - State "2": Selaginella selaginoides
   - State "2":
    1. **Character20:**
       - State "1": Isoetes histrix
       - State "2 and 3": Isoetes lacustris
       - State "1 and 2 and 3": Isoetes echinospora
