In [None]:
import json
import re
from collections import defaultdict, Counter

In [None]:
"""API data cleaning"""
with open('API_extract_results.json', 'r', encoding='utf-8') as f:
    data = json.load(f)


# Function to remove the phospholipid suffix from a compound name
def remove_phospholipid_suffix(name):
    for suffix in ["-phospholipid complex", "-phospholipid", "-PC"]:
        if name.lower().endswith(suffix):
            return name[:-len(suffix)]
    return name


# Initialize a dictionary to store the possible APIs for each file
possible_APIs = defaultdict(lambda: {"phospholipid": None, "most_common": None, "product": None})

# Iterate over the data and extract the possible APIs
for file_name, compounds in data.items():
    # Initialize a counter to store the frequency of each compound name
    name_counter = Counter()

    for compound_info in compounds:
        # Check if the 'compound' key exists
        if 'compound' in compound_info['API']:
            # Check if the 'names' key exists
            if 'names' in compound_info['API']['compound']['Compound']:
                compound_names = compound_info['API']['compound']['Compound']['names']
                compound_roles = compound_info['API']['compound']['Compound'].get('roles', [])

                for name in compound_names:
                    # Remove the phospholipid suffix from the name
                    name_without_suffix = remove_phospholipid_suffix(name)

                    # Update the counter
                    name_counter[name_without_suffix] += 1

                    # Check if the name ends with a phospholipid suffix and if this is the first such name
                    if name != name_without_suffix and possible_APIs[file_name]["phospholipid"] is None:
                        possible_APIs[file_name]["phospholipid"] = name_without_suffix

                    # Check if the compound is marked as a product and if this is the first such compound
                    if "product" in compound_roles and possible_APIs[file_name]["product"] is None:
                        possible_APIs[file_name]["product"] = name_without_suffix

    # Find the most common compound name if there are any compound names
    if name_counter:
        most_common_name, _ = name_counter.most_common(1)[0]
        # Store the most common name as a possible API
        possible_APIs[file_name]["most_common"] = most_common_name

# Reorder the dictionary keys as requested
reordered_APIs = {}
for file_name, apis in possible_APIs.items():
    reordered_APIs[file_name] = {"phospholipid": apis["phospholipid"], "most_common": apis["most_common"],
                                 "product": apis["product"]}

# Display the possible APIs for each file
reordered_APIs

with open('cleaned_API_results.json', 'w') as f:
    json.dump(reordered_APIs, f)

In [None]:
"""Molar Ratio data cleaning"""
with open('MolarRatio_extract_results.json', 'r', encoding='utf-8') as f:
    molar_ratio_data = json.load(f)

# Initialize a dictionary to store the cleaned data
cleaned_molar_ratio_data = {}

# Regular expression to match a:b ratio format
ratio_format = re.compile(r'\d+:\d+')

# Iterate over the data and remove entries where raw_value is not in a:b ratio format
for file_name, data_points in molar_ratio_data.items():
    valid_data_points = []
    for data_point in data_points:
        raw_value = data_point['MolarRatio']['raw_value']
        if ratio_format.match(raw_value):
            valid_data_points.append(data_point)
    cleaned_molar_ratio_data[file_name] = valid_data_points

# Save the results to a JSON file
with open('cleaned_MolarRatio_results.json', 'w') as f:
    json.dump(cleaned_molar_ratio_data, f)