# Packages

In [None]:
import requests
import re
import pandas as pd
import warnings
import json
from tqdm import tqdm

# Data

In [None]:
# loading in the llm v2 predictions (subset to manually annotated compounds)

llm_subset = pd.read_csv('/Users/judepops/Documents/PathIntegrate/Code/Final_Scripts/Results/Results_B/llm_subset_v2.csv', index_col=0)
llm_subset = llm_subset.rename(columns={'Matched COMPOUND_ID': 'ChEBI'})
llm_subset = llm_subset.drop(columns='Matched Compound Name')
llm_subset = llm_subset.drop(columns='Source')
llm_subset = llm_subset.drop(columns='Correct COMPOUND_ID')

llm_subset['Query'] = llm_subset['Query'].str.strip().str.lower()
llm_subset['ChEBI'] = pd.to_numeric(llm_subset['ChEBI'], errors='coerce')
llm_subset.dropna(subset=['ChEBI'], inplace=True)
llm_subset['ChEBI'] = llm_subset['ChEBI'].astype('Int64')
llm_subset.reset_index(drop=True, inplace=True)

# Proeteomics - Just using KEGG API

In [None]:
prot = pd.read_csv('/Users/judepops/Documents/PathIntegrate/Code/Pathway_Analysis/COVID_Pro_UniProt_Final.csv')
prot = prot.iloc[:, :-7]
prot = prot.drop(columns='sample_id')
column_vector_df = pd.DataFrame(prot.columns, columns=['UniProt'])
column_vector_df

In [None]:
# function to convert UniProt IDs to KEGG GENE ID 
def convert_uniprot_to_kegg(uniprot_id):
    url = f"https://rest.kegg.jp/conv/genes/uniprot:{uniprot_id}"
    response = requests.get(url)
    if response.status_code == 200:
        result = response.text.strip()
        print(result)
        if result:
            # splittign to obtain result
            kegg_id = result.split("\t")[1].split(":")[1]
            return kegg_id
    return None

column_vector_df['KEGG'] = column_vector_df['UniProt'].apply(convert_uniprot_to_kegg)

print(column_vector_df)

column_vector_df.to_csv('uniprot_to_kegg.csv', index=False)

# Metabolomics Method 1: KEGG API

In [None]:
# function to convert ChEBI IDs to KEGG COMPOUND ID
def convert_chebi_to_kegg(chebi_id):
    url = f"https://rest.kegg.jp/conv/compound/chebi:{chebi_id}"
    response = requests.get(url)
    if response.status_code == 200:
        result = response.text.strip()
        print(result)
        if result:
            # splittign to obtain result
            kegg_id = result.split("\t")[1].split(":")[1]
            return kegg_id
    return None

# adding a new column for KEGG IDs
llm_subset['KEGG'] = llm_subset['ChEBI'].apply(convert_chebi_to_kegg)
llm_subset.to_csv('chebi_to_kegg.csv', index=False)

In [None]:
llm_subset.to_csv('/Users/judepops/Documents/PathIntegrate/Code/Pathway_Prediction/chebi_compounds.csv')

# Metabolomics Method 2: ChEBI conversion tool (database accession)

In [None]:
# Loading in the llm v2 predictions (subset to manually annotated compounds)

llm_subset = pd.read_csv('/Users/judepops/Documents/PathIntegrate/Code/Final_Scripts/Results/Results_B/llm_subset_v2.csv', index_col=0)
llm_subset = llm_subset.rename(columns={'Matched COMPOUND_ID': 'ChEBI'})
llm_subset = llm_subset.drop(columns='Matched Compound Name')
llm_subset = llm_subset.drop(columns='Source')
llm_subset = llm_subset.drop(columns='Correct COMPOUND_ID')

llm_subset['Query'] = llm_subset['Query'].str.strip().str.lower()
llm_subset['ChEBI'] = pd.to_numeric(llm_subset['ChEBI'], errors='coerce')
llm_subset.dropna(subset=['ChEBI'], inplace=True)
llm_subset['ChEBI'] = llm_subset['ChEBI'].astype('Int64')
llm_subset.reset_index(drop=True, inplace=True)

In [None]:
# loading in chebi conversion tool
chebi_db = pd.read_csv('/Users/judepops/Documents/PathIntegrate/Code/Pathway_Prediction/database_accession.tsv', delimiter='\t')
chebi_db

In [None]:
# makign them strings
chebi_db['ID'] = chebi_db['ID'].astype(str)
llm_subset['ChEBI'] = llm_subset['ChEBI'].astype(str)

# function to match chebi ids adn extract accession number
def get_kegg_accession(manual_entry, chebi_db):
    result = chebi_db[(chebi_db['ID'] == str(manual_entry)) & (chebi_db['TYPE'] == 'KEGG COMPOUND accession')]
    if not result.empty:
        return result['ACCESSION_NUMBER'].values[0]
    return None

# applying  function to the ChEBI_IDs in llm_subset dataframe
llm_subset['KEGG_ID'] = llm_subset['ChEBI'].apply(lambda x: get_kegg_accession(x, chebi_db))


In [None]:
llm_subset.to_csv('chebi_to_kegg_pd.csv')

# Metabolomics Method 3: Metaboanallyst

In [None]:
# this was done using the website so no code to show

# CREATING A FINAL DATAFRAME WTIH ALL!

In [3]:
import pandas as pd
# first we load the data from each conversion method
api_method = pd.read_csv('/Users/judepops/Documents/PathIntegrate/Code/Pathway_Prediction/chebi_to_kegg_api.csv')
pd_method = pd.read_csv('/Users/judepops/Documents/PathIntegrate/Code/Pathway_Prediction/chebi_to_kegg_pd.csv')
ma_method = pd.read_csv('/Users/judepops/Documents/PathIntegrate/Code/Pathway_Prediction/chebi_to_kegg_ma.csv')

# now we prepare teh data to be combined
ma_method = ma_method[['KEGG', 'Query']]
pd_method = pd_method.rename(columns={'KEGG_ID': 'KEGG'})
api_method['KEGG'] = api_method['KEGG'].str.replace(r'\nchebi', '', regex=True)

# now we add a  source column to each method!
api_method['Source'] = 'API Method'
pd_method['Source'] = 'PD Method'
ma_method['Source'] = 'MA Method'

## dataframes are ALL THE SAME LENGTH - the LLM V2 0.75 predictions
## it is important to note that there are some na in the KEGG column of each one - if no prediction was provided


# WE START WITH METABOANALSYT (MA) METHOD AS THE INTIIAL PREDICTIONS AND THEN FILL WITH THE REST (THE REST ARE ALL PREDICTIOS THAT
# METABOANALYST MISSED AND THESE WERE PROVIDED BY LLM)

# now we fill NA values in ma_method KEGG column first with api_method and then with pd_method
combined = ma_method.copy()
combined['KEGG'] = combined['KEGG'].fillna(api_method['KEGG']).fillna(pd_method['KEGG'])

# THIS DETERMINES the source of each method
combined['Source'] = np.where(ma_method['KEGG'].notna(), 'MA Method', 
                              np.where(api_method['KEGG'].notna(), 'API Method', 'PD Method'))

# now there is a final combined dataframe that has the query metabolite name, the kegg id for it and the source of where it caome from
final_combined_df = combined[['Query', 'KEGG', 'Source']]

final_combined_df



In [None]:
# saving the dataframe
final_combined_df.to_csv('/Users/judepops/Documents/PathIntegrate/Code/Final_Scripts/Results/Results_G/final_kegg_id.csv')