# Packages

In [None]:
# Dependencies

import numpy as np
from pandas import DataFrame
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
import sklearn 
import sspa
import sspa.utils
import gseapy.plot as gp
import networkx
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import scipy.stats as stats
import statsmodels.api as sm
import plotly.graph_objects as go
import plotly.express as px
import urllib.request
import statsmodels
import networkx as nx
import math
import itertools 
from scipy.stats import hypergeom as hg
import textwrap
from itertools import chain
import missforest
import pathintegrate


# Loading in raw data

In [None]:
# Reading in the metabolomics data
metabolomics_data_processed = pd.read_csv('/Users/judepops/Documents/PathIntegrate/Code/Processing/Processing_Cleaned/cleaned_metabolomics_data_covid.csv')
metabolomics_data_processed = metabolomics_data_processed.set_index('sample_id')
metabolomics_data_processed_final = metabolomics_data_processed.iloc[:, :-7]
metabolomics_data_processed_final.columns = [col.strip().lower() for col in metabolomics_data_processed_final.columns]

last_7_columns = metabolomics_data_processed.iloc[:, -7:]
last_7_columns


In [None]:
proteomics_data_processed = pd.read_csv('/Users/judepops/Documents/PathIntegrate/Code/Processing/Processing_Cleaned/cleaned_proteomics_data_covid.csv')
proteomics_data_processed

# Proteomics ID converted matrix: simply using olink panel

In [None]:
proteomics_data_processed = proteomics_data_processed.set_index('sample_id')
numeric_data = proteomics_data_processed.select_dtypes(include=[np.number]) 
numeric_data=numeric_data.drop(columns='Age')

olink = pd.read_csv('/Users/judepops/Documents/PathIntegrate/Code/Processing/proteomics_metadata.csv')
gene_description = olink["gene_description"]
uniprot	= olink["uniprot"]

uniprot_id = dict(zip(gene_description, uniprot))
uniprot_id

# replcaing name with uniprot IDs
uniprot_proteomic = numeric_data.rename(mapper= uniprot_id, axis=1)
uniprot_proteomic

In [None]:
columns_to_move = proteomics_data_processed[['Who', 'Race', 'Age', 'Group', 'Condition_Group', 'Age_Group', 'Race_Group']] 
proteomics_final = pd.concat([uniprot_proteomic, columns_to_move], axis=1)
proteomics_final

proteomics_final.to_csv('/Users/judepops/Documents/PathIntegrate/Code/Pathway_Analysis/COVID_Pro_UniProt_Final.csv')

# Creating ID-converted matrices for each of the different metabolite ID mapping methods 

### Manual Method

In [None]:
# manual conversion table
manual = pd.read_csv('/Users/judepops/Documents/PathIntegrate/Code/Manual_Annotation/manual_annotations_raw_final_2.csv', index_col=0)
manual
manual['Manual_ChEBI'] = pd.to_numeric(manual['Manual_ChEBI'], errors='coerce')
manual.dropna(subset=['Manual_ChEBI'], inplace=True)
columns_to_drop = ['Input Compound Name', 'Automated_Match', 'Automated_ChEBI', 'Confusion_Matrix', 'Unnamed: 4', 'Manual_Match']
manual = manual.drop(columns=columns_to_drop)
manual.rename(columns={'Compound Name': 'Query'}, inplace=True)
manual.rename(columns={'Manual_ChEBI': 'ChEBI'}, inplace=True)
manual['Query'] = manual['Query'].str.strip().str.lower()
manual['ChEBI'] = manual['ChEBI'].astype('Int64')
manual.reset_index(drop=True, inplace=True)
manual

# File

processed_data_mapped_manual = sspa.map_identifiers(manual, output_id_type="ChEBI", matrix=metabolomics_data_processed_final)
processed_data_mapped_manual_final = pd.concat([processed_data_mapped_manual, last_7_columns], axis=1)
processed_data_mapped_manual_final.to_csv('/Users/judepops/Documents/PathIntegrate/Code/Final_Scripts/Results/Results_D/COVID_Met_ChEBI_Maual.csv')


### MetaboAnalyst Method

In [None]:
# loading in metaboanalyst conversion table
conversion_table_met = pd.read_csv('/Users/judepops/Documents/PathIntegrate/Code/Metaboanalyst_Annotation/name_map_original_csv.csv')
columns_to_drop = ['Match', 'HMDB', 'PubChem', 'KEGG', 'METLIN', 'SMILES', 'Comment']
conversion_table_met = conversion_table_met.drop(columns=columns_to_drop)
conversion_table_met.rename(columns={'Query': 'Query'}, inplace=True)
conversion_table_met.rename(columns={'ChEBI': 'ChEBI'}, inplace=True)
conversion_table_met['Query'] = conversion_table_met['Query'].str.strip().str.lower()
conversion_table_met['ChEBI'] = pd.to_numeric(conversion_table_met['ChEBI'], errors='coerce')

conversion_table_met.dropna(subset=['ChEBI'], inplace=True)
conversion_table_met['ChEBI'] = conversion_table_met['ChEBI'].astype('Int64')

# metaboanalsyt has a problem where it adds an asterix to names, ruinign matching so we fixe dthis for them
query_values = conversion_table_met['Query'].unique()
metabolomics_columns = set(metabolomics_data_processed_final.columns)
non_matching_columns = set(query_values) - metabolomics_columns
conversion_table_met['Query'] = conversion_table_met['Query'].apply(
    lambda x: f"{x}*" if x in non_matching_columns else x
)

# creating mapped file
processed_data_mapped_metaboanalyst = sspa.map_identifiers(conversion_table_met, output_id_type="ChEBI", matrix=metabolomics_data_processed_final)
processed_data_mapped_metaboanalyst_final = pd.concat([processed_data_mapped_metaboanalyst, last_7_columns], axis=1)
processed_data_mapped_metaboanalyst_final.to_csv('/Users/judepops/Documents/PathIntegrate/Code/Final_Scripts/Results/Results_D/COVID_Met_ChEBI_Metaboanalyst.csv')



### LLM method - using best threshold (0.75)

In [None]:
# Loading in LLM conversion table
llm_subset = pd.read_csv('/Users/judepops/Documents/PathIntegrate/Code/LLM_Annotation/manual_automated_subset.csv', index_col=0)
llm_subset = llm_subset[['Query', 'Matched COMPOUND_ID 0.75']]
llm_subset = llm_subset.rename(columns={'Matched COMPOUND_ID 0.75': 'ChEBI'})
llm_subset['Query'] = llm_subset['Query'].str.strip().str.lower()
llm_subset['ChEBI'] = pd.to_numeric(llm_subset['ChEBI'], errors='coerce')
llm_subset.dropna(subset=['ChEBI'], inplace=True)
llm_subset['ChEBI'] = llm_subset['ChEBI'].astype('Int64')


# creating mapped file
processed_data_mapped_llm = sspa.map_identifiers(llm_subset, output_id_type="ChEBI", matrix=metabolomics_data_processed_final)
processed_data_mapped_llm_final = pd.concat([processed_data_mapped_llm, last_7_columns], axis=1)
processed_data_mapped_llm_final.to_csv('/Users/judepops/Documents/PathIntegrate/Code/Final_Scripts/Results/Results_D/COVID_Met_ChEBI_LLM.csv')
processed_data_mapped_llm_final

### LLM V2 Method - this is a seperate file was was computed with a 0.75 threshold and provided teh best predictions

In [None]:
llm_subset = pd.read_csv('/Users/judepops/Documents/PathIntegrate/Code/Final_Scripts/Results/Results_B/llm_subset_v2.csv', index_col=0)
llm_subset = llm_subset.rename(columns={'Matched COMPOUND_ID': 'ChEBI'})
llm_subset = llm_subset.drop(columns='Matched Compound Name')
llm_subset = llm_subset.drop(columns='Source')
llm_subset = llm_subset.drop(columns='Correct COMPOUND_ID')

llm_subset['Query'] = llm_subset['Query'].str.strip().str.lower()
llm_subset['ChEBI'] = pd.to_numeric(llm_subset['ChEBI'], errors='coerce')
llm_subset.dropna(subset=['ChEBI'], inplace=True)
llm_subset['ChEBI'] = llm_subset['ChEBI'].astype('Int64')


# creating mapped file
processed_data_mapped_llm_v2 = sspa.map_identifiers(llm_subset, output_id_type="ChEBI", matrix=metabolomics_data_processed_final)
processed_data_mapped_llm_final_v2 = pd.concat([processed_data_mapped_llm, last_7_columns], axis=1)
processed_data_mapped_llm_final_v2.to_csv('/Users/judepops/Documents/PathIntegrate/Code/Final_Scripts/Results/Results_D/COVID_Met_ChEBI_LLM_V2.csv')
processed_data_mapped_llm_final_v2

# Pathway analysis comparison with different ID-converted matrices

In [None]:
# Intitialisng the pathway database files

# reactome pathways
reactome_pathways = sspa.process_reactome(organism="Homo sapiens",download_latest=True, omics_type='multiomics')

# compounds in all teh reactome pathways
all_reactome_cpds = set(sum(sspa.utils.pathwaydf_to_dict(reactome_pathways).values(), []))

### Metaboanalyst pathway mappign results

In [None]:
# getting the compounds in metaboanalyst processed data (all the columns)
processed_data_mapped_metaboanalyst.columns = processed_data_mapped_metaboanalyst.columns.map(str)
mapped_annotated_cpds_metaboanalyst = set(processed_data_mapped_metaboanalyst.columns) & set(all_reactome_cpds)
print(len(mapped_annotated_cpds_metaboanalyst))

# pathway dataframe to a dictionary
pathways_dict = sspa.utils.pathwaydf_to_dict(reactome_pathways)

# dictionary to store the count of molecules for each pathway
pathway_molecule_count = {k: len([i for i in processed_data_mapped_metaboanalyst.columns if i in v]) for k, v in pathways_dict.items()}

# converting  dictionary to dataframe
pathway_molecule_count_df = pd.DataFrame(list(pathway_molecule_count.items()), columns=['Pathway', 'Molecule_Count'])

# sorting  dataframe by molecule count
pathway_molecule_count_df_sorted = pathway_molecule_count_df.sort_values(by='Molecule_Count', ascending=False)
non_zero_molecule_count_df = pathway_molecule_count_df[pathway_molecule_count_df['Molecule_Count'] > 0]
mean_molecule_count_non_zero_metaboanalyst = non_zero_molecule_count_df['Molecule_Count'].mean()
pathway_molecule_count_df_sorted.head(50), mean_molecule_count_non_zero_metaboanalyst


### Creating a pathway results dataframe

In [None]:
pathway_results = pd.DataFrame({
    'ChEBI': [len(processed_data_mapped_metaboanalyst.columns)],
    'Pathways': [len(mapped_annotated_cpds_metaboanalyst)],
    'Avg Pathway Met': [mean_molecule_count_non_zero_metaboanalyst]
}, index=['MetaboAnalyst'])

### Gather pathway mapping statistics for other mapped dataframes

LLM

In [None]:
# getting the compounds in metaboanalyst processed data (all the columns)
processed_data_mapped_llm.columns = processed_data_mapped_llm.columns.map(str)
mapped_annotated_cpds_llm = set(processed_data_mapped_llm.columns) & set(all_reactome_cpds)
print(len(mapped_annotated_cpds_metaboanalyst))

# pathway dataframe to a dictionary
pathways_dict = sspa.utils.pathwaydf_to_dict(reactome_pathways)

# dictionary to store the count of molecules for each pathway
pathway_molecule_count = {k: len([i for i in processed_data_mapped_llm.columns if i in v]) for k, v in pathways_dict.items()}

# converting  dictionary to dataframe
pathway_molecule_count_df = pd.DataFrame(list(pathway_molecule_count.items()), columns=['Pathway', 'Molecule_Count'])

# sorting  dataframe by molecule count
pathway_molecule_count_df_sorted = pathway_molecule_count_df.sort_values(by='Molecule_Count', ascending=False)
non_zero_molecule_count_df = pathway_molecule_count_df[pathway_molecule_count_df['Molecule_Count'] > 0]
mean_molecule_count_non_zero_llm = non_zero_molecule_count_df['Molecule_Count'].mean()
pathway_molecule_count_df_sorted.head(50), mean_molecule_count_non_zero_llm


In [None]:
pathway_results.loc['LLM'] = [len(processed_data_mapped_llm.columns), len(mapped_annotated_cpds_llm), mean_molecule_count_non_zero_llm]

LLM v2

In [None]:
# getting the compounds in metaboanalyst processed data (all the columns)
processed_data_mapped_llm_v2.columns = processed_data_mapped_llm_v2.columns.map(str)
mapped_annotated_cpds_llm_v2 = set(processed_data_mapped_llm_v2.columns) & set(all_reactome_cpds)
print(len(mapped_annotated_cpds_metaboanalyst))

# pathway dataframe to a dictionary
pathways_dict = sspa.utils.pathwaydf_to_dict(reactome_pathways)

# dictionary to store the count of molecules for each pathway
pathway_molecule_count = {k: len([i for i in processed_data_mapped_llm_v2.columns if i in v]) for k, v in pathways_dict.items()}

# converting  dictionary to dataframe
pathway_molecule_count_df = pd.DataFrame(list(pathway_molecule_count.items()), columns=['Pathway', 'Molecule_Count'])

# sorting  dataframe by molecule count
pathway_molecule_count_df_sorted = pathway_molecule_count_df.sort_values(by='Molecule_Count', ascending=False)
non_zero_molecule_count_df = pathway_molecule_count_df[pathway_molecule_count_df['Molecule_Count'] > 0]
mean_molecule_count_non_zero_llm_v2 = non_zero_molecule_count_df['Molecule_Count'].mean()
pathway_molecule_count_df_sorted.head(50), mean_molecule_count_non_zero_llm_v2


In [None]:
pathway_results.loc['LLM_V2'] = [len(processed_data_mapped_llm_v2.columns), len(mapped_annotated_cpds_llm_v2), mean_molecule_count_non_zero_llm_v2]

Manual

In [None]:
# getting the compounds in metaboanalyst processed data (all the columns)
processed_data_mapped_manual.columns = processed_data_mapped_manual.columns.map(str)
mapped_annotated_cpds_manual = set(processed_data_mapped_manual.columns) & set(all_reactome_cpds)
print(len(mapped_annotated_cpds_manual))

# pathway dataframe to a dictionary
pathways_dict = sspa.utils.pathwaydf_to_dict(reactome_pathways)

# dictionary to store the count of molecules for each pathway
pathway_molecule_count = {k: len([i for i in processed_data_mapped_manual.columns if i in v]) for k, v in pathways_dict.items()}

# converting  dictionary to dataframe
pathway_molecule_count_df = pd.DataFrame(list(pathway_molecule_count.items()), columns=['Pathway', 'Molecule_Count'])

# sorting  dataframe by molecule count
pathway_molecule_count_df_sorted = pathway_molecule_count_df.sort_values(by='Molecule_Count', ascending=False)
non_zero_molecule_count_df = pathway_molecule_count_df[pathway_molecule_count_df['Molecule_Count'] > 0]
mean_molecule_count_non_zero_manual = non_zero_molecule_count_df['Molecule_Count'].mean()
pathway_molecule_count_df_sorted.head(50), mean_molecule_count_non_zero_manual


In [None]:
pathway_results.loc['Manual'] = [len(processed_data_mapped_manual.columns), len(mapped_annotated_cpds_manual), mean_molecule_count_non_zero_manual]

### Final df

In [None]:
pathway_results

In [None]:
import matplotlib.pyplot as plt

# sortting by pathway counts 
pathway_results_sorted = pathway_results.sort_values(by='Pathways')

bar_width = 0.35
index = range(len(pathway_results_sorted))
fig, ax1 = plt.subplots()
ax1.bar(index, pathway_results_sorted['Pathways'], bar_width, label='Pathways', color='b')
ax2 = ax1.twinx()
ax2.bar([i + bar_width for i in index], pathway_results_sorted['Avg Pathway Met'], bar_width, label='Avg Pathway Met', color='r')
ax1.set_xticks([i + bar_width / 2 for i in index])
ax1.set_xticklabels(pathway_results_sorted.index)
ax1.set_xlabel('Number')
ax1.set_ylabel('Pathways', color='b')
ax2.set_ylabel('Avg Pathway Met', color='r')
ax1.set_title('Pathway Results')
fig.legend(loc="upper left", bbox_to_anchor=(0,1), bbox_transform=ax1.transAxes)
plt.show()
