In [None]:
#Load libraries 

import pandas as pd
import sspa
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import random
import scipy

In [None]:
#For metabolomic:
df = pd.read_csv('../Data/Su_COVID_metabolomics_processed_commoncases.csv', index_col=0)
analytes_in_df = df.columns[:-2]
reactome_pathways = sspa.process_gmt("../Data/Reactome_Homo_sapiens_pathways_compounds_R84.gmt")  #2294 Reactome pathways


In [None]:
#For proteomic:
df = pd.read_csv('../Data/Su_COVID_proteomics_processed_commoncases.csv', index_col=0)
analytes_in_df = df.columns[:-2]
reactome_pathways = sspa.process_reactome('Homo sapiens', infile = '../Data/UniProt2Reactome_All_Levels_ver84.txt', download_latest = False, filepath = None) #2596 Reactome pathways
reactome_pathways

In [None]:
#https://stackoverflow.com/questions/26977076/pandas-unique-values-multiple-columns
#"ravel() is an array method that returns a view (if possible) of a multidimensional array. The argument 'K' tells the method to flatten the array in the order the elements are stored in the memory"
analytes_in_reactome = pd.unique(reactome_pathways.iloc[:,1:].values.ravel('K'))

In [None]:
#Number of analytes in the dataset that map to a Reactome pathway
analytes_mapping_reactome = list(set(analytes_in_reactome) & set(analytes_in_df)) #set removes duplicates

In [None]:
#Can make a Venn diagram with these results (not a bar graph because of the high numbers of total metabolites in reactome, not a contingency table because no double negative)
print(len(analytes_in_df)) #333 metabolites, 454 proteins
print(len(analytes_in_reactome)) #2393 unique Reactome metabolites, 11596 unique Reactome proteins
print(len(analytes_mapping_reactome)) #78 metabolites in the dataset that map to a reactome pathway, 354 proteins 

In [None]:
print(( len(analytes_mapping_reactome) / len(analytes_in_df) ) *100) # 23.42% of metabolites in the dataset map to a Reactome pathway, 77.97% of proteins

Get average of the length of Reactome pathways, and then calculate percentage of analytes that map to each Reactome pathway:

In [None]:
pathway_len = []  #length of each Reactome pathway
number_coverage = [] #number of analytes in the dataset for a Reactome pathway if 2 or more analytes from data present
percentage_coverage = [] #pathway coverage with the dataset for a Reactome pathway if 2 or more analytes from data present

for index in range(len(reactome_pathways)):

    #Obtain list of analytes per pathways
    list1 = reactome_pathways.iloc[index,1:]
    #list2 = [i for i in list1 if i != ''] #metabolomic
    list2 = [i for i in list1 if i != None]  #proteomic
    
    #Get number of analytes present in dataset present in each pathways
    number_mapping = len(list(set(list2) & set(analytes_in_df)))
    #If the number is 2 or over (min number to be in a pathway using sspa kPCA) append to list
    if number_mapping >= 2:
        number_coverage.append(number_mapping)
        #Get the fraction of metabolites mapping to a pathway so you can make a plot
        percentage_coverage.append((number_mapping/len(list2)) * 100)

    #Get length of pathway 
    pathway_len.append(len(list2))
    

print("Average length of Reactome pathway: ", sum(pathway_len) / len(pathway_len), "analytes")
print("Number of pathways that will be detected by sspa: ", len(percentage_coverage))
print("Average analytes present in pathway: ", sum(number_coverage) / len(number_coverage), "analytes")
print("Average pathway coverage: ", sum(percentage_coverage) / len(percentage_coverage), "%")

Note: You remove the root pathways so the number of pathways are not the same here

Metabolomic: <br> 
Average length of Reactome pathway:  13.640366172624237 analytes <br>
Number of pathways that will be detected by sspa:  160 <br>
Average analytes present in pathway:  5.125 analytes <br>
Average pathway coverage:  12.772397118617086 % <br>

Proteomic: <br>
Average length of Reactome pathway:  49.74306625577812 analytes <br>
Number of pathways that will be detected by sspa:  603 <br>
Average analytes present in pathway:  6.200663349917082 analytes <br>
Average pathway coverage:  9.070738437503643 % <br>

In order to plot the data for both omics, you would need to run the metabolomic code first then save the data as a another variable, then repeat for proteomic data.

In [None]:
metabolomic_number_coverage #= number_coverage

In [None]:
proteomic_number_coverage# = number_coverage

In [None]:
meta_df = pd.DataFrame()
meta_df['val']  = np.log10(metabolomic_number_coverage)
meta_df['Type'] = "Metabolomic"

pro_df = pd.DataFrame()
pro_df['val']  = np.log10(proteomic_number_coverage)
pro_df['Type'] = "Proteomic"

number_df = pd.concat([meta_df, pro_df], axis=0)
number_df

In [None]:
sns.set(font_scale = 2)
sns.set_style("ticks") # same as "white" but with ticks
plt.figure(figsize=(10.5,6))

#Change because I want metabolomics to be orange, proteomics to be blue
palette = ['tab:orange', 'tab:blue']

sns.histplot(number_df, x="val", hue="Type",palette=palette,edgecolor="k", bins = 30) #, bins = 30,color='#79C99E',edgecolor="k") 

plt.title('Pathway coverage',fontsize=30,pad=12)
plt.xlabel('Number of analytes mapping to Reactome pathway (log10)',fontsize=22,labelpad=8)
plt.ylabel('Count',fontsize=22, labelpad=12) ;


#plt.savefig( '../Figures/number_pathway_coverage.png' , dpi=300,bbox_inches = 'tight' , pad_inches = 0.2 , facecolor='w')

In [None]:
metabolomic_percentage_coverage # = percentage_coverage

In [None]:
proteomic_percentage_coverage #= percentage_coverage

In [None]:
meta_df = pd.DataFrame()
meta_df['val']  = metabolomic_percentage_coverage
meta_df['Type'] = "Metabolomic"

pro_df = pd.DataFrame()
pro_df['val']  = proteomic_percentage_coverage
pro_df['Type'] = "Proteomic"

number_df = pd.concat([meta_df, pro_df], axis=0)
number_df

In [None]:
print(max(metabolomic_percentage_coverage))

sns.histplot(metabolomic_percentage_coverage,bins = 30) 
plt.title('Pathway coverage',fontsize=16)
plt.xlabel('Percentage of metabolites mapping to Reactome pathway (%)',fontsize=11)
plt.ylabel('Count',fontsize=13)

In [None]:
print(max(proteomic_percentage_coverage))

sns.histplot(proteomic_percentage_coverage,bins = 30) 
plt.title('Pathway coverage',fontsize=16)
plt.xlabel('Percentage of proteins mapping to Reactome pathway (%)',fontsize=11)
plt.ylabel('Count',fontsize=13)

In [None]:
sns.set(font_scale = 2)
sns.set_style("ticks") # same as "white" but with ticks
plt.figure(figsize=(10.5,6))

#Change because I want metabolomics to be orange, proteomics to be blue
palette = ['tab:orange', 'tab:blue']

sns.histplot(number_df, x="val", hue="Type",palette=palette,edgecolor="k",bins = 30) #, bins = 30,color='#79C99E',edgecolor="k") 

plt.title('Pathway coverage',fontsize=30,pad=12)
plt.xlabel('Percentage of analytes mapping to Reactome pathway (%)',fontsize=22,labelpad=8)
plt.ylabel('Count',fontsize=22, labelpad=12) ;

#plt.savefig( '../Figures/percent_pathway_coverage.png' , dpi=300,bbox_inches = 'tight' , pad_inches = 0.2 , facecolor='w')

Calculating pathway scores and getting the pathways in common between both:

In [None]:
meta_df = pd.read_csv('Data/Su_COVID_metabolomics_processed_commoncases.csv', index_col=0)
metabolomic_reactome_pathways = sspa.process_gmt("Data/Reactome_Homo_sapiens_pathways_compounds_R84.gmt")
metabolomic_kpca_scores = sspa.sspa_kpca(meta_df.iloc[:,:-2], metabolomic_reactome_pathways)

analytes_in_meta_df = meta_df.columns[:-2]

In [None]:
pro_df = pd.read_csv('Data/Su_COVID_proteomics_processed_commoncases.csv', index_col=0)
proteomic_reactome_pathways = sspa.process_reactome('Homo sapiens', infile = 'Data/UniProt2Reactome_All_Levels.txt', download_latest = False, filepath = None)
proteomic_kpca_scores = sspa.sspa_kpca(pro_df.iloc[:,:-2], proteomic_reactome_pathways)

analytes_in_pro_df = pro_df.columns[:-2]

In [None]:
#Get common pathways
 
intersection = list(set(metabolomic_kpca_scores.columns.tolist()) & set(proteomic_kpca_scores.columns.tolist())) #set removes duplicates

#Remove root pathways
root_path = pd.read_excel('Data/Root_pathways.xlsx', header=None)
root_pathway_dict = {root_path[0][i]:root_path[1][i] for i in range(0,len(root_path))}
root_pathway_names = list(root_pathway_dict.keys())

common_pathways = [i for i in intersection if i not in root_pathway_names]

In [None]:
common_pathway_metabolite_df = metabolomic_reactome_pathways.loc[metabolomic_reactome_pathways.index.isin(common_pathways), : ]
common_pathway_protein_df = proteomic_reactome_pathways.loc[proteomic_reactome_pathways.index.isin(common_pathways), : ]

In [None]:
#For metabolomic data:

pathway_len = []  #length of each Reactome pathway
number_coverage = [] #number of analytes in the dataset for a Reactome pathway if 2 or more analytes from data present
percentage_coverage = [] #pathway coverage with the dataset for a Reactome pathway if 2 or more analytes from data present

for index in range(len(common_pathway_metabolite_df)):

    #Obtain list of analytes per pathways
    meta_list = common_pathway_metabolite_df.iloc[index,1:]
    meta_list2 = [i for i in meta_list if i != ''] #metabolomic

    #Get number of analytes present in dataset present in each pathways
    number_mapping = len(list(set(meta_list2) & set(analytes_in_meta_df)))
    #If the number is 2 or over (min number to be in a pathway using sspa kPCA) append to list
    if number_mapping >= 2:
        number_coverage.append(number_mapping)
        #Get the fraction of metabolites mapping to a pathway so you can make a plot
        percentage_coverage.append((number_mapping/len(meta_list2)) * 100)

    #Get length of pathway 
    pathway_len.append(len(meta_list2))

In [None]:
#For proteomic data:

pathway_len = []  #length of each Reactome pathway
number_coverage = [] #number of analytes in the dataset for a Reactome pathway if 2 or more analytes from data present
percentage_coverage = [] #pathway coverage with the dataset for a Reactome pathway if 2 or more analytes from data present

for index in range(len(common_pathway_protein_df)):

    #Obtain list of analytes per pathways
    pro_list = common_pathway_protein_df.iloc[index,1:]
    pro_list2 = [i for i in pro_list if i != None]  #proteomic

    #Get number of analytes present in dataset present in each pathways
    number_mapping = len(list(set(pro_list2) & set(analytes_in_pro_df)))
    #If the number is 2 or over (min number to be in a pathway using sspa kPCA) append to list
    if number_mapping >= 2:
        number_coverage.append(number_mapping)
        #Get the fraction of metabolites mapping to a pathway so you can make a plot
        percentage_coverage.append((number_mapping/len(pro_list2)) * 100)

    #Get length of pathway 
    pathway_len.append(len(pro_list2))


In [None]:
number_coverage

In [None]:
full_df =  pd.DataFrame(index=common_pathway_metabolite_df.index)
#full_df

In [None]:
full_df['Metabolite number'] = number_coverage
full_df['Metabolite percent'] = percentage_coverage

In [None]:
full_df['Protein number'] = number_coverage
full_df['Protein percent'] = percentage_coverage

In [None]:
full_df.sort_values(by=['Metabolite number'],ascending=False)[:5]

In [None]:
full_df.sort_values(by=['Metabolite percent'],ascending=False)[:5]

In [None]:
full_df.sort_values(by=['Protein number'],ascending=False)[:5]

In [None]:
full_df.sort_values(by=['Protein percent'],ascending=False)[:5]