In [1]:
#Load libraries 
import pandas as pd
import sspa
import seaborn as sns
import matplotlib.pyplot as plt
import scipy


In [2]:
df = pd.read_csv('../Data/Su_COVID_metabolomics_processed_commoncases.csv', index_col=0)
reactome_pathways = sspa.process_gmt("../Data/Reactome_Homo_sapiens_pathways_compounds_R84.gmt")

In [None]:
df = pd.read_csv('../Data/Su_COVID_proteomics_processed_commoncases.csv', index_col=0)
reactome_pathways = sspa.process_reactome('Homo sapiens', infile = '../Data/UniProt2Reactome_all_Levels_ver84.txt', download_latest = False, filepath = None)

In [3]:
#Convert pathway ID to name
root_path = pd.read_excel('../Data/Root_pathways.xlsx', header=None)
root_pathway_dict = {root_path[0][i]:root_path[1][i] for i in range(0,len(root_path))}

root_pathway_names = list(root_pathway_dict.keys())
#Remove root pathways
reactome_pathways = reactome_pathways[~reactome_pathways.index.isin(root_pathway_names)]

In [4]:
for i in range(len(df.index)):
    if df.WHO_status[i] == '1-2':
        df['Group'][i] = 'Mild'
    else:
        df['Group'][i] = 'Severe'


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Group'][i] = 'Severe'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Group'][i] = 'Mild'


### Over-representation analysis

In [None]:
#Initiate an ORA object 
ora = sspa.sspa_ora(df.iloc[:,:-2], df["Group"], reactome_pathways, 0.05, custom_background=None)
#Carry out ORA
ora_res = ora.over_representation_analysis()
print(len(ora.DA_molecules))
display(ora.DA_test_res.sort_values(by="P-value"))
display(ora_res.sort_values(by="P-value"))


top_10_pathways = ora_res.sort_values(by="P-value").iloc[0:10, :]

In [None]:
#Change name to have a line break otherwise it won't fit on the plot
top_10_pathways.Pathway_name[62] = '\n Transport of inorganic cations/anions and \n amino acids/oligopeptides   '

In [None]:

plt.figure(figsize=(9, 5))

sns.set(font_scale = 2)
sns.set_style("ticks") # same as "white" but with ticks

#If you want to colour by significance
#bar_color = ['tab:green' if float(i) < 0.05 else 'tab:grey' for i in top_20_pathways['P-value']]
#sns.barplot(data=top_10_pathways, y="Pathway_name", x="P-value", orient="h", palette=bar_color) #rocket, magma

#If you want to colour by gradient
ax = sns.barplot(data=top_10_pathways, y="Pathway_name", x="P-value", orient="h", palette="rocket",dodge=False, width=0.8) #rocket, magma

#Increase space between the bars and the axis
ax.relim()
ax.autoscale_view()


#Add pathway coverage
label = []
for i in top_10_pathways.Coverage:
    num = i.split('/')
    print(num)
    percent = (int(num[0]) / int(num[1])) *100
    label.append("  "+ str(round(percent,1))+ '%')

print(label)

ax.bar_label(ax.containers[0], labels=label)

#plt.title('Metabolomics',fontsize=22,pad=10)
plt.xlabel('Unadjusted p-value',fontsize=26)
plt.ylabel('Pathway name',fontsize=26) 
plt.xlim(0, 0.12)


plt.axvline(0.03, c="black")


#plt.savefig( '../Figures/proteomic_ORA_top_10.png' , dpi=300,bbox_inches = 'tight' , pad_inches = 0.2 , facecolor='w')

In [None]:
list(top_10_pathways.Coverage)

For integrated data, take the two ORA results and combine the p-values with Fisher's method, which is what is commonly done (See Maghsoudi et al., 2021 for examples):

In [None]:
metabolomic_ora = ora_res.sort_values(by="P-value")

In [None]:
proteomic_ora = ora_res.sort_values(by="P-value")

In [None]:
metabolomic_ora.index = metabolomic_ora["ID"]
metabolomic_ora.drop(columns = ["ID"]) #using Sara's code to drop root pathways

In [None]:
proteomic_ora.index = proteomic_ora["ID"]
proteomic_ora.drop(columns = ["ID"]) #using Sara's code to drop root pathways

In [None]:
result = metabolomic_ora.merge(proteomic_ora, how='inner',right_index=True, left_index = True)
result[:5]

In [None]:
result["combined_pval"] = 0

In [None]:
for i in range(len(result.index)):
    ID_list = [result["P-value_x"][i],result["P-value_y"][i]]
    print(ID_list)
    test_stat,pval = scipy.stats.combine_pvalues(ID_list, method='fisher',weights=None)
    print(pval)
    result["combined_pval"][i] = pval


In [None]:
result[:10]

In [None]:
result.Coverage_x[0]

In [None]:
top_10_pathways = result.sort_values(by="combined_pval").iloc[0:10, :]

In [None]:
#Obtain pathway coverage for the top 10 results
df = pd.read_csv("../Data/Su_integrated_data.csv", index_col=0)
reactome_pathways = pd.read_csv("../Data/Reactome_multi_omics_ChEBI_Uniprot.csv", index_col=0,dtype="str") #Dtype warning because in some columns, some values are in string format whereas some are in integer format, that's why I specify dtype="str"
kpca_scores = sspa.sspa_kpca(df.iloc[:,:-2], reactome_pathways)


#Convert pathway ID to name
root_path = pd.read_excel('../Data/Root_pathways.xlsx', header=None)
root_pathway_dict = {root_path[0][i]:root_path[1][i] for i in range(0,len(root_path))}

root_pathway_names = list(root_pathway_dict.keys())
#Using Sara's code, remove root pathways
kpca_scores = kpca_scores.drop(columns = list(set(root_pathway_names) & set(kpca_scores.columns)))



#Filter out the molecules in the pathways that are not present in the dataset
#Obtain all unique values in dataset
compounds_present = list(df.columns[:-2])
filtered_dict = {} 

#Obtain pathways and corresponding molecules for all Reactome pathways, store as dictionary
orig_dict = sspa.utils.pathwaydf_to_dict(reactome_pathways)

#Filter out dictionary to retain only the pathways that remain after kPCA
my_keys = kpca_scores.columns
pathways_dict = {key: orig_dict[key] for key in my_keys}


#My code adapted from Cecilia's
#If the key values are not part of the compounds in dataset then remove
for key,value in pathways_dict.items():
    new_val = [item for item in value if item in compounds_present]
    if len(new_val) >= 2: #at least two compounds in the pathway
        filtered_dict[key] = new_val

In [None]:

plt.figure(figsize=(9, 5.5))
sns.set(font_scale = 2)

sns.set_style("ticks") # same as "white" but with ticks

ax = sns.barplot(data=top_10_pathways, y="Pathway_name_x", x="combined_pval", orient="h", palette="rocket") #rocket, magma


#Increase space between the bars and the axis
ax.relim()
ax.autoscale_view()



label = []
# percent1 = []
# percent2 = []

#Getting pathway coverage by taking the average of the metabolomic and proteomic pathway coverage (not ideal)
# for i in range(10):
#     pathway1 =  top_10_pathways.Coverage_x[i]
#     num = pathway1.split('/')
#     percent1 = (int(num[0]) / int(num[1])) *100
#     print(percent1)

#     pathway2 =  top_10_pathways.Coverage_y[i]
#     num = pathway2.split('/')
#     percent2 = (int(num[0]) / int(num[1])) *100
#     print(percent2)

#     percent = (percent1+percent2)/2
#     print(percent)
#     label.append("  "+ str(round(percent,1))+ '%')

# print(label)


#Getting pathway coverage by using the Reactome pathway definitions concatenated together
for i in range(10):
    pathway_nam = top_10_pathways.iloc[i,0]
    
    num_in_df = len(filtered_dict[pathway_nam])
    num_whole_pathway = len(pathways_dict[pathway_nam])

    percent = (num_in_df/num_whole_pathway) * 100

    label.append("  "+ str(round(percent,1))+ '%')

ax.bar_label(ax.containers[0], labels=label)

#plt.title('ORA for integrated data',fontsize=22,pad=10)
plt.xlabel('Unadjusted p-value',fontsize=26)
plt.ylabel('Pathway name',fontsize=26) 
plt.xlim(0, 0.57);

plt.axvline(0.05, c="black")

#plt.savefig( '../Figures/integrated_ORA_top_10.png' , dpi=300,bbox_inches = 'tight' , pad_inches = 0.2 , facecolor='w')

### Gene-set Enrichment Analysis

In [5]:
gsea_res = sspa.sspa_gsea(df.iloc[:,:-2], df["Group"], reactome_pathways)

In [7]:
df

Unnamed: 0_level_0,1372,16610,72665,27823,30915,37373,16831,545959,17050,16359,...,133693,133694,28036,28238,76341,89312,17861,89188,WHO_status,Group
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
INCOV001,-0.342522,-0.830841,-0.703451,-0.451278,0.153207,-0.443252,-0.228725,0.501998,-0.861886,-0.344155,...,-0.465505,0.055753,-0.225622,0.400398,0.163210,1.579262,-0.162221,-0.603790,3-4,Severe
INCOV002,-0.125585,-1.002246,0.399273,-0.828341,-0.980576,0.373921,-1.076979,-0.131876,-0.869093,-0.467326,...,0.509336,0.627269,0.321718,-0.158514,0.001469,-1.051485,1.299330,0.578560,5-7,Severe
INCOV003,-0.859326,-0.409159,3.573100,-0.546783,-0.993647,1.547649,0.370131,-0.020399,-0.703786,-0.015627,...,2.056610,1.717149,-0.297723,-1.276891,-0.658288,2.141594,-1.091183,0.447197,5-7,Severe
INCOV004,-0.562999,-1.560249,-1.269383,-1.437919,0.108734,-0.192066,0.161591,-0.586759,-1.457246,-0.500612,...,0.793065,-0.033125,-0.366082,-0.642671,1.724436,2.074193,-1.021184,-0.410849,3-4,Severe
INCOV005,-0.741957,0.387909,-0.711036,-0.711515,0.670720,0.610591,-0.322241,-0.793451,-1.230681,-0.246493,...,-0.184967,-0.469553,-0.293869,0.558366,-0.390308,-0.455737,-0.535223,-0.847727,3-4,Severe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
INCOV135,-2.075777,-0.485936,-0.671225,0.069483,0.042579,0.357250,-0.505272,-0.378441,-0.444915,-0.368108,...,-0.516557,-0.392716,-0.328170,1.013774,-0.391566,-0.407585,1.041224,-0.866190,1-2,Mild
INCOV136,-0.174602,1.167644,0.055499,0.113342,-0.663851,0.799367,0.004697,0.748894,1.328753,0.391221,...,-0.741578,-0.730981,-0.380517,-1.251641,0.389163,-0.603048,0.941619,-0.511694,3-4,Severe
INCOV137,0.274900,-0.014841,-0.928512,0.885263,-1.591862,0.230250,0.057818,-0.775962,-0.179700,-0.534786,...,-1.306132,-1.246428,1.763824,1.258387,0.448359,-0.508931,0.268474,-0.997206,5-7,Severe
INCOV139,0.772804,1.408806,1.473315,2.182921,-0.380455,-0.175916,0.376092,0.412578,1.853690,-0.368432,...,-1.050924,-0.909450,0.962331,0.292543,-0.404695,0.919776,-1.226216,0.673867,3-4,Severe


In [6]:
display(gsea_res.sort_values(by="P-adjust FDR")) #0.54 is the lowest for metabolomic, 0.056 for proteomic
#display(gsea_res.sort_values(by="P-value"))

Unnamed: 0,Pathway_ID,Pathway_name,ES,NES,P-value,P-adjust FDR,P-adjust FWER,Tag %,Entity %,Leading_edge
5,R-HSA-425407,SLC-mediated transmembrane transport,0.452285,1.474488,0.051383,0.537223,0.585,15/33,31.23%,4208;16680;16199;16283;15344;16040;15756;17568...
8,R-HSA-5663205,Infectious disease,0.911178,1.442434,0.043738,0.555563,0.696,1/2,8.71%,16680
10,R-HSA-73614,Pyrimidine salvage,0.757897,1.418139,0.067485,0.566262,0.761,2/4,23.72%,17802;17568
6,R-HSA-1614635,Sulfur amino acid metabolism,0.703485,1.451648,0.071571,0.580345,0.666,4/6,26.43%,30831;17509;17482;17750
3,R-HSA-2408522,Selenoamino acid metabolism,0.96654,1.48528,0.006237,0.580781,0.55,1/2,3.30%,30831
...,...,...,...,...,...,...,...,...,...,...
32,R-HSA-2262752,Cellular responses to stress,0.736364,1.16961,0.298419,1.0,0.999,3/3,27.03%,15756;17351;16827
115,R-HSA-5619063,Defective SLC29A3 causes histiocytosis-lymphad...,-0.396341,-0.758293,0.737288,1.0,1.0,5/5,61.26%,16708;16704;16335;17562;17596
31,R-HSA-83936,Transport of nucleosides and free purine and p...,0.539713,1.189375,0.266145,1.0,0.997,4/8,39.34%,16040;17568;17368;17596
39,R-HSA-1483257,Phospholipid metabolism,-0.563098,-1.101136,0.371032,1.0,1.0,2/5,3.90%,18132;17553


In [None]:
#Change name to have a line break otherwise it won't fit on the plot
gsea_res.Pathway_name[0] = 'Regulation of Insulin-like Growth Factor (IGF) transport and \n uptake by Insulin-like Growth Factor Binding Proteins (IGFBPs)'

In [None]:
top_10_pathways_gsea

In [None]:
from matplotlib.lines import Line2D #To colour the bar by enrichment

top_10_pathways_gsea = gsea_res.sort_values(by="P-value").iloc[0:10, :]
plt.figure(figsize=(10, 7))

sns.set(font_scale = 2) #sns.set(font_scale = 1.2) for metabolomic because of long label
sns.set_style("ticks") # same as "white" but with ticks

# set bar colour based on normalised enrichment score sign
bar_color = ['tab:red' if float(i) > 0 else 'tab:blue' for i in top_10_pathways_gsea['NES']]
ax = sns.barplot(data=top_10_pathways_gsea, y="Pathway_name", x="P-value", orient="h", palette=bar_color)

#Increase space between the bars and the axis
ax.relim()
ax.autoscale_view()



label = []
for pathway in top_10_pathways_gsea["Entity %"]:
    label.append("  "+ pathway)

print(label)

ax.bar_label(ax.containers[0], labels=label)

#plt.title('GSEA for proteomic data',fontsize=22,pad=10)
plt.xlabel('Unadjusted p-value',fontsize=26)
plt.ylabel('Pathway name',fontsize=26) ;

plt.axvline(0.05, c="black")

# add legend
custom_lines = [Line2D([0], [0], color='tab:red', lw=4),
                Line2D([0], [0], color='tab:blue', lw=4)]
plt.legend(handles=custom_lines, labels=['Positive enrichment score', 'Negative enrichment score'],loc="upper right")

#plt.savefig( '../Figures/metabolomic_GSEA_top_10.png' , dpi=300,bbox_inches = 'tight' , pad_inches = 0.2 , facecolor='w')

For integrated data, take the two GSEA results and combine the p-values with Fisher's method, which is what is commonly done (See Maghsoudi et al., 2021 for examples):

In [None]:
metabolomic_gsea = gsea_res.sort_values(by="P-value")
metabolomic_gsea = metabolomic_gsea.set_index(['Pathway_ID'])

In [None]:
proteomic_gsea = gsea_res.sort_values(by="P-value")
proteomic_gsea = proteomic_gsea.set_index(['Pathway_ID'])

In [None]:
metabolomic_gsea

In [None]:
proteomic_gsea

In [None]:
result = metabolomic_gsea.merge(proteomic_gsea, how='inner',right_index=True, left_index = True)
result[:5]

In [None]:
result["combined_pval"] = 0

In [None]:
for i in range(len(result.index)):
    ID_list = [result["P-value_x"][i],result["P-value_y"][i]]
    print(ID_list)
    test_stat,pval = scipy.stats.combine_pvalues(ID_list, method='fisher',weights=None)
    print(pval)
    result["combined_pval"][i] = pval


In [None]:
#Obtain pathway coverage for the top 10 results
df = pd.read_csv("../Data/Su_integrated_data.csv", index_col=0)
reactome_pathways = pd.read_csv("../Data/Reactome_multi_omics_ChEBI_Uniprot.csv", index_col=0,dtype="str") #Dtype warning because in some columns, some values are in string format whereas some are in integer format, that's why I specify dtype="str"
kpca_scores = sspa.sspa_kpca(df.iloc[:,:-2], reactome_pathways)


#Convert pathway ID to name
root_path = pd.read_excel('../Data/Root_pathways.xlsx', header=None)
root_pathway_dict = {root_path[0][i]:root_path[1][i] for i in range(0,len(root_path))}

root_pathway_names = list(root_pathway_dict.keys())
#Using Sara's code, remove root pathways
kpca_scores = kpca_scores.drop(columns = list(set(root_pathway_names) & set(kpca_scores.columns)))



#Filter out the molecules in the pathways that are not present in the dataset
#Obtain all unique values in dataset
compounds_present = list(df.columns[:-2])
filtered_dict = {} 

#Obtain pathways and corresponding molecules for all Reactome pathways, store as dictionary
orig_dict = sspa.utils.pathwaydf_to_dict(reactome_pathways)

#Filter out dictionary to retain only the pathways that remain after kPCA
my_keys = kpca_scores.columns
pathways_dict = {key: orig_dict[key] for key in my_keys}


#My code adapted from Cecilia's
#If the key values are not part of the compounds in dataset then remove
for key,value in pathways_dict.items():
    new_val = [item for item in value if item in compounds_present]
    if len(new_val) >= 2: #at least two compounds in the pathway
        filtered_dict[key] = new_val

In [None]:
from matplotlib.lines import Line2D

top_10_pathways_gsea = result.sort_values(by="combined_pval").iloc[0:10, :]
plt.figure(figsize=(10, 7))

sns.set(font_scale = 2) #sns.set(font_scale = 1.2) for metabolomic because of long label
sns.set_style("ticks") # same as "white" but with ticks

# set bar colour based on normalised enrichment score sign
bar_color = ['tab:red' if float(i) > 0 else 'tab:blue' for i in top_10_pathways_gsea['NES_x']]
ax = sns.barplot(data=top_10_pathways_gsea, y="Pathway_name_x", x="combined_pval", orient="h", palette=bar_color)

#Increase space between the bars and the axis
ax.relim()
ax.autoscale_view()


label=[]
# for i in range(10):
#     pathway1 =  top_10_pathways_gsea["Entity %_x"][i][:-1]
    
#     pathway2 =  top_10_pathways_gsea["Entity %_y"][i][:-1]

#     percent = (float(pathway1)+float(pathway2))/2
#     print(percent)
#     label.append("  "+ str(round(percent,1))+ '%')

# print(label)

for i in range(10):

    pathway_nam = top_10_pathways_gsea.index[i]
    
    num_in_df = len(filtered_dict[pathway_nam])
    num_whole_pathway = len(pathways_dict[pathway_nam])

    percent = (num_in_df/num_whole_pathway) * 100

    label.append("  "+ str(round(percent,1))+ '%')


print(label)



ax.bar_label(ax.containers[0], labels=label)

#plt.title('GSEA for integrated data',fontsize=22, pad=10)
plt.xlabel('Unadjusted p-value',fontsize=26)
plt.ylabel('Pathway name',fontsize=26) 
plt.xlim(0, 0.21);

plt.axvline(0.05, c="black")

#Add legend
custom_lines = [Line2D([0], [0], color='tab:red', lw=4),
                Line2D([0], [0], color='tab:blue', lw=4)]
plt.legend(handles=custom_lines, labels=['Positive enrichment score', 'Negative enrichment score'],loc="upper right")


#plt.savefig( '../Figures/integrated_GSEA_top_10.png' , dpi=300,bbox_inches = 'tight' , pad_inches = 0.2 , facecolor='w')