In [None]:
import pandas as pd
import sspa
import seaborn as sns
import matplotlib.pyplot as plt
import scipy


In [None]:
df = pd.read_csv('../Data/Su_COVID_metabolomics_processed_commoncases.csv', index_col=0)
reactome_pathways = sspa.process_gmt("../Data/Reactome_Homo_sapiens_pathways_compounds_R84.gmt")

In [None]:
df = pd.read_csv('../Data/Su_COVID_proteomics_processed_commoncases.csv', index_col=0)
reactome_pathways = sspa.process_reactome('Homo sapiens', infile = '../Data/UniProt2Reactome_all_Levels_ver84.txt', download_latest = False, filepath = None)

In [None]:
#Convert pathway ID to name
#root_path = pd.read_excel('../Data/Root_pathways.xlsx', header=None)
#root_pathway_dict = {root_path[0][i]:root_path[1][i] for i in range(0,len(root_path))}

#root_pathway_names = list(root_pathway_dict.keys())
#Remove root pathways
#reactome_pathways = reactome_pathways[~reactome_pathways.index.isin(root_pathway_names)]

In [None]:
for i in range(len(df.index)):
    if df.WHO_status[i] == '1-2':
        df['Group'][i] = 'Mild'
    else:
        df['Group'][i] = 'Severe'


### Over-representation analysis

In [None]:
#Initiate an ORA object 
ora = sspa.sspa_ora(df.iloc[:,:-2], df["Group"], reactome_pathways, 0.05, custom_background=None)
#Carry out ORA
ora_res = ora.over_representation_analysis()
print(len(ora.DA_molecules))
display(ora.DA_test_res.sort_values(by="P-value"))
display(ora_res.sort_values(by="P-value"))


top_10_pathways = ora_res.sort_values(by="P-value").iloc[0:10, :]
plt.figure(figsize=(9, 3.6))

#If you want to colour by significance
#bar_color = ['tab:green' if float(i) < 0.05 else 'tab:grey' for i in top_20_pathways['P-value']]
#sns.barplot(data=top_10_pathways, y="Pathway_name", x="P-value", orient="h", palette=bar_color) #rocket, magma

#If you want to colour by gradient
sns.barplot(data=top_10_pathways, y="Pathway_name", x="P-value", orient="h", palette="rocket") #rocket, magma

plt.title('ORA for metabolomic data',fontsize=15)
plt.xlabel('Unadjusted p-value',fontsize=13)
plt.ylabel('Pathway name',fontsize=13) ;

plt.axvline(0.05, c="black")

#plt.savefig( 'Figures/metabolomic_ORA_top_10.png' , dpi=200,bbox_inches = 'tight' , pad_inches = 0.2 , facecolor='w')

For integrated data, take the two ORA results and combine the p-values with Fisher's method, which is what is commonly done (See Maghsoudi et al., 2021 for examples):

In [None]:
metabolomic_ora = ora_res.sort_values(by="P-value")

In [None]:
proteomic_ora = ora_res.sort_values(by="P-value")

In [None]:
metabolomic_ora.index = metabolomic_ora["ID"]
metabolomic_ora.drop(columns = ["ID"]) #using Sara's code to drop root pathways

In [None]:
proteomic_ora.index = proteomic_ora["ID"]
proteomic_ora.drop(columns = ["ID"]) #using Sara's code to drop root pathways

In [None]:
result = metabolomic_ora.merge(proteomic_ora, how='inner',right_index=True, left_index = True)
result[:5]

In [None]:
result["combined_pval"] = 0

In [None]:
for i in range(len(result.index)):
    ID_list = [result["P-value_x"][i],result["P-value_y"][i]]
    print(ID_list)
    test_stat,pval = scipy.stats.combine_pvalues(ID_list, method='fisher',weights=None)
    print(pval)
    result["combined_pval"][i] = pval


In [None]:
result[:10]

In [None]:
top_10_pathways = result.sort_values(by="combined_pval").iloc[0:10, :]
plt.figure(figsize=(9, 3.6))
sns.barplot(data=top_10_pathways, y="Pathway_name_x", x="combined_pval", orient="h", palette="rocket") #rocket, magma

plt.title('ORA for integrated data',fontsize=15)
plt.xlabel('Unadjusted p-value',fontsize=13)
plt.ylabel('Pathway name',fontsize=13) ;

plt.axvline(0.05, c="black")

plt.savefig( 'Figures/integrated_ORA_top_10.png' , dpi=200,bbox_inches = 'tight' , pad_inches = 0.2 , facecolor='w')

### Gene-set Enrichment Analysis

In [None]:
gsea_res = sspa.sspa_gsea(df.iloc[:,:-2], df["Group"], reactome_pathways)

In [None]:
#display(gsea_res.sort_values(by="P-adjust FDR")) #0.55 is the lowest for metabolomic, 0.06 for proteomic
display(gsea_res.sort_values(by="P-value"))

In [None]:
#Change name to have a line break otherwise it won't fit on the plot
gsea_res.Pathway_name[0] = 'Regulation of Insulin-like Growth Factor (IGF) transport and \n uptake by Insulin-like Growth Factor Binding Proteins (IGFBPs)'

In [None]:
from matplotlib.lines import Line2D

top_10_pathways_gsea = gsea_res.sort_values(by="P-value").iloc[0:10, :]
plt.figure(figsize=(9, 3.6))

# set bar colour based on normalised enrichment score sign
bar_color = ['tab:red' if float(i) > 0 else 'tab:blue' for i in top_10_pathways_gsea['NES']]
sns.barplot(data=top_10_pathways_gsea, y="Pathway_name", x="P-value", orient="h", palette=bar_color)

plt.title('GSEA for proteomic data',fontsize=15)
plt.xlabel('Unadjusted p-value',fontsize=13)
plt.ylabel('Pathway name',fontsize=13) ;

plt.axvline(0.05, c="black")

# add legend
custom_lines = [Line2D([0], [0], color='tab:red', lw=4),
                Line2D([0], [0], color='tab:blue', lw=4)]
plt.legend(handles=custom_lines, labels=['Positive enrichment score', 'Negative enrichment score'],loc="upper right")

#plt.savefig( 'Figures/proteomic_GSEA_top_10.png' , dpi=200,bbox_inches = 'tight' , pad_inches = 0.2 , facecolor='w')

For integrated data, take the two GSEA results and combine the p-values with Fisher's method, which is what is commonly done (See Maghsoudi et al., 2021 for examples):

In [None]:
metabolomic_gsea = gsea_res.sort_values(by="P-value")

In [None]:
proteomic_gsea = gsea_res.sort_values(by="P-value")

In [None]:
metabolomic_gsea

In [None]:
proteomic_gsea

In [None]:
result = metabolomic_gsea.merge(proteomic_gsea, how='inner',right_index=True, left_index = True)
result[:5]

In [None]:
result["combined_pval"] = 0

In [None]:
for i in range(len(result.index)):
    ID_list = [result["P-value_x"][i],result["P-value_y"][i]]
    print(ID_list)
    test_stat,pval = scipy.stats.combine_pvalues(ID_list, method='fisher',weights=None)
    print(pval)
    result["combined_pval"][i] = pval


In [None]:
from matplotlib.lines import Line2D

top_10_pathways_gsea = result.sort_values(by="combined_pval").iloc[0:10, :]
plt.figure(figsize=(9, 3.6))

# set bar colour based on normalised enrichment score sign
bar_color = ['tab:red' if float(i) > 0 else 'tab:blue' for i in top_10_pathways_gsea['NES_x']]
sns.barplot(data=top_10_pathways_gsea, y="Pathway_name_x", x="combined_pval", orient="h", palette=bar_color)

plt.title('GSEA for integrated data',fontsize=15)
plt.xlabel('Unadjusted p-value',fontsize=13)
plt.ylabel('Pathway name',fontsize=13) ;

plt.axvline(0.05, c="black")

#Add legend
custom_lines = [Line2D([0], [0], color='tab:red', lw=4),
                Line2D([0], [0], color='tab:blue', lw=4)]
plt.legend(handles=custom_lines, labels=['Positive enrichment score', 'Negative enrichment score'],loc="upper right")


#plt.savefig( 'Figures/integrated_GSEA_top_10.png' , dpi=200,bbox_inches = 'tight' , pad_inches = 0.2 , facecolor='w')