### Create multi-omics pathways

In [1]:
import pandas as pd
import sspa

Downloading the multiomics data is as easy as downloading from the sspa package. However, the latest pathway version is v85, whereas I have used v84 for the single omics metabolomics and proteomics dataset. Therefore, I would have have to manually concatenate the v84 pathway files together for metabolomics and proteomics.

Method 1: Downloading from sspa package

In [None]:
#Download Reactome latest multi-omics
reactome_mouse_latest_mo = sspa.process_reactome("Mus musculus", download_latest=True, filepath=".", omics_type='multiomics')

In [None]:
#Download Reactome latest multi-omics
reactome_human_latest_mo = sspa.process_reactome("Homo sapiens", download_latest=True, filepath=".", omics_type='multiomics')

Method 2: Manually concatentating specific version files

In [None]:
#Metabolomic pathways (ChEBI IDs)
metabolomic_reactome_pathways = sspa.process_gmt("Data/Reactome_Homo_sapiens_pathways_compounds_R84.gmt")

In [None]:
#Proteomic pathways  (UniProt IDs)
proteomic_reactome_pathways = sspa.process_reactome('Homo sapiens', infile = 'Data/UniProt2Reactome_All_Levels.txt', download_latest = False, filepath = None)

In [None]:
proteomic_reactome_pathways = proteomic_reactome_pathways.rename_axis('Pathway_ID')
proteomic_reactome_pathways

Currently the only index is the Pathway ID. I create a multi-index so that the Pathway name also becomes part of the index as well.

In [None]:
metabolomic_reactome_pathways.set_index([metabolomic_reactome_pathways.index, metabolomic_reactome_pathways['Pathway_name']], inplace=True)
metabolomic_reactome_pathways.drop(['Pathway_name'], axis=1, inplace=True)

In [None]:
proteomic_reactome_pathways.set_index([proteomic_reactome_pathways.index, proteomic_reactome_pathways['Pathway_name']], inplace=True)
proteomic_reactome_pathways.drop(['Pathway_name'], axis=1, inplace=True)

In [None]:
display(metabolomic_reactome_pathways)
display(proteomic_reactome_pathways)

In [None]:
#Merge pathways on uniprot index
reactome_mo = metabolomic_reactome_pathways.merge(proteomic_reactome_pathways, how='outer', left_index=True, right_index=True)    
reactome_mo


In [None]:
reactome_mo.iloc[:,4000:]
reactome_mo

In [None]:
#Pathway_name column turns from index to normal column
reactome_mo = reactome_mo.reset_index(level=[1]) 
reactome_mo.index


In [95]:
reactome_mo.to_csv("Data/Reactome_multi_omics_ChEBI_Uniprot.csv")

In [71]:
#Read in file to check
mo_pathways = pd.read_csv("Data/Reactome_multi_omics_ChEBI_Uniprot.csv", index_col=0,dtype="str")
#Dtype warning because in some columns, some values are in string format whereas some are in integer format, that's why I specify dtype="str"

In [96]:
mo_pathways

Unnamed: 0_level_0,Pathway_name,2_x,3_x,4_x,5_x,6_x,7_x,8_x,9_x,10_x,...,2588,2589,2590,2591,2592,2593,2594,2595,2596,2597
Pathway_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
R-HSA-1059683,Interleukin-6 signaling,30616,456216,,,,,,,,...,,,,,,,,,,
R-HSA-109581,Apoptosis,61120,4705,456216,28494,36080,15377,43474,47575,30616,...,,,,,,,,,,
R-HSA-109582,Hemostasis,15366,91144,15377,15378,15379,456215,456216,35366,57895,...,,,,,,,,,,
R-HSA-109606,Intrinsic Pathway for Apoptosis,456216,28494,36080,15377,43474,47575,30616,,,...,,,,,,,,,,
R-HSA-109703,PKB-mediated events,456216,57836,15377,58165,456215,30616,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
R-HSA-983695,Antigen activates B Cell Receptor (BCR) leadin...,57540,456216,57836,203600,36080,29108,17815,30616,58456,...,,,,,,,,,,
R-HSA-983705,Signaling by the B Cell Receptor (BCR),57540,11750,456216,29034,57836,58189,203600,29105,36080,...,,,,,,,,,,
R-HSA-983712,Ion channel transport,15365,17544,57613,49552,15377,15378,35475,30616,456216,...,,,,,,,,,,
R-HSA-991365,Activation of GABAB receptors,58189,29103,17552,59888,43474,18420,37565,,,...,,,,,,,,,,


### Create multiomic dataset

In [74]:
#Load datasets
metabolomic_df = pd.read_csv('Data/Su_COVID_metabolomics_processed_ChEBI.csv', index_col=0)
metabolomic_df.index= metabolomic_df.index.str.rstrip('-BL')

proteomic_df = pd.read_csv('Data/Su_COVID_proteomics_processed.csv', index_col=0)

In [76]:
#Filter to common samples
list1 = list(metabolomic_df.index)
list2 = list(proteomic_df.index)

#Obtain common samples and subset accordingly
intersection = list(set(metabolomic_df.index.tolist()) & set(proteomic_df.index.tolist())) #set removes duplicates
intersection = [sample for sample in intersection if sample.startswith("INCOV")]

metabolomic_df = metabolomic_df[metabolomic_df.index.isin(intersection)]
proteomic_df = proteomic_df[proteomic_df.index.isin(intersection)]


In [84]:
concat_omics = pd.concat([metabolomic_df.iloc[:,:-2], proteomic_df], axis=1)

In [88]:
concat_omics.to_csv("Data/Su_multi_omics_data.csv")

In [90]:
#Read in file to check
multiomic_df = pd.read_csv("Data/Su_multi_omics_data.csv", index_col=0)

In [94]:
multiomic_df.dtypes

1372          float64
16610         float64
72665         float64
27823         float64
30915         float64
               ...   
Q9Y5L3        float64
Q9Y5V3        float64
Q9Y653        float64
WHO_status     object
Group          object
Length: 789, dtype: object

In [None]:
#Test kPCA on whole dataset
kpca_scores_all = sspa.sspa_kpca(multiomic_df, mo_pathways)