In [1]:
#Load libraries 

import pandas as pd
import sspa
import scipy
import numpy as np 
import random
import sys #to get the array job number when running an array job with the HPC

In [2]:
df = pd.read_csv('Data/Su_COVID_metabolomics_processed_commoncases.csv', index_col=0)

In [3]:
#Download the reactome pathways
reactome_pathways = sspa.process_gmt("Data/Reactome_Homo_sapiens_pathways_compounds_R84.gmt")

#Download the root pathways
root_path = pd.read_excel('Data/Root_pathways.xlsx', header=None)
root_pathway_dict = {root_path[0][i]:root_path[1][i] for i in range(0,len(root_path))}
root_pathway_names = list(root_pathway_dict.keys())

In [4]:

#Make a dictionary with the WHO status for each sample
sample_dict = {sample:df["WHO_status"][sample] for sample in df.index}

In [5]:
sample_names = list(df.index)
random.shuffle(sample_names)
print(sample_names)

#Make a copy of the original dataframe but replace with the shuffled labels
df_shuffled = df.copy()
df_shuffled.index = sample_names

#Change the WHO status to match the shuffled label
for sample in df_shuffled.index: 
        df_shuffled.loc[sample,"WHO_status"] = sample_dict[sample]

display(df_shuffled)

df_mild = (df_shuffled[df_shuffled["WHO_status"] == '1-2']).iloc[:,:-2] #45 samples, remove the metadata
df_severe = (df_shuffled[(df_shuffled["WHO_status"] == '3-4') | (df_shuffled["WHO_status"] == '5-7')]).iloc[:,:-2] #83 samples

['INCOV042', 'INCOV114', 'INCOV054', 'INCOV014', 'INCOV122', 'INCOV088', 'INCOV063', 'INCOV036', 'INCOV100', 'INCOV102', 'INCOV012', 'INCOV020', 'INCOV035', 'INCOV024', 'INCOV017', 'INCOV002', 'INCOV044', 'INCOV009', 'INCOV025', 'INCOV018', 'INCOV123', 'INCOV117', 'INCOV078', 'INCOV001', 'INCOV037', 'INCOV084', 'INCOV127', 'INCOV097', 'INCOV126', 'INCOV033', 'INCOV121', 'INCOV038', 'INCOV041', 'INCOV139', 'INCOV094', 'INCOV076', 'INCOV043', 'INCOV067', 'INCOV039', 'INCOV051', 'INCOV098', 'INCOV070', 'INCOV034', 'INCOV119', 'INCOV055', 'INCOV137', 'INCOV110', 'INCOV013', 'INCOV016', 'INCOV073', 'INCOV031', 'INCOV027', 'INCOV069', 'INCOV132', 'INCOV045', 'INCOV053', 'INCOV135', 'INCOV080', 'INCOV077', 'INCOV128', 'INCOV106', 'INCOV049', 'INCOV068', 'INCOV008', 'INCOV011', 'INCOV004', 'INCOV103', 'INCOV133', 'INCOV066', 'INCOV030', 'INCOV142', 'INCOV093', 'INCOV058', 'INCOV131', 'INCOV081', 'INCOV079', 'INCOV105', 'INCOV022', 'INCOV059', 'INCOV052', 'INCOV108', 'INCOV112', 'INCOV029', 'IN

Unnamed: 0,1372,16610,72665,27823,30915,37373,16831,545959,17050,16359,...,133693,133694,28036,28238,76341,89312,17861,89188,WHO_status,Group
INCOV042,-0.342522,-0.830841,-0.703451,-0.451278,0.153207,-0.443252,-0.228725,0.501998,-0.861886,-0.344155,...,-0.465505,0.055753,-0.225622,0.400398,0.163210,1.579262,-0.162221,-0.603790,1-2,COVID19
INCOV114,-0.125585,-1.002246,0.399273,-0.828341,-0.980576,0.373921,-1.076979,-0.131876,-0.869093,-0.467326,...,0.509336,0.627269,0.321718,-0.158514,0.001469,-1.051485,1.299330,0.578560,1-2,COVID19
INCOV054,-0.859326,-0.409159,3.573100,-0.546783,-0.993647,1.547649,0.370131,-0.020399,-0.703786,-0.015627,...,2.056610,1.717149,-0.297723,-1.276891,-0.658288,2.141594,-1.091183,0.447197,3-4,COVID19
INCOV014,-0.562999,-1.560249,-1.269383,-1.437919,0.108734,-0.192066,0.161591,-0.586759,-1.457246,-0.500612,...,0.793065,-0.033125,-0.366082,-0.642671,1.724436,2.074193,-1.021184,-0.410849,3-4,COVID19
INCOV122,-0.741957,0.387909,-0.711036,-0.711515,0.670720,0.610591,-0.322241,-0.793451,-1.230681,-0.246493,...,-0.184967,-0.469553,-0.293869,0.558366,-0.390308,-0.455737,-0.535223,-0.847727,1-2,COVID19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
INCOV003,-2.075777,-0.485936,-0.671225,0.069483,0.042579,0.357250,-0.505272,-0.378441,-0.444915,-0.368108,...,-0.516557,-0.392716,-0.328170,1.013774,-0.391566,-0.407585,1.041224,-0.866190,5-7,COVID19
INCOV074,-0.174602,1.167644,0.055499,0.113342,-0.663851,0.799367,0.004697,0.748894,1.328753,0.391221,...,-0.741578,-0.730981,-0.380517,-1.251641,0.389163,-0.603048,0.941619,-0.511694,1-2,COVID19
INCOV062,0.274900,-0.014841,-0.928512,0.885263,-1.591862,0.230250,0.057818,-0.775962,-0.179700,-0.534786,...,-1.306132,-1.246428,1.763824,1.258387,0.448359,-0.508931,0.268474,-0.997206,3-4,COVID19
INCOV120,0.772804,1.408806,1.473315,2.182921,-0.380455,-0.175916,0.376092,0.412578,1.853690,-0.368432,...,-1.050924,-0.909450,0.962331,0.292543,-0.404695,0.919776,-1.226216,0.673867,1-2,COVID19


In [7]:
df

Unnamed: 0_level_0,1372,16610,72665,27823,30915,37373,16831,545959,17050,16359,...,133693,133694,28036,28238,76341,89312,17861,89188,WHO_status,Group
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
INCOV001,-0.342522,-0.830841,-0.703451,-0.451278,0.153207,-0.443252,-0.228725,0.501998,-0.861886,-0.344155,...,-0.465505,0.055753,-0.225622,0.400398,0.163210,1.579262,-0.162221,-0.603790,3-4,COVID19
INCOV002,-0.125585,-1.002246,0.399273,-0.828341,-0.980576,0.373921,-1.076979,-0.131876,-0.869093,-0.467326,...,0.509336,0.627269,0.321718,-0.158514,0.001469,-1.051485,1.299330,0.578560,5-7,COVID19
INCOV003,-0.859326,-0.409159,3.573100,-0.546783,-0.993647,1.547649,0.370131,-0.020399,-0.703786,-0.015627,...,2.056610,1.717149,-0.297723,-1.276891,-0.658288,2.141594,-1.091183,0.447197,5-7,COVID19
INCOV004,-0.562999,-1.560249,-1.269383,-1.437919,0.108734,-0.192066,0.161591,-0.586759,-1.457246,-0.500612,...,0.793065,-0.033125,-0.366082,-0.642671,1.724436,2.074193,-1.021184,-0.410849,3-4,COVID19
INCOV005,-0.741957,0.387909,-0.711036,-0.711515,0.670720,0.610591,-0.322241,-0.793451,-1.230681,-0.246493,...,-0.184967,-0.469553,-0.293869,0.558366,-0.390308,-0.455737,-0.535223,-0.847727,3-4,COVID19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
INCOV135,-2.075777,-0.485936,-0.671225,0.069483,0.042579,0.357250,-0.505272,-0.378441,-0.444915,-0.368108,...,-0.516557,-0.392716,-0.328170,1.013774,-0.391566,-0.407585,1.041224,-0.866190,1-2,COVID19
INCOV136,-0.174602,1.167644,0.055499,0.113342,-0.663851,0.799367,0.004697,0.748894,1.328753,0.391221,...,-0.741578,-0.730981,-0.380517,-1.251641,0.389163,-0.603048,0.941619,-0.511694,3-4,COVID19
INCOV137,0.274900,-0.014841,-0.928512,0.885263,-1.591862,0.230250,0.057818,-0.775962,-0.179700,-0.534786,...,-1.306132,-1.246428,1.763824,1.258387,0.448359,-0.508931,0.268474,-0.997206,5-7,COVID19
INCOV139,0.772804,1.408806,1.473315,2.182921,-0.380455,-0.175916,0.376092,0.412578,1.853690,-0.368432,...,-1.050924,-0.909450,0.962331,0.292543,-0.404695,0.919776,-1.226216,0.673867,3-4,COVID19


In [None]:
#Function to calculate the squared Spearman correlation matrix 

def squared_spearman_corr(data):
    kpca_scores = sspa.sspa_kpca(data, reactome_pathways)   
    kpca_scores = kpca_scores.drop(columns = list(set(root_pathway_names) & set(kpca_scores.columns))) #using Sara's code to drop root pathways

    spearman_results = scipy.stats.spearmanr(kpca_scores)
    squared_spearman_coef = np.square(spearman_results[0]) #correlation coefficients (spearman_results[1] gives the p-values)

    return squared_spearman_coef,list(kpca_scores.columns)




#Function to calculate the absolute difference between two matrices and then determine the mean for each edge

def delta_squared_list(data1,data2,edgelist):
    delta_squared = (np.array(data1) - np.array(data2))

    #Mask the upper half of the dataframe (so I don't view the comparisons between the two same genes, and also the duplicate comparisons are removed)
    mask = delta_squared.copy()
    mask = np.triu(np.ones(mask.shape)).astype(bool)
    mask = np.invert(mask) #invert true and false values so the diagonal is False as well
    non_dup_delta_squared = pd.DataFrame(delta_squared, columns = edgelist, index = edgelist)
    non_dup_delta_squared = pd.DataFrame(non_dup_delta_squared).where(mask) #Replace all false values with NaN using mask

    delta_squared_list = non_dup_delta_squared.stack().reset_index()
    finallist = list(delta_squared_list[0])

    return(finallist)

In [None]:
spearman_mild,edgelist = squared_spearman_corr(df_mild)
spearman_severe,edgelist = squared_spearman_corr(df_severe)

output = delta_squared_list(spearman_mild,spearman_severe,edgelist)

In [None]:
#Option 1: I can use the 'pickle' package to pickle the list
#Advantages: Stores as a list, which can be accessed upon reading in the file, and saves memory by about a half compared to saving as regular text file
#Disadvantages: Is not readable in text format (must be 'unpickled' in Python first), since it needs to be unpicked first the list cannot be indexed and all the files have to be read altogether, which requires a lot of memory and is computationally expensive (even the metabolomics data cannot handle it)

#with open('Results/Run'+index_num + '.txt', "wb") as file_output:  
#       pickle.dump(output,file_output)

#with open("file.txt", "rb") as file_input:   # Unpickling
#   list1 = pickle.load(file_input)

In [None]:
index_num = sys.argv[1]  #this should return the array number within the array job

#Option 2: Write as a regular text file with a comma between values
with open ('Results/Run'+index_num+'.txt', 'w') as file:
     file.write(','.join(str(i) for i in output))