In [2]:
#Load libraries 

import pandas as pd
import sspa
import scipy
import numpy as np 
import random
import sys #to get the array job number when running an array job with the HPC

In [3]:
#Load dataset
df = pd.read_csv('Data/Su_COVID_proteomics_processed.csv', index_col=0)

df2= pd.read_csv('Data/Su_COVID_metabolomics_processed_ChEBI.csv', index_col=0)
df2.index= df2.index.str.rstrip('-BL')


#Obtain common samples and subset accordingly
intersection = list(set(df.index.tolist()) & set(df2.index.tolist())) #set removes duplicates
intersection = [sample for sample in intersection if sample.startswith("INCOV")]
print(len(intersection))

df = df[df.index.isin(intersection)]

#Make a dictionary with the WHO status for each sample
sample_dict = {sample:df["WHO_status"][sample] for sample in df.index}

128


In [4]:
#Download the reactome pathways
reactome_pathways = sspa.process_reactome('Homo sapiens', infile = 'Data/UniProt2Reactome_all_Levels.txt', download_latest = False, filepath = None)

#Download the root pathways
root_path = pd.read_excel('Data/Root_pathways.xlsx', header=None)
root_pathway_dict = {root_path[0][i]:root_path[1][i] for i in range(0,len(root_path))}
root_pathway_names = list(root_pathway_dict.keys())

In [5]:
sample_names = list(df.index)
random.shuffle(sample_names)
print(sample_names)

#Make a copy of the original dataframe but replace with the shuffled labels
df_shuffled = df.copy()
df_shuffled.index = sample_names

#Change the WHO status to match the shuffled label
for sample in df_shuffled.index: 
        df_shuffled.loc[sample,"WHO_status"] = sample_dict[sample]

display(df_shuffled)

df_mild = (df_shuffled[df_shuffled["WHO_status"] == '1-2']).iloc[:,:-2] #45 samples, remove the metadata
df_severe = (df_shuffled[(df_shuffled["WHO_status"] == '3-4') | (df_shuffled["WHO_status"] == '5-7')]).iloc[:,:-2] #83 samples

['INCOV037', 'INCOV116', 'INCOV070', 'INCOV049', 'INCOV041', 'INCOV010', 'INCOV053', 'INCOV142', 'INCOV128', 'INCOV003', 'INCOV121', 'INCOV103', 'INCOV122', 'INCOV018', 'INCOV007', 'INCOV071', 'INCOV073', 'INCOV096', 'INCOV036', 'INCOV078', 'INCOV019', 'INCOV015', 'INCOV092', 'INCOV131', 'INCOV139', 'INCOV135', 'INCOV133', 'INCOV027', 'INCOV004', 'INCOV124', 'INCOV061', 'INCOV086', 'INCOV022', 'INCOV021', 'INCOV060', 'INCOV033', 'INCOV081', 'INCOV054', 'INCOV079', 'INCOV051', 'INCOV093', 'INCOV119', 'INCOV120', 'INCOV126', 'INCOV035', 'INCOV014', 'INCOV038', 'INCOV023', 'INCOV016', 'INCOV112', 'INCOV066', 'INCOV026', 'INCOV094', 'INCOV069', 'INCOV057', 'INCOV136', 'INCOV043', 'INCOV115', 'INCOV031', 'INCOV106', 'INCOV088', 'INCOV108', 'INCOV107', 'INCOV052', 'INCOV055', 'INCOV059', 'INCOV005', 'INCOV125', 'INCOV082', 'INCOV044', 'INCOV048', 'INCOV111', 'INCOV098', 'INCOV002', 'INCOV109', 'INCOV067', 'INCOV072', 'INCOV058', 'INCOV050', 'INCOV009', 'INCOV029', 'INCOV134', 'INCOV087', 'IN

Unnamed: 0,O00182,O00220,O00253,O14763,O14836,O43915,O94907,P00797,P01127,P01241,...,Q9ULX7,Q9UNK0,Q9Y478,Q9Y4K4,Q9Y5A7,Q9Y5L3,Q9Y5V3,Q9Y653,WHO_status,Group
INCOV037,1.250252,0.873682,-0.222247,0.622561,0.353355,-0.286163,-0.526134,1.805033,-0.324710,-0.667005,...,-0.406587,-0.917903,-0.857745,-1.117503,-1.213145,-0.701155,-0.131201,2.071499,3-4,COVID19
INCOV116,1.393905,-0.309504,0.744007,0.508746,-0.017290,-0.852881,0.173400,0.180716,0.090011,0.505406,...,-1.138131,-0.475021,-0.399031,0.232609,0.141556,-1.339680,0.430852,-0.558821,1-2,COVID19
INCOV070,1.099240,0.804700,0.173683,0.925977,0.187447,-1.468490,0.405765,2.116331,0.281151,0.898135,...,-2.009084,-0.781981,-0.497887,-0.021875,-0.315858,-1.098720,0.118642,1.368320,5-7,COVID19
INCOV049,1.257048,0.599060,1.193907,0.633282,2.507842,0.642288,-1.607032,-1.290692,-2.349914,-0.326495,...,-0.146722,-1.486478,-0.888337,-0.983907,-1.025775,-0.940002,-0.383188,1.140530,3-4,COVID19
INCOV041,0.930230,0.882323,0.323857,1.495380,0.014325,-0.440281,-0.905957,0.521953,-1.400319,0.930726,...,-0.575130,-1.192291,-0.723072,-0.509965,-0.949089,0.457545,0.024298,-0.333001,5-7,COVID19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
INCOV077,0.161129,-1.000615,-1.227069,-0.943569,-0.543900,-0.831409,-0.737041,-0.107560,-0.856326,-0.028323,...,1.129367,0.267868,-0.056972,-0.183219,-0.187084,-0.339814,-0.261669,-0.767177,1-2,COVID19
INCOV105,0.088239,0.326718,-0.187982,0.127236,-0.015408,-1.419804,1.594489,0.067971,0.859896,-0.207040,...,0.431856,1.370419,1.840750,1.313571,1.767903,-0.884657,1.320978,2.368223,3-4,COVID19
INCOV068,0.908461,0.112776,-0.064481,0.873255,1.224368,-1.116571,1.520038,1.652775,0.902458,-0.798092,...,0.116792,1.396261,1.383526,1.393687,1.719381,-0.632084,1.215266,-0.501149,1-2,COVID19
INCOV001,-0.376521,-0.111862,-0.349217,-0.182360,-1.014664,-0.425059,1.334060,-1.275512,1.070715,0.248137,...,0.647299,1.379380,1.722397,1.163267,2.163652,-1.046371,1.246955,-0.338830,3-4,COVID19


In [7]:
#Function to calculate the squared Spearman correlation matrix 

def squared_spearman_corr(data):
    kpca_scores = sspa.sspa_kpca(data, reactome_pathways)   
    kpca_scores = kpca_scores.drop(columns = list(set(root_pathway_names) & set(kpca_scores.columns))) #using Sara's code to drop root pathways

    spearman_results = scipy.stats.spearmanr(kpca_scores)
    squared_spearman_coef = np.square(spearman_results[0]) #correlation coefficients (spearman_results[1] gives the p-values)

    return squared_spearman_coef,list(kpca_scores.columns)




#Function to calculate the absolute difference between two matrices and then determine the mean for each edge

def delta_squared_list(data1,data2,edgelist):
    delta_squared = (np.array(data1) - np.array(data2))

    #Mask the upper half of the dataframe (so I don't view the comparisons between the two same genes, and also the duplicate comparisons are removed)
    mask = delta_squared.copy()
    mask = np.triu(np.ones(mask.shape)).astype(bool)
    mask = np.invert(mask) #invert true and false values so the diagonal is False as well
    non_dup_delta_squared = pd.DataFrame(delta_squared, columns = edgelist, index = edgelist)
    non_dup_delta_squared = pd.DataFrame(non_dup_delta_squared).where(mask) #Replace all false values with NaN using mask

    delta_squared_list = non_dup_delta_squared.stack().reset_index()
    finallist = list(delta_squared_list[0])

    return(finallist)

In [8]:
spearman_mild,edgelist = squared_spearman_corr(df_mild)
spearman_severe,edgelist = squared_spearman_corr(df_severe)

output = delta_squared_list(spearman_mild,spearman_severe,edgelist)

In [10]:
len(output)

166753

In [None]:
#Option 1: I can use the 'pickle' package to pickle the list
#Advantages: Stores as a list, which can be accessed upon reading in the file, and saves memory by about a half compared to saving as regular text file
#Disadvantages: Is not readable in text format (must be 'unpickled' in Python first), since it needs to be unpicked first the list cannot be indexed and all the files have to be read altogether, which requires a lot of memory and is computationally expensive (even the metabolomics data cannot handle it)

#with open('Results/Run'+index_num + '.txt', "wb") as file_output:  
#       pickle.dump(output,file_output)

#with open("file.txt", "rb") as file_input:   # Unpickling
#   list1 = pickle.load(file_input)

In [None]:
index_num = sys.argv[1]  #this should return the array number within the array job

#Option 2: Write as a regular text file with a comma between values
with open ('Results/Run'+index_num+'.txt', 'w') as file:
     file.write(','.join(str(i) for i in output))