In [None]:
#Load libraries 
import pandas as pd
import sspa
import scipy
import numpy as np 
#import os #get path location
#import pickle

In [None]:
#Read in dataset
multiomic_df = pd.read_csv("Data/Su_multi_omics_data.csv", index_col=0)

In [None]:
#Read in pathway file
mo_pathways = pd.read_csv("Data/Reactome_multi_omics_ChEBI_Uniprot.csv", index_col=0,dtype="str")
#Dtype warning because in some columns, some values are in string format whereas some are in integer format, that's why I specify dtype="str"

In [None]:
#Download the root pathways
root_path = pd.read_excel('Data/Root_pathways.xlsx', header=None)
root_pathway_dict = {root_path[0][i]:root_path[1][i] for i in range(0,len(root_path))}
root_pathway_names = list(root_pathway_dict.keys())

In [None]:
#Test kPCA on whole dataset
kpca_scores_all = sspa.sspa_kpca(multiomic_df, mo_pathways) #737 pathways

#Remove root pathways first
kpca_scores_all = kpca_scores_all.drop(columns = list(set(root_pathway_names) & set(kpca_scores_all.columns))) #using Sara's code to drop root pathways

kpca_scores_all #710 pathways

In [None]:
#Separate dataset into two groups
df_mild = (multiomic_df[multiomic_df["WHO_status"] == '1-2']).iloc[:,:-2] #45 samples, remove the metadata
df_severe = (multiomic_df[(multiomic_df["WHO_status"] == '3-4') | (multiomic_df["WHO_status"] == '5-7')]).iloc[:,:-2] #83 samples

### Step 1: Determine initial test-statistic

In [None]:
#Function to calculate the squared Spearman correlation matrix 

def squared_spearman_corr(data):
    kpca_scores = sspa.sspa_kpca(data, mo_pathways)   
    kpca_scores = kpca_scores.drop(columns = list(set(root_pathway_names) & set(kpca_scores.columns))) #using Sara's code to drop root pathways

    spearman_results = scipy.stats.spearmanr(kpca_scores)
    squared_spearman_coef = np.square(spearman_results[0]) #correlation coefficients (spearman_results[1] gives the p-values)

    return squared_spearman_coef,list(kpca_scores.columns)




#Function to calculate the absolute difference between two matrices and then determine the mean for each edge

def absolute_val(data1,data2,edgelist):
    abs_rho_squared = np.absolute(np.array(data1) - np.array(data2))

    #Mask the upper half of the dataframe (so I don't view the comparisons between the two same genes, and also the duplicate comparisons are removed)
    mask = abs_rho_squared.copy()
    mask = np.triu(np.ones(mask.shape)).astype(bool)
    mask = np.invert(mask) #invert true and false values so the diagonal is False as well
    non_dup_abs_squared = pd.DataFrame(abs_rho_squared, columns = edgelist, index = edgelist)
    non_dup_abs_squared = pd.DataFrame(non_dup_abs_squared).where(mask) #Replace all false values with NaN using mask

    abs_squared_list = non_dup_abs_squared.stack().reset_index()
    abs_squared_list['level_0'] = abs_squared_list["level_0"].astype(str) + ", " + abs_squared_list['level_1']
    abs_squared_list.columns = ["Edges","na","Initial_tstat"]
    abs_squared_list.index = abs_squared_list["Edges"]
    abs_squared_list = abs_squared_list.drop(columns = ["Edges","na"])

    return(abs_squared_list)

Note: For the delta squared correlation values for the unshuffled data (i.e. the real data) I keep the indices (pathway edges). Since I already have a record of the edges, there is no need to keep the edges for each permutation, since the order is the same each time. 

In [None]:
spearman_mild,edgelist = squared_spearman_corr(df_mild)
spearman_severe,edgelist = squared_spearman_corr(df_severe)

output = absolute_val(spearman_mild,spearman_severe,edgelist)

In [None]:
output   #251695 pathway pairs (because of 710 pathways)

In [None]:
output.to_csv("Data/permutation_test_files_integrated/initial_tstats.csv")

### Step 2: Shuffle the labels

### Step 3: Read in the permutation files 

### Step 4: Compare the difference in edges with other networks

Comparing with the significant edges in the proteomic differential network (after FDR correction) and the metabolomic differential network (before FDR correction because only 10 edges afterwards):

In [9]:
def edge_num (omics):
    with open('Data/permutation_test_files_'+omics+'/sig_edges.txt') as f:
        lines = f.readlines()
    edges_remaining = []
    edges = lines[0].split(",")

    for index in range(0,len(edges),2):
        list1 = edges[index],(edges[index+1][1:])
        edges_remaining.append(tuple(list1))

    return edges_remaining


In [10]:
proteomic_edges = edge_num('proteomics')
integrated_edges = edge_num('integrated')

In [16]:
with open('Data/permutation_test_files_metabolomics/sig_edges_beforefdr.txt') as f:
    lines = f.readlines()
    edges_remaining = []
    edges = lines[0].split(",")

    for index in range(0,len(edges),2):
        list1 = edges[index],(edges[index+1][1:])
        edges_remaining.append(tuple(list1))

metabolomic_edges = edges_remaining

In [23]:
len(integrated_edges)

24600

In [24]:
intersection = list(set(integrated_edges).intersection(list(set(proteomic_edges))))  
len(intersection) 

21000