In [1]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from gudhi.weighted_rips_complex import WeightedRipsComplex
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from gudhi.representations import Landscape
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
import pandas as pd 
import numpy as np
import gudhi
import math
import dcor 
import os

In [2]:
#READ RNA COUNT MATRICES (.TSV) FILES AND MERGE INTO A SINGLE DATASET 

In [3]:
def read_rnaseq(path):
    
    pickle_file_directory = '/rnaseq/' + path + '.pkl'
    
    df = pd.read_pickle(pickle_file_directory)
    
    filepath_biomarkers = '/biomarkers/mapk.csv' 
    
    biomarkers = pd.read_csv(filepath_biomarkers, index_col=0, sep=';') 
    biomarkers = biomarkers["x"].to_numpy()    
    gene_exprs = df[np.intersect1d(df.columns, biomarkers)]

    return gene_exprs

In [None]:
def read_microarray(path):
    
    file_directory = '/microarray/' + path + '.pkl'
    
    df = pd.read_csv(file_directory)
    
    filepath_biomarkers = '/biomarkers/mapk.csv' 
    
    biomarkers = pd.read_csv(filepath_biomarkers, index_col=0, sep=';') 
    biomarkers = biomarkers["x"].to_numpy()    
    gene_exprs = df[np.intersect1d(df.columns, biomarkers)]

    return gene_exprs

In [6]:
#MERGE THE DATASETS (DIFFERENT CANCER TYPES AND PERFORM DATA SPLIT TRAIN (70%) AND TEST (30%)

In [7]:
def merge_split(R, S):
    
    labelencoding = preprocessing.LabelEncoder()
    
    gene_mtrx = pd.concat([R, S], axis=0)            
    train, test = train_test_split(gene_mtrx, test_size=0.3, train_size=0.7, random_state=0, shuffle=True)
            
    train_labs = labelencoding.fit_transform(train.index.to_list())
    test_labs = labelencoding.fit_transform(test.index.to_list())
    
    return np.array(train), np.array(test), train_labs, test_labs

In [10]:
#INTERGENE CORRELATION MEASURE USING DCOR 

In [11]:
def intergene_correlation_measure(DF):
    
    num_genes = DF.shape[1]
    
    dist = np.zeros((num_genes, num_genes))
    
    for i in range(num_genes):
        for j in range(i+1, num_genes):
            
            dist[i,j] = dcor.distance_correlation(DF[:,i], DF[:,j])
    
    dist = dist + dist.T + np.eye(num_genes)
    
    return 1 - dist 

In [15]:
#PER-PATIENT DISTANCE MATRIX ADDING FROM GLOBAL DISTANCE MATRIX

In [16]:
def patient_correlation_measure(F, M):
    
    dist = np.zeros((M.shape[1], M.shape[1]))
    num_genes = M.shape[1]
    
    for i in range(num_genes):
        
        for j in range(i+1, num_genes):
            
            b = abs((F[i]**2 - F[j]**2) / M[i,j])
            dist[i,j] = math.sqrt(M[i, j]**2 + b**2 + 2*F[i]**2 + 2*F[j]**2)/2 #weighting algorithm derived from Mandal et al., 2020
    
    dist = dist + dist.T + np.eye(num_genes)
    
    return dist

In [18]:
#TOPOLOGICAL DATA ANALYSIS USING GUDHI, RETURNS PER-PATIENT PERSISTENT DIAGRAMS IN ZERO-TH DIMENSION 

In [89]:
def simplicial_patient(X, M, name):
    
    dgms, dgms1 = [], []
    
    for s in X:
        
        distance_matrix = patient_correlation_measure(s, M)
        rips_complex = RipsComplex(distance_matrix).create_simplex_tree(max_dimension=1) #Weights used include per-patient gene expressions
        
        rips_complex.collapse_edges()
        rips_complex.expansion(2)
        rips_complex.persistence()
    
        Persistent_diagrams0.append(rips_complex.persistence_intervals_in_dimension(0))
        Persistent_diagrams1.append(rips_complex.persistence_intervals_in_dimension(1))
        
    remove_infinity = lambda barcode : np.array([bars for bars in barcode if bars[1]!= np.inf])
    dgms = list(map(remove_infinity, dgms))
    dgms1 = list(map(remove_infinity, dgms1))
    
    return dgms   

In [92]:
#SKLEARN PIPELINE WITH GUDHI PERSISTENT DIAGRAM OUTPUT 

In [93]:
def TDA_representations(train_dataset):
    
    pipe_Landscape = Pipeline([("TDA", Landscape(num_landscapes=10, resolution=len(train_dataset), sample_range=[0,len(train_dataset)])),
                               ("Estimator", MLPClassifier(early_stopping=True))])
    
    return pipe_Landscape

In [96]:
#MODEL TRAINING 

In [97]:
def model_train(dgms, labs, Tdgsm, Tlabs, pipe_Landscape):
        
    trained_model_landscape = pipe_Landscape.fit(dgms, labs)    
    predicted_label_lanscape = trained_model_landscape.predict(dgms)
        
    f1_micro_Landscape = f1_score(labs, predicted_label_lanscape, average="micro")
    f1_macro_Landscape = f1_score(labs, predicted_label_lanscape, average="macro")
    
    recall_Landscape = recall_score(labs, predicted_label_lanscape)
    precision_Landscape = precision_score(labs, predicted_label_lanscape)
    
    a = np.array([[trained_model_landscape.score(dgms, labs), trained_model_Image.score(dgms, labs)]])
    b = np.array([[trained_model_landscape.score(Tdgsm, Tlabs), trained_model_Image.score(Tdgsm, Tlabs)]])
    
    return a, b