In [1]:
import os
import gudhi
import dcor
import math
import numpy as np
import pandas as pd
from gudhi import RipsComplex
from sklearn import preprocessing
from gudhi.representations import Landscape, PersistenceImage
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import quantile_transform
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, balanced_accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
#READ GENE EXPRESSION MATRICES AS PICKLE FILES

In [3]:
def read_files_rna_dataframe(path, ref):
    
    labelencoding = preprocessing.LabelEncoder()
    
    file_directory = '/directory/' + path
    df = pd.read_pickle(file_directory)
    
    df.index = [ref] * df.shape[0]
    
    return(df)

In [11]:
#Split into train and test

In [12]:
def train_test(case, control, name):
    
    X_new = pd.concat([case, control])
    filepath = '/directory/' + name
    
    biomarkers = pd.read_csv(filepath, index_col=0) 
    biomarkers = biomarkers["x"].to_numpy()    
    X_new = X_new[np.intersect1d(X_new.columns, biomarkers)] 
    
    labelencoding = preprocessing.LabelEncoder()
    gene_exprs_matrix = pd.concat([case, control], axis=0)
    labs = labelencoding.fit_transform(gene_exprs_matrix.index.to_list())
    
    train, test = train_test_split(X_new, test_size=0.3, train_size=0.7, shuffle=True, randon_state=0)
            
    train_labs = labelencoding.fit_transform(train.index.to_list())
    test_labs = labelencoding.fit_transform(test.index.to_list())
    
    return (np.array(train), np.array(test), train_labs, test_labs)

In [33]:
#Distance Correlation Matrix

In [34]:
def intergene_correlation_measure(DF):
    
    num_genes = DF.shape[1]
    dist = np.zeros((num_genes, num_genes))
    
    for i in range(num_genes):
        
        for j in range(i+1, num_genes):
            
            dist[i,j] = dcor.distance_correlation(DF[:,i], DF[:,j]) #Distance Correlations 
    
    dist = dist + dist.T + np.eye(num_genes)
    
    return 1 - dist

In [44]:
#Per-patient Distance Correlations 

In [45]:
def patient_correlation_measure(F, M):
    
    F = F.T
    num_genes = M.shape[1]
    dist = np.zeros((num_genes, num_genes))
        
    for i in range(num_genes):
        for j in range(i+1, num_genes):
            
            dist[i,j] = M[i,j] + (F[i] + F[j]) 
            
    dist = dist + dist.T + np.eye(num_genes)
    
    return dist

In [40]:
#Persistent Homology Calculation

In [41]:
def simplicial_patient(X, M):
    
    Persistent_diagrams0, Persistent_diagrams1, Persistent_diagrams2 = [], [], []
    
    for s in X:
        
        distance_matrix = patient_correlation_measure(s, M)
        rips_complex = RipsComplex(distance_matrix).create_simplex_tree(max_dimension=1) #Weights used include per-patient gene expressions
        
        rips_complex.collapse_edges()
        rips_complex.expansion(3)
        rips_complex.persistence()
        
        diag = rips_complex.persistence()
 
        Persistent_diagrams0.append(rips_complex.persistence_intervals_in_dimension(0))
        Persistent_diagrams1.append(rips_complex.persistence_intervals_in_dimension(1))
        Persistent_diagrams2.append(rips_complex.persistence_intervals_in_dimension(2))
    
    %config InlineBackend.figure_format = 'retina'
    gudhi.plot_persistence_diagram(diag)
    remove_infinity = lambda barcode : np.array([bars for bars in barcode if bars[1]!= np.inf])
    
    image = PersistenceImage(resolution=resolution) 
    
    samplelandscape0img = image.fit_transform(Persistent_diagrams0)
    samplelandscape1img = image.fit_transform(Persistent_diagrams1)
    samplelandscape2img = image.fit_transform(Persistent_diagrams2)
    
        
    return np.column_stack((samplelandscape0img, samplelandscape1img, samplelandscape2img))

In [30]:
#Multi-layer Perceptron 

In [48]:
def MLPClass(X_train, X_test, y_train, y_test, lbl):
    
    from sklearn.model_selection import GridSearchCV
    
    parameter_space = {
        
    'activation': ['tanh', 'relu', 'logistic'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
    }
    
    mlp_gs = MLPClassifier(max_iter=2000) #Epochs 2000
    clf = GridSearchCV(mlp_gs, parameter_space, n_jobs=-1, cv=2)
    clf = clf.fit(X_train, y_train)
    
    
    y_pred = clf.predict(X_test) # Test Data
    
    from sklearn.metrics import classification_report
    print(lbl, 'Results on the Test Set:')
    print(classification_report(y_test, y_pred))
    
    
    f1 = f1_score(y_test, y_pred, average="macro")
    f1_micro = f1_score(y_test, y_pred, average="micro")
    pr = precision_score(y_test, y_pred, average="macro")
    re = recall_score(y_test, y_pred, average="macro")
    acc = accuracy_score(y_test, y_pred)
    bala = balanced_accuracy_score(y_test, y_pred)
    
    y_predT = clf.predict(X_train) # Train Data
    
    f1T = f1_score(y_train, y_predT, average="macro")
    f1_micro_T = f1_score(y_train, y_predT, average="micro")
    prT = precision_score(y_train, y_predT, average="macro")
    reT = recall_score(y_train, y_predT, average="macro")
    accT = accuracy_score(y_train, y_predT)
    balaT = balanced_accuracy_score(y_train, y_predT)
    
    scores = np.array([ f1*100, f1T*100, f1_micro*100, f1_micro_T*100, re*100, reT*100, pr*100, prT*100, acc*100, accT*100, bala*100, balaT*100 ])
    
    return scores

In [None]:
metrics = pd.DataFrame(all_metrics,
                       index=['BRCA', 'COAD', 'LUAD','PRAD'], 
                       columns=['F1 Score MacroTest', 'F1 Score MacroTrain',
                                'F1 Score MicroTest', 'F1 Score MicroTrain',
                                'RecallTest', 'RecallTrain',
                                'PrecisionTest', 'PrecisionTrain',
                                'AccuracyTest', 'AccuracyTrain',
                                'BalancesAccTest', 'BalancedAccTrain'])

metrics