In [None]:
# !pip install pytorch-tabnet

In [None]:
# !pip install tab-transformer-pytorch

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import gc
import os
import sys

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

In [3]:
import torch
from pytorch_tabnet.tab_model import TabNetClassifier

In [4]:
## perform classification using XGBoost
def perform_classification(df, random_state):
    X = df.drop(columns=['label'])
    y = df['label']
    X = X.to_numpy()
    y = y.to_numpy()
    
    skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    model = XGBClassifier(device='cuda')
    y_pred = cross_val_predict(model, X, y, cv=skfold)
    y_proba = cross_val_predict(model, X, y, cv=skfold, method='predict_proba')[:, 1]  ## select the probs of class 1 only
    
    accuracy = accuracy_score(y, y_pred)
    auroc = roc_auc_score(y, y_proba)
    cm = confusion_matrix(y, y_pred)

    results = {
        'accuracy':accuracy,
        'auroc':auroc,
        'cm':cm,
        'y_proba':y_proba,
        'y_true':y
    }
    return results

In [5]:
def perform_classification_svc(df, random_state):
    X = df.drop(columns=['label'])
    y = df['label']
    X = X.to_numpy()
    y = y.to_numpy()
    
    skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    model = SVC(probability=True)
    y_pred = cross_val_predict(model, X, y, cv=skfold)
    y_proba = cross_val_predict(model, X, y, cv=skfold, method='predict_proba')[:, 1]  ## select the probs of class 1 only

    del model
    gc.collect()
    
    accuracy = accuracy_score(y, y_pred)
    auroc = roc_auc_score(y, y_proba)
    cm = confusion_matrix(y, y_pred)

    results = {
        'accuracy':accuracy,
        'auroc':auroc,
        'cm':cm,
        'y_proba':y_proba,
        'y_true':y
    }
    return results

In [6]:
def perform_classification_mlp(df, random_state):
    X = df.drop(columns=['label'])
    y = df['label']
    X = X.to_numpy()
    y = y.to_numpy()
    
    skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    model = MLPClassifier(random_state=random_state)
    y_pred = cross_val_predict(model, X, y, cv=skfold)
    y_proba = cross_val_predict(model, X, y, cv=skfold, method='predict_proba')[:, 1]  ## select the probs of class 1 only

    del model
    gc.collect()
    
    accuracy = accuracy_score(y, y_pred)
    auroc = roc_auc_score(y, y_proba)
    cm = confusion_matrix(y, y_pred)

    results = {
        'accuracy':accuracy,
        'auroc':auroc,
        'cm':cm,
        'y_proba':y_proba,
        'y_true':y
    }
    return results

In [7]:
def perform_tabnet(df, random_state):
    """
    Check the performance of TabNet
    """
    
    accuracies = []
    aurocs = []
    conf_mats = []
    y_trues = []
    y_preds_proba = []

    X = df.drop(columns=['label'])
    y = df['label']
    X = X.to_numpy()
    y = y.to_numpy()
    
    skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    
    for train_idx, test_idx in skfold.split(X, y):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
    
        tabnet_model = TabNetClassifier(
            device_name = 'cuda',
            seed = random_state,
            verbose = 0
        )
        tabnet_model.fit(
            X_train, y_train,
            eval_set = [(X_test, y_test)],
            # max_epochs = 3,
            eval_metric = ['auc', 'accuracy'],
            batch_size = 512,
            patience = 0
        )
    
        y_pred = tabnet_model.predict(X_test)
        y_pred_prob = tabnet_model.predict_proba(X_test)[:,1]

        y_trues.append(y_test)
        y_preds_proba.append(y_pred_prob)
        
        # Compute metrics
        accuracy = accuracy_score(y_test, y_pred)
        auroc = roc_auc_score(y_test, y_pred_prob)
        cm = confusion_matrix(y_test, y_pred)
    
        # Store results
        accuracies.append(accuracy)
        aurocs.append(auroc)
        conf_mats.append(cm)

        del tabnet_model
        with torch.no_grad():
            torch.cuda.empty_cache()
        gc.collect()
    
    # Compute mean metrics across folds
    mean_accuracy = np.mean(accuracies)
    mean_auroc = np.mean(aurocs)
    total_cm = np.sum(conf_mats, axis=0)  # Summing up all confusion matrices
    
    
    results = {
        'accuracy':mean_accuracy,
        'auroc':mean_auroc,
        'cm':total_cm,
        'y_proba':np.hstack(y_preds_proba),
        'y_true':np.hstack(y_trues),
    }
    
    return results

In [8]:
def weighted_fusion(prob_model1, prob_model2, weight_model1, weight_model2):
    fused_probs = (weight_model1 * prob_model1) + (weight_model2 * prob_model2)
    return fused_probs

In [9]:
def compute_weight(acc1, acc2, alpha):
    return np.exp(alpha * acc1) / (np.exp(alpha * acc1) + np.exp(alpha * acc2))

In [10]:
def compute_fusion_results(results_cnv, results_rna):
    
    luad_prob_cnv = results_cnv['y_proba']  # CNV model probabilities for LUAD. No need to use LUSC probs, as you will compute fused_probs based on class 1 only
    luad_prob_rna = results_rna['y_proba']  # RNAseq model probabilities for LUAD
    
    acc_cnv = results_cnv['accuracy']
    acc_rna = results_rna['accuracy']
    
    auroc_cnv = results_cnv['auroc']
    auroc_rna = results_rna['auroc']
    
    alpha = np.abs(np.exp(acc_cnv) - np.exp(acc_rna))*10  # Inline computation of alpha
    
    weight_cnv = compute_weight(acc_cnv, acc_rna, alpha)
    weight_rna = compute_weight(acc_rna, acc_cnv, alpha)
    
    fused_probs = weighted_fusion(luad_prob_cnv, luad_prob_rna, weight_cnv, weight_rna)
    fused_preds = (fused_probs >= 0.5).astype(int)

    return fused_probs, fused_preds

In [11]:
def dump_all_results(results, fname, seed):
    
    # Extract confusion matrix values
    TN, FP, FN, TP = results['cm'].ravel()
    
    # Create results DataFrame
    results_df = pd.DataFrame({
        "Accuracy": [results['accuracy']],
        "Mean AUROC": [results['auroc']],
        "TN": [TN],
        "FP": [FP],
        "FN": [FN],
        "TP": [TP]
    })
    
    results_df.to_csv(f"Z:/multiomics based manuscript/results_for_xena_rna_cnv_only/{fname}_{seed}.csv", index=False) ## Print results

In [12]:
rna_luad_xena = pd.read_csv(os.path.join('Z:\multiomics based manuscript\datasets\gene_exp\with_common_patients_processed', 'csv_rna_common_luad.csv'))
rna_lusc_xena = pd.read_csv(os.path.join('Z:\multiomics based manuscript\datasets\gene_exp\with_common_patients_processed', 'csv_rna_common_lusc.csv'))

rna_luad_xena['label'] = 1
rna_lusc_xena['label'] = 0
df_rna_xena = pd.concat([rna_luad_xena, rna_lusc_xena], axis=0)

cnv_luad_xena = pd.read_csv(os.path.join('Z:\multiomics based manuscript\datasets\cnv\with_common_patients_processed', 'csv_cnv_common_luad.csv'))
cnv_lusc_xena = pd.read_csv(os.path.join('Z:\multiomics based manuscript\datasets\cnv\with_common_patients_processed', 'csv_cnv_common_lusc.csv'))

cnv_luad_xena['label'] = 1
cnv_lusc_xena['label'] = 0
df_cnv_xena = pd.concat([cnv_luad_xena, cnv_lusc_xena], axis=0)

In [15]:
cnv_lusc_xena.shape

(483, 24777)

In [None]:
# good_seeds = [101,
#              103,
#              105,
#              106,
#              109,
#              119,
#              136,
#              154,
#              172,
#              175,
#              176,]
# good_seeds = [101,
#              103,
#              105,
#              106,
#              109,
#              119,
#              136,
#              154,
#              172,
#              175,]
good_seeds = [0,1,2,3,4,5,6,7,8,9]

In [None]:
for seed in good_seeds:
    
    
    ## sample with same seed, else samples would shuffle and then fusing of probs will not be correct

    _df_rna_xena = df_rna_xena.sample(frac=1, replace=False, ignore_index=True, random_state = seed)
    _df_cnv_xena = df_cnv_xena.sample(frac=1, replace=False, ignore_index=True, random_state = seed)
    
    ## get XGBoost results on RNASeq
    results_rna = perform_classification(_df_rna_xena, seed)

    ## get XGBoost results on CNV
    results_cnv = perform_classification(_df_cnv_xena, seed)

    print(f"RNA results for seed: {seed}:\t Acc: {results_rna['accuracy']} \t AUROC: {results_rna['auroc']}")
    print(f"CNV results for seed: {seed}:\t Acc: {results_cnv['accuracy']} \t AUROC: {results_cnv['auroc']}")
    
    fused_probs, fused_preds = compute_fusion_results(results_cnv, results_rna)

    ## now you can use y_true from any dataset -- cnv or rna, as the order of y_true is same in both
    fused_accuracy = accuracy_score(results_cnv['y_true'], fused_preds)
    fused_auroc = roc_auc_score(results_cnv['y_true'], fused_probs)
    fused_cm = confusion_matrix(results_cnv['y_true'], fused_preds)
    
    results_fusion = {
        'accuracy':fused_accuracy,
        'auroc':fused_auroc,
        'cm':fused_cm,
    }

    print(f"FUSION results for seed: {seed}:\t Acc: {results_fusion['accuracy']} \t AUROC: {results_fusion['auroc']}")

    dump_all_results(results_cnv, 'results_cnv', seed)
    dump_all_results(results_rna, 'results_rna', seed)
    dump_all_results(results_fusion, 'results_fusion', seed)

    print("--------------------------------------------------------------------------------")
    

In [None]:
for seed in good_seeds:
    
    
    ## sample with same seed, else samples would shuffle and then fusing of probs will not be correct

    _df_rna_xena = df_rna_xena.sample(frac=1, replace=False, ignore_index=True, random_state = seed)
    _df_cnv_xena = df_cnv_xena.sample(frac=1, replace=False, ignore_index=True, random_state = seed)


    results_cnv = perform_tabnet(_df_cnv_xena, seed)
    print(f"Tabnet CNV results for seed: {seed}:\t Acc: {results_cnv['accuracy']} \t AUROC: {results_cnv['auroc']}")

    
    
    results_rna = perform_tabnet(_df_rna_xena, seed)
    print(f"Tabnet RNA results for seed: {seed}:\t Acc: {results_rna['accuracy']} \t AUROC: {results_rna['auroc']}")

    fused_probs, fused_preds = compute_fusion_results(results_cnv, results_rna)

    ## now you can use y_true from any dataset -- cnv or rna, as the order of y_true is same in both
    fused_accuracy = accuracy_score(results_cnv['y_true'], fused_preds)
    fused_auroc = roc_auc_score(results_cnv['y_true'], fused_probs)
    fused_cm = confusion_matrix(results_cnv['y_true'], fused_preds)
    
    results_fusion = {
        'accuracy':fused_accuracy,
        'auroc':fused_auroc,
        'cm':fused_cm,
    }

    print(f"Tabnet FUSION results for seed: {seed}:\t Acc: {results_fusion['accuracy']} \t AUROC: {results_fusion['auroc']}")

    dump_all_results(results_cnv, 'results_cnv_tabnet', seed)
    dump_all_results(results_rna, 'results_rna_tabnet', seed)
    dump_all_results(results_fusion, 'results_fusion_tabnet', seed)

    print("--------------------------------------------------------------------------------")
    

In [None]:

for seed in good_seeds:
    
    
    ## sample with same seed, else samples would shuffle and then fusing of probs will not be correct

    _df_rna_xena = df_rna_xena.sample(frac=1, replace=False, ignore_index=True, random_state = seed)
    _df_cnv_xena = df_cnv_xena.sample(frac=1, replace=False, ignore_index=True, random_state = seed)
    
    ## get XGBoost results on RNASeq
    results_rna = perform_classification_svc(_df_rna_xena, seed)

    ## get XGBoost results on CNV
    results_cnv = perform_classification_svc(_df_cnv_xena, seed)

    print(f"SVC RNA results for seed: {seed}:\t Acc: {results_rna['accuracy']} \t AUROC: {results_rna['auroc']}")
    print(f"SVC CNV results for seed: {seed}:\t Acc: {results_cnv['accuracy']} \t AUROC: {results_cnv['auroc']}")
    
    fused_probs, fused_preds = compute_fusion_results(results_cnv, results_rna)

    ## now you can use y_true from any dataset -- cnv or rna, as the order of y_true is same in both
    fused_accuracy = accuracy_score(results_cnv['y_true'], fused_preds)
    fused_auroc = roc_auc_score(results_cnv['y_true'], fused_probs)
    fused_cm = confusion_matrix(results_cnv['y_true'], fused_preds)
    
    results_fusion = {
        'accuracy':fused_accuracy,
        'auroc':fused_auroc,
        'cm':fused_cm,
    }

    print(f"SVC FUSION results for seed: {seed}:\t Acc: {results_fusion['accuracy']} \t AUROC: {results_fusion['auroc']}")

    dump_all_results(results_cnv, 'results_cnv_svc', seed)
    dump_all_results(results_rna, 'results_rna_svc', seed)
    dump_all_results(results_fusion, 'results_fusion_svc', seed)

    print("--------------------------------------------------------------------------------")
    

In [None]:
from sklearn.neural_network import MLPClassifier

for seed in good_seeds:
    
    
    ## sample with same seed, else samples would shuffle and then fusing of probs will not be correct

    _df_rna_xena = df_rna_xena.sample(frac=1, replace=False, ignore_index=True, random_state = seed)
    _df_cnv_xena = df_cnv_xena.sample(frac=1, replace=False, ignore_index=True, random_state = seed)
    
    ## get XGBoost results on RNASeq
    results_rna = perform_classification_mlp(_df_rna_xena, seed)

    ## get XGBoost results on CNV
    results_cnv = perform_classification_mlp(_df_cnv_xena, seed)

    print(f"MLP RNA results for seed: {seed}:\t Acc: {results_rna['accuracy']} \t AUROC: {results_rna['auroc']}")
    print(f"MLP CNV results for seed: {seed}:\t Acc: {results_cnv['accuracy']} \t AUROC: {results_cnv['auroc']}")
    
    fused_probs, fused_preds = compute_fusion_results(results_cnv, results_rna)

    ## now you can use y_true from any dataset -- cnv or rna, as the order of y_true is same in both
    fused_accuracy = accuracy_score(results_cnv['y_true'], fused_preds)
    fused_auroc = roc_auc_score(results_cnv['y_true'], fused_probs)
    fused_cm = confusion_matrix(results_cnv['y_true'], fused_preds)
    
    results_fusion = {
        'accuracy':fused_accuracy,
        'auroc':fused_auroc,
        'cm':fused_cm,
    }

    print(f"MLP FUSION results for seed: {seed}:\t Acc: {results_fusion['accuracy']} \t AUROC: {results_fusion['auroc']}")

    dump_all_results(results_cnv, 'results_cnv_mlp', seed)
    dump_all_results(results_rna, 'results_rna_mlp', seed)
    dump_all_results(results_fusion, 'results_fusion_mlp', seed)

    print("--------------------------------------------------------------------------------")
    