In [18]:
%cd ../src

/Users/ivamilojkovic/Breast-Cancer-Analysis/src


In [98]:
import pandas as pd
import pickle, os
import matplotlib.pyplot as plt
import seaborn as sns
from utils import *
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import FunctionTransformer

plt.style.use('ggplot')
sns.set_theme()
plt.rcParams["figure.figsize"] = (12, 4)

import warnings
warnings.filterwarnings('ignore')

In [99]:
from multilabel_metrics import semi_relaxed_accuracy, ordered_subset_accuracy, k_orders_subset_accuracy, relaxed_accuracy, partial_accuracy, secondary_accuracy
from sklearn.metrics import label_ranking_average_precision_score, average_precision_score
def print_all_scores(y_test, predictions, prob_predictions, label_orig, label_pam50, y_corr, txt_file_name=None, k=[1]):

    # Compute scores on test set
    subset_acc = accuracy_score(y_test, predictions)
    relax_pam50_acc = relaxed_accuracy(label_pam50, predictions)
    relax_orig_acc = relaxed_accuracy(label_orig, predictions)
    partial_acc = partial_accuracy(y_test, predictions)
    hamm_loss = hamming_loss(y_test, predictions)
    rank_avg_prec = label_ranking_average_precision_score(y_test, prob_predictions)
    avg_prec = average_precision_score(y_test, prob_predictions)

    print('\nTest accuracy: {}'.format(subset_acc))
    print('Test relaxed accuracy (PAM50): {}'.format(relax_pam50_acc))
    print('Test relaxed accuracy (original): {}'.format(relax_orig_acc))
    print('Partial accuracy: {}'.format(partial_acc))
    print('Test Hamming loss: {}\n'.format(hamm_loss))
    print('Ranking average precision: ', rank_avg_prec)
    print('Average precision: ', avg_prec)

    prec_weighted = precision_score(y_test, predictions, 
                                    average='weighted', zero_division=1)
    rec_weighted = recall_score(y_test, predictions, 
                            average='weighted', zero_division=1)
    f1_weighted = f1_score(y_test, predictions, 
                        average='weighted', zero_division=1)

    print('Test precision (weighted): {}'.format(prec_weighted))
    print('Test recall (weighted): {}'.format(rec_weighted))
    print('Test f1 score (weighted): {}\n'.format(f1_weighted))

    prec_macro = precision_score(y_test, predictions, 
                                average='macro', zero_division=1)
    rec_macro = recall_score(y_test, predictions, 
                            average='macro', zero_division=1)
    f1_macro = f1_score(y_test, predictions, 
                        average='macro', zero_division=1)

    print('Test precision (macro): {}'.format(prec_macro))
    print('Test recall (macro): {}'.format(rec_macro))
    print('Test f1 score (macro): {}\n'.format(f1_macro))

    prec_micro = precision_score(y_test, predictions, 
                                average='micro', zero_division=1)
    rec_micro = recall_score(y_test, predictions, 
                            average='micro', zero_division=1)
    f1_micro = f1_score(y_test, predictions, 
                        average='micro', zero_division=1)
    
    print('Test precision (micro): {}'.format(prec_micro))
    print('Test recall (micro): {}'.format(rec_micro))
    print('Test f1 score (micro): {}\n'.format(f1_micro))

    # Additional metrics
    # semi = semi_relaxed_accuracy(y_prob_pred=prob_predictions, y_true=y_test)
    ordered = ordered_subset_accuracy(y_test_corr=y_corr, y_test_mcut=y_test, 
                                      predictions=predictions, prob_predictions=prob_predictions)

    # print('Semi-relexed: {}'.format(semi))
    print('Ordered subset acc: {}'.format(ordered))
    for order in k:
        k_ordered = k_orders_subset_accuracy(y_test_mcut=y_test, predictions=predictions, 
                                            y_test_corr=y_corr, prob_predictions=prob_predictions, k=order)
        print('Order {} accuracy: {}'.format(order, k_ordered))

    sec_acc = secondary_accuracy(y_test_mcut=y_test, predictions=predictions, 
                                            y_test_corr=y_corr, prob_predictions=prob_predictions, k=2)
    print('\nSecondary accuracy: ', sec_acc)
    
    if txt_file_name != None:
        with open(txt_file_name, 'w') as file:

            file.write('--- Test scores ---\n')
            file.write(f'Subset accuracy: {subset_acc}\n')
            file.write(f'Relaxed (PAM50) accuracy: {relax_pam50_acc}\n')
            file.write(f'Relaxed (original) accuracy: {relax_orig_acc}\n')
            file.write(f'Partial accuracy: {partial_acc}\n')
            file.write(f'Hamming loss: {hamm_loss}\n')
            file.write(f'Ranking average precision: {rank_avg_prec}\n')
            file.write(f'Average precision: {avg_prec}\n\n')

            file.write(' - Weighted scores -\n')
            file.write(f'Precision: {prec_weighted}\n')
            file.write(f'Recall: {rec_weighted}\n')
            file.write(f'F1 score: {f1_weighted}\n\n')

            file.write(' - Macro scores -\n')
            file.write(f'Precision: {prec_macro}\n')
            file.write(f'Recall: {rec_macro}\n')
            file.write(f'F1 score: {f1_macro}\n\n')

            file.write(' - Micro scores -\n')
            file.write(f'Precision: {prec_micro}\n')
            file.write(f'Recall: {rec_micro}\n')
            file.write(f'F1 score: {f1_micro}\n\n')


In [100]:
from data_preprocessing import remove_extreme

# Load test data (the split is the same as in the training workflow)
with open('../data/dataset_multilabel.pkl', 'rb') as file:
    data = pickle.load(file)
    label_values = ['Basal', 'Her2', 'LumA', 'LumB', 'Normal']
    X = data.drop(columns=['expert_PAM50_subtype', 'tcga_id',
                           'Subtype-from Parker centroids',	'MaxCorr',
                            'Basal', 'Her2', 'LumA', 'LumB', 'Normal'], inplace=False)
    y_orig = data.expert_PAM50_subtype
    y_pam50 = data['Subtype-from Parker centroids']
    
DATA_TYPE = 'BRCA'

########################### Load the data ###########################
if DATA_TYPE == 'CRIS':
    label_values = ['CRIS.A', 'CRIS.B', 'CRIS.C', 'CRIS.D', 'CRIS.E']
    with open('../data/tcga_cris_raw_24356_620samples.pkl', 'rb') as file:
        data = pickle.load(file) 
    X = data.drop(columns=['Patient ID', 'Subtype-from Parker centroids'] + label_values, inplace=False)
    y_pam50 = data['Subtype-from Parker centroids']
    y_orig = data['Subtype-from Parker centroids'] # this is not important

# Remove extreme values (genes, samples) from initial preprocessing
X, potential_samples_to_remove, \
    feat_to_remove, feat_to_keep = remove_extreme(X, change_X = True)

# Take labels on whole dataset for PAM50
y_corr = data[label_values]
y_corr_non_neg = discard_negative_correlations(y_corr)

# M-cut strategy to assign labels on whole dataset
y_mcut_labels, _ = m_cut_strategy_class_assignment(y_corr, non_neg_values=True)
y_mcut_labels_neg, _ = m_cut_strategy_class_assignment(y_corr, non_neg_values=False)

# Compute labels from two strategies (M-cut and 5th percentile)
y_mcut_5perc_labels, _ = create_mcut_nth_percentile_labels(
    m_cut_labels=y_mcut_labels,
    correlations=y_corr_non_neg,
    y=y_pam50,
    keep_primary=False,
    N=5
)

X_train, X_test, \
y_train_pam50, y_test_pam50, \
y_train_mcut, y_test_mcut, \
y_train_orig, y_test_orig, \
y_train_5perc, y_test_5perc, \
y_train_corr, y_test_corr = \
    train_test_split(X, y_pam50, y_mcut_labels, y_orig, 
                    y_mcut_5perc_labels, y_corr_non_neg, test_size=0.3, random_state=1, stratify=y_pam50)

# Data standardization | normalization
X_train = X_train.divide(X_train.sum(axis=1), axis=0) * 1e6
X_test = X_test.divide(X_test.sum(axis=1), axis=0) * 1e6
scaler = FunctionTransformer(log_transform)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)

# Load selected features
if DATA_TYPE=='CRIS':
    with open('../data/cris/new2_without_corr_removed_feat_select_gt_40_perc_occur.pkl', 'rb') as file:
        selected_feat = pickle.load(file)
else:
    with open('../data/brca/without_corr_removed_feat_select_gt_50_perc_occur.pkl', 'rb') as file:
        selected_feat = pickle.load(file)

X_train_scaled_selected = X_train_scaled[list(selected_feat)]
X_test_scaled_selected = X_test_scaled[list(selected_feat)]

# One-hot encoding of original and PAM50 labels
y_train_orig = pd.get_dummies(y_train_orig)
y_test_orig = pd.get_dummies(y_test_orig)
y_train_pam50 = pd.get_dummies(y_train_pam50)
y_test_pam50 = pd.get_dummies(y_test_pam50)


There are 0 columns with more than 80% of Null values!
There are 22722 columns with more than 20% of count values greater than 4!


In [101]:
with open('/Users/ivamilojkovic/Breast-Cancer-Analysis/test_samples.pkl', 'rb') as f:
    test_main_idx = pickle.load(f)
len(test_main_idx), len(set(y_test_5perc.index).intersection(set(test_main_idx)))

(186, 53)

In [102]:
from sklearn.metrics import accuracy_score

# Define a mapping dictionary
mapping = {0: 'Basal',1: 'Her2', 2: 'LumA', 3: 'LumB', 4: 'Normal'}

In [103]:
path_brca_sl = '../final_results/BRCA/single-label/SUBTYPE_PAM50/feat_select_hybrid'

# Loading the best models for each approach in single-label
with open(os.path.join(path_brca_sl, 'bestmodel_XGBoost_run_28-08-2023_05:09:55.pkl'), 'rb') as f:
    brca_model_xgb = pickle.load(f)

with open(os.path.join(path_brca_sl, 'bestmodel_LogisticRegression_run_28-08-2023_13:08:54.pkl'), 'rb') as f:
    brca_model_lr = pickle.load(f)

with open(os.path.join(path_brca_sl, 'bestmodel_RandomForest_run_28-08-2023_13:44:47.pkl'), 'rb') as f:
    brca_model_rf = pickle.load(f)

with open(os.path.join(path_brca_sl, 'bestmodel_SVC_run_28-08-2023_14:00:35.pkl'), 'rb') as f:
    brca_model_svm = pickle.load(f)

with open(os.path.join(path_brca_sl, 'bestmodel_AdaBoost_run_28-08-2023_16:33:41.pkl'), 'rb') as f:
    brca_model_adaboost = pickle.load(f)

with open(os.path.join(path_brca_sl, 'bestmodel_DecisionTree_run_28-08-2023_13:34:10.pkl'), 'rb') as f:
    brca_model_dectree = pickle.load(f)

with open(os.path.join(path_brca_sl, 'bestmodel_KNN_run_28-08-2023_13:33:43.pkl'), 'rb') as f:
    brca_model_knn = pickle.load(f)

with open(os.path.join(path_brca_sl, 'bestmodel_LightGBM_run_28-08-2023_14:43:23.pkl'), 'rb') as f:
    brca_model_lgbm = pickle.load(f)

# Compute predictions
xgb_preds = brca_model_xgb.predict(X_test_scaled_selected)
xgb_preds = pd.Series(xgb_preds, index=y_test_5perc.index).map(mapping)

lr_preds = brca_model_lr.predict(X_test_scaled_selected)
lr_preds = pd.Series(lr_preds, index=y_test_5perc.index).map(mapping)

svm_preds = brca_model_svm.predict(X_test_scaled_selected)
svm_preds = pd.Series(svm_preds, index=y_test_5perc.index).map(mapping)

rf_preds = brca_model_rf.predict(X_test_scaled_selected)
rf_preds = pd.Series(rf_preds, index=y_test_5perc.index).map(mapping)
# ----------
ada_preds = brca_model_adaboost.predict(X_test_scaled_selected)
ada_preds = pd.Series(ada_preds, index=y_test_5perc.index).map(mapping)

knn_preds = brca_model_knn.predict(X_test_scaled_selected)
knn_preds = pd.Series(knn_preds, index=y_test_5perc.index).map(mapping)

dectree_preds = brca_model_dectree.predict(X_test_scaled_selected)
dectree_preds = pd.Series(dectree_preds, index=y_test_5perc.index).map(mapping)

lgbm_preds = brca_model_lgbm.predict(X_test_scaled_selected)
lgbm_preds = pd.Series(lgbm_preds, index=y_test_5perc.index).map(mapping)

In [104]:
path_brca_ml = '../final_results/BRCA/multi-label'
path_brca_ml_pt = os.path.join(path_brca_ml, '03-09-2023_18:41:10')

# Logistic Regression - Binary Relevance
with open(os.path.join(path_brca_ml_pt, 'BR_LRegression_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_lr_br = pickle.load(file)
# Logistic Regression - Classifier Chain 
with open(os.path.join(path_brca_ml_pt, 'CC_LRegression_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_lr_cc = pickle.load(file)
# Logistic Regression - Label Powerset
with open(os.path.join(path_brca_ml_pt, 'LP_LRegression_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_lr_lp = pickle.load(file)

# XGBoost - Binary Relevance
with open(os.path.join(path_brca_ml_pt, 'BR_XGBoost_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_xgb_br = pickle.load(file)
# XGBoost - Classifier Chain 
with open(os.path.join(path_brca_ml_pt, 'CC_XGBoost_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_xgb_cc = pickle.load(file)
# XGBoost - Label Powerset
with open(os.path.join(path_brca_ml_pt, 'LP_XGBoost_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_xgb_lp = pickle.load(file)

# Random Forest - Binary Relevance
with open(os.path.join(path_brca_ml_pt, 'BR_RForest_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_rf_br = pickle.load(file)
# Random Forest - Classifier Chain
with open(os.path.join(path_brca_ml_pt, 'CC_RForest_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_rf_cc = pickle.load(file)
# Random Forest - Label Powesret
with open(os.path.join(path_brca_ml_pt, 'LP_RForest_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_rf_lp = pickle.load(file)

# SVC - Binary Relevance 
with open(os.path.join(path_brca_ml_pt, 'BR_SVC_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_svm_br = pickle.load(file)
# SVC - Classifier Chain
with open(os.path.join(path_brca_ml_pt, 'CC_SVC_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_svm_cc = pickle.load(file)
# SVC - Label Powesret
with open(os.path.join(path_brca_ml_pt, 'LP_SVC_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_svm_lp = pickle.load(file)


############################ Algorithm Adaptation and Ensemble #############################
path_brca_ml_aa = os.path.join(path_brca_ml, '30-08-2023_03:55:43')

with open(os.path.join(path_brca_ml_aa, 'MLkNN_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_mlknn = pickle.load(file)
with open(os.path.join(path_brca_ml_aa, 'MLARAM_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_mlaram = pickle.load(file)

with open(os.path.join(path_brca_ml_aa, 'EnsembleCC_XGBoost_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_ecc_xgb = pickle.load(file)
with open(os.path.join(path_brca_ml_aa, 'EnsembleCC_RForest_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_ecc_rf = pickle.load(file)
with open(os.path.join(path_brca_ml_aa, 'EnsembleCC_LRegression_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_ecc_lr = pickle.load(file)
with open(os.path.join(path_brca_ml_aa, 'EnsembleCC_SVC_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_ecc_svm = pickle.load(file)

with open(os.path.join(path_brca_ml_aa, 'EnsembleRakel_XGBoost_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_er_xgb = pickle.load(file)
with open(os.path.join(path_brca_ml_aa, 'EnsembleRakel_RForest_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_er_rf = pickle.load(file)
with open(os.path.join(path_brca_ml_aa, 'EnsembleRakel_LRegression_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_er_lr = pickle.load(file)
with open(os.path.join(path_brca_ml_aa, 'EnsembleRakel_SVC_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_er_svm = pickle.load(file)

In [114]:
print(brca_ml_model_er_xgb.get_params())

{'memory': None, 'steps': [('classifier', RakelD(base_classifier=XGBClassifier(base_score=0.5, booster='gbtree',
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.5,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=0.25,
                                     gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=0.05, max_bin=None,
                                     max_cat_threshold=None,
                                     max_cat_to_onehot=None,
                                     max_delta_step=None, max_depth=5,
                  

In [8]:
from sklearn.preprocessing import normalize

# Define a mapping dictionary
mapping = {0: 'Basal', 1: 'Her2', 2: 'LumA', 3: 'LumB', 4: 'Normal'}

## Get the predictions and probabilities for each model and set the original test indices
# X_test_scaled_selected = X_scaled_selected
# y_test_5perc = y_mcut_5perc_labels

############################### XGBoost ###############################
xgb_br_preds = brca_ml_model_xgb_br.predict(X_test_scaled_selected).toarray()
xgb_br_preds = pd.DataFrame(xgb_br_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
xgb_br_prob_preds = brca_ml_model_xgb_br.predict_proba(X_test_scaled_selected).toarray()
xgb_br_prob_preds = pd.DataFrame(xgb_br_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

xgb_cc_preds = brca_ml_model_xgb_cc.predict(X_test_scaled_selected).toarray()
xgb_cc_preds = pd.DataFrame(xgb_cc_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
xgb_cc_prob_preds = brca_ml_model_xgb_cc.predict_proba(X_test_scaled_selected).toarray()
xgb_cc_prob_preds = pd.DataFrame(xgb_cc_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

# ############################### Logistic Regression ###############################
lr_br_preds = brca_ml_model_lr_br.predict(X_test_scaled_selected).toarray()
lr_br_preds = pd.DataFrame(lr_br_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
lr_br_prob_preds = brca_ml_model_lr_br.predict_proba(X_test_scaled_selected).toarray()
lr_br_prob_preds = pd.DataFrame(lr_br_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

lr_cc_preds = brca_ml_model_lr_cc.predict(X_test_scaled_selected).toarray()
lr_cc_preds = pd.DataFrame(lr_cc_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
lr_cc_prob_preds = brca_ml_model_lr_cc.predict_proba(X_test_scaled_selected).toarray()
lr_cc_prob_preds = pd.DataFrame(lr_cc_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

# ############################### SVM ###############################
svm_lp_preds = brca_ml_model_svm_lp.predict(X_test_scaled_selected).toarray()
svm_lp_preds = pd.DataFrame(svm_lp_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
svm_lp_prob_preds = brca_ml_model_svm_lp.predict_proba(X_test_scaled_selected).toarray()
svm_lp_prob_preds = pd.DataFrame(svm_lp_prob_preds, index=y_test_5perc.index, columns=y_test_5perc.columns)

# ############################### Random Forest ###############################
rf_lp_preds = brca_ml_model_rf_lp.predict(X_test_scaled_selected).toarray()
rf_lp_preds = pd.DataFrame(rf_lp_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
rf_lp_prob_preds = brca_ml_model_rf_lp.predict_proba(X_test_scaled_selected).toarray()
rf_lp_prob_preds = pd.DataFrame(rf_lp_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

# NOT OPTIMAL
xgb_lp_preds = brca_ml_model_xgb_lp.predict(X_test_scaled_selected).toarray()
xgb_lp_preds = pd.DataFrame(xgb_lp_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
xgb_lp_prob_preds = brca_ml_model_xgb_lp.predict_proba(X_test_scaled_selected).toarray()
xgb_lp_prob_preds = pd.DataFrame(xgb_lp_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

lr_lp_preds = brca_ml_model_lr_lp.predict(X_test_scaled_selected).toarray()
lr_lp_preds = pd.DataFrame(lr_lp_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
lr_lp_prob_preds = brca_ml_model_lr_lp.predict_proba(X_test_scaled_selected).toarray()
lr_lp_prob_preds = pd.DataFrame(lr_lp_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

svm_br_preds = brca_ml_model_svm_br.predict(X_test_scaled_selected).toarray()
svm_br_preds = pd.DataFrame(svm_br_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
svm_br_prob_preds = brca_ml_model_svm_br.predict_proba(X_test_scaled_selected).toarray()
svm_br_prob_preds = pd.DataFrame(svm_br_prob_preds, index=y_test_5perc.index, columns=y_test_5perc.columns)

svm_cc_preds = brca_ml_model_svm_cc.predict(X_test_scaled_selected).toarray()
svm_cc_preds = pd.DataFrame(svm_cc_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
svm_cc_prob_preds = brca_ml_model_svm_cc.predict_proba(X_test_scaled_selected).toarray()
svm_cc_prob_preds = pd.DataFrame(svm_cc_prob_preds, index=y_test_5perc.index, columns=y_test_5perc.columns)

rf_br_preds = brca_ml_model_rf_br.predict(X_test_scaled_selected).toarray()
rf_br_preds = pd.DataFrame(rf_br_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
rf_br_prob_preds = brca_ml_model_rf_br.predict_proba(X_test_scaled_selected).toarray()
rf_br_prob_preds = pd.DataFrame(rf_br_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

rf_cc_preds = brca_ml_model_rf_cc.predict(X_test_scaled_selected).toarray()
rf_cc_preds = pd.DataFrame(rf_cc_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
rf_cc_prob_preds = brca_ml_model_rf_cc.predict_proba(X_test_scaled_selected).toarray()
rf_cc_prob_preds = pd.DataFrame(rf_cc_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

############################ Algorithm Adaptation #############################
mlknn_preds = brca_ml_model_mlknn.predict(X_test_scaled_selected).toarray()
mlknn_preds = pd.DataFrame(mlknn_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
mlknn_prob_preds = brca_ml_model_mlknn.predict_proba(X_test_scaled_selected).toarray()
mlknn_prob_preds = pd.DataFrame(mlknn_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

mlaram_preds = brca_ml_model_mlaram.predict(X_test_scaled_selected.values)
mlaram_preds = pd.DataFrame(mlaram_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
mlaram_prob_preds = brca_ml_model_mlaram.predict_proba(X_test_scaled_selected.values)
mlaram_prob_preds = pd.DataFrame(mlaram_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

####################### Ensemble #########################
ecc_xgb_preds = brca_ml_model_ecc_xgb.predict(X_test_scaled_selected)
ecc_xgb_preds = pd.DataFrame(ecc_xgb_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
ecc_xgb_prob_preds = brca_ml_model_ecc_xgb.predict_proba(X_test_scaled_selected)
ecc_xgb_prob_preds = normalize(np.array([ecc_xgb_prob_preds[i][:, 1] for i in range(5)]).transpose(), axis=1, norm='l1')
ecc_xgb_prob_preds = pd.DataFrame(ecc_xgb_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

er_xgb_preds = brca_ml_model_er_xgb.predict(X_test_scaled_selected).todense()
er_xgb_preds = pd.DataFrame(er_xgb_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
er_xgb_prob_preds = brca_ml_model_er_xgb.predict_proba(X_test_scaled_selected).todense()
er_xgb_prob_preds = pd.DataFrame(er_xgb_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

ecc_rf_preds = brca_ml_model_ecc_rf.predict(X_test_scaled_selected)
ecc_rf_preds = pd.DataFrame(ecc_rf_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
ecc_rf_prob_preds = brca_ml_model_ecc_rf.predict_proba(X_test_scaled_selected)
ecc_rf_prob_preds = normalize(np.array([ecc_rf_prob_preds[i][:, 1] for i in range(5)]).transpose(), axis=1, norm='l1')
ecc_rf_prob_preds = pd.DataFrame(ecc_rf_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

er_rf_preds = brca_ml_model_er_rf.predict(X_test_scaled_selected).todense()
er_rf_preds = pd.DataFrame(er_rf_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
er_rf_prob_preds = brca_ml_model_er_rf.predict_proba(X_test_scaled_selected).todense()
er_rf_prob_preds = pd.DataFrame(er_rf_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

ecc_lr_preds = brca_ml_model_ecc_lr.predict(X_test_scaled_selected)
ecc_lr_preds = pd.DataFrame(ecc_lr_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
ecc_lr_prob_preds = brca_ml_model_ecc_lr.predict_proba(X_test_scaled_selected)
ecc_lr_prob_preds = normalize(np.array([ecc_lr_prob_preds[i][:, 1] for i in range(5)]).transpose(), axis=1, norm='l1')
ecc_lr_prob_preds = pd.DataFrame(ecc_lr_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

er_lr_preds = brca_ml_model_er_lr.predict(X_test_scaled_selected).todense()
er_lr_preds = pd.DataFrame(er_lr_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
er_lr_prob_preds = brca_ml_model_er_lr.predict_proba(X_test_scaled_selected).todense()
er_lr_prob_preds = pd.DataFrame(er_lr_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

ecc_svm_preds = brca_ml_model_ecc_svm.predict(X_test_scaled_selected)
ecc_svm_preds = pd.DataFrame(ecc_svm_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
ecc_svm_prob_preds = brca_ml_model_ecc_svm.predict_proba(X_test_scaled_selected)
ecc_svm_prob_preds = normalize(np.array([ecc_svm_prob_preds[i][:, 1] for i in range(5)]).transpose(), axis=1, norm='l1')
ecc_svm_prob_preds = pd.DataFrame(ecc_svm_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

er_svm_preds = brca_ml_model_er_svm.predict(X_test_scaled_selected).todense()
er_svm_preds = pd.DataFrame(er_svm_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
er_svm_prob_preds = brca_ml_model_er_svm.predict_proba(X_test_scaled_selected).todense()
er_svm_prob_preds = pd.DataFrame(er_svm_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

In [14]:
print_all_scores(y_test_5perc, er_xgb_preds, er_xgb_prob_preds, y_test_orig, y_test_pam50, y_test_corr, txt_file_name=None, k=[1, 2, 3])



Test accuracy: 0.7943037974683544
Test relaxed accuracy (PAM50): 0.9715189873417721
Test relaxed accuracy (original): 0.939873417721519
Partial accuracy: 0.8924050632911392
Test Hamming loss: 0.043670886075949364

Ranking average precision:  0.9898206751054851
Average precision:  0.9694695433285533
Test precision (weighted): 0.9393440179395858
Test recall (weighted): 0.9136842105263158
Test f1 score (weighted): 0.9242453864492874

Test precision (macro): 0.9445071010860484
Test recall (macro): 0.8670346151482053
Test f1 score (macro): 0.8991986318204764

Test precision (micro): 0.9393939393939394
Test recall (micro): 0.9136842105263158
Test f1 score (micro): 0.9263607257203841

Ordered subset acc: 0.7246835443037974
Order 1 accuracy: 0.8829113924050633
Order 2 accuracy: 0.7278481012658228
Order 3 accuracy: 0.7246835443037974

Secondary accuracy:  0.740506329113924


# CRIS

In [115]:
label_values = ['CRIS.A', 'CRIS.B', 'CRIS.C', 'CRIS.D', 'CRIS.E']
with open('../data/tcga_cris_raw_24356_620samples.pkl', 'rb') as file:
    data = pickle.load(file) 
X = data.drop(columns=['Patient ID', 'Subtype-from Parker centroids'] + label_values, inplace=False)
y_pam50 = data['Subtype-from Parker centroids']
y_orig = data['Subtype-from Parker centroids'] # this is not important

# Take labels on whole dataset for PAM50
y_corr = data[label_values]
y_corr_non_neg = discard_negative_correlations(y_corr)

# M-cut strategy to assign labels on whole dataset
y_mcut_labels, _ = m_cut_strategy_class_assignment(y_corr, non_neg_values=True)
y_mcut_labels_neg, _ = m_cut_strategy_class_assignment(y_corr, non_neg_values=False)

# Compute labels from two strategies (M-cut and 5th percentile)
y_mcut_5perc_labels, _ = create_mcut_nth_percentile_labels(
    m_cut_labels=y_mcut_labels,
    correlations=y_corr_non_neg,
    y=y_pam50,
    keep_primary=False,
    N=5
)

X_train, X_test, \
y_train_pam50, y_test_pam50, \
y_train_mcut, y_test_mcut, \
y_train_orig, y_test_orig, \
y_train_5perc, y_test_5perc, \
y_train_corr, y_test_corr = \
    train_test_split(X, y_pam50, y_mcut_labels, y_orig, 
                    y_mcut_5perc_labels, y_corr_non_neg, test_size=0.3, random_state=1, stratify=y_pam50)

# Data standardization | normalization
X_train = X_train.divide(X_train.sum(axis=1), axis=0) * 1e6
X_test = X_test.divide(X_test.sum(axis=1), axis=0) * 1e6
scaler = FunctionTransformer(log_transform)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)

# Load selected features
with open('../data/cris/new2_without_corr_removed_feat_select_gt_40_perc_occur.pkl', 'rb') as file:
    selected_feat = pickle.load(file)

X_train_scaled_selected = X_train_scaled[list(selected_feat)]
X_test_scaled_selected = X_test_scaled[list(selected_feat)]

# One-hot encoding of original and PAM50 labels
y_train_orig = pd.get_dummies(y_train_orig)
y_test_orig = pd.get_dummies(y_test_orig)
y_train_pam50 = pd.get_dummies(y_train_pam50)
y_test_pam50 = pd.get_dummies(y_test_pam50)


In [116]:
path_brca_sl = '../final_results/CRIS/single-label/01-14-52/'

# Loading the best models for each approach in single-label
with open(os.path.join(path_brca_sl, 'bestmodel_XGBoost_run_01-09-2023_03:49:46.pkl'), 'rb') as f:
    cris_model_xgb = pickle.load(f)

with open(os.path.join(path_brca_sl, 'bestmodel_LogisticRegression_run_01-09-2023_01:15:05.pkl'), 'rb') as f:
    cris_model_lr = pickle.load(f)

with open(os.path.join(path_brca_sl, 'bestmodel_RandomForest_run_01-09-2023_01:42:48.pkl'), 'rb') as f:
    cris_model_rf = pickle.load(f)

with open(os.path.join(path_brca_sl, 'bestmodel_SVC_run_01-09-2023_01:59:53.pkl'), 'rb') as f:
    cris_model_svm = pickle.load(f)

with open(os.path.join(path_brca_sl, 'bestmodel_AdaBoost_run_01-09-2023_03:41:42.pkl'), 'rb') as f:
    cris_model_adaboost = pickle.load(f)

with open(os.path.join(path_brca_sl, 'bestmodel_DecisionTree_run_01-09-2023_01:37:16.pkl'), 'rb') as f:
    cris_model_dectree = pickle.load(f)

with open(os.path.join(path_brca_sl, 'bestmodel_KNN_run_01-09-2023_01:36:57.pkl'), 'rb') as f:
    cris_model_knn = pickle.load(f)

with open(os.path.join(path_brca_sl, 'bestmodel_LightGBM_run_01-09-2023_02:21:39.pkl'), 'rb') as f:
    cris_model_lgbm = pickle.load(f)

# Compute predictions
xgb_preds = cris_model_xgb.predict(X_test_scaled_selected)
xgb_preds = pd.Series(xgb_preds, index=y_test_5perc.index).map(mapping)

lr_preds = cris_model_lr.predict(X_test_scaled_selected)
lr_preds = pd.Series(lr_preds, index=y_test_5perc.index).map(mapping)

svm_preds = cris_model_svm.predict(X_test_scaled_selected)
svm_preds = pd.Series(svm_preds, index=y_test_5perc.index).map(mapping)

rf_preds = cris_model_rf.predict(X_test_scaled_selected)
rf_preds = pd.Series(rf_preds, index=y_test_5perc.index).map(mapping)
# ----------
ada_preds = cris_model_adaboost.predict(X_test_scaled_selected)
ada_preds = pd.Series(ada_preds, index=y_test_5perc.index).map(mapping)

knn_preds = cris_model_knn.predict(X_test_scaled_selected)
knn_preds = pd.Series(knn_preds, index=y_test_5perc.index).map(mapping)

dectree_preds = cris_model_dectree.predict(X_test_scaled_selected)
dectree_preds = pd.Series(dectree_preds, index=y_test_5perc.index).map(mapping)

lgbm_preds = cris_model_lgbm.predict(X_test_scaled_selected)
lgbm_preds = pd.Series(lgbm_preds, index=y_test_5perc.index).map(mapping)

In [117]:
path_brca_ml = '../final_results/CRIS/multi-label'
path_brca_ml_pt = os.path.join(path_brca_ml, '01-09-2023_05:58:37')

# Logistic Regression - Binary Relevance
with open(os.path.join(path_brca_ml_pt, 'BR_LRegression_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_lr_br = pickle.load(file)
# Logistic Regression - Classifier Chain 
with open(os.path.join(path_brca_ml_pt, 'CC_LRegression_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_lr_cc = pickle.load(file)
# Logistic Regression - Label Powerset
with open(os.path.join(path_brca_ml_pt, 'LP_LRegression_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_lr_lp = pickle.load(file)

# XGBoost - Binary Relevance
with open(os.path.join(path_brca_ml_pt, 'BR_XGBoost_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_xgb_br = pickle.load(file)
# XGBoost - Classifier Chain 
with open(os.path.join(path_brca_ml_pt, 'CC_XGBoost_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_xgb_cc = pickle.load(file)
# XGBoost - Label Powerset
with open(os.path.join(path_brca_ml_pt, 'LP_XGBoost_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_xgb_lp = pickle.load(file)

# Random Forest - Binary Relevance
with open(os.path.join(path_brca_ml_pt, 'BR_RForest_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_rf_br = pickle.load(file)
# Random Forest - Classifier Chain
with open(os.path.join(path_brca_ml_pt, 'CC_RForest_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_rf_cc = pickle.load(file)
# Random Forest - Label Powesret
with open(os.path.join(path_brca_ml_pt, 'LP_RForest_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_rf_lp = pickle.load(file)

# SVC - Binary Relevance 
with open(os.path.join(path_brca_ml_pt, 'BR_SVC_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_svm_br = pickle.load(file)
# SVC - Classifier Chain
with open(os.path.join(path_brca_ml_pt, 'CC_SVC_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_svm_cc = pickle.load(file)
# SVC - Label Powesret
with open(os.path.join(path_brca_ml_pt, 'LP_SVC_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_svm_lp = pickle.load(file)

In [118]:
############################ Algorithm Adaptation and Ensemble #############################
path_brca_ml_aa = os.path.join(path_brca_ml, '01-09-2023_12:14:22')

with open(os.path.join(path_brca_ml_aa, 'MLkNN_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_mlknn = pickle.load(file)
with open(os.path.join(path_brca_ml_aa, 'MLARAM_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_mlaram = pickle.load(file)

with open(os.path.join(path_brca_ml_aa, 'EnsembleCC_XGBoost_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_ecc_xgb = pickle.load(file)
with open(os.path.join(path_brca_ml_aa, 'EnsembleCC_RForest_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_ecc_rf = pickle.load(file)
with open(os.path.join(path_brca_ml_aa, 'EnsembleCC_LRegression_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_ecc_lr = pickle.load(file)
with open(os.path.join(path_brca_ml_aa, 'EnsembleCC_SVC_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_ecc_svm = pickle.load(file)

with open(os.path.join(path_brca_ml_aa, 'EnsembleRakel_XGBoost_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_er_xgb = pickle.load(file)
with open(os.path.join(path_brca_ml_aa, 'EnsembleRakel_RForest_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_er_rf = pickle.load(file)
with open(os.path.join(path_brca_ml_aa, 'EnsembleRakel_LRegression_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_er_lr = pickle.load(file)
with open(os.path.join(path_brca_ml_aa, 'EnsembleRakel_SVC_mcut_5perc_bestmodel.pkl'), 'rb') as file:
    brca_ml_model_er_svm = pickle.load(file)

In [123]:
print(brca_ml_model_lr_br.get_params())

{'classifier': LogisticRegression(C=5, random_state=4, solver='sag', tol=0.01), 'classifier__C': 5, 'classifier__class_weight': None, 'classifier__dual': False, 'classifier__fit_intercept': True, 'classifier__intercept_scaling': 1, 'classifier__l1_ratio': None, 'classifier__max_iter': 100, 'classifier__multi_class': 'auto', 'classifier__n_jobs': None, 'classifier__penalty': 'l2', 'classifier__random_state': 4, 'classifier__solver': 'sag', 'classifier__tol': 0.01, 'classifier__verbose': 0, 'classifier__warm_start': False, 'require_dense': [True, True]}


In [14]:
from sklearn.preprocessing import normalize

# Define a mapping dictionary
mapping = {0: 'Basal', 1: 'Her2', 2: 'LumA', 3: 'LumB', 4: 'Normal'}

############################### XGBoost ###############################
xgb_br_preds = brca_ml_model_xgb_br.predict(X_test_scaled_selected).toarray()
xgb_br_preds = pd.DataFrame(xgb_br_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
xgb_br_prob_preds = brca_ml_model_xgb_br.predict_proba(X_test_scaled_selected).toarray()
xgb_br_prob_preds = pd.DataFrame(xgb_br_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

xgb_cc_preds = brca_ml_model_xgb_cc.predict(X_test_scaled_selected).toarray()
xgb_cc_preds = pd.DataFrame(xgb_cc_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
xgb_cc_prob_preds = brca_ml_model_xgb_cc.predict_proba(X_test_scaled_selected).toarray()
xgb_cc_prob_preds = pd.DataFrame(xgb_cc_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

# ############################### Logistic Regression ###############################
lr_br_preds = brca_ml_model_lr_br.predict(X_test_scaled_selected).toarray()
lr_br_preds = pd.DataFrame(lr_br_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
lr_br_prob_preds = brca_ml_model_lr_br.predict_proba(X_test_scaled_selected).toarray()
lr_br_prob_preds = pd.DataFrame(lr_br_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

lr_cc_preds = brca_ml_model_lr_cc.predict(X_test_scaled_selected).toarray()
lr_cc_preds = pd.DataFrame(lr_cc_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
lr_cc_prob_preds = brca_ml_model_lr_cc.predict_proba(X_test_scaled_selected).toarray()
lr_cc_prob_preds = pd.DataFrame(lr_cc_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

# ############################### SVM ###############################
svm_lp_preds = brca_ml_model_svm_lp.predict(X_test_scaled_selected).toarray()
svm_lp_preds = pd.DataFrame(svm_lp_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
svm_lp_prob_preds = brca_ml_model_svm_lp.predict_proba(X_test_scaled_selected).toarray()
svm_lp_prob_preds = pd.DataFrame(svm_lp_prob_preds, index=y_test_5perc.index, columns=y_test_5perc.columns)

# ############################### Random Forest ###############################
rf_lp_preds = brca_ml_model_rf_lp.predict(X_test_scaled_selected).toarray()
rf_lp_preds = pd.DataFrame(rf_lp_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
rf_lp_prob_preds = brca_ml_model_rf_lp.predict_proba(X_test_scaled_selected).toarray()
rf_lp_prob_preds = pd.DataFrame(rf_lp_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

# NOT OPTIMAL
xgb_lp_preds = brca_ml_model_xgb_lp.predict(X_test_scaled_selected).toarray()
xgb_lp_preds = pd.DataFrame(xgb_lp_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
xgb_lp_prob_preds = brca_ml_model_xgb_lp.predict_proba(X_test_scaled_selected).toarray()
xgb_lp_prob_preds = pd.DataFrame(xgb_lp_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

lr_lp_preds = brca_ml_model_lr_lp.predict(X_test_scaled_selected).toarray()
lr_lp_preds = pd.DataFrame(lr_lp_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
lr_lp_prob_preds = brca_ml_model_lr_lp.predict_proba(X_test_scaled_selected).toarray()
lr_lp_prob_preds = pd.DataFrame(lr_lp_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

svm_br_preds = brca_ml_model_svm_br.predict(X_test_scaled_selected).toarray()
svm_br_preds = pd.DataFrame(svm_br_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
svm_br_prob_preds = brca_ml_model_svm_br.predict_proba(X_test_scaled_selected).toarray()
svm_br_prob_preds = pd.DataFrame(svm_br_prob_preds, index=y_test_5perc.index, columns=y_test_5perc.columns)

svm_cc_preds = brca_ml_model_svm_cc.predict(X_test_scaled_selected).toarray()
svm_cc_preds = pd.DataFrame(svm_cc_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
svm_cc_prob_preds = brca_ml_model_svm_cc.predict_proba(X_test_scaled_selected).toarray()
svm_cc_prob_preds = pd.DataFrame(svm_cc_prob_preds, index=y_test_5perc.index, columns=y_test_5perc.columns)

rf_br_preds = brca_ml_model_rf_br.predict(X_test_scaled_selected).toarray()
rf_br_preds = pd.DataFrame(rf_br_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
rf_br_prob_preds = brca_ml_model_rf_br.predict_proba(X_test_scaled_selected).toarray()
rf_br_prob_preds = pd.DataFrame(rf_br_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

rf_cc_preds = brca_ml_model_rf_cc.predict(X_test_scaled_selected).toarray()
rf_cc_preds = pd.DataFrame(rf_cc_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
rf_cc_prob_preds = brca_ml_model_rf_cc.predict_proba(X_test_scaled_selected).toarray()
rf_cc_prob_preds = pd.DataFrame(rf_cc_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

############################ Algorithm Adaptation #############################
mlknn_preds = brca_ml_model_mlknn.predict(X_test_scaled_selected).toarray()
mlknn_preds = pd.DataFrame(mlknn_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
mlknn_prob_preds = brca_ml_model_mlknn.predict_proba(X_test_scaled_selected).toarray()
mlknn_prob_preds = pd.DataFrame(mlknn_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

mlaram_preds = brca_ml_model_mlaram.predict(X_test_scaled_selected.values)
mlaram_preds = pd.DataFrame(mlaram_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
mlaram_prob_preds = brca_ml_model_mlaram.predict_proba(X_test_scaled_selected.values)
mlaram_prob_preds = pd.DataFrame(mlaram_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

####################### Ensemble #########################
ecc_xgb_preds = brca_ml_model_ecc_xgb.predict(X_test_scaled_selected)
ecc_xgb_preds = pd.DataFrame(ecc_xgb_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
ecc_xgb_prob_preds = brca_ml_model_ecc_xgb.predict_proba(X_test_scaled_selected)
ecc_xgb_prob_preds = normalize(np.array([ecc_xgb_prob_preds[i][:, 1] for i in range(5)]).transpose(), axis=1, norm='l1')
ecc_xgb_prob_preds = pd.DataFrame(ecc_xgb_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

er_xgb_preds = brca_ml_model_er_xgb.predict(X_test_scaled_selected).todense()
er_xgb_preds = pd.DataFrame(er_xgb_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
er_xgb_prob_preds = brca_ml_model_er_xgb.predict_proba(X_test_scaled_selected).todense()
er_xgb_prob_preds = pd.DataFrame(er_xgb_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

ecc_rf_preds = brca_ml_model_ecc_rf.predict(X_test_scaled_selected)
ecc_rf_preds = pd.DataFrame(ecc_rf_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
ecc_rf_prob_preds = brca_ml_model_ecc_rf.predict_proba(X_test_scaled_selected)
ecc_rf_prob_preds = normalize(np.array([ecc_rf_prob_preds[i][:, 1] for i in range(5)]).transpose(), axis=1, norm='l1')
ecc_rf_prob_preds = pd.DataFrame(ecc_rf_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

er_rf_preds = brca_ml_model_er_rf.predict(X_test_scaled_selected).todense()
er_rf_preds = pd.DataFrame(er_rf_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
er_rf_prob_preds = brca_ml_model_er_rf.predict_proba(X_test_scaled_selected).todense()
er_rf_prob_preds = pd.DataFrame(er_rf_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

ecc_lr_preds = brca_ml_model_ecc_lr.predict(X_test_scaled_selected)
ecc_lr_preds = pd.DataFrame(ecc_lr_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
ecc_lr_prob_preds = brca_ml_model_ecc_lr.predict_proba(X_test_scaled_selected)
ecc_lr_prob_preds = normalize(np.array([ecc_lr_prob_preds[i][:, 1] for i in range(5)]).transpose(), axis=1, norm='l1')
ecc_lr_prob_preds = pd.DataFrame(ecc_lr_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

er_lr_preds = brca_ml_model_er_lr.predict(X_test_scaled_selected).todense()
er_lr_preds = pd.DataFrame(er_lr_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
er_lr_prob_preds = brca_ml_model_er_lr.predict_proba(X_test_scaled_selected).todense()
er_lr_prob_preds = pd.DataFrame(er_lr_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

ecc_svm_preds = brca_ml_model_ecc_svm.predict(X_test_scaled_selected)
ecc_svm_preds = pd.DataFrame(ecc_svm_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
ecc_svm_prob_preds = brca_ml_model_ecc_svm.predict_proba(X_test_scaled_selected)
ecc_svm_prob_preds = normalize(np.array([ecc_svm_prob_preds[i][:, 1] for i in range(5)]).transpose(), axis=1, norm='l1')
ecc_svm_prob_preds = pd.DataFrame(ecc_svm_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

er_svm_preds = brca_ml_model_er_svm.predict(X_test_scaled_selected).todense()
er_svm_preds = pd.DataFrame(er_svm_preds, columns=y_test_5perc.columns, index=y_test_5perc.index, dtype='int')
er_svm_prob_preds = brca_ml_model_er_svm.predict_proba(X_test_scaled_selected).todense()
er_svm_prob_preds = pd.DataFrame(er_svm_prob_preds, columns=y_test_5perc.columns, index=y_test_5perc.index)

In [16]:
print_all_scores(y_test_5perc, xgb_br_preds, xgb_br_prob_preds, y_test_orig, y_test_pam50, y_test_corr, txt_file_name=None, k=[1, 2, 3])



Test accuracy: 0.6397849462365591
Test relaxed accuracy (PAM50): 0.7849462365591398
Test relaxed accuracy (original): 0.7849462365591398
Partial accuracy: 0.7123655913978495
Test Hamming loss: 0.08924731182795699

Ranking average precision:  0.9310035842293908
Average precision:  0.8891493651745336
Test precision (weighted): 0.8712156003828593
Test recall (weighted): 0.7400881057268722
Test f1 score (weighted): 0.7981709343616163

Test precision (macro): 0.8628247141150368
Test recall (macro): 0.7171137886655129
Test f1 score (macro): 0.7811833324089993

Test precision (micro): 0.875
Test recall (micro): 0.7400881057268722
Test f1 score (micro): 0.8019093078758949

Ordered subset acc: 0.6129032258064516
Order 1 accuracy: 0.7634408602150538
Order 2 accuracy: 0.6129032258064516
Order 3 accuracy: 0.6129032258064516

Secondary accuracy:  0.7634408602150538
