# 0. Imports and Data Load

In [1]:
import plotly.graph_objects as go
import pandas as pd
from plot_functions import paper_boxplot_comparison_single_metric
from ast import literal_eval
import numpy as np

In [2]:
disgenet_proteins_indexes_df = pd.read_csv("../../data/processed/disgenet_protein_indexes_fp.csv", sep=',', header=0, index_col=0)
disgenet_proteins_indexes_conservative_df = pd.read_csv("../../data/processed/disgenet_protein_indexes_conservative_fp.csv", sep=',', header=0)
reactome_proteins_indexes_df = pd.read_csv("../../data/processed/reactome_protein_indexes_fp.csv", sep=',', header=0)

disgenet_proteins_indexes_df['fp_proteins'] = disgenet_proteins_indexes_df['fp_proteins'].apply(literal_eval)
disgenet_proteins_indexes_conservative_df['fp_proteins'] = disgenet_proteins_indexes_conservative_df['fp_proteins'].apply(literal_eval)
reactome_proteins_indexes_df['fp_proteins'] = reactome_proteins_indexes_df['fp_proteins'].apply(literal_eval)

disgenet_proteins_indexes_df['proteins_ids'] = disgenet_proteins_indexes_df['proteins_ids'].apply(literal_eval)
disgenet_proteins_indexes_conservative_df['proteins_ids'] = disgenet_proteins_indexes_conservative_df['proteins_ids'].apply(literal_eval)
reactome_proteins_indexes_df['proteins_ids'] = reactome_proteins_indexes_df['proteins_ids'].apply(literal_eval)

In [3]:
reactome_proteins_indexes_df['fp_proteins'] = reactome_proteins_indexes_df.apply(lambda row: list(set(row['fp_proteins'])^set(row['proteins_ids'])), axis=1)
disgenet_proteins_indexes_df['fp_proteins'] = disgenet_proteins_indexes_df.apply(lambda row: list(set(row['fp_proteins'])^set(row['proteins_ids'])), axis=1)
disgenet_proteins_indexes_conservative_df['fp_proteins'] = disgenet_proteins_indexes_conservative_df.apply(lambda row: list(set(row['fp_proteins'])^set(row['proteins_ids'])), axis=1)

##### Threshold Classifier

In [4]:
process_hypergeometric_threshold = pd.read_csv('../../models/false_positive/threshold/process_hypergeometric.csv')
process_closeness_threshold = pd.read_csv('../../models/false_positive/threshold/process_closeness.csv')
process_betweenness_threshold = pd.read_csv('../../models/false_positive/threshold/process_betweenness.csv')
process_rwr_threshold = pd.read_csv('../../models/false_positive/threshold/process_rwr.csv')
process_fraction_betweenness_threshold = pd.read_csv('../../models/false_positive/threshold/process_fraction_betweenness.csv')

disease_hypergeometric_threshold = pd.read_csv('../../models/false_positive/threshold/disease_hypergeometric.csv')
disease_closeness_threshold = pd.read_csv('../../models/false_positive/threshold/disease_closeness.csv')
disease_betweenness_threshold = pd.read_csv('../../models/false_positive/threshold/disease_betweenness.csv')
disease_rwr_threshold = pd.read_csv('../../models/false_positive/threshold/disease_rwr.csv')
disease_fraction_betweenness_threshold = pd.read_csv('../../models/false_positive/threshold/disease_fraction_betweenness.csv')

disease_hypergeometric_conservative_threshold = pd.read_csv('../../models/false_positive/threshold/disease_hypergeometric_conservative.csv')
disease_closeness_conservative_threshold = pd.read_csv('../../models/false_positive/threshold/disease_closeness_conservative.csv')
disease_betweenness_conservative_threshold = pd.read_csv('../../models/false_positive/threshold/disease_betweenness_conservative.csv')
disease_rwr_conservative_threshold = pd.read_csv('../../models/false_positive/threshold/disease_rwr_conservative.csv')
disease_fraction_betweenness_conservative_threshold = pd.read_csv('../../models/false_positive/threshold/disease_fraction_betweenness_conservative.csv')

##### Logistic Regression Classifier

In [5]:
process_hypergeometric = pd.read_csv('../../models/false_positive/probability/process_hypergeometric_lgr_proba.csv')
process_closeness = pd.read_csv('../../models/false_positive/probability/process_closeness_lgr_proba.csv')
process_betweenness = pd.read_csv('../../models/false_positive/probability/process_betweenness_lgr_proba.csv')
process_fraction_betweenness = pd.read_csv('../../models/false_positive/probability/process_fraction_betweenness_lgr_proba.csv')
process_rwr = pd.read_csv('../../models/false_positive/probability/process_rwr_lgr_proba.csv')

disease_hypergeometric = pd.read_csv('../../models/false_positive/probability/disease_hypergeometric_lgr_proba.csv')
disease_closeness = pd.read_csv('../../models/false_positive/probability/disease_closeness_lgr_proba.csv')
disease_betweenness = pd.read_csv('../../models/false_positive/probability/disease_betweenness_lgr_proba.csv')
disease_fraction_betweenness = pd.read_csv('../../models/false_positive/probability/disease_fraction_betweenness_lgr_proba.csv')
disease_rwr = pd.read_csv('../../models/false_positive/probability/disease_rwr_lgr_proba.csv')

disease_hypergeometric_conservative = pd.read_csv('../../models/false_positive/probability/disease_hypergeometric_lgr_proba_conservative.csv')
disease_closeness_conservative = pd.read_csv('../../models/false_positive/probability/disease_closeness_lgr_proba_conservative.csv')
disease_betweenness_conservative = pd.read_csv('../../models/false_positive/probability/disease_betweenness_lgr_proba_conservative.csv')
disease_fraction_betweenness_conservative = pd.read_csv('../../models/false_positive/probability/disease_fraction_betweenness_lgr_proba_conservative.csv')
disease_rwr_conservative = pd.read_csv('../../models/false_positive/probability/disease_rwr_lgr_proba_conservative.csv')

In [6]:
columns = ['tp_proteins', 'fp_proteins', 'fn_proteins']
threshold_clfs = [process_hypergeometric_threshold, process_closeness_threshold, process_betweenness_threshold, process_fraction_betweenness_threshold, process_rwr_threshold,
        disease_hypergeometric_threshold, disease_closeness_threshold, disease_betweenness_threshold, disease_fraction_betweenness_threshold, disease_rwr_threshold,
        disease_hypergeometric_conservative_threshold, disease_closeness_conservative_threshold, disease_betweenness_conservative_threshold, disease_fraction_betweenness_conservative_threshold, disease_rwr_conservative_threshold]
GAPMINE_clfs = [process_hypergeometric, process_closeness, process_betweenness, process_fraction_betweenness, process_rwr,
        disease_hypergeometric, disease_closeness, disease_betweenness, disease_fraction_betweenness, disease_rwr,
        disease_hypergeometric_conservative, disease_closeness_conservative, disease_betweenness_conservative, disease_fraction_betweenness_conservative, disease_rwr_conservative]
for clf_dfs in [threshold_clfs, GAPMINE_clfs]:
    for clf in clf_dfs:
        for column in columns:
            clf[column] = clf[column].apply(literal_eval)

# 1. Complete Network Results

In [7]:
prox_metrics = {
    'Threshold':{'Hypergeometric Test': {'P':process_hypergeometric_threshold, 'SCA':disease_hypergeometric_threshold, 'Cons.':disease_hypergeometric_conservative_threshold},
                'Closeness': {'P':process_closeness_threshold, 'SCA':disease_closeness_threshold, 'Cons.':disease_closeness_conservative_threshold},
                'Betweenness': {'P':process_betweenness_threshold, 'SCA':disease_betweenness_threshold, 'Cons.':disease_betweenness_conservative_threshold},
                'Fraction Betweenness': {'P':process_fraction_betweenness_threshold, 'SCA':disease_fraction_betweenness_threshold, 'Cons.':disease_fraction_betweenness_conservative_threshold},
                'Random Walk w/ Restart': {'P':process_rwr_threshold, 'SCA':disease_rwr_threshold, 'Cons.':disease_rwr_conservative_threshold}},
    'Logistic':{'Hypergeometric Test': {'P':process_hypergeometric, 'SCA':disease_hypergeometric, 'Cons.':disease_hypergeometric_conservative},
               'Closeness': {'P':process_closeness, 'SCA':disease_closeness, 'Cons.':disease_closeness_conservative},
               'Betweenness': {'P':process_betweenness, 'SCA':disease_betweenness, 'Cons.':disease_betweenness_conservative},
               'Fraction Betweenness': {'P':process_fraction_betweenness, 'SCA':disease_fraction_betweenness, 'Cons.':disease_fraction_betweenness_conservative}, 
               'Random Walk w/ Restart': {'P':process_rwr, 'SCA':disease_rwr, 'Cons.':disease_rwr_conservative}}}

process_protein_ids = {'P':reactome_proteins_indexes_df,
                'SCA':disgenet_proteins_indexes_df,
                'Cons.':disgenet_proteins_indexes_conservative_df}

performance_dict = {'Clf':[], 'Metric':[], 'Module':[], 'Noise Proteins':[], 'Nº Noise Proteins':[], 'False Positives':[], 'True Negatives':[], 'FP len':[], 'TN len':[], 'FP Ratio':[], 'TN Ratio':[]}


for clf, metric_dict in prox_metrics.items():
    for metric, method_dict in metric_dict.items():
        for method, df in method_dict.items():
            module_protein_ids = process_protein_ids[method]
            for i, row in df.iterrows():
                protein_ids = module_protein_ids.iloc[i,4]
                negatives = list(set(protein_ids)&set(row['tp_proteins'])) + list(set(protein_ids)&set(row['fn_proteins']))
                fp_captured = list(set(row['tp_proteins'])&set(negatives))
                tn_captured = list(set(row['fn_proteins'])&set(negatives))
                try:
                    fp_as_tp = (len(fp_captured)/len(negatives))*100
                    fp_as_fn = (len(tn_captured)/len(negatives))*100
                except ZeroDivisionError:
                    fp_as_fn = 0
                    fp_as_fn = 0
                performance_dict['Clf'].append(clf)
                performance_dict['Metric'].append(metric)
                performance_dict['Module'].append(method)
                performance_dict['Noise Proteins'].append(negatives)
                performance_dict['Nº Noise Proteins'].append(len(negatives))
                performance_dict['False Positives'].append(fp_captured)
                performance_dict['True Negatives'].append(tn_captured)
                performance_dict['FP len'].append(len(fp_captured))
                performance_dict['TN len'].append(len(tn_captured))
                try:
                    performance_dict['FP Ratio'].append(len(fp_captured)/len(negatives))
                    performance_dict['TN Ratio'].append(len(tn_captured)/len(negatives))
                except ZeroDivisionError:
                    performance_dict['FP Ratio'].append(0)
                    performance_dict['TN Ratio'].append(0)
performance_df = pd.DataFrame(performance_dict)


In [8]:
def clf_correction(clf, fp_performance):
    tp = clf['tp'] - fp_performance['FP len'][clf.name]
    fp = clf['fp'] + fp_performance['FP len'][clf.name]
    fn = clf['fn'] - fp_performance['TN len'][clf.name]
    tn = clf['tn'] + fp_performance['TN len'][clf.name]
    if tp + fp > 0:
        precision = tp/(tp+fp)
    else:
        precision = 0
    recall = tp/(tp+fn)
    f_measure = 2*((precision*recall)/(precision+recall))
    if np.isnan(f_measure):
        f_measure=0
    return tp, fp, fn, tn, precision, recall, f_measure

In [9]:
process_hypergeometric[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = process_hypergeometric.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Logistic')&(performance_df['Module']=='P')&(performance_df['Metric']=='Hypergeometric Test')].reset_index()), axis=1, result_type='expand')
process_closeness[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = process_closeness.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Logistic')&(performance_df['Module']=='P')&(performance_df['Metric']=='Closeness')].reset_index()), axis=1, result_type='expand')
process_betweenness[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = process_betweenness.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Logistic')&(performance_df['Module']=='P')&(performance_df['Metric']=='Betweenness')].reset_index()), axis=1, result_type='expand')
process_fraction_betweenness[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = process_fraction_betweenness.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Logistic')&(performance_df['Module']=='P')&(performance_df['Metric']=='Fraction Betweenness')].reset_index()), axis=1, result_type='expand')
process_rwr[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = process_rwr.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Logistic')&(performance_df['Module']=='P')&(performance_df['Metric']=='Random Walk w/ Restart')].reset_index()), axis=1, result_type='expand')

disease_hypergeometric[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = disease_hypergeometric.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Logistic')&(performance_df['Module']=='SCA')&(performance_df['Metric']=='Hypergeometric Test')].reset_index()), axis=1, result_type='expand')
disease_closeness[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = disease_closeness.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Logistic')&(performance_df['Module']=='SCA')&(performance_df['Metric']=='Closeness')].reset_index()), axis=1, result_type='expand')
disease_betweenness[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = disease_betweenness.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Logistic')&(performance_df['Module']=='SCA')&(performance_df['Metric']=='Betweenness')].reset_index()), axis=1, result_type='expand')
disease_fraction_betweenness[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = disease_fraction_betweenness.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Logistic')&(performance_df['Module']=='SCA')&(performance_df['Metric']=='Fraction Betweenness')].reset_index()), axis=1, result_type='expand')
disease_rwr[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = disease_rwr.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Logistic')&(performance_df['Module']=='SCA')&(performance_df['Metric']=='Random Walk w/ Restart')].reset_index()), axis=1, result_type='expand')

disease_hypergeometric_conservative[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = disease_hypergeometric_conservative.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Logistic')&(performance_df['Module']=='Cons.')&(performance_df['Metric']=='Hypergeometric Test')].reset_index()), axis=1, result_type='expand')
disease_closeness_conservative[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = disease_closeness_conservative.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Logistic')&(performance_df['Module']=='Cons.')&(performance_df['Metric']=='Closeness')].reset_index()), axis=1, result_type='expand')
disease_betweenness_conservative[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = disease_betweenness_conservative.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Logistic')&(performance_df['Module']=='Cons.')&(performance_df['Metric']=='Betweenness')].reset_index()), axis=1, result_type='expand')
disease_fraction_betweenness_conservative[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = disease_fraction_betweenness_conservative.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Logistic')&(performance_df['Module']=='Cons.')&(performance_df['Metric']=='Fraction Betweenness')].reset_index()), axis=1, result_type='expand')
disease_rwr_conservative[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = disease_rwr_conservative.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Logistic')&(performance_df['Module']=='Cons.')&(performance_df['Metric']=='Random Walk w/ Restart')].reset_index()), axis=1, result_type='expand')


invalid value encountered in double_scalars



In [10]:
process_hypergeometric_threshold[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = process_hypergeometric_threshold.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Threshold')&(performance_df['Module']=='P')&(performance_df['Metric']=='Hypergeometric Test')].reset_index()), axis=1, result_type='expand')
process_closeness_threshold[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = process_closeness_threshold.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Threshold')&(performance_df['Module']=='P')&(performance_df['Metric']=='Closeness')].reset_index()), axis=1, result_type='expand')
process_betweenness_threshold[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = process_betweenness_threshold.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Threshold')&(performance_df['Module']=='P')&(performance_df['Metric']=='Betweenness')].reset_index()), axis=1, result_type='expand')
process_fraction_betweenness_threshold[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = process_fraction_betweenness_threshold.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Threshold')&(performance_df['Module']=='P')&(performance_df['Metric']=='Fraction Betweenness')].reset_index()), axis=1, result_type='expand')
process_rwr_threshold[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = process_rwr_threshold.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Threshold')&(performance_df['Module']=='P')&(performance_df['Metric']=='Random Walk w/ Restart')].reset_index()), axis=1, result_type='expand')

disease_hypergeometric_threshold[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = disease_hypergeometric_threshold.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Threshold')&(performance_df['Module']=='SCA')&(performance_df['Metric']=='Hypergeometric Test')].reset_index()), axis=1, result_type='expand')
disease_closeness_threshold[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = disease_closeness_threshold.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Threshold')&(performance_df['Module']=='SCA')&(performance_df['Metric']=='Closeness')].reset_index()), axis=1, result_type='expand')
disease_betweenness_threshold[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = disease_betweenness_threshold.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Threshold')&(performance_df['Module']=='SCA')&(performance_df['Metric']=='Betweenness')].reset_index()), axis=1, result_type='expand')
disease_fraction_betweenness_threshold[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = disease_fraction_betweenness_threshold.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Threshold')&(performance_df['Module']=='SCA')&(performance_df['Metric']=='Fraction Betweenness')].reset_index()), axis=1, result_type='expand')
disease_rwr_threshold[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = disease_rwr_threshold.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Threshold')&(performance_df['Module']=='SCA')&(performance_df['Metric']=='Random Walk w/ Restart')].reset_index()), axis=1, result_type='expand')

disease_hypergeometric_conservative_threshold[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = disease_hypergeometric_conservative_threshold.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Threshold')&(performance_df['Module']=='Cons.')&(performance_df['Metric']=='Hypergeometric Test')].reset_index()), axis=1, result_type='expand')
disease_closeness_conservative_threshold[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = disease_closeness_conservative_threshold.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Threshold')&(performance_df['Module']=='Cons.')&(performance_df['Metric']=='Closeness')].reset_index()), axis=1, result_type='expand')
disease_betweenness_conservative_threshold[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = disease_betweenness_conservative_threshold.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Threshold')&(performance_df['Module']=='Cons.')&(performance_df['Metric']=='Betweenness')].reset_index()), axis=1, result_type='expand')
disease_fraction_betweenness_conservative_threshold[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = disease_fraction_betweenness_conservative_threshold.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Threshold')&(performance_df['Module']=='Cons.')&(performance_df['Metric']=='Fraction Betweenness')].reset_index()), axis=1, result_type='expand')
disease_rwr_conservative_threshold[['tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f_measure']] = disease_rwr_conservative_threshold.apply(lambda row: clf_correction(row, performance_df[(performance_df['Clf']=='Threshold')&(performance_df['Module']=='Cons.')&(performance_df['Metric']=='Random Walk w/ Restart')].reset_index()), axis=1, result_type='expand')


invalid value encountered in double_scalars



In [11]:
prox_metrics = {
    'Threshold':{'HT': {'P':process_hypergeometric_threshold, 'SCA':disease_hypergeometric_threshold, 'Cons.':disease_hypergeometric_conservative_threshold},
                'C': {'P':process_closeness_threshold, 'SCA':disease_closeness_threshold, 'Cons.':disease_closeness_conservative_threshold},
                'B': {'P':process_betweenness_threshold, 'SCA':disease_betweenness_threshold, 'Cons.':disease_betweenness_conservative_threshold},
                'FB': {'P':process_fraction_betweenness_threshold, 'SCA':disease_fraction_betweenness_threshold, 'Cons.':disease_fraction_betweenness_conservative_threshold},
                'RWR': {'P':process_rwr_threshold, 'SCA':disease_rwr_threshold, 'Cons.':disease_rwr_conservative_threshold}},
    'GAP-MINE':{'HT': {'P':process_hypergeometric, 'SCA':disease_hypergeometric, 'Cons.':disease_hypergeometric_conservative},
               'C': {'P':process_closeness, 'SCA':disease_closeness, 'Cons.':disease_closeness_conservative},
               'B': {'P':process_betweenness, 'SCA':disease_betweenness, 'Cons.':disease_betweenness_conservative},
               'FB': {'P':process_fraction_betweenness, 'SCA':disease_fraction_betweenness, 'Cons.':disease_fraction_betweenness_conservative}, 
               'RWR': {'P':process_rwr, 'SCA':disease_rwr, 'Cons.':disease_rwr_conservative}}}

clf_dict = {'clf': [], 'metric': [], 'method': [], 'quality_metric': [], 'score': []}

for clf, metric_dict in prox_metrics.items():
    for metric, method_dict in metric_dict.items():
        for method, df in method_dict.items():
            for i, row in df.iterrows():
                for quality_metric, score in dict(row[4:11]).items():
                    clf_dict['clf'].append(clf)
                    clf_dict['metric'].append(metric)
                    clf_dict['method'].append(method)
                    clf_dict['quality_metric'].append(quality_metric)
                    clf_dict['score'].append(score)
df_plots = pd.DataFrame(clf_dict)

In [12]:
df = df_plots[(df_plots['metric'] == 'RWR') & (df_plots['quality_metric'].isin(['f_measure', 'precision', 'recall']))]
df.replace('f_measure', 'F', inplace=True)
df.replace('P', 'Process', inplace=True)
df.replace('precision', 'P', inplace=True)
df.replace('recall', 'R', inplace=True)
df.replace('Cons.', 'Conservative', inplace=True)
paper_boxplot_comparison_single_metric(df, 'lgr_threshold_comparison_rwr_fp', colors=['#59C3C3', '#FDA96D'], showlegend=False)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [13]:
from scipy.stats import wilcoxon

def hypothesis_testing(pairs, alternative, simple=False):
    if not simple:
        hypothesis_dict = {'f_measure':[], 'precision':[], 'recall':[], 'mcc':[], 'precision@15':[], 'precision@20':[], 'precision@n_positives':[]}
    if simple:
        hypothesis_dict = {'f_measure':[], 'precision':[], 'recall':[]}
    for keys, values in pairs.items():
        for column in hypothesis_dict.keys():
            U1, p = wilcoxon(values[0][column], values[1][column], alternative=alternative)
            hypothesis_dict[column].append(p)
    hypothesis_df = pd.DataFrame(hypothesis_dict, index=pairs.keys())
    return hypothesis_df.round(5)

In [14]:
pairs={'Hypergeometric':[process_hypergeometric, process_hypergeometric_threshold],
        'Closeness':[process_closeness, process_closeness_threshold],
        'Betweenness':[process_betweenness, process_betweenness_threshold],
        'Fraction Betweenness':[process_fraction_betweenness, process_fraction_betweenness_threshold],
        'RWR':[process_rwr, process_rwr_threshold]}

print('Logistic vs. Threshold (Process)')
hypothesis_testing(pairs, 'greater')

Logistic vs. Threshold (Process)


Unnamed: 0,f_measure,precision,recall,mcc,precision@15,precision@20,precision@n_positives
Hypergeometric,0.99671,0.00159,1.0,0.98913,0.91237,0.99416,0.99875
Closeness,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Betweenness,0.0,0.0,1.0,0.0,0.0,0.0,0.0
Fraction Betweenness,1.0,0.97661,1.0,1.0,1.0,1.0,1.0
RWR,0.0,0.0,0.97829,0.0,0.0,0.0,0.0


In [15]:
pairs={'Hypergeometric':[disease_hypergeometric, disease_hypergeometric_threshold],
        'Closeness':[disease_closeness, disease_closeness_threshold],
        'Betweenness':[disease_betweenness, disease_betweenness_threshold],
        'Fraction Betweenness':[disease_fraction_betweenness, disease_fraction_betweenness_threshold],
        'RWR':[disease_rwr, disease_rwr_threshold]}

print('Logistic vs. Threshold (Disease SCA)')
hypothesis_testing(pairs, 'greater')

Logistic vs. Threshold (Disease SCA)


Unnamed: 0,f_measure,precision,recall,mcc,precision@15,precision@20,precision@n_positives
Hypergeometric,0.82153,0.5207,0.95278,0.74839,0.46279,0.714,0.50025
Closeness,0.74841,0.86013,0.46289,0.71377,0.74541,0.61179,0.72531
Betweenness,0.77024,0.48705,0.96622,0.396,0.75771,0.5894,0.41141
Fraction Betweenness,0.97015,0.44725,0.9759,0.90767,0.70919,0.72187,0.93501
RWR,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [16]:
pairs={'Hypergeometric':[disease_hypergeometric_conservative, disease_hypergeometric_conservative_threshold],
        'Closeness':[disease_closeness_conservative, disease_closeness_conservative_threshold],
        'Betweenness':[disease_betweenness_conservative, disease_betweenness_conservative_threshold],
        'Fraction Betweenness':[disease_fraction_betweenness_conservative, disease_fraction_betweenness_conservative_threshold],
        'RWR':[disease_rwr_conservative, disease_rwr_conservative_threshold]}

print('Logistic vs. Threshold (Disease Conservative)')
hypothesis_testing(pairs, 'greater')

Logistic vs. Threshold (Disease Conservative)


Unnamed: 0,f_measure,precision,recall,mcc,precision@15,precision@20,precision@n_positives
Hypergeometric,0.00253,0.00429,0.39832,0.00336,0.00178,0.0014,0.00316
Closeness,0.01596,0.07885,0.00291,0.03068,0.03249,0.00044,0.00058
Betweenness,1.0,0.99982,0.99997,0.90512,0.55809,0.80249,0.86536
Fraction Betweenness,1.0,1.0,0.11943,1.0,1.0,0.99999,0.99987
RWR,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [17]:
fp_df = pd.DataFrame(clf_dict)
fp_df.replace('B', 'Betweenness', inplace=True)
fp_df.replace('C', 'Closeness', inplace=True)
fp_df.replace('FB', 'Fraction Betweenness', inplace=True)
fp_df.replace('HT', 'Hypergeometric Test', inplace=True)
fp_df.replace('RWR', 'Random Walks with Restart', inplace=True)
fp_df.replace('Cons.', 'Disease Conservative', inplace=True)
fp_df.replace('P', 'Process', inplace=True)
fp_df.replace('SCA', 'Disease SCA', inplace=True)

fp_table = pd.DataFrame(columns = ['metric', 'method', 'clf', 'f_measure', 'f_measure_min', 'f_measure_max', 'precision', 'precision_min', 'precision_max', 'recall', 'recall_min', 'recall_max'])
fp_table[['metric', 'method', 'clf']] = fp_df[(fp_df['quality_metric'].isin(['f_measure']))].groupby(['metric', 'method', 'clf']).median().reset_index()[['metric', 'method', 'clf']]

fp_table[['f_measure_min', 'f_measure', 'f_measure_max']] = fp_df[(fp_df['quality_metric'].isin(['f_measure']))].groupby(['metric', 'method', 'clf']).quantile([0.25, 0.5,0.75]).round(3).unstack().values
fp_table[['precision_min', 'precision', 'precision_max']] = fp_df[(fp_df['quality_metric'].isin(['precision']))].groupby(['metric', 'method', 'clf']).quantile([0.25, 0.5,0.75]).round(3).unstack().values
fp_table[['recall_min', 'recall', 'recall_max']] = fp_df[(fp_df['quality_metric'].isin(['recall']))].groupby(['metric', 'method', 'clf']).quantile([0.25, 0.5,0.75]).round(3).unstack().values

fp_table['f_measure'] = fp_table.apply(lambda row: f"{row['f_measure']} ({row['f_measure_min']}-{row['f_measure_max']})", axis=1)
fp_table['precision'] = fp_table.apply(lambda row: f"{row['precision']} ({row['precision_min']}-{row['precision_max']})", axis=1)
fp_table['recall'] = fp_table.apply(lambda row: f"{row['recall']} ({row['recall_min']}-{row['recall_max']})", axis=1)

fp_table[['metric', 'method', 'clf', 'f_measure', 'precision', 'recall']].to_csv('../../reports/fp_table_table.csv', index=False)
fp_table[['metric', 'method', 'clf', 'f_measure', 'precision', 'recall']]

Unnamed: 0,metric,method,clf,f_measure,precision,recall
0,Betweenness,Disease Conservative,GAP-MINE,0.0 (0.0-0.01),0.0 (0.0-0.005),0.0 (0.0-0.059)
1,Betweenness,Disease Conservative,Threshold,0.007 (0.0-0.016),0.004 (0.0-0.009),0.05 (0.0-0.125)
2,Betweenness,Disease SCA,GAP-MINE,0.011 (0.0-0.024),0.005 (0.0-0.014),0.042 (0.0-0.239)
3,Betweenness,Disease SCA,Threshold,0.012 (0.0-0.023),0.006 (0.0-0.013),0.071 (0.0-0.266)
4,Betweenness,Process,GAP-MINE,0.024 (0.0-0.098),0.013 (0.0-0.077),0.083 (0.0-0.231)
5,Betweenness,Process,Threshold,0.006 (0.0-0.009),0.003 (0.0-0.005),1.0 (0.0-1.0)
6,Closeness,Disease Conservative,GAP-MINE,0.036 (0.0-0.08),0.026 (0.0-0.068),0.067 (0.0-0.143)
7,Closeness,Disease Conservative,Threshold,0.033 (0.0-0.077),0.024 (0.0-0.066),0.059 (0.0-0.118)
8,Closeness,Disease SCA,GAP-MINE,0.093 (0.062-0.134),0.088 (0.054-0.131),0.109 (0.062-0.167)
9,Closeness,Disease SCA,Threshold,0.098 (0.071-0.14),0.098 (0.064-0.13),0.116 (0.071-0.158)
