In [1]:
import pandas as pd
from lexas import evaluation
from lexas import prediction
symbols = prediction.symbols

# Preparation

In [13]:
path_to_csv = "./data/experiments_for_xgboost.csv"
train_dic = evaluation.generate_dic_for_eval(
    path_to_csv,
    1930,  #start year
    2018,  #end year
) 

dev_dic = evaluation.generate_dic_for_eval(path_to_csv,2019,2019)
test_dic = evaluation.generate_dic_for_eval(path_to_csv,2020,2023)

#Save the dictionaries
#The disctionaries are saved in ./eval/
evaluation.save_dic(train_dic,dev_dic,test_dic)

798it [00:00, 186226.82it/s]
798it [00:00, 428450.41it/s]
798it [00:00, 445798.43it/s]


# Evaluation

In [9]:
# Loading data
# When using dictionaries created above
# train_dic, dev_dic, test_dic = evaluation.load_dic(mode, input_dir="./eval")

# When using precomputed dictionaries that encompass all experiments from all articles
# These dictionaries were used for evaluation in the manuscript
train_dic, dev_dic, test_dic = evaluation.load_dic(input_dir="./eval_pre_computed")

print("Genes examined after PLK4 in the articles published before 2018")
print(train_dic["PLK4"]) 
print("\nGenes examined after PLK4 in the articles published in 2019")
print(dev_dic["PLK4"]) 
print("\nGenes examined after PLK4 in the articles published after 2020")
print(test_dic["PLK4"]) 

Genes examined after PLK4 in the articles published before 2018
['KIF2C', 'SLC9A3R1', 'TLR4', 'MARK1', 'CUL1', 'DLGAP5', 'MYBL2', 'DSN1', 'CLASP2', 'CENPT', 'CAMSAP3', 'NEK7', 'NIN', 'PXN', 'SKP2', 'TUBGCP6', 'GAPDH', 'TSC1', 'PTGS2', 'MTOR', 'RBM14', 'RAD21', 'MAS1', 'FOXO1', 'TMPRSS13', 'CDK5RAP2', 'CNTROB', 'SAP30', 'CENPW', 'PRC1', 'PAX6', 'KAT2A', 'CDC25C', 'TP53BP1', 'LCP2', 'ACIN1', 'RUNX2', 'CROCC', 'PRPF6', 'HSPB1', 'INCENP', 'NSL1', 'USP28', 'VIM', 'OFD1', 'SNAI2', 'RNF168', 'BUB1', 'CEP120', 'OTX2', 'ESPL1', 'CEP135', 'XRCC5', 'KIFC1', 'PPP1R35', 'PYURF', 'NCAPD2', 'HSPB3', 'CEP290', 'GOLGB1', 'KCNIP3', 'MDM2', 'PCM1', 'LIN28A', 'CEP68', 'BUB3', 'HSP90AA1', 'CCNB1', 'BRN3B', 'TSC2', 'TXN', 'ORC1', 'PCNT', 'RAC1', 'FOXM1', 'H2AX', 'SP1', 'CHEK1', 'CCNB2', 'PALM', 'JMJD7', 'FMNL1', 'SAP30BP', 'CD24', 'PHB', 'NFKB2', 'INS', 'LATS2', 'LATS1', 'CDKN1A', 'PARP1', 'SPICE1', 'GCC2', 'SASS6', 'MYLK', 'MITF', 'CEP63', 'PCNA', 'CEP350', 'CEP57', 'HDAC9', 'GCG', 'FHDC1', 'PTEN', 'AURKA'

In [4]:
# Genes examined after a query gene in the previous data are not considered false or true. 
# They are removed from evaluation.
# prev_dic stores genes examined previously to the answer data set

def get_dictionaries_for_evaluation(eval_mode, train_dic, dev_dic, test_dic):   
    if eval_mode == "dev":
        prev_dic = train_dic.copy()
        answer_dic = dev_dic.copy()
    elif eval_mode == "test":
        prev_dic = {k: train_dic[k] + dev_dic[k] for k in train_dic}
        answer_dic = test_dic.copy()
    else:
        raise ValueError("Evaluation mode should be \"dev\" or \"test\"")
    
    return prev_dic, answer_dic

eval_mode = "test" #dev or test
prev_dic, answer_dic = get_dictionaries_for_evaluation(eval_mode, train_dic, dev_dic, test_dic)

In [5]:
#Calculate AUC
model_name ="xgboost"
genes = ["PLK4","SASS6","CEP152","CEP192","PCNT"]
result_dir = "./result/xgboost"
top_k = 100

import os
import tqdm

evaluation.calculate_auc_for_many_genes(result_dir, model_name, prev_dic, answer_dic, top_k, genes=genes)

100%|██████████| 5/5 [00:00<00:00, 13.11it/s]


Unnamed: 0,Symbol,AUC at 100 for xgboost
0,PLK4,0.585285
1,SASS6,0.663106
2,CEP152,0.644797
3,CEP192,0.701162
4,PCNT,0.558299


# Comparison with other tools 

In [11]:
import tqdm
import os
from lexas import evaluation

model_names =["xgboost","string_rwr","funcoup_rwr","gosemsim","string_raw","funcoup_raw"]
genes = ["PLK4","SASS6","CEP152","CEP192","PCNT"]
top_k = 100
results = []

for model_name in model_names:
    result_dir = f"./result/{model_name}"
    result = evaluation.calculate_auc_for_many_genes(result_dir, model_name, prev_dic, answer_dic, top_k, genes=genes)
    results.append(result)

from functools import reduce
merged_df = reduce(lambda left, right: pd.merge(left, right, on='Symbol', how='inner'), results)
merged_df 

100%|██████████| 5/5 [00:00<00:00, 14.54it/s]
100%|██████████| 5/5 [00:00<00:00, 11.18it/s]
100%|██████████| 5/5 [00:00<00:00, 11.29it/s]
100%|██████████| 5/5 [00:00<00:00, 13.03it/s]
100%|██████████| 5/5 [00:00<00:00, 13.88it/s]
100%|██████████| 5/5 [00:00<00:00, 14.24it/s]


Unnamed: 0,Symbol,AUC at 100 for xgboost,AUC at 100 for string_rwr,AUC at 100 for funcoup_rwr,AUC at 100 for gosemsim,AUC at 100 for string_raw,AUC at 100 for funcoup_raw
0,PLK4,0.585285,0.575909,0.534329,0.55981,0.571395,0.573635
1,SASS6,0.663106,0.631155,0.593017,0.637505,0.6121,0.599347
2,CEP152,0.644797,0.626169,0.613997,0.626184,0.626296,0.607878
3,CEP192,0.701162,0.640055,0.640026,0.640094,0.633397,0.633251
4,PCNT,0.558299,0.553281,0.544564,0.547083,0.547124,0.545804


In [12]:
merged_df.mean()

AUC at 100 for xgboost        0.630530
AUC at 100 for string_rwr     0.605314
AUC at 100 for funcoup_rwr    0.585186
AUC at 100 for gosemsim       0.602135
AUC at 100 for string_raw     0.598062
AUC at 100 for funcoup_raw    0.591983
dtype: float64

# Mann-Whitney U test and 95%CI

In [7]:
import numpy as np
import pandas as pd
from scipy.stats import mannwhitneyu

group1_data = merged_df["AUC at 100 for xgboost"]
group2_data = merged_df["AUC at 100 for string_rwr"]

# Perform a Mann-Whitney U test to compare the two groups (In the articles, with Bonferroni correction)
print("P value:", mannwhitneyu(group1_data, group2_data))

# Calculate the effect size (difference in means) between the two groups
original_effect = np.mean(group1_data) - np.mean(group2_data)

# Bootstrap resampling for estimating a 95% confidence interval of the effect size difference
bootstrap_samples = 10000
bootstrap_effects = []

for _ in range(bootstrap_samples):
    # Randomly sample with replacement from both groups' data
    sample_group1 = np.random.choice(group1_data, size=len(merged_df), replace=True)
    sample_group2 = np.random.choice(group2_data, size=len(merged_df), replace=True)

    # Calculate the effect size (difference in means) for the bootstrap sample
    effect = np.mean(sample_group1) - np.mean(sample_group2)
    bootstrap_effects.append(effect)

# Calculate the 95% confidence interval for the effect size difference
lower_bound = np.percentile(bootstrap_effects, 2.5)
upper_bound = np.percentile(bootstrap_effects, 97.5)

original_effect, lower_bound, upper_bound

P value: MannwhitneyuResult(statistic=7.0, pvalue=0.1481349357421432)


(0.025215876716281116, -0.02813302477232972, 0.08049244043995707)