In [4]:
import pandas as pd
from lexas import evaluation
from lexas import prediction
symbols = prediction.symbols
mode = "all_following" #all_following or just_next

# Preparation

In [5]:
path_to_csv = "./data/experiments_for_xgboost.csv"
train_dic = evaluation.generate_dic_for_eval(
    path_to_csv,
    1930,  #start year
    2018,  #end year
    mode) 

dev_dic = evaluation.generate_dic_for_eval(path_to_csv,2019,2019,mode)
test_dic = evaluation.generate_dic_for_eval(path_to_csv,2020,2023,mode)

#Save the dictionaries
#The disctionaries are saved as ./eval/dic_{mode}.csv
evaluation.save_dic(train_dic,dev_dic,test_dic,mode)

798it [00:00, 145619.08it/s]
798it [00:00, 378283.75it/s]
798it [00:00, 360518.59it/s]


# Evaluation

In [6]:
#Loading data
mode = "just_next"
mode = "all_following"

train_dic,dev_dic,test_dic = evaluation.load_dic(mode)

print(train_dic["PLK4"]) #Genes examined after PLK4 in the articles published before 2018
print(dev_dic["PLK4"]) #Genes examined after PLK4 in the articles published in 2019
print(test_dic["PLK4"]) #Genes examined after PLK4 in the articles published after 2020

['CEP295', 'CEP135', 'CEP192', 'PLK4', 'PCM1', 'SASS6', 'CCP110', 'NEK7', 'PCNA', 'CENPJ', 'CDH1', 'IFT88', 'STIL', 'CEP152', 'RBM14']
['PLK4', 'CEP192', 'SASS6', 'PCNT', 'STIL', 'CEP152']
['PLK4', 'CEP192', 'PLK1', 'SASS6', 'PCNT', 'CEP57', 'CDK5RAP2']


In [7]:
# Genes examined after a query gene in the previous data are not considered false or true. 
# They are removed from evaluation.
# prev_dic stores genes examined previously to the answer data set

def get_dictionaries_for_evaluation(eval_mode, train_dic, dev_dic, test_dic):   
    if eval_mode == "dev":
        prev_dic = train_dic.copy()
        answer_dic = dev_dic.copy()
    elif eval_mode == "test":
        prev_dic = {k: train_dic[k] + dev_dic[k] for k in train_dic}
        answer_dic = test_dic.copy()
    else:
        raise ValueError("Evaluation mode should be \"dev\" or \"test\"")
    
    return prev_dic, answer_dic

eval_mode = "test" #dev or test
prev_dic, answer_dic = get_dictionaries_for_evaluation(eval_mode, train_dic, dev_dic, test_dic)

In [8]:
#Calculate AUC
model_name ="xgboost"
genes = ["PLK4","SASS6","CEP152","CEP192","PCNT"]
result_dir = "./result/xgboost"
top_k = 100

import os
import tqdm

evaluation.calculate_auc_for_many_genes(result_dir, model_name, prev_dic, answer_dic, top_k, genes=genes)

100%|██████████| 5/5 [00:00<00:00, 14.31it/s]


Unnamed: 0,Symbol,AUC at 100 for xgboost
0,PLK4,0.664963
1,SASS6,0.664955
2,CEP152,0.497394
3,CEP192,0.698439
4,PCNT,0.798661


# Comparison with other tools 

In [41]:
import tqdm
import os
from lexas import evaluation

model_names =["xgboost","string_rwr","funcoup_rwr","string_raw","funcoup_raw","gosemsim"]
genes = ["PLK4","SASS6","CEP152","CEP192","PCNT"]
top_k = 100
results = []

for model_name in model_names:
    result_dir = f"./result/{model_name}"
    result = evaluation.calculate_auc_for_many_genes(result_dir, model_name, prev_dic, answer_dic, top_k, genes=genes)
    results.append(result)

from functools import reduce
merged_df = reduce(lambda left, right: pd.merge(left, right, on='Symbol', how='inner'), results)
merged_df 

100%|██████████| 5/5 [00:00<00:00, 20.95it/s]
100%|██████████| 5/5 [00:00<00:00, 14.40it/s]
100%|██████████| 5/5 [00:00<00:00, 15.02it/s]
100%|██████████| 5/5 [00:00<00:00, 19.43it/s]
100%|██████████| 5/5 [00:00<00:00, 20.09it/s]
100%|██████████| 5/5 [00:00<00:00, 18.34it/s]


Unnamed: 0,Symbol,AUC at 100 for xgboost,AUC at 100 for string_rwr,AUC at 100 for funcoup_rwr,AUC at 100 for string_raw,AUC at 100 for funcoup_raw,AUC at 100 for gosemsim
0,PLK4,0.999217,0.914456,0.580925,0.999054,0.915867,0.998064
1,SASS6,0.999148,0.832095,0.664783,0.83188,0.747574,0.998134
2,CEP152,0.91579,0.748031,0.664783,0.748598,0.664947,0.915067
3,CEP192,0.898209,0.765123,0.798007,0.765202,0.731413,0.831728
4,PCNT,0.898543,0.797946,0.731293,0.798421,0.731252,0.831632


# Mann-Whitney U test and 95%CI

In [50]:
import numpy as np
import pandas as pd
from scipy.stats import mannwhitneyu

group1_data = merged_df["AUC at 100 for xgboost"]
group2_data = merged_df["AUC at 100 for string_rwr"]

# Perform a Mann-Whitney U test to compare the two groups
print("P value:", mannwhitneyu(group1_data, group2_data))

# Calculate the effect size (difference in means) between the two groups
original_effect = np.mean(group1_data) - np.mean(group2_data)

# Bootstrap resampling for estimating a 95% confidence interval of the effect size difference
bootstrap_samples = 10000
bootstrap_effects = []

for _ in range(bootstrap_samples):
    # Randomly sample with replacement from both groups' data
    sample_group1 = np.random.choice(group1_data, size=len(merged_df), replace=True)
    sample_group2 = np.random.choice(group2_data, size=len(merged_df), replace=True)

    # Calculate the effect size (difference in means) for the bootstrap sample
    effect = np.mean(sample_group1) - np.mean(sample_group2)
    bootstrap_effects.append(effect)

# Calculate the 95% confidence interval for the effect size difference
lower_bound = np.percentile(bootstrap_effects, 2.5)
upper_bound = np.percentile(bootstrap_effects, 97.5)

original_effect, lower_bound, upper_bound

P value: MannwhitneyuResult(statistic=2.0, pvalue=0.01835692818135205)


(0.13065129243650853, 0.06356919228130574, 0.19415064029747217)