In [1]:
import pandas as pd
from lexas import evaluation
from lexas import prediction
symbols = prediction.symbols
mode = "all_following" #all_following or just_next

# Preparation

In [5]:
path_to_csv = "./data/experiments_for_xgboost.csv"
train_dic = evaluation.generate_dic_for_eval(
    path_to_csv,
    1930,  #start year
    2018,  #end year
    mode) 

dev_dic = evaluation.generate_dic_for_eval(path_to_csv,2019,2019,mode)
test_dic = evaluation.generate_dic_for_eval(path_to_csv,2020,2023,mode)

#Save the dictionaries
#The disctionaries are saved as ./eval/dic_{mode}.csv
evaluation.save_dic(train_dic,dev_dic,test_dic,mode)

793it [00:00, 157246.74it/s]
793it [00:00, 377021.43it/s]
793it [00:00, 492534.14it/s]


# Evaluation

In [2]:
#Loading data
mode = "just_next"
mode = "all_following"

train_dic,dev_dic,test_dic = evaluation.load_dic(mode)

print(train_dic["PLK4"]) #Genes examined after PLK4 in the articles published before 2018
print(dev_dic["PLK4"]) #Genes examined after PLK4 in the articles published in 2019
print(test_dic["PLK4"]) #Genes examined after PLK4 in the articles published after 2020

['SASS6', 'CENPJ', 'PCNA', 'CEP295', 'PCM1', 'STIL', 'CDH1', 'CEP135', 'CCP110', 'NEK7', 'CEP152', 'CEP192', 'RBM14', 'IFT88', 'PLK4']
['SASS6', 'PCNT', 'CEP152', 'STIL', 'CEP192', 'PLK4']
['SASS6', 'PLK1', 'PCNT', 'CDK5RAP2', 'CEP192', 'CEP57', 'PLK4']


In [4]:
# Genes examined after a query gene in previous data are not considered false, 
# so they are removed from evaluation.
# prev_dic stores genes examined previously to the answer data set

def get_dictionaries_for_evaluation(eval_mode, train_dic, dev_dic, test_dic):   
    if eval_mode == "dev":
        prev_dic = train_dic.copy()
        answer_dic = dev_dic.copy()
    elif eval_mode == "test":
        prev_dic = {k: train_dic[k] + dev_dic[k] for k in train_dic}
        answer_dic = test_dic.copy()
    else:
        raise ValueError("Evaluation mode should be \"dev\" or \"test\"")
    
    return prev_dic, answer_dic

eval_mode = "test" #dev or test
prev_dic, answer_dic = get_dictionaries_for_evaluation(eval_mode, train_dic, dev_dic, test_dic)

In [5]:
#Calculate AUC
model_name ="xgboost"
genes = ["PLK4","SASS6","CEP152","CEP192","CEP63","PCNT"]
result_dir = "./result/xgboost"
top_k = 100

import os
import tqdm

evaluation.calculate_auc_for_many_genes(result_dir, model_name, prev_dic, answer_dic, top_k, genes=genes)

100%|██████████| 6/6 [00:00<00:00, 12.34it/s]


Unnamed: 0,Symbol,AUC at 100 for xgboost
0,PLK4,0.665227
1,SASS6,0.66522
2,CEP152,0.748757
3,CEP192,0.698681
4,CEP63,0.581517
5,PCNT,0.798869


In [12]:
!head "./data/HGNC.txt"

Approved symbol	Approved name	Previous symbols	Alias symbols	Chromosome	Alias names	Previous name
A1BG	alpha-1-B glycoprotein			19q13.43		
A1BG-AS1	A1BG antisense RNA 1	NCRNA00181, A1BGAS, A1BG-AS	FLJ23569	19q13.43		"non-protein coding RNA 181", "A1BG antisense RNA (non-protein coding)", "A1BG antisense RNA 1 (non-protein coding)"
A1CF	APOBEC1 complementation factor		ACF, ASP, ACF64, ACF65, APOBEC1CF	10q11.23		
A1S9T	symbol withdrawn, see [HGNC:12469](/data/gene-symbol-report/#!/hgnc_id/HGNC:12469)					
A2M	alpha-2-macroglobulin		FWP007, S863-7, CPAMD5	12p13.31		
A2M-AS1	A2M antisense RNA 1			12p13.31		"A2M antisense RNA 1 (non-protein coding)", "A2M antisense RNA 1"
A2ML1	alpha-2-macroglobulin like 1	CPAMD9	FLJ25179, p170	12p13.31		"C3 and PZP-like, alpha-2-macroglobulin domain containing 9"
A2ML1-AS1	A2ML1 antisense RNA 1			12p13.31		"A2ML1 antisense RNA 1 (non-protein coding)"
A2ML1-AS2	A2ML1 antisense RNA 2			12p13.31		"A2ML1 antisense RNA 2 (non-protein coding)"


In [39]:
import pandas as pd
import os

# 初期DataFrame
df = pd.DataFrame(index=symbols)
# ディレクトリ内のすべてのファイルをループ
for file in os.listdir("/mnt/a/nxml/src_231016/eval/test_100_just_next"):
    file_path = os.path.join("/mnt/a/nxml/src_231016/eval/test_100_just_next", file)
    
    # CSVファイルを読み込む
    new_df = pd.read_csv(file_path, index_col=1).drop("Unnamed: 0",axis=1)
    
    # dfとnew_dfを結合
    df = pd.merge(df, new_df, how="outer", left_index=True, right_index=True)
    df = df[[s in ps for s in df.index]]
    df.dropna(axis=0, how='all', inplace=True)
    df.fillna(0.5, inplace=True)

In [33]:
ps = set()
with open("./data/HGNC2.txt","r") as f:
    for line in f:
        ls = line.strip().split("\t")
        ps.add(ls[0])

In [41]:
!head ../Repository/feature/funcoup5_rwr.txt

ZNF691	AKT1	0.837
ZNF691	NEDD4L	0.297
ZNF691	AATF	0.282
ZNF691	HNRNPUL1	0.132
ZNF691	SETD1B	0.148
ZNF691	UGP2	0.51
ZNF691	SRRT	0.149
ZNF691	ING3	0.135
ZNF691	CCNL2	0.454
ZNF691	U2AF1L4	0.255


In [36]:
len(ps)

19393

In [34]:
df[[s in ps for s in df.index]]

Unnamed: 0,AUC at 1000 for STRING_raw,AUC at 1000 for LEXAS_plus,AUC at 1000 for GOSemSim,AUC at 1000 for FunCoup,AUC at 1000 for LEXAS_data,AUC at 1000 for LEXAS,AUC at 1000 for STRING,AUC at 1000 for FunCoup_raw
A4GALT,0.595679,0.694489,0.708095,0.534115,0.554442,0.665213,0.721574,0.037037
AAAS,0.560096,0.685694,0.493240,0.471669,0.534812,0.648628,0.769410,0.312687
AACS,0.840105,0.651872,0.535253,0.541998,0.559375,0.489956,0.687883,0.463166
AADAT,0.836190,0.712645,0.576923,0.574933,0.511722,0.511984,0.728707,0.345992
AAK1,0.726044,0.671242,0.656156,0.601296,0.566331,0.636978,0.632533,0.730363
...,...,...,...,...,...,...,...,...
ZXDA,0.353972,0.659424,0.522059,0.566443,0.525616,0.525623,0.561945,1.000000
ZXDB,0.347944,0.706390,0.588544,0.530479,0.536165,0.536175,0.529350,0.947368
ZYG11B,0.637755,0.619098,0.537099,0.592447,0.580135,0.519518,0.594368,0.988506
ZYX,0.630998,0.706351,0.592254,0.624380,0.594980,0.667162,0.705958,0.786142


In [40]:
df.mean()

AUC at 100 for STRING_raw     0.632327
AUC at 100 for LEXAS_plus     0.574065
AUC at 100 for GOSemSim       0.550851
AUC at 100 for FunCoup        0.532968
AUC at 100 for LEXAS_data     0.540475
AUC at 100 for LEXAS          0.569453
AUC at 100 for STRING         0.577940
AUC at 100 for FunCoup_raw    0.599732
dtype: float64

In [19]:
dfdf["AUC at 100 for STRING"]]

KeyError: "None of [Float64Index([0.5439282913083895, 0.4970717847616838, 0.6215594332501017,\n              0.5806982744718594,                nan,                nan,\n              0.6340045806906273, 0.8107689187623129, 0.5651194559088532,\n              0.5597443926594815,\n              ...\n              0.9962870569124558, 0.6846590358174801, 0.5425462975158716,\n              0.5525294083560294, 0.4970736512719477, 0.4970726334705234,\n              0.5573262156202855, 0.5959342189483745,  0.497073481687529,\n              0.7764202475154939],\n             dtype='float64', length=16549)] are in the [columns]"

In [23]:
missing_rows = df[df["AUC at 100 for STRING"].isna()]
missing_rows.index.tolist()

['A2MP1',
 'A3GALT2',
 'AATF',
 'ABCB11',
 'ABCC2',
 'ABCC4',
 'ABCC6P1',
 'ABCG4',
 'ABCG5',
 'ABHD11-AS1',
 'ABLIM1',
 'ACACA',
 'ACD',
 'ACKR2',
 'ACP6',
 'ACSBG2',
 'ACTB',
 'ACTG1',
 'ACTN3',
 'ADAM1A',
 'ADAM21',
 'ADAM3A',
 'ADAM5',
 'ADAM6',
 'ADAMTS9-AS2',
 'ADGRE4P',
 'ADGRG1',
 'ADH1C',
 'ADORA2A-AS1',
 'ADORA3',
 'ADRA2B',
 'AFAP1-AS1',
 'AFDN-DT',
 'AFF3',
 'AFTPH',
 'AGAP11',
 'AGBL2',
 'AGK',
 'AGS1',
 'AHSA2P',
 'AIRN',
 'AKAP2',
 'ALAS2',
 'ALDH16A1',
 'ALDH3B1',
 'ALDOA',
 'ALDOB',
 'ALG1L',
 'ALG6',
 'ALMS1',
 'ALOX12B',
 'ALX3',
 'ALX4',
 'AMELY',
 'AMER1',
 'AMZ2P1',
 'ANKRD20A5P',
 'ANKRD24',
 'ANKRD26P1',
 'ANKRD34A',
 'ANKRD40CL',
 'ANKS1A',
 'ANP32E',
 'ANTXRL',
 'ANXA2P2',
 'ANXA8',
 'ANXA8L1',
 'AOAH',
 'AOC4P',
 'AOX2P',
 'AP2B1',
 'APELA',
 'APLN',
 'APOC1',
 'APRG1',
 'AQP4-AS1',
 'AREG',
 'ARF2',
 'ARFRP1',
 'ARHGAP18',
 'ARHGAP23',
 'ARHGAP27P1',
 'ARHGAP35',
 'ARHGAP5-AS1',
 'ARHGEF15',
 'ARHGEF26-AS1',
 'ARL16',
 'ARPC1B',
 'ARSL',
 'ASAP1-IT1',
 'ASB1