In [1]:
from sys import path
path.append('../src')
from NLEval import graph, valsplit, label, model
from sklearn.metrics import roc_auc_score as auroc
import numpy as np
import pandas as pd

In [2]:
# load graph and lablset collection
data_path = '../data/' # path to data
# load graph and labelset collection
g = graph.DenseGraph.DenseGraph.from_edglst(data_path \
    + 'networks/STRING-EXP.edg', weighted=True, directed=False)
lsc = label.LabelsetCollection.SplitLSC.from_gmt(data_path + 'labels/KEGGBP.gmt')

# initialize models
SL_A = model.SupervisedLearning.LogReg(g, penalty='l2', solver='lbfgs')
LP_A = model.LabelPropagation.LP(g)

In [3]:
# display label info
pd.options.display.max_rows = 999
df = pd.DataFrame()
df['Name'] = lsc.labelIDlst
df['Size'] = [len(lsc.getLabelset(i)) for i in lsc.labelIDlst]
df['Eff size'] = [len(lsc.getLabelset(i) & set(g.IDmap.lst)) for i in lsc.labelIDlst]
print(df)

                                                  Name  Size  Eff size
0                                      KEGG_CELL_CYCLE   128       118
1        KEGG_VASOPRESSIN_REGULATED_WATER_REABSORPTION    44        40
2                      KEGG_TGF_BETA_SIGNALING_PATHWAY    86        79
3                           KEGG_WNT_SIGNALING_PATHWAY   151       137
4                         KEGG_GLYCEROLIPID_METABOLISM    49        38
5                         KEGG_BETA_ALANINE_METABOLISM    22        20
6                          KEGG_GLUTATHIONE_METABOLISM    50        42
7                              KEGG_TASTE_TRANSDUCTION    52        25
8                                 KEGG_DNA_REPLICATION    36        34
9                      KEGG_CARDIAC_MUSCLE_CONTRACTION    80        66
10                          KEGG_RIBOFLAVIN_METABOLISM    16        12
11                       KEGG_ECM_RECEPTOR_INTERACTION    84        75
12                KEGG_DRUG_METABOLISM_CYTOCHROME_P450    72        49
13    

In [4]:
def example(label_index, numbers_of_top_genes, model):
    # get labelset ID
    labelID = lsc.labelIDlst[label_index]
    print(labelID)
    
    # get positive and negative samples
    positive_set = lsc.getLabelset(labelID)
    negative_set = lsc.getNegative(labelID)
    
    # train and get genome wide prediction scores
    score_dict = model.predict(positive_set, negative_set)
    
    # print top ranked genes and its intersection with known ones
    top_list = sorted(score_dict, key=score_dict.get, reverse=True)[:numbers_of_top_genes]
    intersection = list(set(top_list) & positive_set)
    print("Top %d genes: %s" % (numbers_of_top_genes, repr(top_list)))
    print("Known genes in top %d: %s" % (numbers_of_top_genes, repr(intersection)))

In [5]:
example(29, 50, SL_A)

KEGG_ENDOCYTOSIS
Top 50 genes: ['7157', '3320', '10273', '2064', '5710', '2534', '8850', '10075', '2547', '24148', '9351', '9040', '6426', '5052', '9092', '6720', '7525', '6389', '6446', '51639', '8766', '1453', '6727', '2308', '5170', '637', '10253', '5327', '9525', '3486', '26088', '330', '10193', '5584', '1643', '3561', '6135', '5717', '2956', '6154', '5528', '8650', '51529', '8867', '4773', '27', '6152', '374291', '10580', '64223']
Known genes in top 50: ['3561', '8766', '5584', '9525', '10193']
