In [1]:
from sys import path
path.append('../src')
from NLEval import graph, valsplit, label, model
from sklearn.metrics import roc_auc_score as auroc
import numpy as np
import pandas as pd

In [2]:
# load graph and lablset collection
data_path = '../data/' # path to data
# load graph and labelset collection
g = graph.DenseGraph.DenseGraph.from_edglst(data_path \
    + 'networks/STRING-EXP.edg', weighted=True, directed=False)
lsc = label.LabelsetCollection.SplitLSC.from_gmt(data_path + 'labels/KEGGBP.gmt')

# initialize models
SL_A = model.SupervisedLearning.LogReg(g, penalty='l2', solver='lbfgs')
LP_A = model.LabelPropagation.LP(g)

In [3]:
# display label info
pd.options.display.max_rows = 999
df = pd.DataFrame()
df['Name'] = lsc.label_ids
df['Size'] = [len(lsc.get_labelset(i)) for i in lsc.label_ids]
df['Eff size'] = [len(lsc.get_labelset(i) & set(g.IDmap.lst)) for i in lsc.label_ids]
print(df)

                                                  Name  Size  Eff size
0                                      KEGG_CELL_CYCLE   128       118
1        KEGG_VASOPRESSIN_REGULATED_WATER_REABSORPTION    44        40
2                      KEGG_TGF_BETA_SIGNALING_PATHWAY    86        79
3                           KEGG_WNT_SIGNALING_PATHWAY   151       137
4                         KEGG_GLYCEROLIPID_METABOLISM    49        38
5                         KEGG_BETA_ALANINE_METABOLISM    22        20
6                          KEGG_GLUTATHIONE_METABOLISM    50        42
7                              KEGG_TASTE_TRANSDUCTION    52        25
8                                 KEGG_DNA_REPLICATION    36        34
9                      KEGG_CARDIAC_MUSCLE_CONTRACTION    80        66
10                          KEGG_RIBOFLAVIN_METABOLISM    16        12
11                       KEGG_ECM_RECEPTOR_INTERACTION    84        75
12                KEGG_DRUG_METABOLISM_CYTOCHROME_P450    72        49
13    

In [4]:
# get labelset ID
label_id = lsc.label_ids[29]
print(label_id)

KEGG_ENDOCYTOSIS


In [5]:
# get positive and negative samples
positive_set = lsc.get_labelset(label_id)
negative_set = lsc.get_negative(label_id)
print('Positive:\n', positive_set, '\n')
print('Negative:\n', negative_set)

Positive:
 {'1173', '2870', '2868', '22905', '3310', '11031', '2149', '116987', '23362', '10059', '11059', '10617', '157', '9525', '27243', '6011', '128866', '998', '3579', '23550', '5868', '3303', '29082', '2321', '2060', '3133', '5156', '5338', '440073', '9146', '155', '161', '22841', '3480', '30011', '57132', '84552', '3559', '9230', '7037', '30845', '1436', '80230', '8853', '51534', '409', '5979', '3312', '5337', '8411', '131890', '8766', '160', '79643', '26286', '163', '79720', '2065', '28964', '3135', '83737', '867', '7189', '1785', '9135', '26119', '55048', '9101', '3134', '4734', '10193', '1956', '23396', '1211', '7251', '2066', '51652', '9815', '1234', '154', '2261', '3265', '5869', '1212', '6457', '4914', '6455', '1759', '84249', '408', '1601', '382', '23327', '3106', '23624', '56904', '5662', '4193', '10938', '156', '7852', '26052', '3791', '23527', '27183', '84313', '3560', '200576', '29924', '64750', '89853', '23096', '3107', '93343', '1213', '11267', '80223', '1950', '356

In [6]:
# train and get genome wide prediction scores
model = SL_A
score_dict = model.predict(positive_set, negative_set)

In [11]:
# print top ranked genes and its intersection with known ones
numbers_of_top_genes = 50
top_list = sorted(score_dict, key=score_dict.get, reverse=True)[:numbers_of_top_genes]
intersection = list(set(top_list) & positive_set)
print("Top %d genes: %s" % (numbers_of_top_genes, repr(top_list)))
print("Known genes in top %d: %s" % (numbers_of_top_genes, repr(intersection)))

Top 50 genes: ['7533', '6714', '3725', '867', '988', '30011', '1019', '3190', '7323', '6502', '1788', '1000', '8453', '3920', '1789', '2957', '7328', '59343', '6209', '3667', '7168', '572', '4914', '7414', '6389', '1453', '7037', '637', '6627', '4175', '1871', '7706', '699', '6227', '84313', '11337', '8743', '7508', '1615', '661', '9748', '25898', '10621', '5600', '55284', '5817', '79720', '81027', '6655', '4316']
Known genes in top 50: ['79720', '6714', '7037', '867', '4914', '84313', '30011']


In [12]:
# user input labelset
lsc.add_labelset(['7328', '59343', '6209', '3667', '7168', '572', '4914'], 'New')

In [14]:
print(lsc.get_labelset('New'))
print(lsc.get_negative('New'))

{'59343', '7328', '7168', '4914', '3667', '572', '6209'}
{'2870', '8643', '735', '1000', '7201', '1562', '2277', '55577', '27330', '2495', '248', '60496', '653888', '4051', '2327', '148327', '81888', '1789', '30011', '7037', '3725', '4157', '2986', '8677', '6257', '7533', '81027', '30837', '162466', '26007', '10344', '29851', '1394', '1583', '867', '7915', '5332', '3597', '5817', '4709', '810', '9942', '2651', '51715', '521', '4696', '988', '133121', '23673', '377677', '7424', '51144', '9179', '6627', '79837', '5372', '7706', '7414', '9133', '10621', '51166', '3996', '84313', '57733', '51091', '95', '6227', '5737', '6898', '79840', '30815', '637', '6502', '7450', '6487', '196743', '553', '57369', '410', '9791', '5600', '5161', '4242', '4756', '2958', '3293', '2134', '7043', '1366', '64170', '51067', '3577', '1355', '54657', '284217', '10747', '4697', '123283', '84612', '10946', '23495', '4843', '2523', '6358', '492', '79158', '81873', '5578', '5289', '2915', '83943', '6452', '10349', '

In [16]:
score_dict = model.predict(lsc.get_labelset('New'), lsc.get_negative('New'))

In [17]:
numbers_of_top_genes = 50
top_list = sorted(score_dict, key=score_dict.get, reverse=True)[:numbers_of_top_genes]
intersection = list(set(top_list) & positive_set)
print("Top %d genes: %s" % (numbers_of_top_genes, repr(top_list)))
print("Known genes in top %d: %s" % (numbers_of_top_genes, repr(intersection)))

Top 50 genes: ['8453', '1000', '2870', '22900', '29937', '2869', '735', '112464', '64399', '8454', '8065', '8643', '8451', '6923', '5727', '6921', '91653', '8452', '50937', '4522', '6569', '732', '22861', '7284', '156', '440068', '813', '10392', '9978', '1906', '10987', '55664', '733', '1448', '1003', '1908', '2767', '7428', '8533', '1013', '3091', '25828', '1447', '6532', '91662', '5663', '8450', '79139', '373509', '740']
Known genes in top 50: ['156', '2870', '2869']


In [None]:
def example(label_index, numbers_of_top_genes, model):
    # get labelset ID
    label_id = lsc.label_ids[label_index]
    print(label_id)
    
    # get positive and negative samples
    positive_set = lsc.get_labelset(label_id)
    negative_set = lsc.get_negative(label_id)
    
    # train and get genome wide prediction scores
    score_dict = model.predict(positive_set, negative_set)
    
    # print top ranked genes and its intersection with known ones
    top_list = sorted(score_dict, key=score_dict.get, reverse=True)[:numbers_of_top_genes]
    intersection = list(set(top_list) & positive_set)
    print("Top %d genes: %s" % (numbers_of_top_genes, repr(top_list)))
    print("Known genes in top %d: %s" % (numbers_of_top_genes, repr(intersection)))

In [None]:
example(29, 50, SL_A)