In [35]:
from sys import path
path.append('../src')
from NLEval import graph, valsplit, label, model
from sklearn.metrics import roc_auc_score as auroc
import numpy as np
import pandas as pd

In [36]:
'''
Going to add data dir myself
Will send to Doug and Pat the data folder and they will add to it
The whole data folder will be ignored in git and github
'''

# load graph and lablset collection
data_path = '../data/' # path to data
# load graph and labelset collection
g = graph.DenseGraph.DenseGraph.from_edglst(data_path \
    + 'networks/String_experiments.edg', weighted=True, directed=False)
lsc = label.LabelsetCollection.SplitLSC.from_gmt(data_path + 'labels/c2.cp.kegg.v6.1.entrez.BP.gsea-min10-max200-ovlppt7-jacpt5.nonred.gmt')

# initialize models
SL_A = model.SupervisedLearning.LogReg(g, penalty='l2', solver='lbfgs')
# LP_A = model.LabelPropagation.LP(g)

In [20]:
# display label info
pd.options.display.max_rows = 999
df = pd.DataFrame()
df['Name'] = lsc.label_ids
df['Size'] = [len(lsc.get_labelset(i)) for i in lsc.label_ids]
df['Eff size'] = [len(lsc.get_labelset(i) & set(g.IDmap.lst)) for i in lsc.label_ids]
print(df)

                                                  Name  Size  Eff size
0                                      KEGG_CELL_CYCLE   128       118
1        KEGG_VASOPRESSIN_REGULATED_WATER_REABSORPTION    44        40
2                      KEGG_TGF_BETA_SIGNALING_PATHWAY    86        79
3                           KEGG_WNT_SIGNALING_PATHWAY   151       137
4                         KEGG_GLYCEROLIPID_METABOLISM    49        38
5                         KEGG_BETA_ALANINE_METABOLISM    22        20
6                          KEGG_GLUTATHIONE_METABOLISM    50        42
7                              KEGG_TASTE_TRANSDUCTION    52        25
8                                 KEGG_DNA_REPLICATION    36        34
9                      KEGG_CARDIAC_MUSCLE_CONTRACTION    80        66
10                          KEGG_RIBOFLAVIN_METABOLISM    16        12
11                       KEGG_ECM_RECEPTOR_INTERACTION    84        75
12                KEGG_DRUG_METABOLISM_CYTOCHROME_P450    72        49
13    

In [21]:
# get labelset ID
label_id = lsc.label_ids[29]
print(label_id)

KEGG_ENDOCYTOSIS


In [22]:
# get positive and negative samples
positive_set = lsc.get_labelset(label_id)
negative_set = lsc.get_negative(label_id)
print('Positive:\n', positive_set, '\n')
print('Negative:\n', negative_set)

Positive:
 {'6457', '7037', '57403', '3134', '50807', '93343', '11311', '8766', '5584', '137492', '998', '30011', '5337', '3312', '155', '10015', '55738', '57132', '153', '116986', '163', '11267', '1950', '3559', '6714', '84249', '2066', '29924', '1213', '30846', '84612', '440073', '2060', '3303', '3561', '9101', '51160', '56904', '3304', '23527', '5878', '3560', '7189', '3949', '92421', '26286', '5979', '9922', '11031', '116983', '2261', '9230', '5867', '64145', '867', '57154', '84313', '3577', '116987', '10617', '1436', '200576', '83737', '23396', '3310', '5590', '3133', '382', '6456', '30845', '868', '2264', '5868', '84440', '116984', '5869', '23624', '22841', '161', '23096', '5338', '652614', '84552', '51028', '55616', '9829', '3815', '29082', '9135', '23362', '9146', '128866', '156', '8218', '89853', '154', '64744', '9525', '84364', '9727', '23550', '8853', '1956', '8395', '6455', '64411', '5156', '51100', '8027', '408', '3305', '51534', '2868', '9744', '3106', '51652', '3265', '2

In [25]:
# train and get genome wide prediction scores
model = SL_A
score_dict = model.predict(positive_set, negative_set)

In [26]:
# print top ranked genes and its intersection with known ones
numbers_of_top_genes = 50
top_list = sorted(score_dict, key=score_dict.get, reverse=True)[:numbers_of_top_genes]
intersection = list(set(top_list) & positive_set)
print("Top %d genes: %s" % (numbers_of_top_genes, repr(top_list)))
print("Known genes in top %d: %s" % (numbers_of_top_genes, repr(intersection)))

Top 50 genes: ['7334', '3190', '23291', '8655', '5241', '80143', '7184', '19', '1000', '5710', '653361', '5465', '996', '843', '5500', '8837', '2770', '85369', '2768', '8773', '6139', '9101', '5296', '6446', '5315', '701', '4478', '5695', '7171', '2697', '10454', '1161', '2690', '1540', '25759', '4967', '54512', '11235', '84313', '7381', '9474', '1514', '5694', '51606', '83443', '4216', '56915', '7327', '6124', '9739']
Known genes in top 50: ['9101', '84313']


In [37]:
# user input labelset
lsc.add_labelset(['7328', '59343', '6209', '3667', '7168', '572', '4914'], 'New')

In [28]:
print(lsc.get_labelset('New'))
print(lsc.get_negative('New'))

{'59343', '3667', '572', '4914', '7168', '7328', '6209'}
{'1002', '9985', '701', '2618', '6696', '2137', '2815', '1949', '627', '3421', '1571', '4306', '7941', '9474', '1635', '3703', '50814', '9739', '51422', '5743', '51205', '64816', '1540', '23761', '7334', '7058', '11261', '2329', '7381', '1237', '259285', '22929', '79132', '7366', '83443', '4099', '956', '23291', '84313', '730', '7184', '8509', '4245', '10454', '5438', '7327', '6337', '64682', '6139', '4359', '134111', '728622', '2997', '51272', '129607', '4267', '1000', '913', '26290', '3578', '2770', '53833', '63826', '3993', '54926', '6742', '51465', '5209', '3190', '9071', '5465', '80143', '5315', '6124', '84720', '2697', '727947', '843', '800', '1288', '135152', '162466', '7171', '2069', '1655', '1514', '438', '537', '29926', '4967', '2946', '3690', '7363', '2690', '35', '56907', '3242', '8655', '2920', '3937', '374378', '5264', '5705', '83439', '901', '528', '5190', '898', '5361', '131965', '6484', '51366', '4357', '10535', 

In [38]:
model = SL_A
score_dict = model.predict(lsc.get_labelset('New'), lsc.get_negative('New'))

In [32]:
numbers_of_top_genes = 50
top_list = sorted(score_dict, key=score_dict.get, reverse=True)[:numbers_of_top_genes]
intersection = list(set(top_list) & positive_set)
print("Top %d genes: %s" % (numbers_of_top_genes, repr(top_list)))
print("Known genes in top %d: %s" % (numbers_of_top_genes, repr(intersection)))

Top 50 genes: ['701', '6696', '25847', '699', '991', '9184', '246184', '29882', '119504', '29945', '8697', '2815', '3972', '10403', '4085', '996', '25936', '64682', '8881', '25906', '10393', '51433', '26271', '3973', '928', '1082', '79003', '3486', '5054', '83540', '2160', '2153', '7450', '2147', '999', '134', '51434', '3491', '2243', '3053', '2811', '2488', '58157', '7056', '93659', '835', '2244', '10287', '94115', '3827']
Known genes in top 50: []


In [33]:
def example(label_index, numbers_of_top_genes, model):
    # get labelset ID
    label_id = lsc.label_ids[label_index]
    print(label_id)
    
    # get positive and negative samples
    positive_set = lsc.get_labelset(label_id)
    negative_set = lsc.get_negative(label_id)
    
    # train and get genome wide prediction scores
    score_dict = model.predict(positive_set, negative_set)
    
    # print top ranked genes and its intersection with known ones
    top_list = sorted(score_dict, key=score_dict.get, reverse=True)[:numbers_of_top_genes]
    intersection = list(set(top_list) & positive_set)
    print("Top %d genes: %s" % (numbers_of_top_genes, repr(top_list)))
    print("Known genes in top %d: %s" % (numbers_of_top_genes, repr(intersection)))

In [34]:
example(29, 50, SL_A)

KEGG_ENDOCYTOSIS
Top 50 genes: ['7334', '3190', '23291', '8655', '5241', '80143', '7184', '19', '1000', '5710', '653361', '5465', '996', '843', '5500', '8837', '2770', '85369', '2768', '8773', '6139', '9101', '5296', '6446', '5315', '701', '4478', '5695', '7171', '2697', '10454', '1161', '2690', '1540', '25759', '4967', '54512', '11235', '84313', '7381', '9474', '1514', '5694', '51606', '83443', '4216', '56915', '7327', '6124', '9739']
Known genes in top 50: ['9101', '84313']
