In [58]:
import pandas as pd
from igraph import Graph
import sys
import os, sys
sys.path.append('../dev/')
import geneplexus
from ast import literal_eval
from tqdm.notebook import tqdm

In [59]:
graph = Graph.Read_GML("../../../data/processed/graph_apid_huri")

In [60]:
edge_list = graph.get_edge_dataframe()
gene_list = graph.vs['name']

In [57]:
edge_list['source'] = edge_list['source'].apply(lambda row: gene_list[row])
edge_list['target'] = edge_list['target'].apply(lambda row: gene_list[row])

In [43]:
from gprofiler import GProfiler

gp = GProfiler(return_dataframe=True)
gp_convert_df = gp.convert(organism='hsapiens', query=graph.vs['name'], target_namespace='ENTREZGENE_ACC')
gp_convert_df = gp_convert_df[gp_convert_df['n_converted']==1]
gp_convert_dict = gp_convert_df[['incoming', 'converted']].set_index('incoming').to_dict(orient='dict')['converted']
edge_list['source'] = edge_list['source'].apply(lambda row: gp_convert_dict[row])
edge_list['target'] = edge_list['target'].apply(lambda row: gp_convert_dict[row])

In [62]:
edge_list.to_csv('../data/edge_list_apidhuri.txt', sep=' ', header=False, index=False)

In [63]:
"""
Convert :term:`edgelist` to node order.
The node order (NodeOrder) file is used to map gene IDs to rows in the data
repsentation matrix.
Args:
    edgelist_loc: Location of the edgelist
    data_dir: The directory to save the file
    net_name: The name of the network
    sep: The separation used in the edgelist file (default tab)
    skiplines: The number of lines to skip for header
"""

geneplexus.custom.edgelist_to_nodeorder(edgelist_loc='../data/edge_list_apidhuri.txt', data_dir='../data/', net_name='APID_HuRI', sep=' ')

In [64]:
"""
Convert :term:`edgelist` to an adjacency matrix or influence matrix.
Note:
    The NodeOrder file needs to be a single column text file. If not
    supplying custom GSC, the file needs to be in Entrez ID space.
Args:
    edgelist_loc: Location of the edgelist
    data_dir: The directory to save the file
    net_name: The name of the network
    features: Features for the networks (Adjacency or Influence, All)
    alpha: Restart parameter.
    sep: The separation used in the edgelist file (default tab)
    skiplines: The number of lines to skip for header
"""

geneplexus.custom.edgelist_to_matrix(edgelist_loc='../data/edge_list_apidhuri.txt', data_dir='../data/', net_name='APID_HuRI', features='Adjacency', sep=' ')

In [None]:
"""Select subset of data to download.
Args:
    data_dir: Location of data files.
    tasks: Task of interest, accept multiple selection as a list. Do all
        the tasks if set to "All".
    networks: Networks of interest, accept multiple selection as a list. Do
        all the networks if set to "All".
    features: Network features of interest, accept multiple selection as a
        list. Do all the features if set to "All".
    gscs: Gene set collection of interest, accept multiple selection as a
        list. Do all the GSC if set to "All".
    n_jobs: Number of concurrent downloading threads.
    retry: If set to True, then retry downloading any missing file.
"""

geneplexus.download.download_select_data(data_dir='../data/', tasks='OriginalGSCs', networks='STRING', features='Adjacency', gscs='All')

In [65]:
"""Subset :term:`GSC` to only include genes in the network.
Note:
    Use the :meth:`geneplexus.download.download_select_data` function to
    get the preprocessed GO and DisGeNet files first.
Args:
    data_dir: The directory to save the file
    net_name: The name of the network
    gsc_name: The name of the GSC
    max_size: Maximum geneset size.
    min_size: Minimum geneset size.
"""
import importlib
importlib.reload(geneplexus)
geneplexus.custom.subset_gsc_to_network('../data/', 'APID_HuRI', 'GO', 350, 50)

In [66]:
myclass = geneplexus.GenePlexus(file_loc='../data/', net_type="APID_HuRI", features="Adjacency", gsc="GO")

In [67]:
process_labels = pd.read_csv('../../../data/processed/reactome_labels_apid_huri.csv', header=None)
test_indices = pd.read_csv("../../../data/processed/fs/reactome_rwr_test_apid_huri.csv", sep=',', header=0).transpose().to_numpy(dtype='int')-1

  arr = arr.astype(dtype, copy=False)


In [68]:
disease_labels = pd.read_csv('../../../data/processed/disgenet_filtered_labels_apid_huri.csv', header=None)
disease_test_indices = pd.read_csv("../../../data/processed/fs/disease/disease_rwr_test_apid_huri.csv", sep=',', header=0).transpose().to_numpy(dtype='int')-1
disease_conservative_labels = pd.read_csv('../../../data/processed/disgenet_conservative_labels_apid_huri.csv', header=None)
disease_conservative_test_indices = pd.read_csv("../../../data/processed/fs/disease/disease_rwr_test_conservative_apid_huri.csv", sep=',', header=0).transpose().to_numpy(dtype='int')-1

  arr = arr.astype(dtype, copy=False)
  arr = arr.astype(dtype, copy=False)


In [39]:
clf_dict = {'auprc_random':[], 'precision_k_random':[]}
for i in tqdm(range(len(process_labels.columns))):
    mdl_weights, df_probs, avgps, auprc_random, precision_k_random = myclass.fit_and_predict(process_labels[i], test_indices[i])
    print('AUPRC: {}'.format(auprc_random))
    print('Precision@K: {}'.format(precision_k_random))
    clf_dict['auprc_random'].append(auprc_random)
    clf_dict['precision_k_random'].append(precision_k_random)

clf_df = pd.DataFrame.from_dict(clf_dict)
clf_df.to_csv('../../../models/threshold_classifier/process/geneplexus.csv', index=False)

  0%|          | 0/429 [00:00<?, ?it/s]

  precision_k = sum(value_labels['label']


AUPRC: 5.372304820774928
Precision@K: 17.551020408163264


  precision_k = sum(value_labels['label']


AUPRC: 1.092324208507478
Precision@K: 0.0


  precision_k = sum(value_labels['label']


AUPRC: 0.9980574944158067
Precision@K: 0.0


  precision_k = sum(value_labels['label']


AUPRC: 20.529398234299578
Precision@K: 68.80000000000001


  precision_k = sum(value_labels['label']


AUPRC: 3.5217657256894612
Precision@K: 0.0


KeyboardInterrupt: 

In [None]:
disease_clf_dict = {'auprc_random':[], 'precision_k_random':[]}
for i in tqdm(range(len(disease_labels.columns))):
    mdl_weights, df_probs, avgps, auprc_random, precision_k_random = myclass.fit_and_predict(disease_labels[i], disease_test_indices[i])
    disease_clf_dict['auprc_random'].append(auprc_random)
    disease_clf_dict['precision_k_random'].append(precision_k_random)

disease_clf_df = pd.DataFrame.from_dict(disease_clf_dict)
disease_clf_df.to_csv('../../../models/threshold_classifier/disease/geneplexus.csv', index=False)

  0%|          | 0/203 [00:00<?, ?it/s]

  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['

In [None]:
disease_conservative_clf_dict = {'auprc_random':[], 'precision_k_random':[]}
for i in tqdm(range(len(disease_conservative_labels.columns))):
    mdl_weights, df_probs, avgps, auprc_random, precision_k_random = myclass.fit_and_predict(disease_conservative_labels[i], disease_conservative_test_indices[i])
    disease_conservative_clf_dict['auprc_random'].append(auprc_random)
    disease_conservative_clf_dict['precision_k_random'].append(precision_k_random)

disease_conservative_clf_df = pd.DataFrame.from_dict(disease_conservative_clf_dict)
disease_conservative_clf_df.to_csv('../../../models/threshold_classifier/disease/geneplexus_conservative.csv', index=False)

  0%|          | 0/301 [00:00<?, ?it/s]

  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['label']
  precision_k = sum(value_labels['