In [1]:
import pandas as pd
from igraph import Graph
import sys
import os, sys
sys.path.append('../dev/')
import geneplexus
from ast import literal_eval
from tqdm.notebook import tqdm

In [2]:
graph = Graph.Read_GML("../../../data/processed/graph_string_weighted")

In [128]:
edge_list = graph.get_edge_dataframe()
gene_list = graph.vs['name']

In [129]:
edge_list['source'] = edge_list['source'].apply(lambda row: gene_list[row])
edge_list['target'] = edge_list['target'].apply(lambda row: gene_list[row])

In [125]:
from gprofiler import GProfiler

gp = GProfiler(return_dataframe=True)
gp_convert_df = gp.convert(organism='hsapiens', query=graph.vs['name'], target_namespace='ENTREZGENE_ACC')

In [119]:
gp_convert_df = gp_convert_df[gp_convert_df['n_converted']==1]

In [120]:
gp_convert_dict = gp_convert_df[['incoming', 'converted']].set_index('incoming').to_dict(orient='dict')['converted']


In [121]:
edge_list['source'] = edge_list['source'].apply(lambda row: gp_convert_dict[row])
edge_list['target'] = edge_list['target'].apply(lambda row: gp_convert_dict[row])

In [130]:
edge_list.to_csv('../data/edge_list.txt', sep=' ', header=False, index=False)

In [131]:
"""
Convert :term:`edgelist` to node order.
The node order (NodeOrder) file is used to map gene IDs to rows in the data
repsentation matrix.
Args:
    edgelist_loc: Location of the edgelist
    data_dir: The directory to save the file
    net_name: The name of the network
    sep: The separation used in the edgelist file (default tab)
    skiplines: The number of lines to skip for header
"""

geneplexus.custom.edgelist_to_nodeorder(edgelist_loc='../data/edge_list.txt', data_dir='../data/', net_name='STRING_GM', sep=' ')

In [132]:
"""
Convert :term:`edgelist` to an adjacency matrix or influence matrix.
Note:
    The NodeOrder file needs to be a single column text file. If not
    supplying custom GSC, the file needs to be in Entrez ID space.
Args:
    edgelist_loc: Location of the edgelist
    data_dir: The directory to save the file
    net_name: The name of the network
    features: Features for the networks (Adjacency or Influence, All)
    alpha: Restart parameter.
    sep: The separation used in the edgelist file (default tab)
    skiplines: The number of lines to skip for header
"""

geneplexus.custom.edgelist_to_matrix(edgelist_loc='../data/edge_list.txt', data_dir='../data/', net_name='STRING_GM', features='Adjacency', sep=' ')

In [97]:
"""Select subset of data to download.
Args:
    data_dir: Location of data files.
    tasks: Task of interest, accept multiple selection as a list. Do all
        the tasks if set to "All".
    networks: Networks of interest, accept multiple selection as a list. Do
        all the networks if set to "All".
    features: Network features of interest, accept multiple selection as a
        list. Do all the features if set to "All".
    gscs: Gene set collection of interest, accept multiple selection as a
        list. Do all the GSC if set to "All".
    n_jobs: Number of concurrent downloading threads.
    retry: If set to True, then retry downloading any missing file.
"""

geneplexus.download.download_select_data(data_dir='../data/', tasks='OriginalGSCs', networks='STRING', features='Adjacency', gscs='All')



In [98]:
"""Subset :term:`GSC` to only include genes in the network.
Note:
    Use the :meth:`geneplexus.download.download_select_data` function to
    get the preprocessed GO and DisGeNet files first.
Args:
    data_dir: The directory to save the file
    net_name: The name of the network
    gsc_name: The name of the GSC
    max_size: Maximum geneset size.
    min_size: Minimum geneset size.
"""
import importlib
importlib.reload(geneplexus)
geneplexus.custom.subset_gsc_to_network('../data/', 'STRING_GM', 'GO', 350, 50)

In [2]:
myclass = geneplexus.GenePlexus(file_loc='../data/', net_type="STRING_GM", features="Adjacency", gsc="GO")

In [2]:
data = geneplexus.util.load_gene_features('../data/', 'Adjacency', 'STRING_GM')

In [None]:
process_labels = pd.read_csv('../../../data/processed/reactome_labels_string.csv', header=None)
test_indices = pd.read_csv("../../../data/processed/fs/reactome_rwr_test_string.csv", sep=',', header=0).transpose().to_numpy(dtype='int')-1

In [None]:
disease_labels = pd.read_csv('../../../data/processed/disgenet_sca_labels_string.csv', header=None)
disease_test_indices = pd.read_csv("../../../data/processed/fs/disease/disease_rwr_test_string.csv", sep=',', header=0).transpose().to_numpy(dtype='int')-1

disease_conservative_labels = pd.read_csv('../../../data/processed/disgenet_conservative_labels_string.csv', header=None)
disease_conservative_test_indices = pd.read_csv("../../../data/processed/fs/disease/disease_rwr_test_conservative_string.csv", sep=',', header=0).transpose().to_numpy(dtype='int')-1

In [None]:
clf_dict = {'auprc_random':[], 'precision_k_random':[]}
for i in tqdm(range(len(process_labels.columns))):
    mdl_weights, df_probs, avgps, auprc_random, precision_k_random = myclass.fit_and_predict(process_labels[i], test_indices[i])
    clf_dict['auprc_random'].append(auprc_random)
    clf_dict['precision_k_random'].append(precision_k_random)

clf_df = pd.DataFrame.from_dict(clf_dict)
clf_df.to_csv('../../../models/threshold_classifier/STRING/geneplexus.csv', index=False)

In [5]:
clf_dict = {'auprc_random':[], 'precision_k_random':[]}
for i in tqdm(range(len(process_labels.columns))):
    mdl_weights, df_probs, avgps, auprc_random, precision_k_random = myclass.fit_and_predict(disease_labels[i], disease_test_indices[i])
    clf_dict['auprc_random'].append(auprc_random)
    clf_dict['precision_k_random'].append(precision_k_random)

clf_df = pd.DataFrame.from_dict(clf_dict)
clf_df.to_csv('../../../models/threshold_classifier/STRING/disease_geneplexus.csv', index=False)

In [None]:
clf_dict = {'auprc_random':[], 'precision_k_random':[]}
for i in tqdm(range(len(process_labels.columns))):
    mdl_weights, df_probs, avgps, auprc_random, precision_k_random = myclass.fit_and_predict(disease_conservative_labels[i], disease_conservative_test_indices[i])
    clf_dict['auprc_random'].append(auprc_random)
    clf_dict['precision_k_random'].append(precision_k_random)

clf_df = pd.DataFrame.from_dict(clf_dict)
clf_df.to_csv('../../../models/threshold_classifier/STRING/disease_conservative_geneplexus.csv', index=False)