In [14]:
import pandas as pd
import classifiers
import importlib
import os
import pickle
import itertools
from tqdm.notebook import tqdm
import numpy as np
from igraph import Graph
from joblib import Parallel, delayed

# GAP-MINE

In [40]:
directories = dict(
    clf_results = '../../models/GAP-MINE/clf_results/',
    cv_results = '../../models/GAP-MINE/cv_results/',
    models = '../../models/GAP-MINE/models/'
)

In [41]:
metrics_dir = "../../data/processed/metrics/"
labels_dir = "../../data/processed/"
fs_dir = "../../data/processed/fs/"

In [42]:
networks = ['apid_huri']
datasets = ['reactome', 'disgenet_sca', 'disgenet_conservative']
false_annotations = ['yes', 'no']

In [44]:
importlib.reload(classifiers)
for dataset_label, network_label, fa in itertools.product(datasets, networks, false_annotations):
    
    for k, dir in directories.items():
        
        if os.path.exists(dir) and os.path.isdir(dir):
            pass
        else:
            os.makedirs(dir)

    ############### load data #############################
    if fa == 'no':

        fa_label = '_'
        fa_proteins = None

    elif fa == 'yes':
        
        fa_label = '_fa_'
       
        mod = pd.read_csv(
            labels_dir + f"{dataset_label}_modules_{network_label}.csv",
            converters={'protein_id': eval, 'protein_index': eval}
            )
        mod_fa = pd.read_csv(
            labels_dir + f"{dataset_label}_modules_fa_{network_label}.csv", 
            converters={'protein_id': eval, 'protein_index': eval}
            )
        
        fa_proteins = []
        for i in range(mod.shape[0]):
            fa_proteins.append(list(set(mod_fa.protein_id.iloc[i]) - set(mod.protein_id.iloc[i])))

        fa_proteins = pd.Series(fa_proteins, index=mod.module_id)
    
    metrics = pd.read_csv(metrics_dir + f"{dataset_label}{fa_label}{network_label}.csv", header=0, index_col=0)

    labels = pd.read_csv(labels_dir + f"{dataset_label}{fa_label}labels_{network_label}.csv", header=None)
    labels.columns = metrics.columns
    labels.index = metrics.index

    fs = pd.read_csv(fs_dir + f"{dataset_label}{fa_label}{network_label}.csv")
    

    ####################### Prepare data #######################################################

    fs_indices = [int(i)-1 for i in fs.columns.str.split('.').str[1].to_list()]
    module_labels = metrics.columns[fs_indices].to_list()

    fs.columns = module_labels

    ###################### Run classifier ######################################

    clf_results, cv_results, clf_models = classifiers.logistic_classifier(metrics, fs, labels, module_labels, fa=fa_proteins)
    
    clf_results.to_csv(directories['clf_results']+f'{dataset_label}{fa_label}{network_label}.csv', index=False)
    cv_results.to_csv(directories['cv_results']+f'{dataset_label}{fa_label}{network_label}.csv', index=False)

    with open(directories['models']+f"{dataset_label}{fa_label}{network_label}.pckl", "wb") as f:
        for model in clf_models:
            pickle.dump(model, f)

  0%|          | 0/293 [00:00<?, ?it/s]

  0%|          | 0/204 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

# TEST GAP-MINE

In [20]:
directories = dict(
    clf_results = '../../models/GAP-MINE/clf_results/',
    cv_results = '../../models/GAP-MINE/cv_results/',
    models = '../../models/GAP-MINE/models/'
)

In [21]:
metrics_dir = "../../data/processed/metrics/"
labels_dir = "../../data/processed/"
fs_dir = "../../data/processed/fs/"

In [48]:
networks = ['apid_huri']
datasets = ['reactome', 'disgenet_sca', 'disgenet_conservative']
false_annotations = [False, True]

graph = Graph.Read_GML("../../data/processed/apid_huri_graph")

In [None]:
# compute shortest paths
shortest_paths = graph.distances()
shortest_paths = np.array(shortest_paths)

In [49]:
importlib.reload(classifiers)
for dataset_label, network_label, fa in itertools.product(datasets, networks, false_annotations):
    
    for k, dir in directories.items():
        
        if os.path.exists(dir) and os.path.isdir(dir):
            pass
        else:
            os.makedirs(dir)

    ############### load data #############################
    if fa:
        fa_label = '_fa_'
        g=graph
        sp=shortest_paths

    else:
        fa_label = '_'
        g=None
        sp=None
        
    metrics = pd.read_csv(metrics_dir + f"{dataset_label}_{network_label}.csv", header=0, index_col=0)

    labels = pd.read_csv(labels_dir + f"{dataset_label}_labels_{network_label}.csv", header=None)   
    
    ####################### Prepare data #######################################################

    module_labels = metrics.columns.to_list()

    X = metrics.to_numpy()
    y = labels.to_numpy(dtype='int')
    
    ###################### Compute GAP-MINE ######################################################
    results = Parallel(n_jobs=-1)(
        delayed(classifiers.gapmine)(
            X, y[:, module], beta=0.5, false_annotations=fa, shortest_paths=sp, graph=g, random_state=0
            ) for module in tqdm(range(len(module_labels)))
        )
    
    cv_results = pd.DataFrame([i[0] for i in results])
    cv_results['target_module'] = module_labels
    clf_results = pd.DataFrame([i[1] for i in results])
    clf_results['target_module'] = module_labels
    clf_models = [i[2] for i in results]

    ###################### Save Results ###########################################################
    clf_results.to_csv(directories['clf_results']+f'{dataset_label}{fa_label}{network_label}.csv', index=False)
    cv_results.to_csv(directories['cv_results']+f'{dataset_label}{fa_label}{network_label}.csv', index=False)

    with open(directories['models']+f"{dataset_label}{fa_label}{network_label}.pckl", "wb") as f:
        for model in clf_models:
            pickle.dump(model, f)

  0%|          | 0/426 [00:00<?, ?it/s]

  0%|          | 0/426 [00:00<?, ?it/s]

  0%|          | 0/204 [00:00<?, ?it/s]

  0%|          | 0/204 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

# Baseline Model

In [20]:
metrics_dir = "../../data/processed/metrics/"
labels_dir = "../../data/processed/"
fs_dir = "../../data/processed/fs/"

In [21]:
networks = ['apid_huri']
datasets = ['reactome', 'disgenet_sca', 'disgenet_conservative']
false_annotations = ['no', 'yes']

In [22]:
directory = '../../models/baseline_model/'

In [23]:
importlib.reload(classifiers)
for dataset_label, network_label, fa in itertools.product(datasets, networks, false_annotations):
    
    if os.path.exists(directory) and os.path.isdir(directory):
        pass
    else:
        os.makedirs(directory)


    ############### load data #############################
    if fa == 'no':

        fa_label = '_'
        fa_proteins = None

    elif fa == 'yes':
        
        fa_label = '_fa_'
       
        mod = pd.read_csv(
            labels_dir + f"{dataset_label}_modules_{network_label}.csv",
            converters={'protein_id': eval, 'protein_index': eval}
            )
        mod_fa = pd.read_csv(
            labels_dir + f"{dataset_label}_modules{fa_label}{network_label}.csv", 
            converters={'protein_id': eval, 'protein_index': eval}
            )
        
        fa_proteins = []
        for i in range(mod.shape[0]):
            fa_proteins.append(list(set(mod_fa.protein_id.iloc[i]) - set(mod.protein_id.iloc[i])))

        fa_proteins = pd.Series(fa_proteins, index=mod.module_id)


    metrics = pd.read_csv(metrics_dir + f"{dataset_label}{fa_label}{network_label}.csv", header=0, index_col=0)

    labels = pd.read_csv(labels_dir + f"{dataset_label}{fa_label}labels_{network_label}.csv", header=None)
    labels.columns = metrics.columns
    labels.index = metrics.index

    fs = pd.read_csv(fs_dir + f"{dataset_label}{fa_label}{network_label}.csv")


    ####################### Prepare data #######################################################

    fs_indices = [int(i)-1 for i in fs.columns.str.split('.').str[1].to_list()]
    module_labels = metrics.columns[fs_indices].to_list()

    ###################### Run classifier ######################################
      
    clf_results = classifiers.baseline_classifier(metrics, labels, module_labels, fa=fa_proteins)
    
    clf_results.to_csv(directory+f'{dataset_label}{fa_label}{network_label}.csv', index=False)

  0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/293 [00:00<?, ?it/s]

  0%|          | 0/204 [00:00<?, ?it/s]

  0%|          | 0/204 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

# TEST

In [None]:
metrics_dir = "../../data/processed/metrics/"
labels_dir = "../../data/processed/"
fs_dir = "../../data/processed/fs/"

In [53]:
directory = '../../models/baseline_model/'

In [None]:
networks = ['apid_huri']
datasets = ['reactome', 'disgenet_sca', 'disgenet_conservative']
false_annotations = [False, True]

graph = Graph.Read_GML("../../data/processed/apid_huri_graph")

In [None]:
# compute shortest paths
shortest_paths = graph.distances()
shortest_paths = np.array(shortest_paths)

In [56]:
importlib.reload(classifiers)
for dataset_label, network_label, fa in itertools.product(datasets, networks, false_annotations):
    
    if os.path.exists(directory) and os.path.isdir(directory):
        pass
    else:
        os.makedirs(directory)

    ############### load data #############################
    if fa:
        fa_label = '_fa_'
        g=graph
        sp=shortest_paths

    else:
        fa_label = '_'
        g=None
        sp=None
        
    metrics = pd.read_csv(metrics_dir + f"{dataset_label}_{network_label}.csv", header=0, index_col=0)

    labels = pd.read_csv(labels_dir + f"{dataset_label}_labels_{network_label}.csv", header=None)   
    
    ####################### Prepare data #######################################################

    module_labels = metrics.columns.to_list()

    X = metrics.to_numpy()
    y = labels.to_numpy(dtype='int')
    
    ###################### Compute GAP-MINE ######################################################
    clf_results = Parallel(n_jobs=-1)(
        delayed(classifiers.baseline)(
            X[:, module], y[:, module], beta=0.5, false_annotations=fa, shortest_paths=sp, graph=g, random_state=0
            ) for module in tqdm(range(len(module_labels)))
        )
    
    clf_results = pd.DataFrame(clf_results)
    clf_results['target_module'] = module_labels

    ###################### Save Results ###########################################################
    clf_results.to_csv(directory['clf_results']+f'{dataset_label}{fa_label}{network_label}.csv', index=False)

  0%|          | 0/426 [00:00<?, ?it/s]

  0%|          | 0/426 [00:00<?, ?it/s]

  0%|          | 0/204 [00:00<?, ?it/s]

  0%|          | 0/204 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]