### OBJETIVO: TRANSFORMAR ESTE NOTEBOOK NUM SCRIPT QUE POSSA SER USADO PARA QUALQUER REDE

In [1]:
import pandas as pd
from igraph import Graph
import numpy as np
from IPython.display import display
import os
from tqdm.notebook import tqdm, trange

# Functions

In [2]:
def check_dir(dir):
    """
    """
    if os.path.exists(dir) and os.path.isdir(dir):
        pass
    else:
        os.makedirs(dir)

In [3]:
def get_labels(module_df, graph):
    
    graph_prots = graph.vs['name']
        
    # remove proteins not in graph and choose modules with sizes between 50 and 300 proteins
    module_df = module_df[module_df['protein_id'].isin(graph_prots)]
    module_df = module_df.groupby('module_id').filter(lambda x: 50 <= x.shape[0] <= 300)

    modules = module_df['module_id'].unique()
    # create labels
    labels = pd.DataFrame(np.zeros((len(graph_prots), modules.shape[0])), index=graph_prots, columns=modules)
    crosstab = pd.crosstab(module_df['protein_id'], module_df['module_id'])

    labels.loc[crosstab.index.to_list(), crosstab.columns.to_list()] = crosstab

    return labels

In [4]:
def random_walks_restart(labels, graph):
    """
    
    """
    
    rwr = np.array(labels).T
    
    for mod in rwr:

        mod_prots = np.flatnonzero(mod==1)
        
        mod[:] = graph.personalized_pagerank(reset_vertices=mod_prots)

    return pd.DataFrame(rwr.T, index=labels.index, columns=labels.columns)

In [5]:
def seed_component_algorithm(module, graph, adj_matrix):
    """
    """
    idlist = list(np.flatnonzero(module==1))
    mod_size = len(idlist)

    added_nodes = []
    increase = True
    original_nodes = idlist[:]
    count = 0
    while increase:
        subgraph = graph.induced_subgraph(idlist)
        components = Graph.components(subgraph, mode='strong')

        # largest connected component
        lcc = max(components.sizes())

        comp_adj_matrix = adj_matrix[:, sorted(idlist)]
        # adjacency matrix for each component
        ind_comp_adj_matrices = [comp_adj_matrix[:, comp]
                                    for comp in components]

        max_addition = []
        for ind_comp in ind_comp_adj_matrices:
            
            n_int = np.sum(ind_comp, axis=1)
            
            n_int[n_int >= 1] = ind_comp.shape[1]
            
            max_addition.append(n_int)

        max_addition = np.array(max_addition).transpose()
        max_addition_total = np.array(max_addition).sum(axis=1)

        if np.amax(max_addition_total) > lcc:
            
            candidates = np.flatnonzero(max_addition_total == np.amax(max_addition_total))

            if candidates.shape[0] > 1:
                
                cand_vals = []
                for cand in candidates:
                    # neighbours in idlist/ total neighbours
                    cand_vals.append(np.sum(adj_matrix[idlist, cand])/np.sum(adj_matrix[:, cand]))
                candidates = candidates[cand_vals == np.amax(cand_vals)]
                
            idlist.append(candidates[0])
            added_nodes.append(candidates[0])

        else:
            increase = False
        count += 1
        
    subgraph = graph.induced_subgraph(idlist)
    final_components = list(Graph.connected_components(subgraph, mode='strong'))
    final_component = max(final_components, key=len)

    sca_module = [idlist[node] for node in final_component]
    conservative_module = list(set(original_nodes) & set(sca_module))

    r = len(added_nodes)/mod_size
    return sca_module, conservative_module, r

In [6]:
def get_sca(labels, graph, adj_matrix, add_threshold=False):
    """
    """
    modules = labels.to_numpy()
    sca_modules = np.zeros_like(modules)
    conservative_modules = np.zeros_like(modules)
    rs = []
    for mod in trange(modules.shape[1]):
        sca_module, conservative_module, r = seed_component_algorithm(modules[:, mod], graph, adj_matrix)
        sca_modules[sca_module, mod] = 1
        conservative_modules[conservative_module, mod] = 1
        rs.append(r)

    sca_modules = pd.DataFrame(sca_modules, index=labels.index, columns=labels.columns)
    conservative_modules = pd.DataFrame(conservative_modules, index=labels.index, columns=labels.columns)
    
    if add_threshold:
    # remove modules with more than add_threshold% added proteins
        sca_modules = sca_modules.loc[:, np.array(rs) < add_threshold]
    
    return sca_modules, conservative_modules

# Metrics

In [7]:
parent_dir = '../data/processed/'
modules_dir = parent_dir + 'modules/'
network_dir = parent_dir + 'networks/'

In [8]:
networks = ['apid_huri', 'string']
directories = {net: {} for net in networks}

for net in networks:
    for dir_ in ['graph', 'metrics', 'labels', 'models']:
        new_dir = network_dir + net + '/' + dir_ +'/'
        check_dir(new_dir)
        directories[net][dir_] = new_dir

In [9]:
# modules
modules = dict(
    reactome = pd.read_csv(modules_dir+'reactome.csv'),
    disgenet = pd.read_csv(modules_dir+'disgenet.csv')
)
display(modules['reactome'].head(2))
display(modules['disgenet'].head(2))

Unnamed: 0,module_id,protein_id
0,R-HSA-481007,A1BG
1,R-HSA-6798748,A1BG


Unnamed: 0,protein_id,module_id,score
0,A1BG,C0019209,0.3
1,A1BG,C0036341,0.3


In [10]:
# graph
graphs = {}
adj_matrices = {}
for net in networks:
    graphs[net] = Graph.Read_GML(directories[net]['graph'] + 'graph.gml')
    adj_matrices[net] = np.load(directories[net]['graph'] + 'adjacency_matrix.npy')

In [11]:
# compute metrics
for net in networks:
    for l, m in tqdm(modules.items()):
        
        labels = get_labels(m, graphs[net])
        metrics = random_walks_restart(labels, graphs[net])

        metrics.to_csv(directories[net]['metrics'] + f'{l}_rwr.csv')
        labels.to_csv(directories[net]['labels'] + f'{l}.csv')

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

## Seed Component Algorithm

In [12]:
labels = {}
for net in networks:
    labels[net] = pd.read_csv(directories[net]['labels'] + 'disgenet.csv', index_col=0)

In [13]:
for net in networks:
    graph = graphs[net]
    adj_matrix = adj_matrices[net]

    new_labels = get_sca(labels[net], graph, adj_matrix, add_threshold=0.4)
    
    for name, new_label in zip(['sca', 'conservative'], new_labels):
        
        metrics = random_walks_restart(new_label, graph)
        
        metrics.to_csv(directories[net]['metrics'] + f'disgenet_{name}_rwr.csv')
        new_label.to_csv(directories[net]['labels'] + f'disgenet_{name}_rwr.csv')

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/301 [00:00<?, ?it/s]