In [1]:
import pandas as pd
from igraph import Graph
import disease_process_proteins
import metrics_functions
import importlib
import numpy as np
from ast import literal_eval
from tqdm.notebook import tqdm

In [2]:
# graph
graph = Graph.Read_GML("../../data/processed/graph_string")
adj_matrix = np.load("../../data/processed/string_adjacency_matrix.npy")

In [None]:
# modules
reactome_modules = pd.read_csv("../../data/processed/string_reactome_modules.csv", sep=',', header=0)
disgenet_modules = pd.read_csv("../../data/processed/string_disgenet_modules.csv", sep=',', header=0)

In [4]:
reactome_modules['protein_index'] = reactome_modules['protein_index'].apply(literal_eval)
disgenet_modules['protein_index'] = disgenet_modules['protein_index'].apply(literal_eval)
reactome_modules['proteins_ids'] = reactome_modules['proteins_ids'].apply(literal_eval)
disgenet_modules['proteins_ids'] = disgenet_modules['proteins_ids'].apply(literal_eval)
disgenet_modules['conservative_module'] = disgenet_modules['conservative_module'].apply(literal_eval)
disgenet_modules['added_nodes'] = disgenet_modules['added_nodes'].apply(literal_eval)
disgenet_modules['conservative_module_ids'] = disgenet_modules['conservative_module_ids'].apply(literal_eval)
disgenet_modules['added_nodes_ids'] = disgenet_modules['added_nodes_ids'].apply(literal_eval)
disgenet_modules['main_component'] = disgenet_modules['main_component'].apply(literal_eval)
disgenet_modules['main_component_ids'] = disgenet_modules['main_component_ids'].apply(literal_eval)

In [5]:
disgenet_conservative_modules = disgenet_modules[['process', 'conservative_module_ids', 'conservative_module', 'len_conservative']]
disgenet_conservative_modules.columns = ['process', 'proteins_ids', 'protein_index', 'n_proteins']

In [6]:
disgenet_sca_modules = disgenet_modules[disgenet_modules['increase']<0.4][['process', 'main_component_ids', 'main_component', 'len_sca']]
disgenet_sca_modules.columns = ['process', 'proteins_ids', 'protein_index', 'n_proteins']

In [7]:
reactome_labels_df = pd.read_csv("../../data/processed/reactome_labels_string.csv", sep=',', names=reactome_modules['process'].values)
reactome_labels_df['protein_id'] = graph.vs['name']
reactome_labels_df.set_index('protein_id', inplace=True)

disgenet_labels_df = pd.read_csv("../../data/processed/disgenet_sca_labels_string.csv", sep=',', names=disgenet_sca_modules['process'].values)
disgenet_labels_df['protein_id'] = graph.vs['name']
disgenet_labels_df.set_index('protein_id', inplace=True)

disgenet_labels_conservative_df = pd.read_csv('../../data/processed/disgenet_conservative_labels_string.csv', names=disgenet_conservative_modules['process'].values)
disgenet_labels_conservative_df['protein_id'] = graph.vs['name']
disgenet_labels_conservative_df.set_index('protein_id', inplace=True)

In [8]:
protein_list = set(list(disgenet_labels_df.sum(axis=1).replace(0, np.nan).dropna().index)+list(disgenet_labels_conservative_df.sum(axis=1).replace(0, np.nan).dropna().index)+list(reactome_labels_df.sum(axis=1).replace(0, np.nan).dropna().index))

In [3]:
sp = graph.distances(graph.vs['name'], protein_list)
sp_df = pd.DataFrame(sp, columns=list(protein_list))
sp_df.index = graph.vs['name']

NameError: name 'graph' is not defined

In [10]:
import math
from tqdm.notebook import tqdm
tqdm.pandas()
def fp_addition(protein_indexes, sp_df, graph):
    rng = np.random.default_rng(42)
    min_sp = sp_df[~sp_df.index.isin(protein_indexes)][protein_indexes].min(axis=1)
    degree_values = graph.degree(sp_df[~sp_df.index.isin(protein_indexes)].index)
    log_degree_values = [math.log10(x) for x in degree_values]
    min_sp = pd.DataFrame(min_sp, columns=['sp'])
    min_sp['degree'] = log_degree_values
    min_sp['weight'] = min_sp['degree']/(10**min_sp['sp'])
    min_sp['normalized_weight'] = min_sp['weight']/sum(min_sp['weight'])
    new_proteins = list(rng.choice(min_sp.index, int(len(protein_indexes)*0.1), p=min_sp['normalized_weight']))
    new_proteins = new_proteins+protein_indexes
    new_proteins_index = [int(graph.vs.find(name=x)['id']) for x in new_proteins]
    return new_proteins, new_proteins_index

In [11]:
reactome_modules[['fp_proteins', 'fp_proteins_index']] =\
      reactome_modules.progress_apply(
          lambda row: fp_addition(row['proteins_ids'], sp_df, graph), axis=1, result_type='expand'
          )

  0%|          | 0/244 [00:00<?, ?it/s]

In [12]:
disgenet_sca_modules[['fp_proteins', 'fp_proteins_index']] =\
      disgenet_sca_modules.progress_apply(
          lambda row: fp_addition(row['proteins_ids'], sp_df, graph), axis=1, result_type='expand'
          )

  0%|          | 0/301 [00:00<?, ?it/s]

In [13]:
disgenet_conservative_modules[['fp_proteins', 'fp_proteins_index']] =\
     disgenet_conservative_modules.progress_apply(
         lambda row: fp_addition(row['proteins_ids'], sp_df, graph), axis=1, result_type='expand'
         )

  0%|          | 0/301 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  disgenet_conservative_modules[['fp_proteins', 'fp_proteins_index']] =\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  disgenet_conservative_modules[['fp_proteins', 'fp_proteins_index']] =\


In [14]:
reactome_modules.to_csv('../../data/processed/string_reactome_modules_fp.csv', index=False)
disgenet_sca_modules.to_csv('../../data/processed/string_disgenet_sca_modules_fp.csv', index=False)
disgenet_conservative_modules.to_csv('../../data/processed/string_disgenet_conservative_modules_fp.csv', index=False)

# 1. Target Process Metrics

In [15]:
reactome_modules = pd.read_csv('../../data/processed/string_reactome_modules_fp.csv')
disgenet_sca_modules = pd.read_csv('../../data/processed/string_disgenet_sca_modules_fp.csv')
disgenet_conservative_modules = pd.read_csv('../../data/processed/string_disgenet_conservative_modules_fp.csv')

In [16]:
reactome_modules['fp_proteins'] = reactome_modules['fp_proteins'].apply(literal_eval)
disgenet_sca_modules['fp_proteins'] = disgenet_sca_modules['fp_proteins'].apply(literal_eval)
disgenet_conservative_modules['fp_proteins'] = disgenet_conservative_modules['fp_proteins'].apply(literal_eval)

reactome_modules['fp_proteins_index'] = reactome_modules['fp_proteins_index'].apply(literal_eval)
disgenet_sca_modules['fp_proteins_index'] = disgenet_sca_modules['fp_proteins_index'].apply(literal_eval)
disgenet_conservative_modules['fp_proteins_index'] = disgenet_conservative_modules['fp_proteins_index'].apply(literal_eval)

In [17]:
reactome_modules.drop(columns=['proteins_ids', 'protein_index'], inplace=True)
reactome_modules = reactome_modules[['process', 'fp_proteins', 'fp_proteins_index']]
reactome_modules.columns = ['process', 'proteins_ids', 'protein_index']
reactome_modules['n_proteins'] = reactome_modules['proteins_ids'].apply(lambda row: len(row))

In [18]:
disgenet_conservative_modules.drop(columns=['proteins_ids', 'protein_index'], inplace=True)
disgenet_conservative_modules = disgenet_conservative_modules[['process', 'fp_proteins', 'fp_proteins_index']]
disgenet_conservative_modules.columns = ['process', 'proteins_ids', 'protein_index']
disgenet_conservative_modules['n_proteins'] = disgenet_conservative_modules['proteins_ids'].apply(lambda row: len(row))

disgenet_sca_modules.drop(columns=['proteins_ids', 'protein_index'], inplace=True)
disgenet_sca_modules = disgenet_sca_modules[['process', 'fp_proteins', 'fp_proteins_index']]
disgenet_sca_modules.columns = ['process', 'proteins_ids', 'protein_index']
disgenet_sca_modules['n_proteins'] = disgenet_sca_modules['proteins_ids'].apply(lambda row: len(row))

## 1.1 Random Walks with Restart

In [19]:
importlib.reload(metrics_functions)
process_rwr = metrics_functions.random_walk_restart(graph, reactome_modules)
process_rwr_df = pd.DataFrame.from_dict(process_rwr)
process_rwr_df = process_rwr_df.rename(index=dict(zip(list(process_rwr_df.index),list(graph.vs['name']))))
process_rwr_df.to_csv('../../data/processed/metrics/string_process_rwr_fp.csv')
print(process_rwr_df.shape)
process_rwr_df.head()

  0%|          | 0/244 [00:00<?, ?it/s]

(19035, 244)


Unnamed: 0,R-HSA-1031716,R-HSA-112379,R-HSA-112385,R-HSA-1168640,R-HSA-1234159,R-HSA-141409,R-HSA-141422,R-HSA-141431,R-HSA-141439,R-HSA-141671,...,R-HSA-9633742,R-HSA-9634669,R-HSA-9648114,R-HSA-9660824,R-HSA-983140,R-HSA-983147,R-HSA-983156,R-HSA-983157,R-HSA-983259,R-HSA-983266
ARF5,8.3e-05,8.6e-05,8.6e-05,9.7e-05,0.000106,9.6e-05,9.6e-05,9.6e-05,9.6e-05,0.000104,...,0.000101,0.0001,0.000105,8.9e-05,9.1e-05,9e-05,9.1e-05,9.1e-05,0.000129,0.000129
PDE1C,3.5e-05,3.4e-05,3.4e-05,3.6e-05,3.6e-05,3.8e-05,3.8e-05,3.8e-05,3.8e-05,3.5e-05,...,3.4e-05,3.4e-05,4e-05,7.6e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,4.7e-05,4.7e-05
ERCC1,5.7e-05,0.000126,0.000126,5.3e-05,6.2e-05,6.8e-05,6.8e-05,6.8e-05,6.8e-05,5.3e-05,...,5.8e-05,5.5e-05,6.9e-05,5.8e-05,6.1e-05,6e-05,6.1e-05,6.1e-05,5.8e-05,5.8e-05
TLL1,3.2e-05,3e-05,3e-05,2.9e-05,3e-05,2.9e-05,2.9e-05,2.9e-05,2.9e-05,3e-05,...,3e-05,3e-05,3.1e-05,4.2e-05,3.3e-05,3.2e-05,3.3e-05,3.3e-05,3e-05,3e-05
PRSS22,3.6e-05,2.2e-05,2.2e-05,2.5e-05,2.5e-05,2.3e-05,2.3e-05,2.3e-05,2.3e-05,2.1e-05,...,2.2e-05,2.1e-05,2.3e-05,3e-05,2.5e-05,2.4e-05,2.5e-05,2.5e-05,4.1e-05,4.1e-05


In [20]:
importlib.reload(metrics_functions)
disease_rwr = metrics_functions.random_walk_restart(graph, disgenet_sca_modules)
disease_rwr_df = pd.DataFrame.from_dict(disease_rwr)
disease_rwr_df = disease_rwr_df.rename(index=dict(zip(list(disease_rwr_df.index),list(graph.vs['name']))))
disease_rwr_df.to_csv('../../data/processed/metrics/string_disease_rwr_fp.csv')
print(disease_rwr_df.shape)
disease_rwr_df.head()

  0%|          | 0/301 [00:00<?, ?it/s]

(19035, 301)


Unnamed: 0,C0000786,C0000822,C0001418,C0001787,C0001973,C0002152,C0002395,C0002736,C0003873,C0004096,...,C4317109,C4317123,C4505436,C4505456,C4552091,C4552766,C4704862,C4707243,C4721453,C4721507
ARF5,8.6e-05,8.6e-05,9.3e-05,0.000107,9e-05,9.5e-05,9.3e-05,0.000107,9e-05,8.5e-05,...,9.2e-05,8.9e-05,9.2e-05,9.9e-05,0.000102,8.6e-05,0.000102,9e-05,9.9e-05,8.4e-05
PDE1C,4.2e-05,4.2e-05,4.2e-05,5.3e-05,6.2e-05,4.8e-05,4.8e-05,4.2e-05,4.1e-05,4.7e-05,...,6.4e-05,6.3e-05,6.4e-05,3.9e-05,4.9e-05,4.2e-05,4.9e-05,5.4e-05,4.5e-05,4.2e-05
ERCC1,5.2e-05,5.2e-05,7.3e-05,5.2e-05,5.4e-05,5.6e-05,5.8e-05,6.3e-05,6e-05,6.4e-05,...,5.6e-05,5.6e-05,5.6e-05,6.2e-05,5.9e-05,5.2e-05,5.9e-05,5.3e-05,0.002581,5.6e-05
TLL1,8.1e-05,8.1e-05,4.1e-05,5e-05,4.5e-05,4.3e-05,4.6e-05,3.6e-05,4.7e-05,4.7e-05,...,4.3e-05,4.3e-05,4.3e-05,3.5e-05,3.8e-05,8.1e-05,3.8e-05,8.7e-05,4.1e-05,5.4e-05
PRSS22,4.1e-05,4.1e-05,3.6e-05,3.1e-05,3.1e-05,3.5e-05,3.3e-05,2.6e-05,3.1e-05,3.7e-05,...,3.6e-05,3.2e-05,3.6e-05,2.9e-05,3e-05,4.1e-05,3e-05,2.7e-05,3.5e-05,3.5e-05


In [21]:
importlib.reload(metrics_functions)
disease_rwr_conservative = metrics_functions.random_walk_restart(graph, disgenet_conservative_modules)
disease_rwr_conservative_df = pd.DataFrame.from_dict(disease_rwr_conservative)
disease_rwr_conservative_df = disease_rwr_conservative_df.rename(index=dict(zip(list(disease_rwr_conservative_df.index),list(graph.vs['name']))))
disease_rwr_conservative_df.to_csv('../../data/processed/metrics/string_disease_rwr_conservative_fp.csv')
print(disease_rwr_conservative_df.shape)
disease_rwr_conservative_df.head()

  0%|          | 0/301 [00:00<?, ?it/s]

(19035, 301)


Unnamed: 0,C0000786,C0000822,C0001418,C0001787,C0001973,C0002152,C0002395,C0002736,C0003873,C0004096,...,C4317109,C4317123,C4505436,C4505456,C4552091,C4552766,C4704862,C4707243,C4721453,C4721507
ARF5,8.8e-05,8.8e-05,9.3e-05,0.000107,9e-05,9.5e-05,9.3e-05,0.000107,9.1e-05,8.5e-05,...,9.2e-05,8.9e-05,9.2e-05,9.9e-05,9.9e-05,8.8e-05,9.9e-05,9e-05,9.9e-05,8.4e-05
PDE1C,4.1e-05,4.1e-05,4.2e-05,5.3e-05,6.2e-05,4.8e-05,4.8e-05,4.2e-05,4.1e-05,4.8e-05,...,6.4e-05,6.3e-05,6.4e-05,3.9e-05,4.9e-05,4.1e-05,4.9e-05,5.4e-05,4.5e-05,4.2e-05
ERCC1,5.3e-05,5.3e-05,7.3e-05,5.2e-05,5.4e-05,5.6e-05,5.8e-05,6.3e-05,6e-05,6.1e-05,...,5.6e-05,5.6e-05,5.6e-05,6.2e-05,6.1e-05,5.3e-05,6.1e-05,5.3e-05,0.002621,5.6e-05
TLL1,7.8e-05,7.8e-05,4.1e-05,5e-05,4.5e-05,4.3e-05,4.6e-05,3.6e-05,4.7e-05,4.8e-05,...,4.3e-05,4.3e-05,4.3e-05,3.5e-05,3.9e-05,7.8e-05,3.9e-05,8.7e-05,4.8e-05,5.4e-05
PRSS22,4.1e-05,4.1e-05,3.6e-05,3.1e-05,3.1e-05,3.5e-05,3.3e-05,2.6e-05,3.1e-05,3.4e-05,...,3.6e-05,3.2e-05,3.6e-05,2.9e-05,3e-05,4.1e-05,3e-05,2.7e-05,3.8e-05,3.5e-05


In [22]:
reactome_labels = np.zeros(process_rwr_df.shape)
column = 0
for indexes in reactome_modules['protein_index'].values:
    for pos in indexes:
        reactome_labels[pos][column] = 1
    column += 1
np.savetxt("../../data/processed/string_reactome_labels_fp.csv", reactome_labels, delimiter=",")

In [23]:
disgenet_labels = np.zeros((len(graph.vs['name']), len(disgenet_sca_modules)))
column = 0
for indexes in disgenet_sca_modules['protein_index'].values:
    for pos in indexes:
        disgenet_labels[pos][column] = 1
    column += 1
np.savetxt("../../data/processed/string_disgenet_labels_fp.csv", disgenet_labels, delimiter=",")

In [24]:
disgenet_labels_conservative = np.zeros((len(graph.vs['name']), len(disgenet_conservative_modules)))
column = 0
for indexes in disgenet_conservative_modules['protein_index'].values:
    for pos in indexes:
        disgenet_labels_conservative[pos][column] = 1
    column += 1
np.savetxt("../../data/processed/string_disgenet_conservative_labels_fp.csv", disgenet_labels_conservative, delimiter=",")