# 0. Imports and Data Load

In [1]:
import pandas as pd
import classifiers
import importlib
from igraph import Graph
import igraph as ig
import numpy as np
from ast import literal_eval
import concurrent.futures
import matplotlib.pyplot as plt
np.seterr(divide='ignore', invalid='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [2]:
graph = Graph.Read_GML("../../data/processed/graph_string")
reactome_modules_df = pd.read_csv("../../data/processed/string_reactome_modules.csv", sep=',', header=0)
disgenet_modules_df = pd.read_csv("../../data/processed/string_disgenet_modules.csv")

In [3]:
reactome_raw = pd.read_csv("../../data/processed/metrics/process_raw_string.csv", sep=',', header=0, index_col=0)
reactome_genepanda = pd.read_csv("../../data/processed/metrics/process_genepanda_string.csv", sep=',', header=0, index_col=0)
reactome_maxlink = pd.read_csv("../../data/processed/metrics/process_maxlink_string.csv", sep=',', header=0, index_col=0)
reactome_maxlink['protein_id'] = graph.vs['name']
reactome_maxlink.set_index('protein_id', inplace=True)

In [4]:
disease_raw = pd.read_csv("../../data/processed/metrics/disease_raw_string.csv", sep=',', header=0, index_col=0)
disease_genepanda = pd.read_csv("../../data/processed/metrics/disease_genepanda_string.csv", sep=',', header=0, index_col=0, names=disgenet_modules_df['process'])
disease_maxlink = pd.read_csv("../../data/processed/metrics/disease_maxlink_string.csv", sep=',', header=0, names=disgenet_modules_df['process'], index_col=0)
disease_maxlink['protein_id'] = graph.vs['name']
disease_maxlink.set_index('protein_id', inplace=True)

disease_conservative_raw = pd.read_csv("../../data/processed/metrics/disease_conservative_raw_string.csv", sep=',', header=0, index_col=0)
disease_conservative_genepanda = pd.read_csv("../../data/processed/metrics/disease_conservative_genepanda_string.csv", sep=',', header=0, index_col=0, names=disgenet_modules_df['process'])
disease_conservative_maxlink = pd.read_csv("../../data/processed/metrics/disease_conservative_maxlink_string.csv", sep=',', names=disgenet_modules_df['process'], header=0, index_col=0)
disease_conservative_maxlink['protein_id'] = graph.vs['name']
disease_conservative_maxlink.set_index('protein_id', inplace=True)

In [5]:
reactome_labels_df = pd.read_csv("../../data/processed/reactome_labels_string.csv", sep=',', names=reactome_modules_df['process'].values)
reactome_labels_df['protein_id'] = graph.vs['name']
reactome_labels_df.set_index('protein_id', inplace=True)

In [6]:
disgenet_labels_df = pd.read_csv("../../data/processed/disgenet_sca_labels_string.csv", sep=',', names=disgenet_modules_df['process'].values)
disgenet_labels_df['protein_id'] = graph.vs['name']
disgenet_labels_df.set_index('protein_id', inplace=True)

disgenet_conservative_labels_df = pd.read_csv("../../data/processed/disgenet_conservative_labels_string.csv", sep=',', names=disgenet_modules_df['process'].values)
disgenet_conservative_labels_df['protein_id'] = graph.vs['name']
disgenet_conservative_labels_df.set_index('protein_id', inplace=True)

In [7]:
reactome_raw_test_indices = pd.read_csv("../../data/processed/fs/process_raw_test.csv", sep=',', header=0).transpose().to_numpy(dtype='int')-1
reactome_genepanda_test_indices = pd.read_csv("../../data/processed/fs/reactome_genepanda_test_string.csv", sep=',', header=0).transpose().to_numpy(dtype='int')-1
reactome_maxlink_test_indices = pd.read_csv("../../data/processed/fs/reactome_maxlink_test_string.csv", sep=',', header=0).transpose().to_numpy(dtype='int')-1

In [14]:
disease_raw_test_indices = pd.read_csv("../../data/processed/fs/disease/disease_raw_test_string.csv", sep=',', header=0).transpose().to_numpy(dtype='int')-1
disease_genepanda_test_indices = pd.read_csv("../../data/processed/fs/disease/disease_genepanda_test_string.csv", sep=',', header=0).transpose().to_numpy(dtype='int')-1
disease_maxlink_test_indices = pd.read_csv("../../data/processed/fs/disease/disease_maxlink_test_string.csv", sep=',', header=0).transpose().to_numpy(dtype='int')-1

disease_conservative_raw_test_indices = pd.read_csv("../../data/processed/fs/disease/disease_raw_test_conservative_string.csv", sep=',', header=0).transpose().to_numpy(dtype='int')-1
disease_conservative_genepanda_test_indices = pd.read_csv("../../data/processed/fs/disease/disease_genePANDA_test_conservative_string.csv", sep=',', header=0).transpose().to_numpy(dtype='int')-1
disease_conservative_maxlink_test_indices = pd.read_csv("../../data/processed/fs/disease/disease_genePANDA_test_conservative_string.csv", sep=',', header=0).transpose().to_numpy(dtype='int')-1

# 1. Threshold Classifier - Cell Processes

In [11]:
importlib.reload(classifiers)
raw_simple_clf = classifiers.threshold_classifier(reactome_raw, reactome_labels_df, reactome_raw_test_indices, op_metric='fmeasure')
raw_simple_clf.to_csv('../../models/threshold_classifier/STRING/process_raw_simple.csv', index=False)

  0%|          | 0/244 [00:00<?, ?it/s]

In [7]:
importlib.reload(classifiers)
genepanda_simple_clf = classifiers.threshold_classifier(reactome_genepanda, reactome_labels_df, reactome_genepanda_test_indices, op_metric='fmeasure')
genepanda_simple_clf.to_csv('../../models/threshold_classifier/STRING/process_genepanda_simple.csv', index=False)

  0%|          | 0/244 [00:00<?, ?it/s]

In [17]:
importlib.reload(classifiers)
maxlink_simple_clf = classifiers.threshold_classifier(reactome_maxlink, reactome_labels_df, reactome_maxlink_test_indices, op_metric='fmeasure')
maxlink_simple_clf.to_csv('../../models/threshold_classifier/STRING/process_maxlink_simple.csv', index=False)

  0%|          | 0/244 [00:00<?, ?it/s]

# 2. Threshold Classifier - Diseases

In [13]:
importlib.reload(classifiers)
disease_raw_simple_clf = classifiers.threshold_classifier(disease_raw, disgenet_labels_df, disease_raw_test_indices, op_metric='fmeasure')
disease_raw_simple_clf.to_csv('../../models/threshold_classifier/STRING/disease_raw_simple.csv', index=False)

  0%|          | 0/301 [00:00<?, ?it/s]

In [39]:
importlib.reload(classifiers)
disease_genepanda_simple_clf = classifiers.threshold_classifier(disease_genepanda, disgenet_labels_df, disease_genepanda_test_indices, op_metric='fmeasure')
disease_genepanda_simple_clf.to_csv('../../models/threshold_classifier/STRING/disease_genepanda_simple.csv', index=False)

  0%|          | 0/301 [00:00<?, ?it/s]

In [40]:
importlib.reload(classifiers)
disease_maxlink_simple_clf = classifiers.threshold_classifier(disease_maxlink, disgenet_labels_df, disease_maxlink_test_indices, op_metric='fmeasure')
disease_maxlink_simple_clf.to_csv('../../models/threshold_classifier/STRING/disease_maxlink_simple.csv', index=False)

  0%|          | 0/301 [00:00<?, ?it/s]

In [15]:
importlib.reload(classifiers)
disease_conservative_raw_simple_clf = classifiers.threshold_classifier(disease_conservative_raw, disgenet_conservative_labels_df, disease_conservative_raw_test_indices, op_metric='fmeasure')
disease_conservative_raw_simple_clf.to_csv('../../models/threshold_classifier/STRING/disease_conservative_raw_simple.csv', index=False)

  0%|          | 0/301 [00:00<?, ?it/s]

In [41]:
importlib.reload(classifiers)
disease_conservative_genepanda_simple_clf = classifiers.threshold_classifier(disease_conservative_genepanda, disgenet_conservative_labels_df, disease_conservative_genepanda_test_indices, op_metric='fmeasure')
disease_conservative_genepanda_simple_clf.to_csv('../../models/threshold_classifier/STRING/disease_conservative_genepanda_simple.csv', index=False)

  0%|          | 0/301 [00:00<?, ?it/s]

In [58]:
importlib.reload(classifiers)
disease_conservative_maxlink_simple_clf = classifiers.threshold_classifier(disease_conservative_maxlink, disgenet_conservative_labels_df, disease_conservative_maxlink_test_indices, op_metric='fmeasure')
disease_conservative_maxlink_simple_clf.to_csv('../../models/threshold_classifier/STRING/disease_conservative_maxlink_simple.csv', index=False)

  0%|          | 0/301 [00:00<?, ?it/s]