# 0. Imports

In [48]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from tqdm.notebook import tqdm
from tqdm import tqdm_notebook
import classifiers
import importlib
import json
import warnings
warnings.filterwarnings("ignore")
import pickle
from ast import literal_eval

# 1. Data Load

In [50]:
process_rwr = np.array(pd.read_csv("../../data/processed/metrics/string_process_rwr_fp.csv", sep=',', header=0, index_col=0).transpose())
disease_rwr = np.array(pd.read_csv("../../data/processed/metrics/string_disease_rwr_fp.csv", sep=',', header=0, index_col=0).transpose())
disease_rwr_conservative = np.array(pd.read_csv("../../data/processed/metrics/string_disease_rwr_conservative_fp.csv", sep=',', header=0, index_col=0).transpose())

In [51]:
process_raw = pd.read_csv("../../data/processed/metrics/process_raw_fp_string.csv", index_col=0)
disease_raw = pd.read_csv("../../data/processed/metrics/disease_raw_fp_string.csv", index_col=0)
disease_raw_conservative = pd.read_csv("../../data/processed/metrics/disease_conservative_raw_fp_string.csv", index_col=0)

In [52]:
process_rwr_df = pd.read_csv("../../data/processed/metrics/string_process_rwr_fp.csv", sep=',', header=0, index_col=0)
process_labels_df = pd.read_csv("../../data/processed/string_reactome_labels_fp.csv", sep=',', names=process_rwr_df.columns)
process_labels_df['protein_id'] = process_rwr_df.index
process_labels_df.set_index('protein_id', inplace=True)

In [53]:
disease_rwr_df = pd.read_csv("../../data/processed/metrics/string_disease_rwr_fp.csv", sep=',', header=0, index_col=0)
disgenet_labels_df = pd.read_csv("../../data/processed/string_disgenet_labels_fp.csv", sep=',', names=disease_rwr_df.columns)
disgenet_labels_df['protein_id'] = disease_rwr_df.index
disgenet_labels_df.set_index('protein_id', inplace=True)

In [54]:
rwr_conservative_df = pd.read_csv("../../data/processed/metrics/string_disease_rwr_conservative_fp.csv", sep=',', header=0, index_col=0)
disgenet_labels_conservative_df = pd.read_csv('../../data/processed/string_disgenet_conservative_labels_fp.csv', names=rwr_conservative_df.columns)
disgenet_labels_conservative_df['protein_id'] = rwr_conservative_df.index
disgenet_labels_conservative_df.set_index('protein_id', inplace=True)

In [55]:
process_rwr_fs = pd.read_csv("../../data/processed/fs/reactome_rwr_fp_fs_string.csv", sep=',', header=0)
disease_rwr_fs = pd.read_csv("../../data/processed/fs/disease/disease_rwr_fp_fs_string.csv", sep=',', header=0)
disease_conservative_rwr_fs = pd.read_csv("../../data/processed/fs/disease/disease_rwr_fp_fs_conservative_string.csv", sep=',', header=0)

process_rwr_test = pd.read_csv("../../data/processed/fs/reactome_rwr_fp_test_string.csv", sep=',', header=0).transpose().to_numpy(dtype='int')-1
disease_rwr_test = pd.read_csv("../../data/processed/fs/disease/disease_rwr_fp_test_string.csv", sep=',', header=0).transpose().to_numpy(dtype='int')-1
disease_conservative_rwr_test = pd.read_csv("../../data/processed/fs/disease/disease_rwr_fp_test_conservative_string.csv", sep=',', header=0).transpose().to_numpy(dtype='int')-1

In [56]:
process_raw_test = pd.read_csv("../../data/processed/fs/reactome_raw_fp_test_string.csv", sep=',', header=0).transpose().to_numpy(dtype='int')-1
disease_raw_test = pd.read_csv("../../data/processed/fs/disease/disease_rwr_fp_test_conservative_string.csv", sep=',', header=0).transpose().to_numpy(dtype='int')-1
disease_conservative_raw_test = pd.read_csv("../../data/processed/fs/disease/disease_rwr_fp_test_conservative_string.csv", sep=',', header=0).transpose().to_numpy(dtype='int')-1

In [57]:
process_modules = pd.read_csv("../../data/processed/string_reactome_modules_fp.csv")
disease_modules = pd.read_csv("../../data/processed/string_disgenet_sca_modules_fp.csv")
disease_conservative_modules = pd.read_csv("../../data/processed/string_disgenet_conservative_modules_fp.csv")

for column in ['proteins_ids', 'protein_index', 'fp_proteins', 'fp_proteins_index']:
    for df in [process_modules, disease_modules, disease_conservative_modules]:
        df[column] = df[column].apply(literal_eval)

In [58]:
process_modules['added_proteins'] = process_modules.apply(lambda row: list(set(row['fp_proteins'])-set(row['proteins_ids'])), axis=1)
disease_modules['added_proteins'] = disease_modules.apply(lambda row: list(set(row['fp_proteins'])-set(row['proteins_ids'])), axis=1)
disease_conservative_modules['added_proteins'] = disease_conservative_modules.apply(lambda row: list(set(row['fp_proteins'])-set(row['proteins_ids'])), axis=1)


# 2. Classification Tasks

## 2.1. Complete Network

### 2.1.1. Process

In [44]:
importlib.reload(classifiers)
clf = LogisticRegression(random_state=22)
"""parameters = [{'penalty':['l1','l2'], 'C':[100, 10, 1.0, 0.1, 0.01],
             'solver': ['liblinear'], 'max_iter':[10, 50, 100]}, 
              {'penalty':['l2', 'none'], 'C':[100, 10, 1.0, 0.1, 0.01],
             'solver': ['sag', 'saga', 'newton-cg'], 'max_iter':[10, 50, 100]}]"""
parameters = [{'penalty':['none'], 'C':[10, 1.0, 0.1],
             'solver': ['newton-cg'], 'max_iter':[10, 50, 100]}]
rwr_lgr_proba_clf, rwr_cv, rwr_n_fs, rwr_models = classifiers.multiple_fs_classifier_fp(clf, parameters, process_rwr, process_rwr_test, process_rwr_fs, process_labels_df,process_modules['added_proteins'].values, jobs=6)
rwr_lgr_proba_clf.to_csv('../../models/false_positive/STRING/probability/process_rwr_lgr_proba.csv', index=False)
with open('../../models/false_positive/STRING/cv_results/process_rwr.txt', 'w') as fp:
    fp.write('\n'.join('%s %s' % x for x in rwr_cv))
with open("../../models/false_positive/STRING/n_fs/process_rwr.txt", "w") as f:
    for s in rwr_n_fs:
        f.write(str(s) +"\n")
with open("../../models/false_positive/STRING/models/process_rwr.pckl", "wb") as f:
    for model in rwr_models:
         pickle.dump(model, f)

  0%|          | 0/244 [00:00<?, ?it/s]

In [64]:
importlib.reload(classifiers)
process_raw_clf = classifiers.threshold_classifier_fp(process_raw, process_labels_df, process_raw_test, process_modules['added_proteins'].values, op_metric='fmeasure')
process_raw_clf.to_csv('../../models/false_positive/STRING/threshold/process_raw.csv', index=False)

  0%|          | 0/244 [00:00<?, ?it/s]

### 2.1.2. Disease Steiner Tree

In [46]:
importlib.reload(classifiers)
clf = LogisticRegression(random_state=22)
"""parameters = [{'penalty':['l1','l2'], 'C':[100, 10, 1.0, 0.1, 0.01],
             'solver': ['liblinear'], 'max_iter':[10, 50, 100]}, 
              {'penalty':['l2', 'none'], 'C':[100, 10, 1.0, 0.1, 0.01],
             'solver': ['sag', 'saga', 'newton-cg'], 'max_iter':[10, 50, 100]}]"""
parameters = [{'penalty':['none'], 'C':[10, 1.0, 0.1],
             'solver': ['newton-cg'], 'max_iter':[10, 50, 100]}]
rwr_lgr_proba_clf, rwr_cv, rwr_n_fs, rwr_models = classifiers.multiple_fs_classifier_fp(clf, parameters, disease_rwr, disease_rwr_test, disease_rwr_fs, disgenet_labels_df, disease_modules['added_proteins'].values, jobs=6)
rwr_lgr_proba_clf.to_csv('../../models/false_positive/STRING/probability/disease_rwr_lgr_proba.csv', index=False)
with open('../../models/false_positive/STRING/cv_results/disease_rwr.txt', 'w') as fp:
    fp.write('\n'.join('%s %s' % x for x in rwr_cv))
with open("../../models/false_positive/STRING/n_fs/disease_rwr.txt", "w") as f:
    for s in rwr_n_fs:
        f.write(str(s) +"\n")
with open("../../models/false_positive/STRING/models/disease_rwr.pckl", "wb") as f:
    for model in rwr_models:
         pickle.dump(model, f)

  0%|          | 0/301 [00:00<?, ?it/s]

In [65]:
importlib.reload(classifiers)
disease_raw_clf = classifiers.threshold_classifier_fp(disease_raw, disgenet_labels_df, disease_raw_test, disease_modules['added_proteins'].values, op_metric='fmeasure')
disease_raw_clf.to_csv('../../models/false_positive/STRING/threshold/disease_raw.csv', index=False)

  0%|          | 0/301 [00:00<?, ?it/s]

### 2.1.3. Disease Conservative Module

In [47]:
importlib.reload(classifiers)
clf = LogisticRegression(random_state=22)
"""parameters = [{'penalty':['l1','l2'], 'C':[100, 10, 1.0, 0.1, 0.01],
             'solver': ['liblinear'], 'max_iter':[10, 50, 100]}, 
              {'penalty':['l2', 'none'], 'C':[100, 10, 1.0, 0.1, 0.01],
             'solver': ['sag', 'saga', 'newton-cg'], 'max_iter':[10, 50, 100]}]"""
parameters = [{'penalty':['none'], 'C':[10, 1.0, 0.1],
             'solver': ['newton-cg'], 'max_iter':[10, 50, 100]}]
rwr_lgr_proba_clf_conservative, rwr_cv_conservative, rwr_n_fs_conservative, rwr_conservative_models = classifiers.multiple_fs_classifier_fp(clf, parameters, disease_rwr_conservative, disease_conservative_rwr_test, disease_conservative_rwr_fs, disgenet_labels_conservative_df, disease_conservative_modules['added_proteins'].values, jobs=6)
rwr_lgr_proba_clf_conservative.to_csv('../../models/false_positive/STRING/probability/disease_rwr_lgr_proba_conservative.csv', index=False)
with open('../../models/false_positive/STRING/cv_results/disease_rwr_conservative.txt', 'w') as fp:
    fp.write('\n'.join('%s %s' % x for x in rwr_cv_conservative))
with open("../../models/false_positive/STRING/n_fs/disease_rwr_conservative.txt", "w") as f:
    for s in rwr_n_fs_conservative:
        f.write(str(s) +"\n")
with open("../../models/false_positive/STRING/models/disease_rwr_conservative.pckl", "wb") as f:
    for model in rwr_conservative_models:
         pickle.dump(model, f)

  0%|          | 0/301 [00:00<?, ?it/s]

In [66]:
importlib.reload(classifiers)
disease_conservative_raw_clf = classifiers.threshold_classifier_fp(disease_raw_conservative, disgenet_labels_conservative_df, disease_conservative_raw_test, disease_conservative_modules['added_proteins'].values, op_metric='fmeasure')
disease_conservative_raw_clf.to_csv('../../models/false_positive/STRING/threshold/disease_conservative_raw.csv', index=False)

  0%|          | 0/301 [00:00<?, ?it/s]