#Install packages and load libraries

Import common packages

In [None]:
%%capture
import argparse, cProfile, datetime
from functools import reduce
import glob
import itertools as it
from multiprocessing import cpu_count, Pool
import networkx as nx
import numpy as np
import os

import pandas as pd
import pickle
import matplotlib.pyplot as plt
import random, re, shutil
from scipy.stats import hmean
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics
from sklearn.metrics import auc, average_precision_score, precision_recall_curve
from sklearn.model_selection import KFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
import seaborn as sns
import subprocess as sp
import sys
import tempfile as tf
import time
from tqdm import tqdm as progressMonitor

Enable Google Colab, mount drives, and load proprietary modules...

In [None]:
%%capture
from google.colab import drive, files, output
drive.mount('/content/drive', force_remount=True)
!rm -r sample_data/

In [None]:
#useful to keep track of sys vars so as to better monitor space remaining
print(cpu_count())
sysVars = list(globals().keys())

96


In [None]:
rootDir = '/content/drive/My Drive/'
workDir = rootDir + 'elcfs_protein_complex_modeling/'  #phase 2 directory
workDir_elcfs = rootDir + 'Primary Research/proteinPairs_complexMaps/' #modeling library
workDir_ph1 = rootDir + 'Primary Research/JLMwSCBC_notebook/'  #phase 1 directory

sys.path.insert(0, rootDir)
for p in workDir, workDir_elcfs, workDir_ph1: sys.path.append(p)

In [None]:
from util import ppiPrediction_v2, dataProcessing, modelEvaluating
from utils import operations, reference, alertMe
pushoverKey_user = 'uith8rmy2npjj1oqpjwcanow3un984'
pushoverAPI = 'aw4v3424kaznrw598r6qge9icddwg7'

#Recapitulate 5CV data fold structure

In [None]:
#specify secondary root dirs
dataDir = workDir_ph1 + 'dataPartitions/'

In [None]:
for cell in ['h322', 'mcf7', 'u251']:
  #specify dirs
  featMat_cellDir = dataDir + 'lmPairs_KD+LM+FGK+{0}SCBC/5CV/'.format(cell)
  featMat_cellDir_new = \
    dataDir + 'lmPairs_KD+LM+FGK+{0}SCBC_recal/'.format(cell)
  featMat_cellDir_newCV = featMat_cellDir_new + '5CV/'.format(cell)
  os.makedirs(featMat_cellDir_newCV)

  #load featMat w/ new recalibrated cell-specific NCI-60 features
  featMat_cellSpec_calibratedFilename = \
    'lmPairs_KD+LM+AF+SCBC_NCI60 {0}-cellSpec featCalibr.pkl'.format(cell)
  featMat_cellSpec_calibrated = pickle.load(
      open(featMat_cellDir_new + featMat_cellSpec_calibratedFilename, 'rb'))
  featMat_cellSpec_calibrated.rename(
      columns={'idi': 'id1', 'idii': 'id2'}, inplace=True)
  featMat_cellSpec_calibrated.id1 = \
    featMat_cellSpec_calibrated.id1.astype('str')
  featMat_cellSpec_calibrated.id2 = \
    featMat_cellSpec_calibrated.id2.astype('str')
  featMat_cellSpec_calibrated.drop(columns=['pairsFrozen'], inplace=True)
  featMat_cellSpec_calibrated = \
    operations.freezePairs(featMat_cellSpec_calibrated, pool=True)

  #QC output
  print(cell)
  print(featMat_cellDir)
  print(featMat_cellDir_new)
  print(featMat_cellSpec_calibrated.shape)
  print(featMat_cellSpec_calibrated.dtypes)
  print(featMat_cellSpec_calibrated.head())

  for fold in np.arange(5):
    #specify dirs
    featMat_foldDir = featMat_cellDir + 'fold={0}/'.format(fold)
    featMat_foldDir_new = featMat_cellDir_newCV + 'fold={0}/'.format(fold)
    os.makedirs(featMat_foldDir_new)

    #load test pairs corresponding to fold iter and copy to new feat mat dir
    testPairs = pd.read_csv(featMat_foldDir + 'testPairs.tsv',
                            sep='\t', header=0, names=['id1', 'id2'], dtype='str')
    testPairs.rename(columns={'id1': 'idi', 'id2': 'idii'}).to_csv(
        featMat_foldDir_new + 'testPairs.tsv.tar.bz2', sep='\t', index=False)
    testPairs = operations.freezePairs(testPairs, pool=True)

    #generate new labels and feature matrix files from primary feature matrix
    featMat_cellSpec_calibrated.loc[
        featMat_cellSpec_calibrated.pairsFrozen.isin(
            testPairs.pairsFrozen.to_list()), 'label'].to_csv(
                featMat_foldDir_new + 'testLabels.tsv.tar.bz2',
                     sep='\t', index=False)
    featMat_new = \
      featMat_cellSpec_calibrated.loc[
          featMat_cellSpec_calibrated.pairsFrozen.isin(
              testPairs.pairsFrozen.to_list())].iloc[:, 4:]
    featMat_new.to_csv(featMat_foldDir_new + 'testData.tsv.tar.bz2',
                       sep='\t', index=False)

    #QC output
    print(fold)
    print(featMat_foldDir)
    print(featMat_foldDir_new)
    print(testPairs.shape)
    print(testPairs.dtypes)
    print(testPairs.head())
    print(featMat_new.shape)
    print(featMat_new.dtypes)
    print(featMat_new.head())
    print(set(testPairs.pairsFrozen.to_list()).issubset(set(
        featMat_cellSpec_calibrated.pairsFrozen.to_list())))

pooling...


100%|██████████| 1037907/1037907 [01:37<00:00, 10678.74it/s]


h322
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+h322SCBC/5CV/
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+h322SCBC_recal/5CV/
(1037907, 285)
id1             object
id2             object
pairsFrozen     object
label            int64
0              float64
                ...   
fs1            float64
fs2            float64
fp1            float64
fp2            float64
fp3            float64
Length: 285, dtype: object
     id1    id2     pairsFrozen  label    0    1    2    3         4  \
0   1665  57187   (57187, 1665)      1  0.0  0.0  0.0  0.0  0.000924   
1  22916   1654   (22916, 1654)      1  NaN  NaN  NaN  NaN       NaN   
2  51729  79228  (79228, 51729)      1  NaN  NaN  NaN  NaN       NaN   
3   6158   7846    (6158, 7846)      1  NaN  NaN  NaN  NaN       NaN   
4  1

100%|██████████| 207582/207582 [00:20<00:00, 10289.16it/s]


0
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+h322SCBC/5CV/fold=0/
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+h322SCBC_recal/5CV/fold=0/
(207582, 3)
id1            object
id2            object
pairsFrozen    object
dtype: object
     id1    id2     pairsFrozen
0   1665  57187   (57187, 1665)
1   3376   4141    (3376, 4141)
2  10884  64978  (10884, 64978)
3   6598   8841    (6598, 8841)
4   6884   6874    (6874, 6884)
(207582, 281)
0      float64
1      float64
2      float64
3      float64
4      float64
        ...   
fs1    float64
fs2    float64
fp1    float64
fp2    float64
fp3    float64
Length: 281, dtype: object
           0         1    2         3         4         5    6         7  \
0   0.000000  0.000000  0.0  0.000000  0.000924  0.057159  0.0 -0.064575   
7   0.

100%|██████████| 207582/207582 [00:19<00:00, 10431.84it/s]


1
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+h322SCBC/5CV/fold=1/
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+h322SCBC_recal/5CV/fold=1/
(207582, 3)
id1            object
id2            object
pairsFrozen    object
dtype: object
     id1    id2     pairsFrozen
0  27316   9939   (9939, 27316)
1  10450   3550   (10450, 3550)
2  23450  10155  (10155, 23450)
3   9343  94104   (9343, 94104)
4  22826   6634   (6634, 22826)
(207582, 281)
0      float64
1      float64
2      float64
3      float64
4      float64
        ...   
fs1    float64
fs2    float64
fp1    float64
fp2    float64
fp3    float64
Length: 281, dtype: object
           0    1    2         3         4         5    6         7         8  \
6        NaN  NaN  NaN       NaN       NaN       NaN  NaN       NaN       NaN

100%|██████████| 207581/207581 [00:20<00:00, 10300.96it/s]


2
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+h322SCBC/5CV/fold=2/
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+h322SCBC_recal/5CV/fold=2/
(207581, 3)
id1            object
id2            object
pairsFrozen    object
dtype: object
     id1    id2     pairsFrozen
0  51729  79228  (79228, 51729)
1   6222   1938    (1938, 6222)
2   9410  22916   (22916, 9410)
3  11052  57819  (57819, 11052)
4   3054   6879    (3054, 6879)
(207581, 281)
0      float64
1      float64
2      float64
3      float64
4      float64
        ...   
fs1    float64
fs2    float64
fp1    float64
fp2    float64
fp3    float64
Length: 281, dtype: object
           0        1    2         3         4       5    6         7       8  \
2        NaN      NaN  NaN       NaN       NaN     NaN  NaN       NaN     NaN

100%|██████████| 207581/207581 [00:20<00:00, 10210.39it/s]


3
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+h322SCBC/5CV/fold=3/
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+h322SCBC_recal/5CV/fold=3/
(207581, 3)
id1            object
id2            object
pairsFrozen    object
dtype: object
     id1    id2     pairsFrozen
0  22916   1654   (22916, 1654)
1  11017   6427   (6427, 11017)
2  10291  24148  (24148, 10291)
3   3608  11224   (3608, 11224)
4  51631  26156  (51631, 26156)
(207581, 281)
0      float64
1      float64
2      float64
3      float64
4      float64
        ...   
fs1    float64
fs2    float64
fp1    float64
fp2    float64
fp3    float64
Length: 281, dtype: object
           0    1    2    3        4         5    6         7    8    9  ...  \
1        NaN  NaN  NaN  NaN      NaN       NaN  NaN       NaN  NaN  NaN  ...  

100%|██████████| 207581/207581 [00:20<00:00, 10329.33it/s]


4
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+h322SCBC/5CV/fold=4/
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+h322SCBC_recal/5CV/fold=4/
(207581, 3)
id1            object
id2            object
pairsFrozen    object
dtype: object
     id1    id2     pairsFrozen
0   6158   7846    (6158, 7846)
1   6168   2197    (6168, 2197)
2  25804  11052  (25804, 11052)
3   7756   8175    (7756, 8175)
4  54948  64983  (64983, 54948)
(207581, 281)
0      float64
1      float64
2      float64
3      float64
4      float64
        ...   
fs1    float64
fs2    float64
fp1    float64
fp2    float64
fp3    float64
Length: 281, dtype: object
           0         1    2         3         4         5    6         7  \
3        NaN       NaN  NaN       NaN       NaN       NaN  NaN       NaN   
5   0.

100%|██████████| 1037907/1037907 [01:41<00:00, 10267.24it/s]


mcf7
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+mcf7SCBC/5CV/
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+mcf7SCBC_recal/5CV/
(1037907, 285)
id1             object
id2             object
pairsFrozen     object
label            int64
0              float64
                ...   
fs1            float64
fs2            float64
fp1            float64
fp2            float64
fp3            float64
Length: 285, dtype: object
     id1    id2     pairsFrozen  label    0    1    2    3         4  \
0   1665  57187   (57187, 1665)      1  0.0  0.0  0.0  0.0  0.000924   
1  22916   1654   (22916, 1654)      1  NaN  NaN  NaN  NaN       NaN   
2  51729  79228  (79228, 51729)      1  NaN  NaN  NaN  NaN       NaN   
3   6158   7846    (6158, 7846)      1  NaN  NaN  NaN  NaN       NaN   
4  1

100%|██████████| 207582/207582 [00:21<00:00, 9648.26it/s] 


0
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+mcf7SCBC/5CV/fold=0/
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+mcf7SCBC_recal/5CV/fold=0/
(207582, 3)
id1            object
id2            object
pairsFrozen    object
dtype: object
     id1    id2     pairsFrozen
0   1665  57187   (57187, 1665)
1   3376   4141    (3376, 4141)
2  10884  64978  (10884, 64978)
3   6598   8841    (6598, 8841)
4   6884   6874    (6874, 6884)
(207582, 281)
0      float64
1      float64
2      float64
3      float64
4      float64
        ...   
fs1    float64
fs2    float64
fp1    float64
fp2    float64
fp3    float64
Length: 281, dtype: object
           0         1    2         3         4         5    6         7  \
0   0.000000  0.000000  0.0  0.000000  0.000924  0.057159  0.0 -0.064575   
7   0.

100%|██████████| 207582/207582 [00:21<00:00, 9559.30it/s] 


1
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+mcf7SCBC/5CV/fold=1/
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+mcf7SCBC_recal/5CV/fold=1/
(207582, 3)
id1            object
id2            object
pairsFrozen    object
dtype: object
     id1    id2     pairsFrozen
0  27316   9939   (9939, 27316)
1  10450   3550   (10450, 3550)
2  23450  10155  (10155, 23450)
3   9343  94104   (9343, 94104)
4  22826   6634   (6634, 22826)
(207582, 281)
0      float64
1      float64
2      float64
3      float64
4      float64
        ...   
fs1    float64
fs2    float64
fp1    float64
fp2    float64
fp3    float64
Length: 281, dtype: object
           0    1    2         3         4         5    6         7         8  \
6        NaN  NaN  NaN       NaN       NaN       NaN  NaN       NaN       NaN

100%|██████████| 207581/207581 [00:21<00:00, 9465.60it/s] 


2
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+mcf7SCBC/5CV/fold=2/
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+mcf7SCBC_recal/5CV/fold=2/
(207581, 3)
id1            object
id2            object
pairsFrozen    object
dtype: object
     id1    id2     pairsFrozen
0  51729  79228  (79228, 51729)
1   6222   1938    (1938, 6222)
2   9410  22916   (22916, 9410)
3  11052  57819  (57819, 11052)
4   3054   6879    (3054, 6879)
(207581, 281)
0      float64
1      float64
2      float64
3      float64
4      float64
        ...   
fs1    float64
fs2    float64
fp1    float64
fp2    float64
fp3    float64
Length: 281, dtype: object
           0        1    2         3         4       5    6         7       8  \
2        NaN      NaN  NaN       NaN       NaN     NaN  NaN       NaN     NaN

100%|██████████| 207581/207581 [00:23<00:00, 8923.31it/s] 


3
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+mcf7SCBC/5CV/fold=3/
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+mcf7SCBC_recal/5CV/fold=3/
(207581, 3)
id1            object
id2            object
pairsFrozen    object
dtype: object
     id1    id2     pairsFrozen
0  22916   1654   (22916, 1654)
1  11017   6427   (6427, 11017)
2  10291  24148  (24148, 10291)
3   3608  11224   (3608, 11224)
4  51631  26156  (51631, 26156)
(207581, 281)
0      float64
1      float64
2      float64
3      float64
4      float64
        ...   
fs1    float64
fs2    float64
fp1    float64
fp2    float64
fp3    float64
Length: 281, dtype: object
           0    1    2    3        4         5    6         7    8    9  ...  \
1        NaN  NaN  NaN  NaN      NaN       NaN  NaN       NaN  NaN  NaN  ...  

100%|██████████| 207581/207581 [00:24<00:00, 8623.43it/s] 


4
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+mcf7SCBC/5CV/fold=4/
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+mcf7SCBC_recal/5CV/fold=4/
(207581, 3)
id1            object
id2            object
pairsFrozen    object
dtype: object
     id1    id2     pairsFrozen
0   6158   7846    (6158, 7846)
1   6168   2197    (6168, 2197)
2  25804  11052  (25804, 11052)
3   7756   8175    (7756, 8175)
4  54948  64983  (64983, 54948)
(207581, 281)
0      float64
1      float64
2      float64
3      float64
4      float64
        ...   
fs1    float64
fs2    float64
fp1    float64
fp2    float64
fp3    float64
Length: 281, dtype: object
           0         1    2         3         4         5    6         7  \
3        NaN       NaN  NaN       NaN       NaN       NaN  NaN       NaN   
5   0.

100%|██████████| 1037907/1037907 [02:10<00:00, 7982.71it/s] 


u251
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+u251SCBC/5CV/
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+u251SCBC_recal/5CV/
(1037907, 285)
id1             object
id2             object
pairsFrozen     object
label            int64
0              float64
                ...   
fs1            float64
fs2            float64
fp1            float64
fp2            float64
fp3            float64
Length: 285, dtype: object
     id1    id2     pairsFrozen  label    0    1    2    3         4  \
0   1665  57187   (57187, 1665)      1  0.0  0.0  0.0  0.0  0.000924   
1  22916   1654   (22916, 1654)      1  NaN  NaN  NaN  NaN       NaN   
2  51729  79228  (79228, 51729)      1  NaN  NaN  NaN  NaN       NaN   
3   6158   7846    (6158, 7846)      1  NaN  NaN  NaN  NaN       NaN   
4  1

100%|██████████| 207582/207582 [00:21<00:00, 9827.52it/s] 


0
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+u251SCBC/5CV/fold=0/
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+u251SCBC_recal/5CV/fold=0/
(207582, 3)
id1            object
id2            object
pairsFrozen    object
dtype: object
     id1    id2     pairsFrozen
0   1665  57187   (57187, 1665)
1   3376   4141    (3376, 4141)
2  10884  64978  (10884, 64978)
3   6598   8841    (6598, 8841)
4   6884   6874    (6874, 6884)
(207582, 281)
0      float64
1      float64
2      float64
3      float64
4      float64
        ...   
fs1    float64
fs2    float64
fp1    float64
fp2    float64
fp3    float64
Length: 281, dtype: object
           0         1    2         3         4         5    6         7  \
0   0.000000  0.000000  0.0  0.000000  0.000924  0.057159  0.0 -0.064575   
7   0.

100%|██████████| 207582/207582 [00:21<00:00, 9658.39it/s] 


1
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+u251SCBC/5CV/fold=1/
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+u251SCBC_recal/5CV/fold=1/
(207582, 3)
id1            object
id2            object
pairsFrozen    object
dtype: object
     id1    id2     pairsFrozen
0  27316   9939   (9939, 27316)
1  10450   3550   (10450, 3550)
2  23450  10155  (10155, 23450)
3   9343  94104   (9343, 94104)
4  22826   6634   (6634, 22826)
(207582, 281)
0      float64
1      float64
2      float64
3      float64
4      float64
        ...   
fs1    float64
fs2    float64
fp1    float64
fp2    float64
fp3    float64
Length: 281, dtype: object
           0    1    2         3         4         5    6         7         8  \
6        NaN  NaN  NaN       NaN       NaN       NaN  NaN       NaN       NaN

100%|██████████| 207581/207581 [00:21<00:00, 9686.30it/s] 


2
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+u251SCBC/5CV/fold=2/
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+u251SCBC_recal/5CV/fold=2/
(207581, 3)
id1            object
id2            object
pairsFrozen    object
dtype: object
     id1    id2     pairsFrozen
0  51729  79228  (79228, 51729)
1   6222   1938    (1938, 6222)
2   9410  22916   (22916, 9410)
3  11052  57819  (57819, 11052)
4   3054   6879    (3054, 6879)
(207581, 281)
0      float64
1      float64
2      float64
3      float64
4      float64
        ...   
fs1    float64
fs2    float64
fp1    float64
fp2    float64
fp3    float64
Length: 281, dtype: object
           0        1    2         3         4       5    6         7       8  \
2        NaN      NaN  NaN       NaN       NaN     NaN  NaN       NaN     NaN

100%|██████████| 207581/207581 [00:21<00:00, 9791.94it/s] 


3
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+u251SCBC/5CV/fold=3/
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+u251SCBC_recal/5CV/fold=3/
(207581, 3)
id1            object
id2            object
pairsFrozen    object
dtype: object
     id1    id2     pairsFrozen
0  22916   1654   (22916, 1654)
1  11017   6427   (6427, 11017)
2  10291  24148  (24148, 10291)
3   3608  11224   (3608, 11224)
4  51631  26156  (51631, 26156)
(207581, 281)
0      float64
1      float64
2      float64
3      float64
4      float64
        ...   
fs1    float64
fs2    float64
fp1    float64
fp2    float64
fp3    float64
Length: 281, dtype: object
           0    1    2    3        4         5    6         7    8    9  ...  \
1        NaN  NaN  NaN  NaN      NaN       NaN  NaN       NaN  NaN  NaN  ...  

100%|██████████| 207581/207581 [00:21<00:00, 9798.42it/s] 


4
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+u251SCBC/5CV/fold=4/
/content/drive/My Drive/Primary Research/JLMwSCBC_notebook/dataPartitions/integratedPairs_KD+LM+FGK+SCBC_cellSpecific-NCI60Feats_5CV/lmPairs_KD+LM+FGK+u251SCBC_recal/5CV/fold=4/
(207581, 3)
id1            object
id2            object
pairsFrozen    object
dtype: object
     id1    id2     pairsFrozen
0   6158   7846    (6158, 7846)
1   6168   2197    (6168, 2197)
2  25804  11052  (25804, 11052)
3   7756   8175    (7756, 8175)
4  54948  64983  (64983, 54948)
(207581, 281)
0      float64
1      float64
2      float64
3      float64
4      float64
        ...   
fs1    float64
fs2    float64
fp1    float64
fp2    float64
fp3    float64
Length: 281, dtype: object
           0         1    2         3         4         5    6         7  \
3        NaN       NaN  NaN       NaN       NaN       NaN  NaN       NaN   
5   0.

In [None]:
#load partitions' masks for each fold
modelDir = workDir_ph1 + 'models/'
partitionsSorted_fold = {}
for fold in np.arange(5):
  modelDir_fold = modelDir + 'models_lmPairs_KD+LM+FGK_5CV_{0}/'.format(fold)
  partitionsFilename = modelDir_fold + 'savedPartitions.pkl'
  partitionsSorted_fold[fold] = pickle.load(open(partitionsFilename, 'rb'))

#load models properties for each fold
modelsDir_patt = modelDir + 'models_lmPairs_KD+LM+FGK_5CV_*/'
modelsProperties = \
  {fold: pd.read_csv(f)
    for fold, f in enumerate(
        sorted(glob.glob(modelsDir_patt + '*Training Performance*')))}

In [None]:
import importlib
importlib.reload(modelEvaluating)

<module 'util.modelEvaluating' from '/content/drive/My Drive/Primary Research/proteinPairs_complexMaps/util/modelEvaluating.py'>

In [None]:
candidatePartitions_fold = {}
candidatePartitions_predsFold = {}
candidatePartitions_predsProbs_fold = {}
modelsEval_properties_fold = {}
for fold in np.arange(5):
  print(fold)

  candidatePartitions_cell = {}
  candidatePartitions_predsCell = {}
  candidatePartitions_predsProbs_cell = {}
  modelsEval_properties_cell = {}
  expName = 'lmPairs_KD+LM+FGK_5CV_{0}'.format(fold)
  for cell in ['h322', 'mcf7', 'u251']:
    print(cell)

    #specify dirs
    featMat_cellDir_new = \
      dataDir + \
        'lmPairs_KD+LM+FGK+{0}SCBC_recal/5CV/fold={1}/'.format(cell, fold)

    #load test pairs corresponding to fold iter and copy to new feat mat dir
    testData = \
      pd.read_csv(featMat_cellDir_new + 'testData.tsv.tar.bz2',
                  sep='\t', header=0)
    testLabels = pd.read_csv(featMat_cellDir_new + 'testLabels.tsv.tar.bz2',
                             sep='\t')

    candidatePartitions_cell[cell], candidatePartitions_predsCell[cell], \
      candidatePartitions_predsProbs_cell[cell], \
        modelsEval_properties_fold[cell] = \
          modelEvaluating.evalPartitions(partitionsSorted_fold[fold],
                                         modelsProperties[fold],
                                         testData, testLabels,
                                         workDir_ph1, expName, storeModels=True)

  candidatePartitions_fold[fold] = candidatePartitions_cell
  candidatePartitions_predsFold[fold] = candidatePartitions_predsCell
  candidatePartitions_predsProbs_fold[fold] = candidatePartitions_predsProbs_cell
  modelsEval_properties_fold[fold] = modelsEval_properties_cell

In [None]:
#put this block after any cell expected to take a long time
cmdReport = alertMe.statusCheck(pushoverAPI, pushoverKey_user)
cmdReport.finishPush()

In [None]:
probsPos_acc_fold = {}
probsPos_weighted_fold = {}
testLabels_adjFold = {}
for fold in np.arange(5):
  print(fold)

  probsPos_weighted_cell = {}
  testLabels_adjCell = {}
  expName = 'lmPairs_KD+LM+FGK_5CV_{0}'.format(fold)
  for cell in ['h322', 'mcf7', 'u251']:
    print(cell)

    #specify dirs
    featMat_cellDir_new = \
      dataDir + \
        'lmPairs_KD+LM+FGK+{0}SCBC_recal/5CV/fold={1}/'.format(cell, fold)

    #load test pairs corresponding to fold iter and copy to new feat mat dir
    testLabels = pd.read_csv(featMat_cellDir_new + 'testLabels.tsv.tar.bz2',
                             sep='\t')

    probsPos_acc_cell[cell], probsPos_weighted_cell[cell], \
      testLabels_adjCell[cell] = \
        modelEvaluating.evalTotal(
            candidatePartitions_fold[fold][cell],
            candidatePartitions_predsFold[fold][cell],
            modelsEval_properties_fold[fold][cell],
            testLabels, workDir_ph1, expName)

    modelEvaluating.evalSummary(
        probsPos_acc_cell[cell],
        probsPos_weighted_cell[cell],
        testLabels_adjCell[cell],
        workDir_ph1, expName)

  probsPos_acc_fold[fold] = probsPos_acc_cell
  probsPos_weighted_fold[fold] = probsPos_weighted_cell
  testLabels_adjFold[fold] = testLabels_adjCell