#install packages and load libraries

Import common packages

In [1]:
import argparse, datetime, glob
import itertools as it
from multiprocessing import cpu_count, Pool
import networkx as nx
import numpy as np
import os
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import random, re, requests
import seaborn as sns
import shutil
from scipy.stats import hmean
from sklearn.inspection import permutation_importance
import sklearn.metrics
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve
import subprocess as sp
import sys
import tempfile as tf
import time
from tqdm import tqdm as progressMonitor

Enable Google Colab, mount drives, and load/define proprietary modules

In [2]:
%%capture
from google.colab import drive, files, output
drive.mount('/content/drive', force_remount=True)
!rm -r sample_data/

Define directories and principal object paths

In [3]:
#rootDir = '/content/drive/My Drive/'
rootDir = '/content/drive/My Drive/Colab Notebooks/'
workDir = rootDir + 'Primary Research/JLMwSCBC_notebook/'
#workDir_elcfs = workDir + 'proteinPairs_complexMaps/'
workDir_elcfs = rootDir + 'Primary Research/proteinPairs_complexMaps/'
workDir_other = rootDir + 'otherStudies/'

In [4]:
sys.path.insert(0, rootDir)
sys.path.append(workDir)
sys.path.append(workDir_elcfs)
sys.path.append(workDir_other)

#util functions

In [5]:
def flatten(lst):
  return [item for sublist in lst for item in sublist]

In [6]:
def freezePairs(df, col1, col2):
    df.insert(0, 'pairsFrozen',
              df.loc[:, [col1, col2]].apply(frozenset, axis=1))

    return df

In [7]:
def generatePairs_theo(prots):
  return [frozenset(pair) for pair in list(it.combinations(prots, 2))]

In [8]:
def loadComplexes(filename):
    complexes = []
    complexesFile = open(filename, 'r')
    for line in complexesFile.readlines():
        complexes.append(line.split())
    complexesFile.close()

    complexesFrozen = [frozenset(cplx) for cplx in complexes]

    return complexesFrozen

In [9]:
def findOverlap(comboList):
    pairsOverlapping = set.intersection(*[set(pairsSources[source])
                                          for source in list(comboList)])
    return pairsOverlapping

In [10]:
class CORUM(object):

  def __init__(self, corumCore_excelFile,
               nameCol='ComplexName', organismCol='Organism',
               subunitsCol='subunits(Entrez IDs)'):

    corumCols_select = [nameCol, organismCol, subunitsCol]
    self.nameCol, self.organismCol, self.subunitsCol = corumCols_select
    self.rawData = pd.read_excel(corumCore_excelFile, usecols=corumCols_select)
    self.humanData = \
      pd.read_excel(corumCore_excelFile, usecols=corumCols_select)

    self.allCplx_names, self.allCplx_frozen, self.allCplx_species, \
      self.allCplx_total, self.allProts, self.organisms = \
        self.formatDF(self.rawData)

    self.humanComplexes_names, self.humanCplx_frozen, _, self.humanCplx_total, \
      self.humanProts, _ = self.formatDF(
          self.rawData.loc[self.rawData[self.organismCol]=='Human', :])

    self.allPairs, self.allPairs_human = \
      self.generatePairs(self.allProts), self.generatePairs(self.humanProts)

  def dropEmpties_freeze2List(self, line):
    return frozenset([ele
                      for ele in line.replace('None', '').split(';') if ele])

  def generatePairs(self, prots):
    return [frozenset(pair) for pair in list(it.combinations(prots, 2))]

  def formatDF(self, df):
    cplxNames = [row[self.nameCol] for _, row in df.iterrows()]
    cplxFrozen = \
     [self.dropEmpties_freeze2List(row[self.subunitsCol])
     for _, row in df.iterrows()]
    cplxOrganisms = [row[self.organismCol] for _, row in df.iterrows()]
    numComplexes = sum([1 for _, row in df.iterrows()])
    prots = [str(ele) for ele in list(set().union(*
     [row[self.subunitsCol].replace('None', '').split(';')
     for _, row in df.iterrows()])) if ele]
    species = list(set([row[self.organismCol] for _, row in df.iterrows()]))

    return cplxNames, cplxFrozen, cplxOrganisms, numComplexes, prots, species


#sources

1.   [CORUM](http://mips.helmholtz-muenchen.de/corum/#download)

2.   [hu.MAP v1.0](http://hu.proteincomplexes.org/download) (HEK293T)
>    *(Obsolete)*
> *   BioPlex v1.0 - Huttlin 2015 (AP-MS)
> *   Hein 2015 (AP-MS)
> *   Wan 2015 (CF-MS)

3.   [hu.MAP 2.0](http://humap2.proteincomplexes.org/download) (HEK293T)
> *  BioPlex v2.0 - Huttlin 2017 (AP-MS)
> *  Boldt 2016 (AP-MS)
> *  Gupta 2015 (Proximity)
> *  Hein 2015 (AP-MS)
> *  Wan 2015 (AP-MS)
> *  Youn 2018 (Proximity)
> *  Treiber 2017; Mallam 2019 (RNA-Pulldown)

4.   [BioPlex](https://bioplex.hms.harvard.edu/data/BioPlex_BaitPreyPairs_noFilters_293T_10K_Dec_2019.tsv) v3.0 2021 (AP-MS)

5.   [Lugo-Martinez 2019](https://drive.google.com/drive/folders/191Y14LBVZnIxWJysmJ7I_UL451aStVvj?usp=sharing)
> *  HPA - Uniprot 2017
> *  HPA-DL - Ouyang 2019
> *  NCI-60 - Gholami 2013
> *  FANTOM - Forrest 2014
> *  GTEx - Yizhak 2019

6.   [SubCellBarCode](https://www.subcellbarcode.org/) 2019

7.   [STRING](https://string-db.org/cgi/download.pl)





key variables: proteins, pairs, complexes, labels, scores

##CORUM

In [None]:
corumData_2018Dir = workDir_elcfs + 'sourceData/corum/2018.09.03/corum.xlsx'
accessCORUM = CORUM(corumData_2018Dir)

In [None]:
humanProts = accessCORUM.humanProts
print(len(humanProts))

3375


In [None]:
pickle.dump(humanProts, open('./humanProts_CORUMlatest.pkl', 'wb'))

In [None]:
allPairs_corumLabeled = \
  pd.DataFrame(accessCORUM.allPairs_human, columns=['idi', 'idii'], dtype='str')
allPairs_corumLabeled.insert(2, 'pairsFrozen',
                             allPairs_corumLabeled.loc[:, ['idi', 'idii']].apply(frozenset, axis=1))
allPairs_corumLabeled.insert(3, 'label', 0)

posPairs_humanProt = \
  flatten(
      [[frozenset(pair) for pair in list(it.combinations(cplx, 2))]
       for cplx in accessCORUM.humanCplx_frozen])
allPairs_corumLabeled.loc[
    (allPairs_corumLabeled.pairsFrozen.isin(posPairs_humanProt)), 'label'] = 1

print('+/- Pairs DF dims: {0}'.format(allPairs_corumLabeled.shape))
print('+ Pairs DF dims: {0}'.format(
    allPairs_corumLabeled.loc[allPairs_corumLabeled.label==1, :].shape))
print('- Pairs DF dims: {0}'.format(
    allPairs_corumLabeled.loc[allPairs_corumLabeled.label==0, :].shape))

+/- Pairs DF dims: (5693625, 4)
+ Pairs DF dims: (38018, 4)
- Pairs DF dims: (5655607, 4)


In [None]:
allPairs_corumLabeled.to_csv(
    workDir_elcfs + 'sourceData/corum/2018.07.01/allPairs_corumLabeled.tsv',
    sep='\t', index=False)
pickle.dump(allPairs_corumLabeled,
            open(workDir_elcfs +
                 'sourceData/corum/2018.07.01/allPairs_corumLabeled.pkl', 'wb'))

##hu.MAP 1.0

(BioPlex 1.0 included)

In [None]:
humap1Dir = workDir + 'proteinComplexes/huMAP_1.0/'
humap1Data_files = glob.glob(humap1Dir + '*ppis*')

In [None]:
humap1Data_files

['/content/drive/My Drive/Colab Notebooks/Primary Research/JLMwSCBC_notebook/proteinComplexes/huMAP_1.0/test_neg_ppis.txt',
 '/content/drive/My Drive/Colab Notebooks/Primary Research/JLMwSCBC_notebook/proteinComplexes/huMAP_1.0/test_ppis.txt',
 '/content/drive/My Drive/Colab Notebooks/Primary Research/JLMwSCBC_notebook/proteinComplexes/huMAP_1.0/train_ppis.txt',
 '/content/drive/My Drive/Colab Notebooks/Primary Research/JLMwSCBC_notebook/proteinComplexes/huMAP_1.0/train_neg_ppis.txt']

In [None]:
humap1Data = \
  {key.split('/')[-1].split('_ppis.txt')[0]:
   pd.read_csv(key,
               engine='python', sep='\t| ', names=['idi', 'idii'], dtype='str')
   for key in humap1Data_files}
humap1Data = \
  {key: pd.concat([val, val.loc[:, ['idi', 'idii']].apply(frozenset, axis=1)],
                  axis=1)
  for key, val in humap1Data.items()}

humap1Data = \
  {key: val.rename(columns={0: 'pairsFrozen'})
  for key, val in humap1Data.items()}
humap1Data = \
  {key: pd.concat([val, pd.DataFrame([0 for i in val.index])], axis=1)
   if 'neg' in key
   else pd.concat([val, pd.DataFrame([1 for i in val.index])], axis=1)
   for key, val in humap1Data.items()}
humap1Data = \
  {key: val.rename(columns={0: 'label'}) for key, val in humap1Data.items()}

KeyboardInterrupt: ignored

In [None]:
humap1Pairs = \
  pd.concat([df for df in humap1Data.values()], axis=0, ignore_index=True)

In [None]:
humap1Prots = \
  list(set(humap1Pairs.idi.to_list()).union(set(humap1Pairs.idii.to_list())))

In [None]:
humap1Pairs

Unnamed: 0,idi,idii,pairsFrozen,label
0,26395,2004,"(26395, 2004)",0
1,24338,5361,"(5361, 24338)",0
2,67891,25686,"(25686, 67891)",0
3,23338,192663,"(192663, 23338)",0
4,75788,777775,"(777775, 75788)",0
...,...,...,...,...
5455116,6204,5429,"(5429, 6204)",0
5455117,354,66340,"(66340, 354)",0
5455118,29218,5430,"(29218, 5430)",0
5455119,67530,100009214,"(100009214, 67530)",0


Don't run this cell with less than 24G of free RAM

In [None]:
humap1Matrix = \
  pd.read_csv(
      humap1Dir + 'blake_bioplex_feature_revisitMerge_pairsOnly_preyMerge2_' + \
      'heinCollapseMerge_pairsOnly_preyMerge2.txt',
      sep=',', usecols=['geneid1', 'geneid2'], na_values=['NaN', np.nan])

  pd.read_csv(


In [None]:
humap1Matrix.loc[:, ['geneid1', 'geneid2']]

Unnamed: 0,geneid1,geneid2
0,1891,51228
1,55720,662
2,51633,7251
3,10059,3029
4,1399,1738


In [None]:
humap1Pairset_protsPairs = \
  pickle.load(open(humap1Dir + 'humap1Pairset_protsPairs.pkl', 'rb'))

humap1Matrix_prots = \
  pickle.load(open(humap1Dir + 'humap1Matrix_prots.pkl', 'rb'))
humap1Matrix_pairs = \
  pickle.load(open(humap1Dir + 'humap1Matrix_pairs.pkl', 'rb'))
humap1Matrix_pairsAll = \
  pickle.load(open(humap1Dir + 'humap1Matrix_pairsAll.pkl', 'rb'))

Don't run this cell with less than 24G of free RAM

In [None]:
humap1Matrix_featuresDir = \
  humap1Dir + 'blake_bioplex_feature_revisitMerge_pairsOnly_preyMerge2_' + \
    'heinCollapseMerge_pairsOnly_preyMerge2.txt'
humap1Matrix_features = \
  pd.read_csv(humap1Matrix_featuresDir,
              usecols=['geneid1', 'geneid2'],
              dtype={'geneid1': 'str', 'geneid2': 'str'})
humap1Matrix_features['pairsFrozen'] = \
  humap1Matrix_features.loc[:, ['geneid1', 'geneid2']].apply(frozenset, axis=1)
humap1Matrix_prots = \
  list(set(humap1Matrix_features.geneid1.to_list()).union(
      set(humap1Matrix_features.geneid2.to_list())))
humap1Matrix_pairs = \
  humap1Matrix_features.pairsFrozen.to_list()
humap1Matrix_pairsAll = \
  [frozenset(pair) for pair in list(it.combinations(humap1Matrix_prots, 2))]

##hu.MAP 2.0

(BioPlex 2.0 included)

In [None]:
humap2Dir = workDir + 'proteinComplexes/huMAP_2.0/'
humap2Data_files = glob.glob(humap2Dir + 'humap2*20200818*')
humap2PPIs_dir = humap2Dir + 'humap2_ppis_geneid_20200821.pairsWprob'
humap2Matrix_featuresDir = \
  humap2Dir + 'humap2_feature_matrix_20200820 2.featmat.gz'

In [None]:
humap2Data = \
  {key.split('/')[-1].split('humap2_')[-1].split('_geneid')[0]:
   pd.read_csv(key, sep='\t', dtype='str')
   for key in humap2Data_files if '_ppis' in key}
humap2Data = \
  {key:
   pd.concat([val,
              val.loc[:, ['IDi', 'IDii']].apply(frozenset, axis=1)], axis=1)
  for key, val in humap2Data.items()}
humap2Data = \
  {key: val.rename(columns={0: 'pairsFrozen'})
  for key, val in humap2Data.items()}
humap2Data = \
  {key:
   pd.concat([val, pd.DataFrame([0 for i in val.index])], axis=1)
   if 'neg' in key
   else pd.concat([val, pd.DataFrame([1 for i in val.index])], axis=1)
   for key, val in humap2Data.items()}
humap2Data = \
  {key: val.rename(columns={0: 'label'}) for key, val in humap2Data.items()}

In [None]:
humap2Pairs = \
  pd.concat([df for df in humap2Data.values()], axis=0, ignore_index=True)

In [None]:
humap2Prots = \
  list(set(humap2Pairs.IDi.to_list()).union(set(humap2Pairs.IDii.to_list())))

In [None]:
humap2Complexes_groundTruth = \
  {key.split('/')[-1].split('humap2_')[-1].split('_geneid')[0]:
   loadComplexes(key) for key in humap2Data_files if 'complex' in key}

In [None]:
humap2Prots = \
    list(set().union(*[list(set(val.loc[:, 'IDi'].to_list()).union(
        set(val.loc[:, 'IDii'].to_list()))) for val in humap2Data.values()]))
humap2Pairs = list(
    set().union(*[val.pairsFrozen.to_list() for val in humap2Data.values()]))

humap2Pairs_all = \
  [frozenset(pair) for pair in list(it.combinations(humap2Prots, 2))]

In [None]:
humap2PPIs = \
  pd.read_csv(humap2PPIs_dir, sep='\t', header=0,
              names=['IDi', 'IDii', 'score'],
              dtype={'IDi': 'str', 'IDii': 'str', 'score': 'float64'})
humap2PPIs.insert(2, 'pairsFrozen',
                  humap2PPIs.loc[:, ['IDi', 'IDii']].apply(frozenset, axis=1))
humap2Prots_allAvailable = list(set(
    humap2PPIs.loc[:, 'IDi'].to_list()).union(
        set(humap2PPIs.loc[:, 'IDii'].to_list())))

In [None]:
humap2Matrix_features = pd.read_csv(humap2Matrix_featuresDir, nrows=10)

In [None]:
humap2Matrix = \
  pd.read_csv(humap2Dir + 'humap2_feature_matrix_20200820 2.featmat.gz',
              na_values=['NaN', np.nan], usecols=['id1', 'id2'], dtype='str')

In [None]:
len(humap2Prots)

2993

##BioPlex 3.0 (not yet incorporated)

In [None]:
bioplex3Dir = workDir + 'proteinComplexes/bioplex 3.0/'

In [None]:
bioplex3Data = \
  bioplex3Dir + 'BioPlex_BaitPreyPairs_noFilters_293T_10K_Dec_2019.tsv'

In [None]:
dataMat_bioplex3 = pd.read_csv(bioplex3Data, sep='\t')

In [None]:
dataMat_bioplex3.head()

Unnamed: 0,CompPASS_ID,bait_symbol,bait_geneid,db_protein_id,symbol,prot_description,gene_id,ave_apsm,nwdscore,zscore,...,entropy,uPeps,ratio,total_psms,ratioTotalPSMs,UtoTratio,pWrongID,pNoInt,pInt,Unnamed: 20
0,106246,ADA,100,sp|P05387|RLA2_HUMAN,RPLP2,60S acidic ribosomal protein P2 OS=Homo sapie...,6181.0,5,0.03,-0.6,...,0.902393,6,0.999452,175120,6e-05,0.6,0.0,0.99922,0.00078,
1,106246,ADA,100,sp|P07737|PROF1_HUMAN,PFN1,Profilin-1 OS=Homo sapiens GN=PFN1 PE=1 SV=2,5216.0,5,0.03,-0.11,...,0.902393,6,0.992608,116064,9e-05,0.6,0.0,0.9998,0.0002,
2,106234,BEND7,222389,sp|P22626|ROA2_HUMAN,HNRNPA2B1,Heterogeneous nuclear ribonucleoproteins A2/B...,3181.0,41,0.09,2.89,...,0.965803,20,1.0,427487,0.00019,0.243902,0.0,0.98014,0.01986,
3,106234,BEND7,222389,sp|O95232|LC7L3_HUMAN,LUC7L3,Luc7-like protein 3 OS=Homo sapiens GN=LUC7L3...,51747.0,2,0.04,0.5,...,0.881291,3,0.471801,21336,0.00019,0.75,0.0,0.99993,7e-05,
4,106234,BEND7,222389,sp|Q04837|SSBP_HUMAN,SSBP1,"Single-stranded DNA-binding protein, mitochon...",6742.0,3,0.02,-0.99,...,1.0,3,0.993794,123907,5e-05,0.5,0.0,0.99988,0.00012,


In [None]:
dataMat_bioplex3.columns.to_list()

['CompPASS_ID',
 'bait_symbol',
 'bait_geneid',
 'db_protein_id',
 'symbol',
 'prot_description',
 'gene_id',
 'ave_apsm',
 'nwdscore',
 'zscore',
 'plate_zscore',
 'entropy',
 'uPeps',
 'ratio',
 'total_psms',
 'ratioTotalPSMs',
 'UtoTratio',
 'pWrongID',
 'pNoInt',
 'pInt',
 'Unnamed: 20']

In [None]:
dataMat_bioplex3 = \
  pd.read_csv(bioplex3Data, sep='\t', usecols=['bait_geneid', 'gene_id'],
              na_values=['nan'], dtype='str')

dataMat_bioplex3.rename(columns={'bait_geneid': 'id1', 'gene_id': 'id2'},
                        inplace=True)

protsBioplex3 = \
  list(set(dataMat_bioplex3.id1.to_list()).union(
      set(dataMat_bioplex3.id2.to_list())))

don't run without sufficient RAM

In [None]:
dataMat_bioplex3 = \
  pd.read_csv(bioplex3Data, sep='\t', usecols=['bait_geneid', 'gene_id'],
              dtype='str')

dataMat_bioplex3.rename(columns={'bait_geneid': 'id1', 'gene_id': 'id2'},
                        inplace=True)

dataMat_bioplex3.insert(2, 'pairsFrozen',
                        dataMat_bioplex3.loc[:, ['id1', 'id2']].apply(
                            frozenset, axis=1))

protsBioplex3 = \
  list(set(dataMat_bioplex3.id1.to_list()).union(
      set(dataMat_bioplex3.id2.to_list())))

pairsBioplex3_theoretical = \
  [frozenset(pair) for pair in list(it.combinations(protsBioplex3, 2))]
pairsBioplex3 = list(set(dataMat_bioplex3.pairsFrozen.to_list()))

In [None]:
pairsBioplex3_theoretical = \
  [frozenset(pair) for pair in list(it.combinations(protsBioplex3, 2))]

In [None]:
len(pairsBioplex3_theoretical)

got dups

In [None]:
len(protsBioplex3)

15094

In [None]:
dataMat_bioplex3.shape

(5851900, 3)

In [None]:
len(pairsBioplex3)

5714780

theoretical pairs from BioPlex 3.0

113,906,871

###Mapping hu.MAP features across releases 1->3

In [None]:
humap1Matrix_features = \
  pd.read_csv(
      humap1Dir + 'blake_bioplex_feature_revisitMerge_pairsOnly_preyMerge2_' + \
      'heinCollapseMerge_pairsOnly_preyMerge2.txt',
      sep=',', nrows=5)

In [None]:
humap1Matrix_features.columns.to_list()

In [None]:
bioplexColumns_humap2 = \
  [col for col in humap2Matrix_features.columns.to_list() if 'bioplex' in col]

In [None]:
featMap_humap1Col_humap2Col_bioplex = \
  {humap2Matrix_col:
   max([humap1Matrix_col
        for humap1Matrix_col in humap1Matrix_features.columns.to_list()
        if humap1Matrix_col in humap2Matrix_col])
   for humap2Matrix_col in bioplexColumns_humap2}

In [None]:
featMap_humap1Col_humap2Col_bioplex

{'nwdscore_bioplex2': 'nwdscore',
 'zscore_bioplex2': 'zscore',
 'plate_zscore_bioplex2': 'zscore',
 'entropy_bioplex2': 'entropy',
 'uPeps_bioplex2': 'uPeps',
 'ratio_bioplex2': 'ratio',
 'total_psms_bioplex2': 'total_psms',
 'ratioTotalPSMs_bioplex2': 'ratioTotalPSMs',
 'UtoTratio_bioplex2': 'ratio',
 'neg_ln_pval_bioplex2_Z4': 'neg_ln_pval',
 'pair_count_bioplex2_Z4': 'pair_count',
 'neg_ln_pval_bioplex2_Z2': 'neg_ln_pval',
 'pair_count_bioplex2_Z2': 'pair_count'}

In [None]:
dataMat_bioplex3.columns.to_list()[8:-1]

['nwdscore',
 'zscore',
 'plate_zscore',
 'entropy',
 'uPeps',
 'ratio',
 'total_psms',
 'ratioTotalPSMs',
 'UtoTratio',
 'pWrongID',
 'pNoInt',
 'pInt']

need to figure out what happened to pval(2x) and pair count(2x) features from BioPlex 2.0 to BioPlex 3.0

need to figure out what relationship, if any, Bioplex 3.0 features, pWrongID, pNoInt, and pInt have with Bioplex 2.0 features

In [None]:
len(protsBioplex3)

15094

In [None]:
len(humap2Prots)

2993

In [None]:
len(humap2Prots_allAvailable)

15433

In [None]:
set(humap2Prots).issuperset(set(protsBioplex3))

False

In [None]:
set(humap2Prots).issubset(set(protsBioplex3))

False

In [None]:
set(humap2Prots_allAvailable).issuperset(set(protsBioplex3))

False

In [None]:
set(humap2Prots_allAvailable).issubset(set(protsBioplex3))

False

In [None]:
len(set(humap2Prots).difference(set(protsBioplex3)))

184

In [None]:
len(set(protsBioplex3).difference(set(humap2Prots)))

12285

In [None]:
len(set(humap2Prots_allAvailable).difference(set(protsBioplex3)))

1917

In [None]:
len(set(protsBioplex3).difference(set(humap2Prots_allAvailable)))

1578

###Compare humap1, humap2 to CORUM (latest)

In [None]:
print(set(humap1Prots).issubset(set(accessCORUM.humanProts)))
print(len(set(accessCORUM.humanProts).difference(set(humap1Prots))))

False
1108


In [None]:
print(set(humap2Prots).issubset(set(accessCORUM.humanProts)))
print(len(set(accessCORUM.humanProts).difference(set(humap2Prots))))

False
584


In [None]:
humap1Prots_absentCORUM = \
  set(humap1Prots).difference(set(accessCORUM.humanProts))

print(humap1Prots_absentCORUM.issubset(set(accessCORUM.humanProts)))
print(len(humap1Prots_absentCORUM))

False
1568


In [None]:
humap2Prots_absentCORUM = \
  set(humap2Prots).difference(set(accessCORUM.humanProts))

print(humap2Prots_absentCORUM.issubset(set(accessCORUM.humanProts)))
print(len(humap2Prots_absentCORUM))

False
202


In [None]:
set(humap2Pairs.pairsFrozen.to_list()).issubset(set(allPairs_corumLabeled.pairsFrozen.to_list()))

False

In [None]:
cmpLabels_corumHumap2 = \
  pd.merge(humap2Pairs.loc[:, ['pairsFrozen', 'label']].rename(
      columns={'label': 'labelHumap2'}),
           allPairs_corumLabeled.loc[:, ['pairsFrozen', 'label']].rename(
              columns={'label': 'labelCORUM'}),
           on=['pairsFrozen'], how='inner')

In [None]:
cmpLabels_corumHumap2.loc[(
    (cmpLabels_corumHumap2.labelCORUM==1) &
     (cmpLabels_corumHumap2.labelHumap2==1)), :]

Unnamed: 0,pairsFrozen,labelHumap2,labelCORUM
0,"(6627, 51428)",1,1
1,"(3958, 6632)",1,1
2,"(8648, 5530)",1,1
3,"(51079, 4720)",1,1
4,"(5440, 5437)",1,1
...,...,...,...
1464514,"(121536, 23512)",1,1
1464515,"(6714, 2099)",1,1
1464516,"(8449, 7756)",1,1
1464517,"(4712, 4540)",1,1


In [None]:
cmpLabels_corumHumap2.loc[(
    (cmpLabels_corumHumap2.labelCORUM==0) &
     (cmpLabels_corumHumap2.labelHumap2==0)), :]

Unnamed: 0,pairsFrozen,labelHumap2,labelCORUM
6971,"(2982, 25929)",0,0
6972,"(79155, 2260)",0,0
6973,"(29127, 6224)",0,0
6974,"(11154, 7334)",0,0
6975,"(123720, 23016)",0,0
...,...,...,...
3031817,"(9092, 81027)",0,0
3031818,"(1654, 192111)",0,0
3031819,"(6231, 3655)",0,0
3031820,"(64963, 7283)",0,0


In [None]:
cmpLabels_corumHumap2.loc[(
    ((cmpLabels_corumHumap2.labelCORUM==1) &
     (cmpLabels_corumHumap2.labelHumap2==0)) |
      ((cmpLabels_corumHumap2.labelCORUM==0) &
       (cmpLabels_corumHumap2.labelHumap2==1))), :]

Unnamed: 0,pairsFrozen,labelHumap2,labelCORUM
134,"(1642, 9343)",1,0
157,"(203068, 9343)",1,0
202,"(7862, 23338)",1,0
235,"(203068, 4436)",1,0
241,"(3551, 4792)",1,0
...,...,...,...
2954993,"(2932, 3320)",0,1
2980659,"(5747, 5605)",0,1
2984388,"(8473, 10445)",0,1
2988118,"(2648, 4149)",0,1


In [None]:
corumComplexes_posPairs = \
  [cplx for cplx in accessCORUM.humanCplx_frozen
   if cmpLabels_corumHumap2.loc[2954993, 'pairsFrozen'].issubset(cplx)]

In [None]:
corumComplexes_posPairs

[frozenset({'2932', '3320', '5315'})]

In [None]:
corumComplexes_posPairs = \
  [cplx for cplx in accessCORUM.humanCplx_frozen
   if cmpLabels_corumHumap2.loc[134, 'pairsFrozen'].issubset(cplx)]

In [None]:
corumComplexes_posPairs

[]

##Lugo-Martinez
*   HPA
*   NCI-60
*   FANTOM
*   gTEX
*   HPA-microscopy deep-learning driven location classification

In [None]:
lmData_dir = workDir + 'proteinComplexes/Lugo-Martinez/'
lmExpanded_pairs = \
  pickle.load(open(lmData_dir + 'expandedPairset_protsPairs.pkl', 'rb'))['pairs']

featMat_lmExpanded2_pklDir = lmData_dir + 'lmExpanded_FANTOM+gTEX+HPA-DL/' + \
  'lmIntegrated2_MM+TSCS+Exp+Abun+Loc+Kaggle_featuresMatrix.pkl'
featMat_lmExpanded2_pairs = pickle.load(open(featMat_lmExpanded2_pklDir, 'rb'))
featMat_lmExpanded2_pairs = \
  list(set(featMat_lmExpanded2_pairs.pairsFrozen.to_list()))

In [None]:
lmData_dir = workDir + 'proteinComplexes/Lugo-Martinez/'

lmExpanded_protsPairs = \
  pickle.load(open(lmData_dir + 'expandedPairset_protsPairs.pkl', 'rb'))
lmExpanded_prots = lmExpanded_protsPairs['prots']
lmExpanded_pairs = lmExpanded_protsPairs['pairs']
lmExpanded_pairsAll = lmExpanded_protsPairs['allPairs']

featMat_lmExpanded2_dir = lmData_dir + 'lmExpanded_FANTOM+gTEX+HPA-DL/' + \
  'lmIntegrated2_MM+TSCS+Exp+Abun+Loc+Kaggle_featuresMatrix.tsv'
featMat_lmExpanded2_pklDir = lmData_dir + 'lmExpanded_FANTOM+gTEX+HPA-DL/' + \
  'lmIntegrated2_MM+TSCS+Exp+Abun+Loc+Kaggle_featuresMatrix.pkl'
featMat_lmExpanded2 = pickle.load(open(featMat_lmExpanded2_pklDir, 'rb'))
featMat_lmExpanded2_prots = \
  list(set(featMat_lmExpanded2.loc[:, 'idi'].to_list()).union(
      set(featMat_lmExpanded2.loc[:, 'idii'])))
featMat_lmExpanded2_pairs = \
  list(set(featMat_lmExpanded2.pairsFrozen.to_list()))
featMat_lmExpanded2_allPairs = \
  [frozenset(pair)
  for pair in list(it.combinations(featMat_lmExpanded2_prots, 2))]

In [None]:
len(featMat_lmExpanded2_pairs)

1826385

###HPA

In [None]:
hpaData_path = lmData_dir + 'hpaPerTissue.tsv'
hpaData = pd.read_csv(hpaData_path, sep='\t')

In [None]:
hpaData.insert(1, 'GeneID_singleEntry',
    [list(filter(None, entry.split(' '))) if ' ' in str(entry) else entry for entry in hpaData.GeneID.to_list()])
hpaData_geneID_noSplits = hpaData.explode('GeneID_singleEntry')
hpaProts = \
  [x for x in hpaData_geneID_noSplits.loc[hpaData_geneID_noSplits.GeneID_singleEntry.notnull(),
                                          'GeneID_singleEntry'].to_list()]

###NCI-60

In [None]:
protExp_cellLine_nci60Path = lmData_dir + 'nci60_all_abundanceLFQPerCellLineProteome.tsv'
protExp_tissueLevel_nci60Path = lmData_dir + 'nci60_all_abundanceLFQPerTissueDeep.tsv'

protExp_cellLine_nci60 = pd.read_csv(protExp_cellLine_nci60Path, sep='\t')
protExp_tissueLevel_nci60 = pd.read_csv(protExp_tissueLevel_nci60Path, sep='\t')

In [None]:
protsNCI60_cellLevel = list(set(protExp_cellLine_nci60.loc[protExp_cellLine_nci60.GeneID.notnull(), 'GeneID'].to_list()))
protsNCI60_tissueLevel = list(set(protExp_tissueLevel_nci60.loc[protExp_tissueLevel_nci60.GeneID.notnull(), 'GeneID'].to_list()))

###load, save, update results

In [None]:
pickle.dump({'humap1Pairs': humap1Pairs,
             'humap2Pairs': humap2Pairs,
             'bioplex3Pairs': bioplex3Pairs},
            open('./availablePairs_humap1+humap2+bioplex3.pkl', 'wb'))

In [None]:
pickle.dump({'lmExpanded_pairs': lmExpanded_pairs,
             'lmExpanded2_pairs': featMat_lmExpanded2_pairs},
            open('./availablePairs_expandedPairset+expandedPairset2.pkl', 'wb'))

In [None]:
pickle.dump({'stringPairs': stringPairs},
            open('./availablePairs_string.pkl', 'wb'))

##SubCellBarCode

In [None]:
scbcDir = workDir + 'proteinComplexes/scbc/'
scbcData = pd.read_excel(scbcDir + 'scbc_quantitative_ms(S1).xlsx', sheet_name=None)
scbcGenename_subunitID_mapping = pd.read_excel(scbcDir + 'genename->geneid_08092023.xlsx', sheet_name=None)

In [None]:
scbcProts = \
  scbcGenename_subunitID_mapping['genename->geneid'].loc[
      scbcGenename_subunitID_mapping['genename->geneid'].GeneID.notnull(), :].GeneID.unique()
scbcProts_pairsAll = \
  [frozenset(pair) for pair in list(it.combinations(scbcProts, 2))]

In [None]:
len(scbcProts)

12389

In [None]:
len(scbcProts_pairsAll)

76737466

###generate mapping file

In [None]:
subunitsSCBC_genename = \
  list(set().union(*
   [val.loc[:, 'Protein'].to_list()
   for name, val in scbcData.items()
   if name in ['A431', 'MCF7', 'H322', 'HCC827', 'U251']]))

In [None]:
subunitsSCBC_genename2Uniprotkb_uniprotkb2GeneID = \
  pd.read_excel(scbcDir + 'subunitsSCBC_genename->uniprotkb_08092023.xlsx',
              na_values=['None'], dtype='str', sheet_name=None)

In [None]:
for dfName, df in subunitsSCBC_genename2Uniprotkb_uniprotkb2GeneID.items():
  print(dfName)
  print(df.dtypes)
  cols2Drop = [col for col in df.columns.to_list() if 'Unnamed' in col]
  print(cols2Drop)
  subunitsSCBC_genename2Uniprotkb_uniprotkb2GeneID[dfName].drop(
      columns=cols2Drop, inplace=True)

In [None]:
subunitsSCBC_genename2GeneID_noMerge_mapping = \
  subunitsSCBC_genename2Uniprotkb_uniprotkb2GeneID['genename->Uniprotkb'].copy()

subunitsSCBC_genename2GeneID_noMerge_mapping['GeneID'] = ''

In [None]:
subunitsSCBC_genename2GeneID_noMerge_mapping.loc[(
    (subunitsSCBC_genename2GeneID_noMerge_mapping.UniProtKB.isin(
        subunitsSCBC_genename2Uniprotkb_uniprotkb2GeneID['uniprotkbNOConverts->GeneID-0+1'].UniProtKB.to_list())) |
    (subunitsSCBC_genename2GeneID_noMerge_mapping.UniProtKB.isna())), 'GeneID'] = np.nan

In [None]:
subunitIDs_nonnullUniprot_nonnullGeneid_nonduplicatedUniprot = \
  subunitsSCBC_genename2GeneID_noMerge_mapping.loc[(
      (~subunitsSCBC_genename2GeneID_noMerge_mapping.duplicated(subset=['UniProtKB'], keep=False)) &
       (subunitsSCBC_genename2GeneID_noMerge_mapping.UniProtKB.notnull()) &
        (subunitsSCBC_genename2GeneID_noMerge_mapping.GeneID.notnull())), :].UniProtKB.to_list()

In [None]:
subunitIDs_nonnullUniprot_nonnullGeneid_nonduplicatedUniprot_mapper = \
  subunitsSCBC_genename2Uniprotkb_uniprotkb2GeneID['uniprotkbConverts->GeneID-0+1'].loc[(
      (subunitsSCBC_genename2Uniprotkb_uniprotkb2GeneID['uniprotkbConverts->GeneID-0+1'].UniProtKB.isin(
          subunitIDs_nonnullUniprot_nonnullGeneid_nonduplicatedUniprot)) &
           (~subunitsSCBC_genename2Uniprotkb_uniprotkb2GeneID['uniprotkbConverts->GeneID-0+1'].duplicated(subset=['UniProtKB'], keep=False))), :].set_index(
              'UniProtKB').to_dict()

In [None]:
subunitsSCBC_genename2GeneID_noMerge_mapping.loc[subunitsSCBC_genename2GeneID_noMerge_mapping.UniProtKB.isin(
    subunitIDs_nonnullUniprot_nonnullGeneid_nonduplicatedUniprot), 'GeneID'] = \
    subunitsSCBC_genename2GeneID_noMerge_mapping.loc[subunitsSCBC_genename2GeneID_noMerge_mapping.UniProtKB.isin(
        subunitIDs_nonnullUniprot_nonnullGeneid_nonduplicatedUniprot), 'UniProtKB'].map(
            subunitIDs_nonnullUniprot_nonnullGeneid_nonduplicatedUniprot_mapper['GeneID'])

In [None]:
for i, subunitID in enumerate(subunitsSCBC_genename2GeneID_noMerge_mapping.loc[(
    (subunitsSCBC_genename2GeneID_noMerge_mapping.GeneID.notnull()) &
     (subunitsSCBC_genename2GeneID_noMerge_mapping.GeneID=='') &
      (subunitsSCBC_genename2GeneID_noMerge_mapping.duplicated(
          subset=['UniProtKB'], keep=False))), 'UniProtKB'].to_list()):
          subunitsSCBC_genename2GeneID_noMerge_mapping.loc[
              subunitsSCBC_genename2GeneID_noMerge_mapping.UniProtKB==subunitID, 'GeneID'] = \
            subunitsSCBC_genename2Uniprotkb_uniprotkb2GeneID['uniprotkbConverts->GeneID-0+1'].loc[
                subunitsSCBC_genename2Uniprotkb_uniprotkb2GeneID['uniprotkbConverts->GeneID-0+1'].UniProtKB==subunitID, 'GeneID'].to_list()

##STRING

In [None]:
stringDir = workDir + 'proteinComplexes/STRING/'
data = pd.read_csv(stringDir + 'stringPPIs_geneidMapping.tsv', sep='\t',
                   usecols=['geneid1', 'geneid2'], na_values=['NaN', np.nan],
                   dtype='str')
data = data.loc[((data.geneid1.notnull()) & (data.geneid2.notnull())), :].copy()

In [None]:
data.loc[:, ['geneid1', 'geneid2']]

Unnamed: 0,geneid1,geneid2
0,381,5137
1,381,5062
2,381,9609
3,381,2889
5,381,23527
...,...,...
11938493,219952,390142
11938494,219952,441608
11938495,219952,337969
11938496,219952,219983


In [None]:
stringPairs = data.loc[:, ['geneid1', 'geneid2']].apply(frozenset, axis=1).to_list()

In [None]:
pickle.dump({'stringPairs': stringPairs},
            open('./availablePairs_string.pkl', 'wb'))

In [None]:
len(stringPairs)

11938498

In [None]:
stringPairs

In [None]:
stringDir = workDir + 'proteinComplexes/STRING/'
stringPPIs = pd.read_csv(stringDir + '9606.protein.links.detailed.v11.5.txt', sep=' ')
stringGeneid_mapping = \
  pd.read_excel(stringDir + 'string->geneID_08092023.xlsx', sheet_name=None)

stringProts = list(set(stringPPIs.protein1.to_list()).union(
    set(stringPPIs.protein2.to_list())))
stringProts_geneIDs = \
  list(stringGeneid_mapping['string->GeneID'].GeneID.unique())
stringPairs_all = list(it.combinations(stringProts_geneIDs, 2))

In [None]:
len(stringProts)

19385

In [None]:
stringPPIs

Unnamed: 0,protein1,protein2,neighborhood,fusion,cooccurence,coexpression,experimental,database,textmining,combined_score
0,9606.ENSP00000000233,9606.ENSP00000379496,0,0,0,54,0,0,144,155
1,9606.ENSP00000000233,9606.ENSP00000314067,0,0,0,0,180,0,61,197
2,9606.ENSP00000000233,9606.ENSP00000263116,0,0,0,62,152,0,101,222
3,9606.ENSP00000000233,9606.ENSP00000361263,0,0,0,0,161,0,64,181
4,9606.ENSP00000000233,9606.ENSP00000409666,0,0,0,82,213,0,72,270
...,...,...,...,...,...,...,...,...,...,...
11938493,9606.ENSP00000485678,9606.ENSP00000354800,0,0,0,213,0,0,0,213
11938494,9606.ENSP00000485678,9606.ENSP00000308270,0,0,0,152,0,0,0,151
11938495,9606.ENSP00000485678,9606.ENSP00000335660,0,0,0,182,0,0,0,181
11938496,9606.ENSP00000485678,9606.ENSP00000300127,0,0,0,155,0,0,0,154


 187,879,420 (19,385 choose 2 -- All Pairs) - 11,938,498 (Present Pairs) = 175,940,922

In [None]:
len(stringGeneid_mapping['string->GeneID'].GeneID.unique())

18286

167,161,470 (18,285 choose 2 -- GeneID All Pairs) - 11,169,978 (Gene ID Present Pairs) = 155,991,492



In [None]:
stringPPIs.dtypes

protein1          object
protein2          object
neighborhood       int64
fusion             int64
cooccurence        int64
coexpression       int64
experimental       int64
database           int64
textmining         int64
combined_score     int64
dtype: object

#Overlap

*   CORUM (latest)
*   BioPlex 3.0
*   hu.MAP 1.0
*   hu.MAP 2.0
*   Expanded pairset, v2
*   SCBC
*   STRING



##Proteins

In [None]:
proteinsSources_all = \
 {'corumProts': accessCORUM.humanProts,
  'humap1Prots': humap1Prots,
  'humap2Prots': humap2Prots,
  'humap1Prots_labeled': humap1Matrix_prots,
  'humap2Prots_labeled': humap2Prots_allAvailable,
  'expandedPairset_v1': lmExpanded_prots,
  'expandedPairset_v2': featMat_lmExpanded2_prots,
  'hpaProts': hpaProts,
  'nci60_cell': protsNCI60_cellLevel,
  'nci60_tissue': protsNCI60_tissueLevel,
  'scbcProts': scbcProts,
  'stringProts': stringProts_geneIDs,
  'bioplex3prots': protsBioplex3}

pickle.dump(proteinsSources_all, open(workDir + 'proteinComplexes/' + \
                    'proteinSources_humap1+humap1Train+humap2+humap2Train' + \
                    '+expPairs+expPairs2+hpa+nci60+scbc+string+bioplex3.pkl', 'wb'))

nonCORUM_sources = \
  ['humap1Prots', 'humap2Prots', 'humap1Prots_labeled', 'humap2Prots_labeled',
   'expandedPairset_v1', 'expandedPairset_v2', 'hpa', 'nci60_cell', 'nci60_tissue',
   'scbcProts', 'stringProts', 'bioplex3Prots']

In [11]:
proteinsSources_all = \
  pickle.load(open(workDir + 'proteinComplexes/' + \
                    'proteinSources_humap1+humap1Train+humap2+humap2Train' + \
                    '+expPairs+expPairs2+hpa+nci60+scbc+string+bioplex3.pkl', 'rb'))

In [None]:
[(key, type(val), len(val)) for key, val in proteinsSources_all.items()]

[('corumProts', list, 3375),
 ('humap1Prots', list, 13276),
 ('humap2Prots', list, 2993),
 ('humap1Prots_labeled', list, 3835),
 ('humap2Prots_labeled', list, 2993),
 ('expandedPairset_v1', list, 1679),
 ('expandedPairset_v2', list, 2163),
 ('scbcProts', list, 12389),
 ('stringProts', list, 18286),
 ('bioplex3Prots', list, 15093)]

In [None]:
sourceOverlap = \
  {combSize: {
      'proteins':
       {proteinSources: set.intersection(*[set(proteinsSources_all[source])
                                           for source in list(proteinSources)])
       for proteinSources in list(it.combinations(nonCORUM_sources, combSize))},
      'counts':
       {proteinSources: len(set.intersection(*[set(proteinsSources_all[source])
                                               for source in list(proteinSources)]))
       for proteinSources in list(it.combinations(nonCORUM_sources, combSize))}
  }
   for combSize in np.arange(2, len(proteinsSources_all))}
pickle.dump(sourceOverlap,
            open(workDir + 'proteinComplexes/sourcesOverlapping_' + \
                  'humap1+humap1Train+humap2+humap2Train' + \
                  '+expPairs+expPairs2+scbc+string+bioplex3.pkl', 'wb'))

In [None]:
sourceOverlap_corum = \
  {combSize: {
      'proteins':
       {proteinSources:
        set(proteinsSources_all['corumProts']).intersection(
            set.intersection(*[set(proteinsSources_all[source])
                               for source in list(proteinSources)]))
        for proteinSources in list(it.combinations(nonCORUM_sources, combSize))},
      'counts':
       {proteinSources:
        len(set(proteinsSources_all['corumProts']).intersection(
            set.intersection(*[set(proteinsSources_all[source])
                               for source in list(proteinSources)])))
        for proteinSources in list(it.combinations(nonCORUM_sources, combSize))}
  }
   for combSize in np.arange(2, len(proteinsSources_all)-1)}
pickle.dump(sourceOverlap_corum,
            open(workDir + 'proteinComplexes/sourcesOverlapping_' + \
                  'corum+humap1+humap1Train+humap2+humap2Train' + \
                  '+expPairs+expPairs2+scbc+string+bioplex3.pkl', 'wb'))

#Pairs

In [None]:
pairsSources_all = \
  {key: generatePairs(val) for key, val in proteinsSources.items()}

pickle.dump(pairsSources_all,
            open(workDir + 'proteinComplexes/pairsSources_' + \
                 'corum+humap1+humap1Train+humap2+humap2Train' + \
                 '+expPairs+expPairs2+scbc+string+bioplex3.pkl', 'wb'))

In [None]:
pairsSources_all = \
  pickle.load(open(workDir + 'proteinComplexes/pairsSources_' + \
                    'corum+humap1+humap1Train+humap2+humap2Train' + \
                   '+expPairs+expPairs2+scbc+string.pkl', 'rb'))

In [None]:
pairsSources_allOverlapping = \
  {combSize: {
      'pairs':
       {proteinSources: set.intersection(*[set(pairsSources_all[source])
                                           for source in list(proteinSources)])
       for proteinSources in list(it.combinations(nonCORUM_sources, combSize))},
      'counts':
       {proteinSources: len(set.intersection(*[set(pairsSources_all[source])
                                               for source in list(proteinSources)]))
       for proteinSources in list(it.combinations(nonCORUM_sources, combSize))}
  }
   for combSize in np.arange(2, len(pairsSources_all))}

pickle.dump(pairsSources_allOverlapping,
            open(workDir + 'proteinComplexes/pairs-sourcesOverlapping_' + \
                  'corum+humap1+humap1Train+humap2+humap2Train' + \
                  '+expPairs+expPairs2+scbc+string+bioplex3.pkl', 'wb'))

more efficient perhaps using parallelization

In [None]:
pool = mp.Pool(mp.cpu_count()-1)
pairsSources_overlapping = dict()
for combSize in np.arange(2, len(pairsSources)):
    print('combSize: {0} \n'.format(combSize))
    pairsSources_overlapping[combSize] = dict()
    combinations = \
        [comb for comb in list(it.combinations(nonCORUM_sources, combSize)) if 'bioplex3Prots' in comb]
    overlappingPairs_combinations = \
        [list(progressMonitor(pool.imap(findOverlap, combinations), total=len(combinations)))]
    pairsSources_overlapping[combSize]['pairs'] = dict(zip(combinations, overlappingPairs_combinations))
    pairsSources_overlapping[combSize]['counts'] = dict(zip(combinations, [len(combo) for combo in overlappingPairs_combinations]))
pool.close()

In [None]:
pairsSources_allOverlapping = \
  pickle.load(open(workDir + 'proteinComplexes/pairs-sourcesOverlapping_' + \
                  'corum+humap1+humap1Train+humap2+humap2Train' + \
                  '+expPairs+expPairs2+scbc+string+bioplex3.pkl', 'rb'))

#*investigational

In [None]:
#use dataIntegrator to organize and consolidate datasets
class dataIntegrator(object):
  def __init__(self, ):

    self.data = None

    self.sources = dict()
    self.sourceInfo = dict()
    self.modules = dict()

    self.proteins = dict()
    self.pairs = dict()

  def addData(self, name, releaseDate, dataType, input='man'):

    self.sourceInfo[name] = \
     {'released': releaseDate, 'format': dataType}

    return self.test

In [None]:
bioplex2Dir = \
  workDir_elcfs + 'sourceData/bioplex2/BioPlex_interactionList_v4a.tsv'

In [None]:
bioplex2Data = pd.read_csv(bioplex2Dir, sep='\t', dtype)

In [None]:
bioplex2Data.dtypes

GeneA                  int64
GeneB                  int64
UniprotA              object
UniprotB              object
SymbolA               object
SymbolB               object
p(Wrong)             float64
p(No Interaction)    float64
p(Interaction)       float64
dtype: object

In [None]:
geneidPairset = freezePairs(featMat_drew2021, 'id1', 'id2')

In [None]:
geneidPairset.loc[((geneidPairset.id1=='817') | (geneidPairset.id2=='817')), :]

In [None]:
geneidPairset
pickle.dump(geneidPairset_addCol_frozenPairs, open('/Users/wilkinsbusiness/Library/CloudStorage/GoogleDrive-gwilkins@andrew.cmu.edu/My Drive/Colab Notebooks/Primary Research/proteinPairs_complexMaps/sourceData/pairsets/drew2017.pkl', 'wb'))