#install packages and load libraries

In [1]:
%%capture
!sudo apt-get update
!sudo apt-get install mcl
!mcl --version

Import common packages

In [2]:
%%capture
import argparse, cProfile, datetime
from functools import reduce
import glob
import itertools as it
from multiprocessing import cpu_count, Pool
import networkx as nx
import numpy as np
import os

import pandas as pd
import pickle
import matplotlib.pyplot as plt
import random, re, shutil
from scipy.stats import hmean
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics
from sklearn.metrics import auc, average_precision_score, precision_recall_curve
from sklearn.model_selection import KFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
import seaborn as sns
import subprocess as sp
import sys
import tempfile as tf
import time
from tqdm import tqdm as progressMonitor

Enable Google Colab, mount drives, and load proprietary modules...

In [3]:
%%capture
from google.colab import drive, files, output
drive.mount('/content/drive', force_remount=True)
!rm -r sample_data/

In [4]:
#useful to keep track of sys vars so as to better monitor space remaining
print(cpu_count())
sysVars = list(globals().keys())

2


In [5]:
rootDir = '/content/drive/My Drive/'
workDir = rootDir + 'otherStudies/'
workDir_ph2 = rootDir + 'elcfs_protein_complex_modeling/'  #phase 2 directory
elcfsDir = rootDir + 'Primary Research/proteinPairs_complexMaps/' #modeling library
workDir_alternate = rootDir + 'Primary Research/JLMwSCBC_notebook/' #phase 1 directory
ribosomeGuide_dir = workDir + 'ribosomeGuided_gridSearch/'

sys.path.insert(0, rootDir)
for p in workDir, workDir_ph2, elcfsDir, workDir_alternate, ribosomeGuide_dir: sys.path.append(p)

In [6]:
from util import ppiPrediction_v2, dataProcessing, modelEvaluating
from utils import operations, reference, alertMe
pushoverKey_user = 'uith8rmy2npjj1oqpjwcanow3un984'
pushoverAPI = 'aw4v3424kaznrw598r6qge9icddwg7'

In [7]:
# made slight alterations to the python file so as to accommodate both python3
# and facilitate execution on Google Colab
# file taken from local wilkinsbusiness drive, no numerical suffix; changes
# include fnc: map() -> list(map()) and tentative changes for pseudocount
# (0.00001 vs 1) => (0.29 vs 0.34)
import complex_comparison_single_gold_standard5 as cc

In [None]:
numWorkers = cpu_count()-1
with Pool(numWorkers) as pool:
  parameterGrid_analysisObjects = \
    list(progressMonitor(pool.imap(cc.ComplexComparison,
     ['/content/gold_standard_subset.txt']), total=1))

100%|██████████| 1/1 [00:01<00:00,  1.83s/it]


In [None]:
with Pool(numWorkers) as pool:
  weightedF1scores_plusAnalysis = list(
      progressMonitor(
          pool.imap(worker, parameterGrid_analysisObjects),
          total=1))
weightedF1scores = [pair[1] for pair in weightedF1scores_plusAnalysis]

  0%|          | 0/1 [00:00<?, ?it/s]

# of clusters (by size)
46
precision list
[0.9970595525112097, 0.996406970477274, 0.9897225361704994, 0.9719321009976415, 0.8070685482986493, 0.7043954573066746, 0.5322089725870512, 0.5062103569455715, 0.5034697605055772, 0.5013687706692642, 0.5030459921283434, 0.5006254698840821, 0.5005151551120592, 0.5004829890193037, 0.5005151551120592, 0.5006254074153283, 0.5008655323366012, 0.5013690437557614, 0.5024831122535783, 0.5051867176866918, 0.5124792907244555, 0.5342005010810001, 0.6002484246242846, 0.7518233683726915, 0.9101328086571417, 0.96591672263264, 0.9687500000000002, 0.96875, 0.96875]
recall list
[0.27778253418930776, 0.3909069243016549, 0.6800946395110026, 0.7742672310012634, 0.8534597166924814, 0.6209723937344983, 0.5174380875543338, 0.5079080585016519, 0.5039618642704765, 0.5013689072125502, 0.5008653595124501, 0.500625344946559, 0.5005152065796349, 0.5004829890193037, 0.5005151551120592, 0.5006254698840821, 0.5008656187486319, 0.5013690437557614, 0.5024833594056572, 0.5051851

100%|██████████| 1/1 [00:29<00:00, 29.30s/it]


In [None]:
sum([0.15593220338983052, 0.15593220338983052, 0.15593220338983052, 0.15593220338983052, 0.10847457627118644, 0.08135593220338982, 0.05084745762711865, 0.023728813559322035, 0.01694915254237288, 0.013559322033898305, 0.006779661016949152, 0.006779661016949152, 0.006779661016949152, 0.006779661016949152, 0.006779661016949152, 0.003389830508474576, 0.003389830508474576, 0.003389830508474576, 0.003389830508474576, 0.003389830508474576, 0.003389830508474576, 0.003389830508474576, 0.003389830508474576, 0.003389830508474576, 0.003389830508474576, 0.003389830508474576, 0.003389830508474576, 0.003389830508474576, 0.003389830508474576])

0.9999999999999992

In [None]:
weightedF1scores

[0.7089355612501773]

In [None]:
twoStage = pickle.load(open(workDir + 'intersectionSet_drew2021_expandedPairs-set/twoStage_clusterGenerators.pkl', 'rb'))

In [None]:
twoStage['elcfsPredictions_twoStage_clusterGenerator'][frozenset({0.28, 0.24})].predictedComplexes

In [None]:
params = \
  pd.read_csv(workDir + 'intersectionSet_drew2021_expandedPairs-set/' +
              'bestPerformers_params+measures.tsv', sep='\t')
params.rename(columns={'Unnamed: 0': 'mappingIdx'}, inplace=True)

In [None]:
complexPredictions = pickle.load(
    open(workDir + 'intersectionSet_drew2021_expandedPairs-set/' +
         'clusterONE_MCL_predictions_sharedPairs_gridSearchDF.pkl',
         'rb'))

In [None]:
complexPredictions.density.unique()

array([0.1, 0.2, 0.3, 0.4])

In [None]:
complexPredictions2 = pickle.load(
    open(workDir + 'intersectionSet_drew2021_expandedPairs-set/' +
         'clusterONE+MCL_predictions_sharedPairs_gridSearch_DICT.pkl', 'rb'))

In [None]:
complexPredictions3 = pickle.load(open(workDir + 'intersectionSet_drew2021_expandedPairs-set/' +
                                       'clusterONE+MCL_predictions_sharedPairs_gridSearch_DICT_protocol2.pkl', 'rb'))

In [None]:
complexPredictions3['ELCFS_t0.28'].density.unique()

array([0.1, 0.2, 0.3, 0.4])

In [None]:
complexPredictions2.keys()

dict_keys(['ELCFS_t0.28', 'ELCFS_t0.29333333333333333', 'ELCFS_t0.3066666666666667', 'ELCFS_t0.32', 'ELCFS_tNone', 'huMAP_t0.28', 'huMAP_t0.29333333333333333', 'huMAP_t0.3066666666666667', 'huMAP_t0.32', 'huMAP_tNone'])

In [None]:
i=0
for idx in params.index:
  paramSet = params.loc[idx,
   ['study', 'threshold', 'density', 'overlap', 'inflation (MCL)']]
  i+=len(complexPredictions.loc[
      ((complexPredictions.study==paramSet[0]) &
      (complexPredictions.threshold==paramSet[1]) &
        (complexPredictions.density==paramSet[2]) &
        (complexPredictions.overlap==paramSet[3]) &
          (complexPredictions['inflation (MCL)']==paramSet[4])),
          'clusterONE+MCL clusters'])

print(i)

46


In [None]:
sum([val.shape[0] for val in complexPredictions.values()])

In [None]:
[11, 31, 286, 401, 461]

In [None]:
complexPredictions['ELCFS_t0.28'].columns.to_list()

In [None]:
def flatten(lst):
  return [item for sublist in lst for item in sublist]

In [None]:
def loadClusters(filepath):
  clusters = []
  clustersFile = open(filepath, "r")
  for line in clustersFile.readlines():
      clusters.append(line.split())
  clustersFile.close()

  clusters = [clust for clust in clusters if clust]

  return clusters

In [None]:
def generateComplexes_clusterONE(ppiFilename,
                                 delete,
                                 sizeSweep, densitySweep, maxOverlap_sweep,
                                 seedMethod_sweep, clusteroneJar, resultsDir):

  # format: id\t id\t weight\n
  with open(ppiFilename, "r") as inputFile:
    networkList = inputFile.readlines()

  ppi_scores = dict()
  for ppi in networkList:
      ppi_scores[frozenset([ppi.split()[0], ppi.split()[1]])] = \
        float(ppi.split()[2])


  parameterGrid = []
  for i, parameters in enumerate(
      it.product(sizeSweep, densitySweep, maxOverlap_sweep, seedMethod_sweep)):
    size, density, overlap, seedMethod = parameters

    parameterComb = dict()
    parameterComb['filename'] = ppiFilename.split('/')[-1].split('.txt')[0]
    parameterComb['clustoneJar'] = clustoneJar
    parameterComb['dir'] = resultsDir
    parameterComb['delete'] = delete

    parameterComb['i'] = str(i)
    parameterComb['networkList'] = networkList

    parameterComb['size'] = str(size)
    parameterComb['density'] = str(density)
    parameterComb['overlap'] = str(overlap)
    parameterComb['seedMethod'] = seedMethod

    parameterGrid.append(parameterComb)

  numWorkers = cpu_count()-1
  with Pool(numWorkers) as pool:
    clusterAssembly = list(
        progressMonitor(
            pool.imap(clusterHelper, parameterGrid), total=len(parameterGrid)))

  return ppi_scores

In [None]:
def clusterHelper(parameterDict):
  clustoneJar = parameterDict['clustoneJar']
  outputDir = parameterDict['dir']
  delete = parameterDict['delete']
  exists = os.makedirs(outputDir, exist_ok=True) if not os.path.exists(outputDir) else None
  for f in glob.glob(outputDir + '*'): os.remove(f) if exists else None

  i = parameterDict['i']
  filename = parameterDict['filename']
  networkList = parameterDict['networkList']
  size = parameterDict['size']
  density = parameterDict['density']
  overlap = parameterDict['overlap']
  seedMethod = parameterDict['seedMethod']
  suffix = '_'.join([filename, i, seedMethod, size, density, overlap]) + '.txt'

  fileTemp = tf.NamedTemporaryFile(delete=delete, dir=outputDir, mode='w')
  tempFile = open(fileTemp.name + '.txt', 'w')
  fileTemp.close()

  for ppi in networkList: tempFile.write(ppi)
  tempFile.close()

  print('clusterONE beginning')
  clusterONE_output = \
    sp.run(['java', '-jar', clustoneJar, fileTemp.name + '.txt',
            '-s', size, '-d', density, '--max-overlap', overlap,
            '--seed-method', seedMethod],
           capture_output=True)
  os.remove(fileTemp.name + '.txt')

  print('cluster line')
  clusters = []
  for line in clusterONE_output.stdout.decode().split('\n'):
    clusters.append(line.split('\t'))

  print('cluster file generation')
  with open(outputDir + 'clusterONE_predictions_' + suffix, 'w') as outfileName:
    for pred in clusters:
      outLine = ' '.join(pred) + '\n'
      outfileName.writelines(outLine)
  outfileName.close()

  print('clusterONE complete')

In [None]:
def analyzeClusterONE_predictions(batchFiles_dir):

  batchFiles = glob.glob(batchFiles_dir + '*nodes*txt')
  batchFiles.sort()

  paramVals = \
    [val.split('nodes_')[-1].split('.txt')[0].split('_') for val in batchFiles]

  parameterGrid_batchResults = \
    pd.DataFrame(
        {param: paramVal
        for param, paramVal
        in zip(
            ['size', 'density', 'overlap'],
            [[lst[j] for lst in paramVals] for j in range(3)])})

  parameterGrid_batchResults = \
    parameterGrid_batchResults.astype(
        {'size': 'int64', 'density': 'float64', 'overlap': 'float64'})

  parameterGrid_batchResults['clusterONE predictions filename'] = \
    [filename for filename in batchFiles]

  #create complex comparison objects using Pool for each parameter combination
  numWorkers = cpu_count()-1
  with Pool(numWorkers) as pool:
    parameterGrid_analysisObjects = \
      list(progressMonitor(
          pool.imap(
              cc.ComplexComparison,
              parameterGrid_batchResults['clusterONE predictions filename'].to_list()),
              total=len(parameterGrid_batchResults)))

  # calculate clique-weighted F1-score for each parameter combination
  with Pool(numWorkers) as pool:
    weightedF1scores_plusAnalysis = list(
        progressMonitor(
            pool.imap(worker, parameterGrid_analysisObjects),
            total=len(parameterGrid_batchResults)))
  weightedF1scores = [pair[1] for pair in weightedF1scores_plusAnalysis]

  parameterGrid_batchResults.insert(
      3, 'total clusterONE clusters',
      [len(loadClusters(filename)) for filename in
      parameterGrid_batchResults.loc[:, 'clusterONE predictions filename']])

  parameterGrid_batchResults.insert(
      4, 'clusterONE clusters weighted F1-score', weightedF1scores)

  parameterGrid_batchResults.insert(
      5, 'clusterONE clusters',
      [loadClusters(filename)
      for filename in
      parameterGrid_batchResults.loc[:,
                                      'clusterONE predictions filename'].to_list(
                                      ) if loadClusters(filename)])

  with Pool(numWorkers) as pool:
    clusterONE_predictionsSimilarity = \
      list(
          progressMonitor(
              pool.imap(
                  calculateJaccard,
                  parameterGrid_batchResults['clusterONE clusters']),
                  total=len(parameterGrid_batchResults['clusterONE clusters'])))

  clusterONE_meanSimilarity = \
    [arr[2] for arr in clusterONE_predictionsSimilarity]
  clusterONE_maxSimilarity = \
    [arr[3] for arr in clusterONE_predictionsSimilarity]
  parameterGrid_batchResults['clusterONE jaccardMean'] = \
    clusterONE_meanSimilarity
  parameterGrid_batchResults['clusterONE jaccardMax'] = \
    clusterONE_maxSimilarity

  with Pool(numWorkers) as pool:
    ribosomalSubunit_matches = \
      list(
          progressMonitor(pool.imap(
              countMatches_ribosomalSubunits,
              parameterGrid_batchResults['clusterONE clusters']),
              total=len(parameterGrid_batchResults['clusterONE clusters'])))

  parameterGrid_batchResults_ribosomalSubunit = \
    pd.concat(
        [pd.DataFrame(
            [row],
            columns=['clusterONE_srRec', 'clusterONE_srPrec', 'clusterONE_srF1',
                     'clusterONE_lrRec', 'clusterONE_lrPrec', 'clusterONE_lrF1'])
        for row in ribosomalSubunit_matches], axis=0, ignore_index=True)

  parameterGrid_batchResults = \
    pd.concat([parameterGrid_batchResults,
               parameterGrid_batchResults_ribosomalSubunit], axis=1)

  parameterGrid_batchResults.insert(
      6, 'clusterONE analysisOBJ',
      [pair[0] for pair in weightedF1scores_plusAnalysis])

  pickle.dump(parameterGrid_batchResults,
            open(batchFiles_dir + 'parameterGrid_batchResults_preMCL.pkl', 'wb'))

  pickle.dump(
      parameterGrid_batchResults.loc[:,
                                     ['size', 'density', 'overlap',
                                      'total clusterONE clusters',
                                      'clusterONE clusters weighted F1-score',
                                      'clusterONE clusters',
                                      'clusterONE jaccardMean',
                                      'clusterONE jaccardMax',
                                      'clusterONE_srRec', 'clusterONE_srPrec',
                                      'clusterONE_srF1',
                                      'clusterONE_lrRec', 'clusterONE_lrPrec',
                                      'clusterONE_lrF1',
                                      'clusterONE predictions filename']
                                    ].copy(),
              open(batchFiles_dir +
                   'parameterGrid_batchResults_preMCL_noCC.pkl', 'wb'))

  return parameterGrid_batchResults

In [None]:
def gridSearch_addInflation_mcl(df, pairsScored, parameterSweep,
                                outputDir, delete):
  exists = os.makedirs(outputDir, exist_ok=True) if not os.path.exists(outputDir) else None
  numWorkers = cpu_count()-1

  print('preparing DF')
  df.insert(7, 'inflation (MCL)', [parameterSweep for row in df.index])
  df = df.explode('inflation (MCL)', ignore_index=True).copy()

  print('generating clusterONE-MCL hybrid prediction filenames')
  prefix = outputDir + 'clusterONE+MCL_predictions_'
  filenameSuffixes = ['_{0}.txt'.format(inflation)
    for inflation in df.loc[:, 'inflation (MCL)'].to_list()]

  mclOutput_filenames = \
    [prefix +
     filename.split('clusterONE_predictions_')[-1].split('.txt')[0] +
     suffix for filename, suffix in zip(
         df.loc[:, 'clusterONE predictions filename'].to_list(),
         filenameSuffixes)]

  print('preparing inputs for mcl clustering')
  inputsList = \
    [(str(df.loc[row, 'inflation (MCL)']), pairsScored,
      df.loc[row, 'clusterONE clusters'], mclOutput_filenames[row],
      outputDir, delete)
    for row in df.index]

  print('submitting jobs for mcl clustering')
  with Pool(numWorkers) as pool:
    clusterAssembly = \
      list(progressMonitor(pool.imap(clusterHelper_mcl, inputsList),
                           total=len(inputsList)))

  print('submitting jobs for mcl clustering analysis')
  with Pool(numWorkers) as pool:
    analysisObjects = \
      list(progressMonitor(pool.imap(cc.ComplexComparison, mclOutput_filenames),
                           total=len(mclOutput_filenames)))

  print('submitting jobs for mcl clustering evaluation')
  with Pool(numWorkers) as pool:
    weightedF1scores_plusAnalysis = \
      list(progressMonitor(pool.imap(worker, analysisObjects),
          total=len(mclOutput_filenames)))

  with Pool(numWorkers) as pool:
    clusterONEplusMCL_predictionsSimilarity = \
      list(progressMonitor(
          pool.imap(calculateJaccard, df['clusterONE clusters']),
          total=len(df['clusterONE clusters'])))

  clusterONEplusMCL_meanSimilarity = \
    [arr[2] for arr in clusterONEplusMCL_predictionsSimilarity]
  clusterONEplusMCL_maxSimilarity = \
    [arr[3] for arr in clusterONEplusMCL_predictionsSimilarity]
  df['clusterONE+MCL jaccardMean'] = clusterONEplusMCL_meanSimilarity
  df['clusterONE+MCL jaccardMax'] = clusterONEplusMCL_maxSimilarity

  print('finishing DF additions')
  weightedF1scores = [pair[1] for pair in weightedF1scores_plusAnalysis]
  df.insert(8, 'total clusterONE+MCL clusters',
            [len(loadClusters(filename)) for filename in mclOutput_filenames])

  df.insert(9, 'clusterONE+MCL clusters weighted F1-score', weightedF1scores)
  df.insert(10, 'clusterONE+MCL clusters', [loadClusters(filename)
      for filename in mclOutput_filenames])

  with Pool(numWorkers) as pool:
    ribosomalSubunit_matches = \
      list(progressMonitor(pool.imap(
          countMatches_ribosomalSubunits, df['clusterONE+MCL clusters']),
          total=len(df['clusterONE+MCL clusters'])))

  df_ribosomalSubunit = \
    pd.concat(
        [pd.DataFrame([row],
                      columns=['clusterONE+MCL_srRec', 'clusterONE+MCL_srPrec',
                               'clusterONE+MCL_srF1',
                               'clusterONE+MCL_lrRec', 'clusterONE+MCL_lrPrec',
                               'clusterONE+MCL_lrF1'])
        for row in ribosomalSubunit_matches], axis=0, ignore_index=True)

  df = pd.concat([df, df_ribosomalSubunit], axis=1)

  df.insert(11, 'clusterONE+MCL analysisOBJ',
      [pair[0] for pair in weightedF1scores_plusAnalysis])

  df.insert(12, 'clusterONE+MCL predictions filename', mclOutput_filenames)

  pickle.dump(
      df, open(outputDir + 'parameterGrid_batchResults_mclAdded.pkl', 'wb'))

  pickle.dump(
      df.loc[:, ['size', 'density', 'overlap',
                 'total clusterONE clusters',
                 'clusterONE clusters weighted F1-score',
                 'clusterONE clusters','inflation (MCL)',
                 'total clusterONE+MCL clusters',
                 'clusterONE+MCL clusters weighted F1-score',
                 'clusterONE+MCL clusters',
                 'clusterONE+MCL jaccardMean', 'clusterONE+MCL jaccardMax',
                 'clusterONE+MCL_srRec', 'clusterONE+MCL_srPrec',
                 'clusterONE+MCL_srF1',
                 'clusterONE+MCL_lrRec', 'clusterONE+MCL_lrPrec',
                 'clusterONE+MCL_lrF1',
                 'clusterONE+MCL predictions filename',
                 'clusterONE predictions filename']].copy(),
              open(outputDir +
                   'parameterGrid_batchResults_noCC_mclAdded.pkl', 'wb'))

  return df

In [None]:
def clusterHelper_mcl(args):
  inflation, ppiScores, predictedClusters, outName, outputDir, delete = args

  print('MCL beginning')

  print(outName)

  mclClusters = []
  for clust in predictedClusters:
    tfIn = tf.NamedTemporaryFile(delete=True, dir=outputDir, mode='w')
    tfOut = tf.NamedTemporaryFile(delete=True, dir=outputDir, mode='w')
    tempInfile = tfIn.name
    tempOutfile = tfOut.name
    tfIn.close()
    tfOut.close()

    with open(tempInfile, 'w') as f:
      for prot1, prot2 in list(it.combinations(clust, 2)):
        try:
          score = ppiScores[frozenset([prot1, prot2])]
        except KeyError:
          score = 0.0
        ppi = "%s\t%s\t%s\n" % (prot1, prot2, score)
        f.write(ppi)

    mclOutput = \
      sp.run(['mcl', tempInfile, '--abc', '-o', tempOutfile, '-I', inflation])

    with open(tempOutfile, 'r') as f:
      for line in f.readlines():
        mclClusters.append(line.split())
    os.remove(tempInfile)
    os.remove(tempOutfile)

  with open(outName, 'w') as outfile:
    for pred in mclClusters:
      outLine = ' '.join(pred) + '\n'
      outfile.write(outLine)
  print('MCL complete')

In [None]:
def worker(analysis):
  wccmm = analysis.clique_comparison_metric_mean(weighted=True)
  clique_weighted_pr_mean = wccmm['precision_mean']
  clique_weighted_re_mean = wccmm['recall_mean']
  clique_weighted_hmean = hmean([wccmm['precision_mean'], wccmm['recall_mean']])

  return (analysis, clique_weighted_hmean)

In [None]:
def countMatches_ribosomalSubunits(assembly):
  assembly = set([frozenset(clust) for clust in assembly if clust])

  srRec = \
    max([len(clust.intersection(list(ribosomalData['elcfs'])[1]))/len(list(ribosomalData['elcfs'])[1])
    for clust in list(assembly)])
  lrRec = \
    max([len(clust.intersection(list(ribosomalData['elcfs'])[0]))/len(list(ribosomalData['elcfs'])[0])
    for clust in list(assembly)])

  srPrec = \
    max([len(clust.intersection(list(ribosomalData['elcfs'])[1]))/len(clust)
    for clust in list(assembly)])
  lrPrec = \
    max([len(clust.intersection(list(ribosomalData['elcfs'])[0]))/len(clust)
    for clust in list(assembly)])

  srF1 = (2*srRec*srPrec)/(srRec+srPrec)
  lrF1 = (2*lrRec*lrPrec)/(lrRec+lrPrec)

  return [srRec, srPrec, srF1, lrRec, lrPrec, lrF1]

In [None]:
def calculateJaccard(assembly):
  assembly = set([frozenset(clust) for clust in assembly if clust])

  countsMatrix_intersection = \
    np.array([[len(clustA.intersection(clustB)) for clustA in list(assembly)]
              for clustB in list(assembly)])

  countsMatrix_union = \
    np.array([[len(clustA.union(clustB)) for clustA in list(assembly)]
              for clustB in list(assembly)])

  jaccardSimilarity = \
    np.tril(countsMatrix_intersection/countsMatrix_union, -1).flatten()
  jaccardSimilarity_min = np.amin(jaccardSimilarity)
  jaccardSimilarity_mean = np.mean(jaccardSimilarity)
  jaccardSimilarity_max = np.amax(jaccardSimilarity)

  return [jaccardSimilarity,
          jaccardSimilarity_min, jaccardSimilarity_mean, jaccardSimilarity_max]

In [None]:
def checkRecapit(predictionsFile):
  predictions = set(
      [frozenset(clust) for clust in loadClusters(predictionsFile)])

  return all(
      [ref in predictions for ref
       in list(ribosomalData['elcfs'])])

In [None]:
def apply_filteringThresholds(outputDir, filesList, filenamesList):

  exists = os.makedirs(outputDir) if not os.path.exists(outputDir) else None

  for baseProj, baseName in zip(filesList, filenamesList):

    with open(baseProj, "r") as inputFile:
      print(baseProj)
      networkList = inputFile.readlines()

    for thresh in ppiThreshold_sweep:
      print(thresh)
      if not thresh:
        filtLines = \
          [idx for idx, line in enumerate(networkList)
          if float(line.split()[2]) >= thresh]
      else:
        filtLines = [idx for idx, line in enumerate(networkList)]

      supProj = \
        outputDir + 'overlappingPairs_' + baseName + '_t{0}.txt'.format(thresh)

      with open(supProj, "w") as outputFile:
        for filteredLine in filtLines:
          outputFile.write(networkList[filteredLine])
      print(supProj)

In [None]:
clusterONE_MCL_predictions_sharedPairs_gridSearchDF_path = workDir + \
  'intersectionSet_drew2021_expandedPairs-set/' + \
  'clusterONE_MCL_predictions_sharedPairs_gridSearch_ribosomalSubunit_overlap+jaccardSimilarity_expandedParamGrid.tsv'

clusterONE_MCL_predictions_sharedPairs_gridSearchDF = pd.read_csv(
    clusterONE_MCL_predictions_sharedPairs_gridSearchDF_path, sep='\t')

In [None]:
paramsList = ['threshold', 'density', 'overlap']
paramsList_mcl = ['threshold', 'density', 'overlap', 'inflation (MCL)']

In [None]:
paramCombs = \
  list(set([frozenset(row[paramsList].to_list())
    for _, row in clusterONE_MCL_predictions_sharedPairs_gridSearchDF.loc[clusterONE_MCL_predictions_sharedPairs_gridSearchDF.study=='huMAP', :].iterrows()]))

In [None]:
paramCombs_mcl = \
  list(set([frozenset(row[paramsList_mcl].to_list())
    for _, row in clusterONE_MCL_predictions_sharedPairs_gridSearchDF.loc[clusterONE_MCL_predictions_sharedPairs_gridSearchDF.study=='huMAP', :].iterrows()]))

In [None]:
paramCombs_index = \
  {paramComb: idx for idx, paramComb in enumerate(paramCombs)}
paramCombs_mcl_index = \
  {paramCombs_mcl: idx for idx, paramComb in enumerate(paramCombs_mcl)}

In [None]:
paramCombs_indexELCFS = \
  {paramCombs_index[frozenset(row[paramsList].to_list())]:
    row['clusterONE clusters weighted F1-score']
      for _, row in clusterONE_MCL_predictions_sharedPairs_gridSearchDF.loc[
          clusterONE_MCL_predictions_sharedPairs_gridSearchDF.study=='ELCFS',
          :].iterrows()
            if frozenset(row[paramsList].to_list()) in paramCombs_index.keys()}
paramCombs_indexHumap = \
  {paramCombs_index[frozenset(row[paramsList].to_list())]:
    row['clusterONE clusters weighted F1-score']
      for _, row in clusterONE_MCL_predictions_sharedPairs_gridSearchDF.loc[
          clusterONE_MCL_predictions_sharedPairs_gridSearchDF.study=='huMAP',
          :].iterrows()
            if frozenset(row[paramsList].to_list()) in paramCombs_index.keys()}

paramCombs_elcfsHumap_index = pd.concat(
    [pd.DataFrame.from_dict(paramCombs_indexELCFS, orient='index').rename(
        columns={0: 'elcfs_weightedF1'}),
     pd.DataFrame.from_dict(paramCombs_indexHumap, orient='index').rename(
         columns={0: 'humapF1_weightedF1'})], axis=1).sort_index()

In [None]:
paramCombs_mcl_indexELCFS = \
  {paramCombs_mcl_index[frozenset(row[paramsList_mcl].to_list())]:
    row['clusterONE+MCL clusters weighted F1-score']
      for _, row in clusterONE_MCL_predictions_sharedPairs_gridSearchDF.loc[
          clusterONE_MCL_predictions_sharedPairs_gridSearchDF.study=='ELCFS',
          :].iterrows()
            if frozenset(row[paramsList_mcl].to_list()) in paramCombs_mcl_index.keys()}
paramCombs_mcl_indexHumap = \
  {paramCombs_mcl_index[frozenset(row[paramsList_mcl].to_list())]:
    row['clusterONE+MCL clusters weighted F1-score']
      for _, row in clusterONE_MCL_predictions_sharedPairs_gridSearchDF.loc[
          clusterONE_MCL_predictions_sharedPairs_gridSearchDF.study=='huMAP',
          :].iterrows()
            if frozenset(row[paramsList_mcl].to_list()) in paramCombs_mcl_index.keys()}

paramCombs_mcl_elcfsHumap_index = pd.concat(
    [pd.DataFrame.from_dict(paramCombs_mcl_indexELCFS, orient='index').rename(
        columns={0: 'elcfs_mcl_weightedF1'}),
     pd.DataFrame.from_dict(paramCombs_mcl_indexHumap, orient='index').rename(
         columns={0: 'humap_mcl_weightedF1'})], axis=1).sort_index()

In [None]:
pickle.dump({'clusterONE': paramCombs_elcfsHumap_index,
             'clusterONE+mcl': paramCombs_mcl_elcfsHumap_index},
              open(workDir + 'intersectionSet_drew2021_expandedPairs-set/' +
                   'paramCombs_v_weightedF1.pkl', 'wb'))

#Expanded pairs' and huMAP's intersection set

##Their clustering

huMAP and ELCFS predictions

In [None]:
filename = workDir + 'intersectionSet_drew2021_expandedPairs-set/' + \
          'clusterONE_MCL_predictions_sharedPairs_gridSearch_' + \
          'ribosomalSubunit_overlap+jaccardSimilarity_expandedParamGrid.tsv'

In [None]:
analysis = pd.read_csv(filename, sep='\t')

In [None]:
analysis.loc[analysis.study=='ELCFS', :].sort_values(by='clusterONE+MCL clusters weighted F1-score', ascending=False)

Unnamed: 0,study,threshold,size,density,overlap,total clusterONE clusters,clusterONE clusters weighted F1-score,inflation (MCL),total clusterONE+MCL clusters,clusterONE+MCL clusters weighted F1-score,...,clusterONE+MCL_srRec,clusterONE+MCL_srPrec,clusterONE+MCL_srF1,clusterONE+MCL_lrRec,clusterONE+MCL_lrPrec,clusterONE+MCL_lrF1,clusterONE jaccardMax,clusterONE jaccardMean,clusterONE+MCL jaccardMax,clusterONE+MCL jaccardMean
460,ELCFS,0.28,2,0.025,0.2,58,0.498300,2,436,0.356167,...,1.000000,0.500,0.666667,1.000000,0.325,0.490566,0.168000,0.002661,0.168000,0.002661
465,ELCFS,0.28,2,0.025,0.3,86,0.470387,2,482,0.348920,...,1.000000,0.500,0.666667,1.000000,0.325,0.490566,0.333333,0.002862,0.333333,0.002862
405,ELCFS,0.28,2,0.050,0.4,112,0.414489,2,481,0.310341,...,1.000000,0.325,0.490566,1.000000,0.500,0.666667,0.443850,0.003277,0.443850,0.003277
410,ELCFS,0.28,2,0.050,0.5,146,0.385473,2,541,0.308953,...,1.000000,0.325,0.490566,1.000000,0.500,0.666667,0.515152,0.003507,0.515152,0.003507
470,ELCFS,0.28,2,0.050,0.2,59,0.450851,2,392,0.303611,...,1.000000,0.500,0.666667,1.000000,0.325,0.490566,0.233438,0.002955,0.233438,0.002955
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463,ELCFS,0.28,2,0.025,0.2,58,0.498300,7,2279,0.041524,...,0.166667,1.000,0.285714,0.256410,1.000,0.408163,0.168000,0.002661,0.168000,0.002661
473,ELCFS,0.28,2,0.050,0.2,59,0.450851,7,1646,0.040822,...,0.166667,1.000,0.285714,0.256410,1.000,0.408163,0.233438,0.002955,0.233438,0.002955
474,ELCFS,0.28,2,0.050,0.2,59,0.450851,9,1787,0.037530,...,0.066667,1.000,0.125000,0.128205,1.000,0.227273,0.233438,0.002955,0.233438,0.002955
464,ELCFS,0.28,2,0.025,0.2,58,0.498300,9,2391,0.034950,...,0.066667,1.000,0.125000,0.128205,1.000,0.227273,0.168000,0.002661,0.168000,0.002661


In [None]:
analysis.loc[analysis.study=='ELCFS', 'clusterONE+MCL clusters weighted F1-score'].max()

0.3561670189043763

In [None]:
analysis.loc[analysis.study=='huMAP', 'clusterONE+MCL clusters weighted F1-score'].max()

In [None]:
ribosomeData = pickle.load(open('/content/ribosomalSubunits_multipleIDs.pkl', 'rb'))

In [None]:
ribosomeData.keys()

dict_keys(['smallRibosomal_subunitGeneid', 'largeRibosomal_subunitGeneid', 'smallRibosomal_subunitUniprotid', 'largeRibosomal_subunitUniprotid', 'smallRibosomal_subunitSTRINGid', 'largeRibosomal_subunitSTRINGid'])

In [None]:
smallRibosome_prots = ribosomeData['smallRibosomal_subunitGeneid']
largeRibosome_prots = ribosomeData['largeRibosomal_subunitGeneid']

In [None]:
for c in complexes:
  tp = sum([p for p in list(c)])
  prec = tp/(tp+fp)
  rec = tp/(tp+fn)

In [None]:
analysis.loc[analysis.study=='huMAP', :].sort_values(by='clusterONE+MCL clusters weighted F1-score', ascending=False)

Unnamed: 0,study,threshold,size,density,overlap,total clusterONE clusters,clusterONE clusters weighted F1-score,inflation (MCL),total clusterONE+MCL clusters,clusterONE+MCL clusters weighted F1-score,...,clusterONE+MCL_srRec,clusterONE+MCL_srPrec,clusterONE+MCL_srF1,clusterONE+MCL_lrRec,clusterONE+MCL_lrPrec,clusterONE+MCL_lrF1,clusterONE jaccardMax,clusterONE jaccardMean,clusterONE+MCL jaccardMax,clusterONE+MCL jaccardMean
305,huMAP,0.3066666666666667,2,0.3,0.7,194,0.279689,2,213,0.315805,...,0.833333,0.367647,0.510204,0.948718,0.544118,0.691589,0.714286,0.000598,0.857143,0.000600
295,huMAP,0.3066666666666667,2,0.2,0.7,188,0.292503,2,216,0.310636,...,0.833333,0.367647,0.510204,0.948718,0.544118,0.691589,0.666667,0.000414,0.923077,0.000591
261,huMAP,0.29333333333333333,2,0.3,0.6,188,0.291691,4,266,0.309684,...,0.833333,0.367647,0.510204,0.948718,0.544118,0.691589,0.571429,0.000438,0.875000,0.000563
345,huMAP,0.32,2,0.3,0.7,194,0.296115,2,213,0.309360,...,0.833333,0.367647,0.510204,0.948718,0.544118,0.691589,0.714286,0.000598,0.857143,0.000600
301,huMAP,0.3066666666666667,2,0.3,0.6,188,0.296650,4,266,0.306644,...,0.833333,0.367647,0.510204,0.948718,0.544118,0.691589,0.571429,0.000438,0.875000,0.000563
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397,huMAP,,2,0.4,0.7,213,0.274648,6,308,0.262511,...,0.833333,0.367647,0.510204,0.948718,0.544118,0.691589,0.777778,0.000993,0.833333,0.000818
279,huMAP,0.29333333333333333,2,0.4,0.7,213,0.259851,9,328,0.262428,...,0.833333,0.367647,0.510204,0.948718,0.544118,0.691589,0.777778,0.000993,0.833333,0.000828
201,huMAP,0.28,2,0.1,0.6,191,0.311710,4,276,0.261752,...,0.833333,0.367647,0.510204,0.948718,0.544118,0.691589,0.604938,0.000266,0.928571,0.000321
239,huMAP,0.28,2,0.4,0.7,213,0.286296,9,328,0.260235,...,0.833333,0.367647,0.510204,0.948718,0.544118,0.691589,0.777778,0.000993,0.833333,0.000828


In [None]:
noPrefiltering = analysis.loc[analysis['threshold']=='None', :].copy()

In [None]:
noPrefiltering.sort_values(by=['clusterONE clusters weighted F1-score'], ascending=False)

Index(['study', 'threshold', 'size', 'density', 'overlap',
       'total clusterONE clusters', 'clusterONE clusters weighted F1-score',
       'inflation (MCL)', 'total clusterONE+MCL clusters',
       'clusterONE+MCL clusters weighted F1-score', 'clusterONE_srRec',
       'clusterONE_srPrec', 'clusterONE_srF1', 'clusterONE_lrRec',
       'clusterONE_lrPrec', 'clusterONE_lrF1', 'clusterONE+MCL_srRec',
       'clusterONE+MCL_srPrec', 'clusterONE+MCL_srF1', 'clusterONE+MCL_lrRec',
       'clusterONE+MCL_lrPrec', 'clusterONE+MCL_lrF1', 'clusterONE jaccardMax',
       'clusterONE jaccardMean', 'clusterONE+MCL jaccardMax',
       'clusterONE+MCL jaccardMean'],
      dtype='object')

In [None]:
elcfsScores_overlappingPairs = pd.read_csv(workDir + 'intersectionSet_drew2021_expandedPairs-set/overlappingPairs_ELCFS.txt', sep=' ', header=None, dtype={0: 'str', 1: 'str', 2: 'float64'})
humapScores_overlappingPairs = pd.read_csv(workDir + 'intersectionSet_drew2021_expandedPairs-set/overlappingPairs_huMAP.txt', sep=' ', header=None, dtype={0: 'str', 1: 'str', 2: 'float64'})

In [None]:
plt.scatterplot()

Unnamed: 0,0,1,2
685668,6429,81608,0.268957
685669,3958,6204,0.001042
685670,2720,8841,0.023543
685671,6229,1457,0.006665
685672,4035,6156,0.00375


##Our clustering

Drew 2021 predictions

In [None]:
filename = workDir + 'twoStage_clusteringHumap/' + \
           'overlappingPairs_predictionsHumapconsolidatedComplexes_detailed.txt'

In [None]:
analysis = pd.read_csv(filename, sep='\t')

In [None]:
analysis.head()

Unnamed: 0,complex,complexFrozen,score,score_pairsAvg-derived
0,"23049,23339,4087,4088,55823,57617,64601,64750,...","frozenset({'7415', '55823', '9135', '64601', '...",0.050508,0.456323
1,"10163,2891,2932,4089,4436,55802,7186,7353,8202...","frozenset({'7186', '8861', '7353', '8202', '10...",0.033169,0.033169
2,"10054,10093,10213,10236,10284,10471,10492,1052...","frozenset({'10213', '8726', '7283', '23607', '...",0.041172,0.041172
3,"10016,10262,10284,10286,10419,10421,10450,1046...","frozenset({'23476', '8829', '1207', '3146', '8...",0.023488,0.023488
4,"1025,10523,10629,10902,10914,10946,11100,11714...","frozenset({'10946', '23476', '55729', '8812', ...",0.014692,0.136043


ELCFS Predictions

In [None]:
filename = workDir + 'expandedPairs-set/twoStage/' + \
           'overlappingPairs_predictionsELCFSconsolidatedComplexes_detailed.txt'

In [None]:
analysis = pd.read_csv(filename, sep='\t')

In [None]:
analysis.head()

Unnamed: 0,complex,complexFrozen,score,score_pairsAvg-derived
0,"2956,3551,4174,51530,6235,6599,6929,8417,8773,...","frozenset({'8773', '6929', '4174', '6235', '84...",0.304483,0.304483
1,"22944,23759,2965,2966,2967,2968,54994,7090,853...","frozenset({'2968', '8533', '54994', '7090', '2...",0.715032,0.715032
2,"10026,10248,23013,3326,3688,6141,6400,84447,87...","frozenset({'10026', '10248', '8751', '23013', ...",0.388854,0.388854
3,"1947,24144,3678,3692,6184,6772,81572,8291,8829...","frozenset({'1947', '3678', '24144', '9126', '3...",0.343512,0.343512
4,"10521,1938,23339,23513,55723,55823,57617,64601...","frozenset({'23513', '6636', '55723', '23339', ...",0.340931,0.340931


#Expanded pairs' set

##Their clustering

In [None]:
filename = workDir + \
  'elcfsMean_expandedSet_t028/parameterGrid_batchResults_noCC_mclAdded.pkl'

In [None]:
analysis = pickle.load(open(filename, 'rb'))

In [None]:
analysis.head()

Unnamed: 0,size,density,overlap,total clusterONE clusters,clusterONE clusters weighted F1-score,clusterONE clusters,inflation (MCL),total clusterONE+MCL clusters,clusterONE+MCL clusters weighted F1-score,clusterONE+MCL clusters,clusterONE+MCL jaccardMean,clusterONE+MCL jaccardMax,clusterONE+MCL_srRec,clusterONE+MCL_srPrec,clusterONE+MCL_srF1,clusterONE+MCL_lrRec,clusterONE+MCL_lrPrec,clusterONE+MCL_lrF1,clusterONE+MCL predictions filename,clusterONE predictions filename
0,2,0.025,0.2,64,0.440476,"[[57804, 79813, 9328, 6877, 6632, 10376, 1153,...",2,771,0.17777,"[[55746, 11097, 81929, 57122, 8021, 79902, 790...",0.0017,0.25,1.0,1.0,1.0,1.0,0.5,0.666667,/content/drive/My Drive/otherStudies/elcfsMean...,/content/drive/My Drive/otherStudies/elcfsMean...


##Our clustering

In [None]:
#

#Union pairs' set

##Their clustering

In [None]:
#ELCFS Dominant
filename = workDir + 'unionPairs-set/' + \
          'pairsWprobs_humap_expandedSet_elcfsDom_t028' + \
          'parameterGrid_batchResults_noCC_mclAdded.pkl'

#huMAP Dominant
filename = workDir + 'unionPairs-set/' + \
          'pairsWprobs_humap_expandedSet_humapDom_t028' + \
          'parameterGrid_batchResults_noCC_mclAdded.pkl'

##Our clustering

In [None]:
#ELCFS Dominant
filename = workDir + 'unionPairs-set/' + \
          'pairsWprobs_humap_expandedSet_elcfsDom_t028/' + \
          'pairsWprobs_humap_expandedSet_elcfsDom_' + \
          'predictionsconsolidatedComplexes_detailed.txt'

#huMAP Dominant
filename = workDir + 'unionPairs-set/' + \
          'pairsWprobs_humap_expandedSet_humapDom_t028/' + \
          'pairsWprobs_humap_expandedSet_humapDom_' + \
          'predictionsconsolidatedComplexes_detailed.txt'

#Generate and evaluate clusterONE complexes

Define directories and principal object paths

In [None]:
clustoneJar = workDir + 'cluster_one-1.0.jar'
ribosomeRecapit = \
  ribosomeGuide_dir + 'ribosomalSubunit_possibleRecapitulation_methods.pkl'
ribosomalData = pickle.load(open(ribosomeRecapit, 'rb'))

In [None]:
trainingComplexes = elcfsDir + 'sourceData/humap1/complexes/train_complexes.txt'
testComplexes = elcfsDir + 'sourceData/humap1/complexes/test_complexes.txt'

In [None]:
humap1Predictions = elcfsDir + 'sourceData/humap1/results/clusters.txt'
humap1Analysis = cc.ComplexComparison(humap1Predictions)
humap1Results = worker(humap1Analysis)
print(list(humap1Results)[1])

In [None]:
#filtering thresholds
ppiThreshold_sweep = [0.28]  #['None'] + list(np.linspace(0.28, 0.32, 4))

#ClusterONE
delete = True
randomSeed = 42
sizeSweep = [2]
densitySweep = [0.025]  #[0.025, 0.05]  #[0.05, 0.075, 0.1, 0.2, 0.3, 0.4]
maxOverlap_sweep = [0.2]  #[0.2, 0.3]  #[0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
seedMethod_sweep = ['nodes']  #nodes, cliques, unused_nodes, edges

#MCL
inflationSweep = [2]  #[2, 4, 6, 7, 9]

In [None]:
def genCplx_cellSpec_diff(params, filenames, cellLine_pairs=[]):
  if not cellLine_pairs:
    cellLine_pairs = list(it.combinations(['h322', 'mcf7', 'u251'], 2))

  threshold = params['threshold']
  generateComplexes_clusterONE(params, filenames)

In [None]:
cplxPreds_elcfsPPIs_cellSpecific_expPairhumap_intersection.sort_values(by=['pairs_lowScoring'], ascending=False)

Unnamed: 0,complexesStrings,complexesFrozen,complexesLengths,cell line pair,allPairs,allPairs_frozen,relPairs,pairs_lowScoring,pairs_highScoring,pairs_lowScore,pairs_highScore
3,219771100114170,"(219771, 10011, 4170)",3,"(ELCFS_h322Specific, ELCFS_mcf7Specific)","[(219771, 10011), (219771, 4170), (10011, 4170)]","{(219771, 10011), (10011, 4170), (219771, 4170)}","[(219771, 10011), (10011, 4170)]",0.836548,0.900696,"[(10011, 4170)]","[(219771, 10011)]"
26,219771100114170,"(219771, 10011, 4170)",3,"(ELCFS_mcf7Specific, ELCFS_u251Specific)","[(219771, 10011), (219771, 4170), (10011, 4170)]","{(219771, 10011), (10011, 4170), (219771, 4170)}","[(219771, 10011), (10011, 4170)]",0.805115,0.89032,"[(219771, 10011)]","[(219771, 10011)]"
16,219771100114170,"(219771, 10011, 4170)",3,"(ELCFS_h322Specific, ELCFS_u251Specific)","[(219771, 10011), (219771, 4170), (10011, 4170)]","{(219771, 10011), (10011, 4170), (219771, 4170)}","[(219771, 10011), (10011, 4170)]",0.805115,0.900696,"[(219771, 10011)]","[(219771, 10011)]"
2,7456451385469459220988,"(7456, 4513, 8546, 9459, 220988)",5,"(ELCFS_h322Specific, ELCFS_mcf7Specific)","[(7456, 4513), (7456, 8546), (7456, 9459), (74...","{(7456, 9459), (7456, 8546), (9459, 4513), (22...","[(7456, 8546), (220988, 9459), (8546, 220988),...",0.021874,0.999695,"[(8546, 9459)]","[(7456, 220988)]"
15,7456451385469459220988,"(7456, 4513, 8546, 9459, 220988)",5,"(ELCFS_h322Specific, ELCFS_u251Specific)","[(7456, 4513), (7456, 8546), (7456, 9459), (74...","{(7456, 9459), (7456, 8546), (9459, 4513), (22...","[(7456, 8546), (220988, 9459), (8546, 220988),...",0.021874,0.999695,"[(8546, 9459)]","[(7456, 220988)]"
25,7456451385469459220988,"(7456, 4513, 8546, 9459, 220988)",5,"(ELCFS_mcf7Specific, ELCFS_u251Specific)","[(7456, 4513), (7456, 8546), (7456, 9459), (74...","{(7456, 9459), (7456, 8546), (9459, 4513), (22...","[(7456, 8546), (220988, 9459), (8546, 220988),...",0.021874,0.999695,"[(8546, 9459)]","[(7456, 220988)]"
7,6873578146432247,"(6873, 5781, 4643, 2247)",4,"(ELCFS_h322Specific, ELCFS_mcf7Specific)","[(6873, 5781), (6873, 4643), (6873, 2247), (57...","{(6873, 2247), (4643, 2247), (6873, 5781), (57...","[(6873, 4643), (6873, 5781), (4643, 5781), (57...",0.013751,1.0,"[(4643, 5781)]","[(5781, 2247)]"
0,1123458842742954973,"(11234, 5884, 27429, 54973)",4,"(ELCFS_h322Specific, ELCFS_mcf7Specific)","[(11234, 5884), (11234, 27429), (11234, 54973)...","{(5884, 54973), (11234, 54973), (5884, 27429),...","[(5884, 54973), (11234, 54973), (5884, 27429),...",0.00375,0.999695,"[(5884, 54973)]","[(5884, 27429)]"
22,1123458842742954973,"(11234, 5884, 27429, 54973)",4,"(ELCFS_mcf7Specific, ELCFS_u251Specific)","[(11234, 5884), (11234, 27429), (11234, 54973)...","{(5884, 54973), (11234, 54973), (5884, 27429),...","[(5884, 54973), (11234, 54973), (5884, 27429),...",0.00375,0.999695,"[(11234, 54973)]","[(5884, 27429)]"
13,1123458842742954973,"(11234, 5884, 27429, 54973)",4,"(ELCFS_h322Specific, ELCFS_u251Specific)","[(11234, 5884), (11234, 27429), (11234, 54973)...","{(5884, 54973), (11234, 54973), (5884, 27429),...","[(5884, 54973), (11234, 54973), (5884, 27429),...",0.00375,0.999695,"[(11234, 54973)]","[(5884, 27429)]"


In [None]:
random.seed(randomSeed)

In [None]:
#ppiELCFS = workDir + 'thresholdedScores_files/overlappingPairs_ELCFS_t0.28.txt'
#ppihuMAP = workDir + 'thresholdedScores_files/overlappingPairs_huMAP_t0.28.txt'

projects = \
  glob.glob(ribosomeGuide_dir + 'thresholdedScores_files/*overlapping*0.28*')
projects.sort()

projectsNames = [
    proj.split('/')[-1].split('.txt')[0].split('overlappingPairs_')[1] for proj in projects]

projectsDirs = [
    workDir + 'ribosomeGuided_gridSearch/{0}/'.format(proj)
    for proj in projectsNames]

In [None]:
sizeSweep, densitySweep, maxOverlap_sweep, seedMethod_sweep = params

#Calculate cell-type-specific complexes

In [None]:
projects = \
  ['/content/drive/My Drive/otherStudies/ribosomeGuided_gridSearch/thresholdedScores_files/expandedPairsset_ELCFS_t0.28_h322Specific.txt',
 '/content/drive/My Drive/otherStudies/ribosomeGuided_gridSearch/thresholdedScores_files/expandedPairsset_ELCFS_t0.28_mcf7Specific.txt',
 '/content/drive/My Drive/otherStudies/ribosomeGuided_gridSearch/thresholdedScores_files/expandedPairsset_ELCFS_t0.28_u251Specific.txt',
 '/content/drive/My Drive/otherStudies/ribosomeGuided_gridSearch/thresholdedScores_files/expandedPairsset_ELCFS_t0.28_nonSpecific.txt']

projectsNames = [
    proj.split('/')[-1].split('.txt')[0] for proj in projects]

projectsDirs = [
    workDir + 'ribosomeGuided_gridSearch/{0}/'.format(proj)
    for proj in projectsNames]

In [None]:
for ppiFilename, resultsDir in zip(projects, projectsDirs):
  print(ppiFilename)
  generateComplexes_clusterONE(ppiFilename,
                               delete,
                               sizeSweep, densitySweep, maxOverlap_sweep,
                               seedMethod_sweep, clustoneJar, resultsDir)

/content/drive/My Drive/otherStudies/ribosomeGuided_gridSearch/thresholdedScores_files/expandedPairsset_ELCFS_t0.28_h322Specific.txt


  0%|          | 0/1 [00:00<?, ?it/s]

clusterONE beginning
cluster line
cluster file generation
clusterONE complete


100%|██████████| 1/1 [11:18:01<00:00, 40681.21s/it]


/content/drive/My Drive/otherStudies/ribosomeGuided_gridSearch/thresholdedScores_files/expandedPairsset_ELCFS_t0.28_mcf7Specific.txt


  0%|          | 0/1 [00:00<?, ?it/s]

clusterONE beginning


In [None]:
ppi_scores = dict()
for ppi in networkList:
    ppi_scores[frozenset([ppi.split()[0], ppi.split()[1]])] = \
      float(ppi.split()[2])

In [None]:
[1 for ele in parameterGrid]

[1]

In [None]:
clusterHelper(parameterGrid[0])

In [None]:
def generateComplexes_clusterONE(ppiFilename,
                                delete,
                                sizeSweep, densitySweep, maxOverlap_sweep,
                                seedMethod_sweep, clusteroneJar, resultsDir):

# format: id\t id\t weight\n
with open(projects[0], "r") as inputFile:
  networkList = inputFile.readlines()

ppi_scores = dict()
for ppi in networkList:
    ppi_scores[frozenset([ppi.split()[0], ppi.split()[1]])] = \
      float(ppi.split()[2])


parameterGrid = []
for i, parameters in enumerate(
    it.product(sizeSweep, densitySweep, maxOverlap_sweep, seedMethod_sweep)):
  size, density, overlap, seedMethod = parameters

  parameterComb = dict()
  parameterComb['filename'] = ppiFilename.split('/')[-1].split('.txt')[0]
  parameterComb['clustoneJar'] = clustoneJar
  parameterComb['dir'] = resultsDir
  parameterComb['delete'] = delete

  parameterComb['i'] = str(i)
  parameterComb['networkList'] = networkList

  parameterComb['size'] = str(size)
  parameterComb['density'] = str(density)
  parameterComb['overlap'] = str(overlap)
  parameterComb['seedMethod'] = seedMethod

  parameterGrid.append(parameterComb)

numWorkers = cpu_count()-1
with Pool(numWorkers) as pool:
  clusterAssembly = list(
      progressMonitor(
          pool.imap(clusterHelper, parameterGrid), total=len(parameterGrid)))

In [None]:
pairsScored = {}
results_stage1 = {}
results_stage2 = {}
for ppiFilename, resultsDir, name in zip(projects, projectsDirs, projectsNames):
  pairsScored[name] = \
    generateComplexes_clusterONE(ppiFilename,
                                 delete,
                                 sizeSweep, densitySweep, maxOverlap_sweep,
                                 seedMethod_sweep, clustoneJar, resultsDir)
  results_stage1[name] = analyzeClusterONE_predictions(resultsDir)
  results_stage2[name] = \
    gridSearch_addInflation_mcl(results_stage1[name], pairsScored[name],
                                inflationSweep, resultsDir, delete)

'''
pairsScored = \
  generateComplexes_clusterONE(ppiFilename,
                               delete,
                               sizeSweep, densitySweep, maxOverlap_sweep,
                               seedMethod_sweep, clustoneJar, resultsDir)
results_stage1 = analyzeClusterONE_predictions(resultsDir)
results_stage2 = \
  gridSearch_addInflation_mcl(results_stage1, pairsScored,
                              inflationSweep, resultsDir, delete)
'''

In [None]:
pairsScored = {}
for ppiFilename, resultsDir, name in zip(projects, projectsDirs, projectsNames):
  pairsScored[name] = \
    generateComplexes_clusterONE(ppiFilename,
                                 delete,
                                 sizeSweep, densitySweep, maxOverlap_sweep,
                                 seedMethod_sweep, clustoneJar, resultsDir)

In [None]:
results_stage1 = {}
for ppiFilename, resultsDir, name in zip(projects, projectsDirs, projectsNames):
  results_stage1[name] = analyzeClusterONE_predictions(resultsDir)

In [None]:
results_stage2 = {}
for ppiFilename, resultsDir, name in zip(projects, projectsDirs, projectsNames):
  results_stage2[name] = \
    gridSearch_addInflation_mcl(results_stage1[name], pairsScored[name],
                                inflationSweep, resultsDir, delete)

##Results--Intersection Set (Drew 2021 Clustering Method)

Highest performing parameter combination: t: 0.28, s: 2, d: 0.025, o: 0.2, i: 2.

In [None]:
clusterONE_MCL_predictions_sharedPairs_gridSearch.sort_values(
    by=['clusterONE clusters weighted F1-score'], ascending=False)

Unnamed: 0,study,threshold,size,density,overlap,total clusterONE clusters,clusterONE clusters weighted F1-score,clusterONE clusters,clusterONE analysisOBJ,inflation (MCL),total clusterONE+MCL clusters,clusterONE+MCL clusters weighted F1-score,clusterONE+MCL clusters,clusterONE+MCL analysisOBJ,clusterONE+MCL predictions filename,clusterONE predictions filename
245,huMAP,0.29333333333333333,2,0.1,0.7,197,0.325528,"[[10978, 51585], [55234, 3550], [6117, 7520, 5...",<complex_comparison_single_gold_standard5.Comp...,2,230,0.282018,"[[10978, 51585], [55234, 3550], [7520, 5591, 2...",<complex_comparison_single_gold_standard5.Comp...,/content/drive/My Drive/otherStudies/ribosomeG...,/content/drive/My Drive/otherStudies/ribosomeG...
246,huMAP,0.29333333333333333,2,0.1,0.7,197,0.325528,"[[10978, 51585], [55234, 3550], [6117, 7520, 5...",<complex_comparison_single_gold_standard5.Comp...,4,291,0.274981,"[[10978, 51585], [55234, 3550], [6117, 6118], ...",<complex_comparison_single_gold_standard5.Comp...,/content/drive/My Drive/otherStudies/ribosomeG...,/content/drive/My Drive/otherStudies/ribosomeG...
247,huMAP,0.29333333333333333,2,0.1,0.7,197,0.325528,"[[10978, 51585], [55234, 3550], [6117, 7520, 5...",<complex_comparison_single_gold_standard5.Comp...,6,315,0.274763,"[[10978, 51585], [55234, 3550], [6117, 6118], ...",<complex_comparison_single_gold_standard5.Comp...,/content/drive/My Drive/otherStudies/ribosomeG...,/content/drive/My Drive/otherStudies/ribosomeG...
248,huMAP,0.29333333333333333,2,0.1,0.7,197,0.325528,"[[10978, 51585], [55234, 3550], [6117, 7520, 5...",<complex_comparison_single_gold_standard5.Comp...,7,326,0.280829,"[[10978, 51585], [55234, 3550], [6117, 6118], ...",<complex_comparison_single_gold_standard5.Comp...,/content/drive/My Drive/otherStudies/ribosomeG...,/content/drive/My Drive/otherStudies/ribosomeG...
249,huMAP,0.29333333333333333,2,0.1,0.7,197,0.325528,"[[10978, 51585], [55234, 3550], [6117, 7520, 5...",<complex_comparison_single_gold_standard5.Comp...,9,342,0.283581,"[[10978, 51585], [55234, 3550], [6117, 6118], ...",<complex_comparison_single_gold_standard5.Comp...,/content/drive/My Drive/otherStudies/ribosomeG...,/content/drive/My Drive/otherStudies/ribosomeG...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,ELCFS,0.29333333333333333,2,0.3,0.7,481,0.156540,"[[80746, 1956, 3685, 4758, 3655, 3688, 56655],...",<complex_comparison_single_gold_standard5.Comp...,4,1107,0.145999,"[[1956, 3685, 3655, 3688, 56655], [80746, 4758...",<complex_comparison_single_gold_standard5.Comp...,/content/drive/My Drive/otherStudies/ribosomeG...,/content/drive/My Drive/otherStudies/ribosomeG...
65,ELCFS,0.29333333333333333,2,0.3,0.7,481,0.156540,"[[80746, 1956, 3685, 4758, 3655, 3688, 56655],...",<complex_comparison_single_gold_standard5.Comp...,2,733,0.187111,"[[1956, 3685, 3655, 3688, 56655], [80746, 4758...",<complex_comparison_single_gold_standard5.Comp...,/content/drive/My Drive/otherStudies/ribosomeG...,/content/drive/My Drive/otherStudies/ribosomeG...
69,ELCFS,0.29333333333333333,2,0.3,0.7,481,0.156540,"[[80746, 1956, 3685, 4758, 3655, 3688, 56655],...",<complex_comparison_single_gold_standard5.Comp...,9,1442,0.075213,"[[1956, 3655, 3688], [80746, 4758], [3685, 566...",<complex_comparison_single_gold_standard5.Comp...,/content/drive/My Drive/otherStudies/ribosomeG...,/content/drive/My Drive/otherStudies/ribosomeG...
68,ELCFS,0.29333333333333333,2,0.3,0.7,481,0.156540,"[[80746, 1956, 3685, 4758, 3655, 3688, 56655],...",<complex_comparison_single_gold_standard5.Comp...,7,1366,0.077612,"[[1956, 3655, 3688], [80746, 4758], [3685, 566...",<complex_comparison_single_gold_standard5.Comp...,/content/drive/My Drive/otherStudies/ribosomeG...,/content/drive/My Drive/otherStudies/ribosomeG...


In [None]:
clusterONE_MCL_predictions_sharedPairs_gridSearch.sort_values(
    by=['clusterONE+MCL clusters weighted F1-score'], ascending=False)

Unnamed: 0,study,threshold,size,density,overlap,total clusterONE clusters,clusterONE clusters weighted F1-score,clusterONE clusters,clusterONE analysisOBJ,inflation (MCL),total clusterONE+MCL clusters,clusterONE+MCL clusters weighted F1-score,clusterONE+MCL clusters,clusterONE+MCL analysisOBJ,clusterONE+MCL predictions filename,clusterONE predictions filename
305,huMAP,0.3066666666666667,2,0.3,0.7,194,0.279689,"[[10978, 51585], [55234, 3550], [6117, 7520, 5...",<complex_comparison_single_gold_standard5.Comp...,2,213,0.315805,"[[10978, 51585], [55234, 3550], [7520, 5591, 2...",<complex_comparison_single_gold_standard5.Comp...,/content/drive/My Drive/otherStudies/ribosomeG...,/content/drive/My Drive/otherStudies/ribosomeG...
295,huMAP,0.3066666666666667,2,0.2,0.7,188,0.292503,"[[10978, 51585], [55234, 3550], [6117, 7520, 5...",<complex_comparison_single_gold_standard5.Comp...,2,216,0.310636,"[[10978, 51585], [55234, 3550], [7520, 5591, 2...",<complex_comparison_single_gold_standard5.Comp...,/content/drive/My Drive/otherStudies/ribosomeG...,/content/drive/My Drive/otherStudies/ribosomeG...
261,huMAP,0.29333333333333333,2,0.3,0.6,188,0.291691,"[[10978, 51585], [55234, 3550], [6117, 7520, 5...",<complex_comparison_single_gold_standard5.Comp...,4,266,0.309684,"[[10978, 51585], [55234, 3550], [6117, 6118], ...",<complex_comparison_single_gold_standard5.Comp...,/content/drive/My Drive/otherStudies/ribosomeG...,/content/drive/My Drive/otherStudies/ribosomeG...
345,huMAP,0.32,2,0.3,0.7,194,0.296115,"[[10978, 51585], [55234, 3550], [6117, 7520, 5...",<complex_comparison_single_gold_standard5.Comp...,2,213,0.309360,"[[10978, 51585], [55234, 3550], [7520, 5591, 2...",<complex_comparison_single_gold_standard5.Comp...,/content/drive/My Drive/otherStudies/ribosomeG...,/content/drive/My Drive/otherStudies/ribosomeG...
301,huMAP,0.3066666666666667,2,0.3,0.6,188,0.296650,"[[10978, 51585], [55234, 3550], [6117, 7520, 5...",<complex_comparison_single_gold_standard5.Comp...,4,266,0.306644,"[[10978, 51585], [55234, 3550], [6117, 6118], ...",<complex_comparison_single_gold_standard5.Comp...,/content/drive/My Drive/otherStudies/ribosomeG...,/content/drive/My Drive/otherStudies/ribosomeG...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174,ELCFS,,2,0.2,0.6,288,0.183372,"[[8828, 8754, 80746, 1956, 3685, 4758, 3915, 3...",<complex_comparison_single_gold_standard5.Comp...,9,1398,0.064456,"[[8828, 8754, 1956, 3482, 3688, 3911, 2734], [...",<complex_comparison_single_gold_standard5.Comp...,/content/drive/My Drive/otherStudies/ribosomeG...,/content/drive/My Drive/otherStudies/ribosomeG...
54,ELCFS,0.29333333333333333,2,0.2,0.6,288,0.200931,"[[8828, 8754, 80746, 1956, 3685, 4758, 3915, 3...",<complex_comparison_single_gold_standard5.Comp...,9,1398,0.063844,"[[8828, 8754, 1956, 3482, 3688, 3911, 2734], [...",<complex_comparison_single_gold_standard5.Comp...,/content/drive/My Drive/otherStudies/ribosomeG...,/content/drive/My Drive/otherStudies/ribosomeG...
129,ELCFS,0.32,2,0.1,0.7,239,0.308469,"[[351, 80781, 8828, 857, 8754, 55558, 1956, 36...",<complex_comparison_single_gold_standard5.Comp...,9,1700,0.063429,"[[80781, 8828, 8754, 55558, 1956, 3482, 3688, ...",<complex_comparison_single_gold_standard5.Comp...,/content/drive/My Drive/otherStudies/ribosomeG...,/content/drive/My Drive/otherStudies/ribosomeG...
84,ELCFS,0.3066666666666667,2,0.1,0.6,190,0.319446,"[[322, 351, 2783, 80781, 8828, 857, 8754, 5555...",<complex_comparison_single_gold_standard5.Comp...,9,1476,0.062654,"[[80781, 8828, 8754, 55558, 1956, 3685, 3482, ...",<complex_comparison_single_gold_standard5.Comp...,/content/drive/My Drive/otherStudies/ribosomeG...,/content/drive/My Drive/otherStudies/ribosomeG...


In [None]:
clusterONE_MCL_predictions_sharedPairs_gridSearch.sort_values(
    by=['total clusterONE+MCL clusters'], ascending=False)

Unnamed: 0,study,threshold,size,density,overlap,total clusterONE clusters,clusterONE clusters weighted F1-score,clusterONE clusters,clusterONE analysisOBJ,inflation (MCL),total clusterONE+MCL clusters,clusterONE+MCL clusters weighted F1-score,clusterONE+MCL clusters,clusterONE+MCL analysisOBJ,clusterONE+MCL predictions filename,clusterONE predictions filename
129,ELCFS,0.32,2,0.1,0.7,239,0.308469,"[[351, 80781, 8828, 857, 8754, 55558, 1956, 36...",<complex_comparison_single_gold_standard5.Comp...,9,1700,0.063429,"[[80781, 8828, 8754, 55558, 1956, 3482, 3688, ...",<complex_comparison_single_gold_standard5.Comp...,/content/drive/My Drive/otherStudies/ribosomeG...,/content/drive/My Drive/otherStudies/ribosomeG...
49,ELCFS,0.29333333333333333,2,0.1,0.7,239,0.310499,"[[351, 80781, 8828, 857, 8754, 55558, 1956, 36...",<complex_comparison_single_gold_standard5.Comp...,9,1700,0.066515,"[[80781, 8828, 8754, 55558, 1956, 3482, 3688, ...",<complex_comparison_single_gold_standard5.Comp...,/content/drive/My Drive/otherStudies/ribosomeG...,/content/drive/My Drive/otherStudies/ribosomeG...
169,ELCFS,,2,0.1,0.7,239,0.309058,"[[351, 80781, 8828, 857, 8754, 55558, 1956, 36...",<complex_comparison_single_gold_standard5.Comp...,9,1700,0.068617,"[[80781, 8828, 8754, 55558, 1956, 3482, 3688, ...",<complex_comparison_single_gold_standard5.Comp...,/content/drive/My Drive/otherStudies/ribosomeG...,/content/drive/My Drive/otherStudies/ribosomeG...
89,ELCFS,0.3066666666666667,2,0.1,0.7,239,0.302932,"[[351, 80781, 8828, 857, 8754, 55558, 1956, 36...",<complex_comparison_single_gold_standard5.Comp...,9,1700,0.068711,"[[80781, 8828, 8754, 55558, 1956, 3482, 3688, ...",<complex_comparison_single_gold_standard5.Comp...,/content/drive/My Drive/otherStudies/ribosomeG...,/content/drive/My Drive/otherStudies/ribosomeG...
9,ELCFS,0.28,2,0.1,0.7,239,0.304852,"[[351, 80781, 8828, 857, 8754, 55558, 1956, 36...",<complex_comparison_single_gold_standard5.Comp...,9,1700,0.065808,"[[80781, 8828, 8754, 55558, 1956, 3482, 3688, ...",<complex_comparison_single_gold_standard5.Comp...,/content/drive/My Drive/otherStudies/ribosomeG...,/content/drive/My Drive/otherStudies/ribosomeG...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300,huMAP,0.3066666666666667,2,0.3,0.6,188,0.296650,"[[10978, 51585], [55234, 3550], [6117, 7520, 5...",<complex_comparison_single_gold_standard5.Comp...,2,206,0.287620,"[[10978, 51585], [55234, 3550], [7520, 5591, 2...",<complex_comparison_single_gold_standard5.Comp...,/content/drive/My Drive/otherStudies/ribosomeG...,/content/drive/My Drive/otherStudies/ribosomeG...
260,huMAP,0.29333333333333333,2,0.3,0.6,188,0.291691,"[[10978, 51585], [55234, 3550], [6117, 7520, 5...",<complex_comparison_single_gold_standard5.Comp...,2,206,0.302284,"[[10978, 51585], [55234, 3550], [7520, 5591, 2...",<complex_comparison_single_gold_standard5.Comp...,/content/drive/My Drive/otherStudies/ribosomeG...,/content/drive/My Drive/otherStudies/ribosomeG...
380,huMAP,,2,0.3,0.6,188,0.292943,"[[10978, 51585], [55234, 3550], [6117, 7520, 5...",<complex_comparison_single_gold_standard5.Comp...,2,206,0.305906,"[[10978, 51585], [55234, 3550], [7520, 5591, 2...",<complex_comparison_single_gold_standard5.Comp...,/content/drive/My Drive/otherStudies/ribosomeG...,/content/drive/My Drive/otherStudies/ribosomeG...
340,huMAP,0.32,2,0.3,0.6,188,0.290244,"[[10978, 51585], [55234, 3550], [6117, 7520, 5...",<complex_comparison_single_gold_standard5.Comp...,2,206,0.286932,"[[10978, 51585], [55234, 3550], [7520, 5591, 2...",<complex_comparison_single_gold_standard5.Comp...,/content/drive/My Drive/otherStudies/ribosomeG...,/content/drive/My Drive/otherStudies/ribosomeG...


In [None]:
clusterONE_MCL_predictions_sharedPairs_gridSearch_ribosomalSubunit_matches.loc[clusterONE_MCL_predictions_sharedPairs_gridSearch_ribosomalSubunit_matches['study']=='ELCFS', ['study', 'threshold', 'size', 'density', 'overlap',
       'total clusterONE clusters', 'clusterONE clusters weighted F1-score',
       'inflation (MCL)', 'total clusterONE+MCL clusters',
       'clusterONE+MCL clusters weighted F1-score', 'clusterONE jaccardMax',
       'clusterONE+MCL jaccardMax', 'clusterONE jaccardMean',
       'clusterONE+MCL jaccardMean'
       ]].sort_values(by=['clusterONE clusters weighted F1-score'], ascending=False)

Unnamed: 0,study,threshold,size,density,overlap,total clusterONE clusters,clusterONE clusters weighted F1-score,inflation (MCL),total clusterONE+MCL clusters,clusterONE+MCL clusters weighted F1-score,clusterONE jaccardMax,clusterONE+MCL jaccardMax,clusterONE jaccardMean,clusterONE+MCL jaccardMean
80,ELCFS,0.3066666666666667,2,0.1,0.6,190,0.319446,2,550,0.237359,0.625000,0.947368,0.003591,0.001360
84,ELCFS,0.3066666666666667,2,0.1,0.6,190,0.319446,9,1476,0.062654,0.625000,0.857143,0.003591,0.000267
83,ELCFS,0.3066666666666667,2,0.1,0.6,190,0.319446,7,1377,0.073358,0.625000,0.857143,0.003591,0.000305
82,ELCFS,0.3066666666666667,2,0.1,0.6,190,0.319446,6,1289,0.086279,0.625000,0.857143,0.003591,0.000347
81,ELCFS,0.3066666666666667,2,0.1,0.6,190,0.319446,4,981,0.152367,0.625000,0.937500,0.003591,0.000596
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,ELCFS,0.29333333333333333,2,0.3,0.7,481,0.156540,2,733,0.187111,0.714286,0.833333,0.001639,0.000969
66,ELCFS,0.29333333333333333,2,0.3,0.7,481,0.156540,4,1107,0.145999,0.714286,0.833333,0.001639,0.000572
69,ELCFS,0.29333333333333333,2,0.3,0.7,481,0.156540,9,1442,0.075213,0.714286,0.800000,0.001639,0.000359
67,ELCFS,0.29333333333333333,2,0.3,0.7,481,0.156540,6,1308,0.089824,0.714286,0.833333,0.001639,0.000424


In [None]:
clusterONE_MCL_predictions_sharedPairs_gridSearch_ribosomalSubunit_matches.sort_values(by=['clusterONE clusters weighted F1-score'], ascending=False)

Unnamed: 0,study,threshold,size,density,overlap,total clusterONE clusters,clusterONE clusters weighted F1-score,inflation (MCL),total clusterONE+MCL clusters,clusterONE+MCL clusters weighted F1-score,...,clusterONE+MCL_srRec,clusterONE+MCL_srPrec,clusterONE+MCL_srF1,clusterONE+MCL_lrRec,clusterONE+MCL_lrPrec,clusterONE+MCL_lrF1,clusterONE jaccardMax,clusterONE+MCL jaccardMax,clusterONE jaccardMean,clusterONE+MCL jaccardMean
245,huMAP,0.29333333333333333,2,0.1,0.7,197,0.325528,2,230,0.282018,...,0.833333,0.367647,0.510204,0.948718,0.544118,0.691589,0.695652,0.923077,0.000417,0.000525
246,huMAP,0.29333333333333333,2,0.1,0.7,197,0.325528,4,291,0.274981,...,0.833333,0.367647,0.510204,0.948718,0.544118,0.691589,0.695652,0.928571,0.000417,0.000420
247,huMAP,0.29333333333333333,2,0.1,0.7,197,0.325528,6,315,0.274763,...,0.833333,0.367647,0.510204,0.948718,0.544118,0.691589,0.695652,0.928571,0.000417,0.000402
248,huMAP,0.29333333333333333,2,0.1,0.7,197,0.325528,7,326,0.280829,...,0.833333,0.367647,0.510204,0.948718,0.544118,0.691589,0.695652,0.928571,0.000417,0.000376
249,huMAP,0.29333333333333333,2,0.1,0.7,197,0.325528,9,342,0.283581,...,0.833333,0.367647,0.510204,0.948718,0.544118,0.691589,0.695652,0.923077,0.000417,0.000359
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,ELCFS,0.29333333333333333,2,0.3,0.7,481,0.156540,4,1107,0.145999,...,0.933333,0.500000,0.651163,0.974359,1.000000,0.987013,0.714286,0.833333,0.001639,0.000572
65,ELCFS,0.29333333333333333,2,0.3,0.7,481,0.156540,2,733,0.187111,...,1.000000,0.500000,0.666667,1.000000,0.500000,0.666667,0.714286,0.833333,0.001639,0.000969
69,ELCFS,0.29333333333333333,2,0.3,0.7,481,0.156540,9,1442,0.075213,...,0.066667,1.000000,0.125000,0.128205,1.000000,0.227273,0.714286,0.800000,0.001639,0.000359
68,ELCFS,0.29333333333333333,2,0.3,0.7,481,0.156540,7,1366,0.077612,...,0.166667,1.000000,0.285714,0.256410,1.000000,0.408163,0.714286,0.833333,0.001639,0.000393


In [None]:
clusterONE_MCL_predictions_sharedPairs_gridSearch_ribosomalSubunit_matches.to_csv(
    workDir + 'ribosomeGuided_gridSearch/clusterONE_MCL_predictions_sharedPairs_gridSearch_ribosomalSubunit_overlap+jaccardSimilarity.tsv', sep='\t', index=False)

In [None]:
clusterONE_MCL_predictions_sharedPairs_gridSearch_ribosomalSubunit_matches.sort_values(by=['lrF1', 'srF1'], ascending=False)

Unnamed: 0,study,threshold,size,density,overlap,total clusterONE clusters,clusterONE clusters weighted F1-score,inflation (MCL),total clusterONE+MCL clusters,clusterONE+MCL clusters weighted F1-score,srRec,srPrec,lrRec,lrPrec,srF1,lrF1
200,huMAP,0.28,2,0.1,0.6,191,0.311710,2,219,0.301683,0.833333,0.367647,0.948718,0.544118,0.510204,0.691589
201,huMAP,0.28,2,0.1,0.6,191,0.311710,4,276,0.261752,0.833333,0.367647,0.948718,0.544118,0.510204,0.691589
202,huMAP,0.28,2,0.1,0.6,191,0.311710,6,299,0.270733,0.833333,0.367647,0.948718,0.544118,0.510204,0.691589
203,huMAP,0.28,2,0.1,0.6,191,0.311710,7,310,0.284217,0.833333,0.367647,0.948718,0.544118,0.510204,0.691589
204,huMAP,0.28,2,0.1,0.6,191,0.311710,9,324,0.287964,0.833333,0.367647,0.948718,0.544118,0.510204,0.691589
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,ELCFS,,2,0.3,0.7,481,0.164687,2,733,0.181254,1.000000,0.250000,1.000000,0.325000,0.400000,0.490566
186,ELCFS,,2,0.3,0.7,481,0.164687,4,1107,0.144693,1.000000,0.250000,1.000000,0.325000,0.400000,0.490566
187,ELCFS,,2,0.3,0.7,481,0.164687,6,1308,0.084579,1.000000,0.250000,1.000000,0.325000,0.400000,0.490566
188,ELCFS,,2,0.3,0.7,481,0.164687,7,1366,0.079754,1.000000,0.250000,1.000000,0.325000,0.400000,0.490566


In [None]:
clusterONE_MCL_predictions_sharedPairs_gridSearch_ribosomalSubunit_matches.sort_values(
    by=['srPrec', 'srRec'], ascending=False)

Unnamed: 0,study,threshold,size,density,overlap,total clusterONE clusters,clusterONE clusters weighted F1-score,inflation (MCL),total clusterONE+MCL clusters,clusterONE+MCL clusters weighted F1-score,srRec,srPrec,lrRec,lrPrec
200,huMAP,0.28,2,0.1,0.6,191,0.311710,2,219,0.301683,0.833333,0.367647,0.948718,0.544118
201,huMAP,0.28,2,0.1,0.6,191,0.311710,4,276,0.261752,0.833333,0.367647,0.948718,0.544118
202,huMAP,0.28,2,0.1,0.6,191,0.311710,6,299,0.270733,0.833333,0.367647,0.948718,0.544118
203,huMAP,0.28,2,0.1,0.6,191,0.311710,7,310,0.284217,0.833333,0.367647,0.948718,0.544118
204,huMAP,0.28,2,0.1,0.6,191,0.311710,9,324,0.287964,0.833333,0.367647,0.948718,0.544118
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,ELCFS,,2,0.3,0.7,481,0.164687,2,733,0.181254,1.000000,0.250000,1.000000,0.325000
186,ELCFS,,2,0.3,0.7,481,0.164687,4,1107,0.144693,1.000000,0.250000,1.000000,0.325000
187,ELCFS,,2,0.3,0.7,481,0.164687,6,1308,0.084579,1.000000,0.250000,1.000000,0.325000
188,ELCFS,,2,0.3,0.7,481,0.164687,7,1366,0.079754,1.000000,0.250000,1.000000,0.325000


In [None]:
clusterONE_MCL_predictions_sharedPairs_gridSearch_ribosomalSubunit_matches.sort_values(
    by=['lrRec', 'lrPrec'], ascending=False)

Unnamed: 0,study,threshold,size,density,overlap,total clusterONE clusters,clusterONE clusters weighted F1-score,inflation (MCL),total clusterONE+MCL clusters,clusterONE+MCL clusters weighted F1-score,srRec,srPrec,lrRec,lrPrec
30,ELCFS,0.28,2,0.4,0.6,486,0.177359,2,610,0.197618,1.000000,0.291262,1.000000,0.378641
31,ELCFS,0.28,2,0.4,0.6,486,0.177359,4,854,0.151525,1.000000,0.291262,1.000000,0.378641
32,ELCFS,0.28,2,0.4,0.6,486,0.177359,6,991,0.087543,1.000000,0.291262,1.000000,0.378641
33,ELCFS,0.28,2,0.4,0.6,486,0.177359,7,1047,0.078198,1.000000,0.291262,1.000000,0.378641
34,ELCFS,0.28,2,0.4,0.6,486,0.177359,9,1105,0.067496,1.000000,0.291262,1.000000,0.378641
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,huMAP,,2,0.4,0.7,213,0.274648,2,220,0.276327,0.833333,0.367647,0.948718,0.544118
396,huMAP,,2,0.4,0.7,213,0.274648,4,282,0.271114,0.833333,0.367647,0.948718,0.544118
397,huMAP,,2,0.4,0.7,213,0.274648,6,308,0.262511,0.833333,0.367647,0.948718,0.544118
398,huMAP,,2,0.4,0.7,213,0.274648,7,320,0.280378,0.833333,0.367647,0.948718,0.544118


##Results--Union Set (Drew 2021 Clustering Method)

 t: 0.28, s: 2, d: 0.025, o: 0.2, i: 2.

In [None]:
#ClusterONE
delete = True
randomSeed = 42
sizeSweep = [2]
densitySweep = [0.025]
maxOverlap_sweep = [0.2]  #[0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
seedMethod_sweep = ['nodes']  #nodes, cliques, unused_nodes, edges

#MCL
inflationSweep = [2]

In [None]:
random.seed(randomSeed)

In [None]:
clustoneJar = '/content/cluster_one-1.0.jar'
ribosomeRecapit = \
  ribosomeGuide_dir + 'ribosomalSubunit_possibleRecapitulation_methods.pkl'
ribosomalData = pickle.load(open(ribosomeRecapit, 'rb'))

In [None]:
projects = \
  ['/content/pairsWprobs_humap_expandedSet_elcfsDom_t0.28.txt',
   '/content/pairsWprobs_humap_expandedSet_humapDom_t0.28.txt']

projectsNames = \
  ['pairsWprobs_humap_expandedSet_elcfsDom_t028',
   'pairsWprobs_humap_expandedSet_humapDom_t028']

projectsDirs = \
  [workDir + 'unionPairs-set/pairsWprobs_humap_expandedSet_elcfsDom_t028',
  workDir + 'unionPairs-set/pairsWprobs_humap_expandedSet_humapDom_t028']

In [None]:
pairsScored = {}
for ppiFilename, resultsDir, name in zip(projects, projectsDirs, projectsNames):
  pairsScored[name] = \
    generateComplexes_clusterONE(ppiFilename,
                                 delete,
                                 sizeSweep, densitySweep, maxOverlap_sweep,
                                 seedMethod_sweep, clustoneJar, resultsDir)

In [None]:
results_stage1 = {}
for ppiFilename, resultsDir, name in zip(projects, projectsDirs, projectsNames):
  results_stage1[name] = analyzeClusterONE_predictions(resultsDir)

In [None]:
results_stage2 = {}
for ppiFilename, resultsDir, name in zip(projects, projectsDirs, projectsNames):
  results_stage2[name] = \
    gridSearch_addInflation_mcl(results_stage1[name], pairsScored[name],
                                inflationSweep, resultsDir, delete)

##Results--Expanded Pairs Set (Drew 2021 Clustering Method)

 t: 0.28, s: 2, d: 0.025, o: 0.2, i: 2.

In [None]:
#ClusterONE
delete = True
randomSeed = 42
sizeSweep = [2]
densitySweep = [0.025]
maxOverlap_sweep = [0.2]
seedMethod_sweep = ['nodes']

#MCL
inflationSweep = [2]

In [None]:
random.seed(randomSeed)

In [None]:
clustoneJar = workDir + 'cluster_one-1.0.jar'
ribosomeRecapit = \
  ribosomeGuide_dir + 'ribosomalSubunit_possibleRecapitulation_methods.pkl'
ribosomalData = pickle.load(open(ribosomeRecapit, 'rb'))

In [None]:
projects = \
  ['/content/elcfsMean_expandedPairset_t0.28.txt']

projectsNames = \
  ['elcfsMean_expandedPairs-set_t0.28']

projectsDirs = \
  [workDir + 'expandedPairs-set/twoStage/']

In [None]:
pairsScored = {}
for ppiFilename, resultsDir, name in zip(projects, projectsDirs, projectsNames):
  pairsScored[name] = \
    generateComplexes_clusterONE(ppiFilename,
                                 delete,
                                 sizeSweep, densitySweep, maxOverlap_sweep,
                                 seedMethod_sweep, clustoneJar, resultsDir)

  0%|          | 0/1 [00:00<?, ?it/s]

clusterONE beginning


In [None]:
results_stage1 = {}
for ppiFilename, resultsDir, name in zip(projects, projectsDirs, projectsNames):
  results_stage1[name] = analyzeClusterONE_predictions(resultsDir)

In [None]:
results_stage2 = {}
for ppiFilename, resultsDir, name in zip(projects, projectsDirs, projectsNames):
  results_stage2[name] = \
    gridSearch_addInflation_mcl(results_stage1[name], pairsScored[name],
                                inflationSweep, resultsDir, delete)

In [None]:
i = str(0)
filename = '/content/elcfsMean_expandedPairset_t0.28'

with open(ppiFilename, "r") as inputFile:
  networkList = inputFile.readlines()

size = str(sizeSweep[0])
density = str(densitySweep[0])
overlap = str(maxOverlap_sweep[0])
seedMethod = seedMethod_sweep[0]
suffix = '_'.join([filename, i, seedMethod, size, density, overlap]) + '.txt'

fileTemp = tf.NamedTemporaryFile(delete=delete, dir=projectsDirs[0], mode='w')
tempFile_name = fileTemp.name + '.txt'
tempFile = open(tempFile_name, 'w')
fileTemp.close()

for ppi in networkList: tempFile.write(ppi)
tempFile.close()

print('clusterONE beginning')
clusterONE_output = \
  sp.run(['java', '-jar', clustoneJar, tempFile_name,
          '-s', size, '-d', density, '--max-overlap', overlap,
          '--seed-method', seedMethod],
          capture_output=True)

clusterONE beginning


#Generate MCL clusters (archive)

In [None]:
#top20Clusters_select.insert(7, 'inflation (MCL)', ...)
#top20Clusters_select.insert(8, 'total clusterONE+MCL clusters', ...)
#top20Clusters_select.insert(9, 'clusterONE+MCL clusters weighted F1-score', ...)
#top20Clusters_select.insert(10, 'clusterONE+MCL clusters', ...)
#top20Clusters_select.insert(11, 'clusterONE+MCL analysisOBJ', ...)

#parameterGrid_batchResults.insert(7, 'inflation (MCL)', ...)
#parameterGrid_batchResults.insert(8, 'total clusterONE+MCL clusters, ...)
#parameterGrid_batchResults.insert(9, 'clusterONE+MCL clusters weighted F1-score', ...)
#parameterGrid_batchResults.insert(10, 'clusterONE+MCL clusters', ...)
#parameterGrid_batchResults.insert(11, 'clusterONE+MCL analysisOBJ', ...)

In [None]:
top20Clusters_select_mclAdded = \
  gridSearch_addInflation_mcl(
      parameterGrid_batchResults, ppi_scores, inflationSweep, resultsDir, delete)

In [None]:
parameterGrid_batchResults

In [None]:
parameterGrid_batchResults_f1Sorted = \
  parameterGrid_batchResults.sort_values(
      by=['clusterONE clusters weighted F1-score'], ascending=False).copy()

parameterGrid_batchResults_numClusters_byF1_sorted = \
  parameterGrid_batchResults.sort_values(
      by=['total clusterONE clusters', 'clusterONE clusters weighted F1-score'],
      ascending=False).copy()

In [None]:
top10Clusters_f1 = parameterGrid_batchResults_f1Sorted.iloc[:10, :]
top10Clusters_numClusters_byF1 = \
  parameterGrid_batchResults_numClusters_byF1_sorted.iloc[:10, :]
top20Clusters_select = \
  pd.concat([top10Clusters_f1, top10Clusters_numClusters_byF1],
            axis=0, ignore_index=True, verify_integrity=True)

In [None]:
pickle.dump(top20Clusters_select,
            open(batchFiles_dir +
                 'parameterGrid_batchResults_top20Clusters_select.pkl', 'wb'))

pickle.dump(
    top20Clusters_select.loc[:, ['size', 'density', 'overlap',
                                 'total clusterONE clusters',
                                 'clusterONE clusters weighted F1-score',
                                 'clusterONE clusters',
                                 'clusterONE predictions filename']].copy(),
            open(batchFiles_dir +
                 'parameterGrid_batchResults_noCC_top20Clusters_select.pkl',
                 'wb'))

In [None]:
top20Clusters_select = \
  pickle.load(open(batchFiles_dir +
                   'parameterGrid_batchResults_top20Clusters_select.pkl', 'rb'))

In [None]:
elcfs_top20_mclAdded = pickle.load(open('/content/drive/My Drive/otherStudies/elcfsMean_clusterONE_narrower/parameterGrid_batchResults_noCC_top20Clusters_select_mclAdded.pkl', 'rb'))

In [None]:
elcfs_top20_mclAdded

In [None]:
top20Clusters_select_mclAdded

In [None]:
featMat_path = \
  rootDir + 'Primary Research/proteinPairs_complexMaps/sourceData/humap2/pairs/orig9k_bioplex2_hygeo_bioid_hygeo_boldt_apms_hygeo_treiber_hygeo_wgt2_youn_hygeo_trimCols_groupbyMean.featmat.bz2'

humap2Featmat = pd.read_csv(featMat_path)

#generate file for complexes to evaluate predictions against using latest CORUM release (2018.09.03)

In [None]:
def purgeNAs(line):
  return [str(s) for s in line if s!='None']

In [None]:
corum = pd.read_excel('./corum.xlsx', na_values=['None'])

In [None]:
corum.insert(7, 'entrezIDs_list',
             [purgeNAs(line.split(';')) if not isinstance(line, float) else line
              for line in corum['subunits(Entrez IDs)'].to_list()])

In [None]:
entrezIDs_all = \
  set().union(*corum.loc[
      corum.entrezIDs_list.notnull(), 'entrezIDs_list'].to_list())
print(len(entrezIDs_all))

5419


In [None]:
all([int(x) for x in list(entrezIDs_all)])

True

In [None]:
entrezIDs_human = \
  set().union(*corum.loc[(
      (corum.entrezIDs_list.notnull()) &
      (corum.Organism=='Human')),
                         'entrezIDs_list'].to_list())
print(len(entrezIDs_human))

3375


In [None]:
corumComplexes_entrezIDs = \
  corum.loc[corum.entrezIDs_list.notnull(), 'entrezIDs_list'].to_list()
corumComplexes_entrezIDs_human = \
  [cplx
   for cplx in corum.loc[(
      (corum.entrezIDs_list.notnull()) &
       (corum.Organism=='Human')), 'entrezIDs_list'].to_list() if len(cplx) > 2]

In [None]:
corumComplexes_2018Sep = \
  elcfsDir + 'sourceData/corum/corumComplexes(len>2)_2018Sep_eval.txt'
with open(corumComplexes_2018Sep, 'w') as f:
  for cplx in corumComplexes_entrezIDs_human:
    line = ' '.join(cplx) + '\n'
    f.write(line)