# Meta

In [2]:
# Autoreload modules without having to restart the notebook kernel.
import networkx as nx
% load_ext autoreload
% autoreload 2


# Plotting code from Georg's notebook.
import matplotlib.pyplot as plt

% matplotlib inline
font = {'family': 'DejaVu Sans',
        'weight': 'bold',
        'size': 32}
plt.rc('font', **font)

# Personal libraries
import lib.cluster
import lib.graph
import lib.files
import lib.constants

# Constants

In [None]:
INFLATION = 2  # The parameter controlling Markov Clustering

# Analysis

### ICP55 connected component with inviable proteins with inflation = 2

In [3]:
network_name = 'icp55-cc-900-inv'
network_filepath = lib.files.make_filepath_to_networks(f'{network_name}.txt')
cluster_filepath = lib.files.make_filepath_to_clusters(f'mcl.{network_name}.nodes.csv')
cluster_df_filepath = lib.files.make_filepath_to_clusters(f'mcl.{network_name}.dataframe.csv')

network = lib.graph.read_weighted_edgelist(network_filepath)
mcl_data = lib.cluster.run_mcl(network, INFLATION)

# UNCOMMENTING BELOW WILL OVERWRITE THE EXISTING FILES
# lib.cluster.generate_and_save_dataframe(network, mcl_data.clusters, cluster_df_filepath)
# lib.cluster.write_to_file(cluster_filepath, mcl_data.clusters)

### ICP55 connected component with inviable proteins with inflation = 1.4
When inflation = 2, ICP55 exists in a disconnected cluster of size 2.
One method of handling this was to "uncluster" nodes like this (i.e. assign them singleton clusters)
Another method is to coarsen the clusters.

In [2]:
network_name = 'icp55-cc-900-inv'
network_filepath = lib.files.make_filepath_to_networks(f'{network_name}.txt')
cluster_filepath = lib.files.make_filepath_to_clusters(f'mcl.{network_name}.nodes.csv')
cluster_df_filepath = lib.files.make_filepath_to_clusters(f'mcl.{network_name}.dataframe.csv')

network = lib.graph.read_weighted_edgelist(network_filepath)
import time

start = time.time()
mcl_data = lib.cluster.run_mcl(network, inflation=1.4)
end = time.time()
print(f"Time taken {end - start / 60}")

Time taken 1605751980.4159539


In [3]:
start = time.time()
df = lib.cluster.generate_dataframe(network, mcl_data.clusters)
end = time.time()
print(f"Time taken {end - start / 60}")
icp55_cluster_index = lib.cluster.cluster_idxs_with_protein(mcl_data.clusters, lib.constants.ICP55)[0]
pim1_cluster_index = lib.cluster.cluster_idxs_with_protein(mcl_data.clusters, lib.constants.PIM1)[0]
print(f"Number of clusters: {len(df)}")
print(f"Percentage of connected clusters: {len(df[df['is_connected']]) / len(df)}")
print(f"ICP55 cluster size: {df.iloc[icp55_cluster_index]['size']}")
print(f"ICP55 cluster is connected: {df.iloc[icp55_cluster_index]['is_connected']}")
print(f"PIM1 cluster size: {df.iloc[pim1_cluster_index]['size']}")
print(f"PIM1 cluster is connected: {df.iloc[pim1_cluster_index]['is_connected']}")

Time taken 1605751989.9316916
Number of clusters: 179
Percentage of connected clusters: 0.9776536312849162
ICP55 cluster size: 98
ICP55 cluster is connected: True
PIM1 cluster size: 4
PIM1 cluster is connected: True


In [6]:
network_name = 'icp55-cc-900-inv'
network_filepath = lib.files.make_filepath_to_networks(f'{network_name}.txt')
cluster_filepath = lib.files.make_filepath_to_clusters(f'mcl.{network_name}.nodes.csv')
cluster_df_filepath = lib.files.make_filepathto_clusters(f'mcl.{network_name}.dataframe.csv')

network = lib.graph.read_weighted_edgelist(network_filepath)
import time

start = time.time()
clusters = lib.cluster.run_pc2p(network)
end = time.time()
print(f"Time taken {(end - start) / 60}")

KeyboardInterrupt: 

In [1]:
%load_ext autoreload
%autoreload 2
import PC2P.Analysis.PredictedClusters_Analysis as pc2p_analysis
import lib.cluster

predicted = lib.cluster.read_csv(lib.files.make_filepath_to_mcl_clusters('icp55-cc-900-inv.mcl-1.1.csv'))
predicted = list(map(set, predicted))

reference = lib.cluster.read_yhtp2008()
reference = list(map(set, reference))

pc2p_analysis.F_measure_Jaccard(reference, predicted)

0

In [7]:
network = lib.graph.read_weighted_edgelist(lib.files.make_filepath_to_networks('icp55-cc-900-inv.txt'))
df = pd.read_csv(lib.files.make_filepath_to_clusters('icp55-cc-900-inv.mcl.global.csv'), header=0, index_col=0)
for i in range(len(df)): # For every clustering
    filepath = df.iloc[i]['filepath']
    clusters = lib.cluster.read_csv(filepath)
    proteins = lib.cluster.proteins(clusters)
    if len(proteins) != len(network):
        continue

KeyboardInterrupt: 

In [15]:
# This cell adds modularity into the mcl.global dataframe
import pandas as pd
import networkx as nx

network = lib.graph.read_weighted_edgelist(lib.files.make_filepath_to_networks('icp55-cc-900-inv.txt'))
df = pd.read_csv(lib.files.make_filepath_to_clusters('icp55-cc-900-inv.mcl.global.csv'), header=0, index_col=0)
df.head()
for i in range(len(df)):
    try:
        filepath = df.iloc[i]['filepath']
        clusters = lib.cluster.read_csv(filepath)
        modularity = nx.algorithms.community.quality.modularity(network, clusters, weight=None)
        df.at[i, 'modularity'] = modularity
        print(modularity)
    except:
        df.at[i, 'modularity'] = modularity
        continue

0.6282559237129133
0.6251728695595921
0.6219184562143253
0.6411649392359039
0.672179733135548
0.6543600070212295
0.6388689228876372
0.6230460545558822
0.6061377073300279
0.5962254294221696
0.5793437111509285
0.5693210590386254
0.5611486278070613
0.5493786880059304
0.5435808044444772
0.5281864970773844
0.5171246340662592
0.5012991870467537
0.48980272173251244
0.4761654955445798
0.3983916201527433
0.3779111204845565
0.36041937845261507
0.33391118810970455


In [15]:
# This cell adds percentage of clusters greater than size >5, >10, >15, >20, >25
import pandas as pd
import networkx as nx

network = lib.graph.read_weighted_edgelist(lib.files.make_filepath_to_networks('icp55-cc-900-inv.txt'))
df = pd.read_csv(lib.files.make_filepath_to_clusters('icp55-cc-900-inv.mcl.global.csv'), header=0, index_col=0)
df.head()

keys = []
p5 = []
p10 = []
p15 = []
p20 = []
p25 = []
for i in range(len(df)):
    filepath = df.iloc[i]['filepath']
    keys.append(filepath) # Used to join the dataframes on.
    clusters = lib.cluster.read_csv(filepath)
    p5.append(len([c for c in clusters if len(c) > 5]) / len(clusters))
    p10.append(len([c for c in clusters if len(c) > 10]) / len(clusters))
    p15.append(len([c for c in clusters if len(c) > 15]) / len(clusters))
    p20.append(len([c for c in clusters if len(c) > 20]) / len(clusters))
    p25.append(len([c for c in clusters if len(c) > 25]) / len(clusters))

df2 = pd.DataFrame.from_records(list(zip(keys, p5, p10, p15, p20, p25)),
                                columns=['filepath',
                                         'gt_5',
                                         'gt_10',
                                         'gt_15',
                                         'gt_20',
                                         'gt_25'])

df3 = df.merge(df2, on='filepath')

In [17]:
df3.to_csv(lib.files.make_filepath_to_clusters('icp55-cc-900-inv.mcl.global.concat.csv'))

In [17]:
df.to_csv(lib.files.make_filepath_to_clusters('icp55-cc-900-inv.mcl.global.concat.csv'))

In [None]:
for i in range(len(df)): # For every clustering
    filepath = df.iloc[i]['filepath']
    clusters = lib.cluster.read_csv(filepath)
    modularity = nx.algorithms.community.quality.modularity(network, clusters, weight=None)
    df.iloc[i]['modularity'] = modularity
df.head()

In [25]:
# This was a test of pc2p on a smaller network.

%load_ext autoreload
%autoreload 2
import networkx as nx
import lib.cluster
network = nx.fast_gnp_random_graph(300, 1/2)
clusters = lib.cluster.run_pc2p_parallel(network)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
end of round:  1
end of round:  2
end of round:  3
end of round:  4


In [5]:
%load_ext autoreload
%autoreload 2

import time
import numpy as np
import pandas as pd
import networkx as nx

import lib.graph
import lib.cluster
import PC2P.Analysis.PredictedClusters_Analysis as pc2p_analysis

network_name = 'icp55-cc-900-inv'
network_filepath = lib.files.make_filepath_to_networks(f'{network_name}.txt')
network = lib.graph.read_weighted_edgelist(network_filepath)

# These are the two gold standard validation sets to use
# We need to convert each cluster from a list to a set for pc2p_analysis
yhtp2008 = lib.cluster.read_yhtp2008()
yhtp2008 = list(map(set, yhtp2008))
sgd = lib.cluster.read_sgd()
sgd = list(map(set, sgd))

# These are the metrics to calculate.
# Anything with a suffix _yhtp will use the yhtp2008 validation set.
# Anything with a suffix _sgd will use the sgd validation set.
# These are the independant metrics
percentage_connected = []
modularity = []
num_clusters = []
# YHTP metrics
sensitivity_yhtp = []
positive_predicted_value_yhtp = []
accuracy_yhtp = []
fraction_matched_yhtp = []
separation_yhtp = []
precision_jaccard_yhtp = []
recall_jaccard_yhtp = []
f_measure_jaccard_yhtp = []
# SGD metrics
sensitivity_sgd = []
positive_predicted_value_sgd = []
accuracy_sgd = []
fraction_matched_sgd = []
separation_sgd = []
precision_jaccard_sgd = []
recall_jaccard_sgd = []
f_measure_jaccard_sgd = []

# These are the different inflation values we will use.
inflations = np.linspace(3.1, 4, num=10, endpoint=True)

# These are the times taken for different inflation values
times = []

# These are filepaths to each dataset
filepaths = []

# Calculate total time taken
time_0 = time.time()

# for inflation in inflations:
for i in range(1):
    start = time.time()

    # Run markov clustering
    # mcl_data = lib.cluster.run_mcl(network, inflation)
    # clusters = mcl_data.clusters

    # Run PC2P clustering O(n^2 max_deg^2)
    # Because of the issue of hub nodes, I'm going to remove all nodes with high degree
    nodes_to_remove = [node for node in network if network.degree()[node] >= 50]
    network.remove_nodes_from(nodes_to_remove)
    network = lib.graph.get_largest_cc_with_node(network, lib.constants.ICP55)
    print(f"Numbewr of nodes {len(network)}")
    clusters = lib.cluster.run_pc2p(network)

    # We need to convert each cluster from a list to a set for pc2p_analysis.
    clusters = list(map(set, clusters))

    # Save the clusters to a file
    # filepath = lib.files.make_filepath_to_mcl_clusters(f"{network_name}.mcl-{inflation}.csv")
    filepath = lib.files.make_filepath_to_clusters(f"{network_name}.pc2p.csv")
    filepaths.append(filepath)
    lib.cluster.write_to_file(filepath, clusters)

    # # Compute the metrics for each clustering.
    # modularity.append(mcl_data.modularity)


    num_clusters.append(len(clusters))
    percentage_connected.append(len(
        [cluster for cluster in clusters if nx.is_connected(network.subgraph(cluster))]
    ) / len(clusters))

    # Compute yhtp metrics
    sensitivity_yhtp.append(pc2p_analysis.clusteringwise_sensitivity(yhtp2008, clusters))
    positive_predicted_value_yhtp.append(pc2p_analysis.positive_predictive_value(yhtp2008, clusters))
    accuracy_yhtp.append(pc2p_analysis.accuracy(yhtp2008, clusters))
    fraction_matched_yhtp.append(pc2p_analysis.fraction_matched(yhtp2008, clusters))
    separation_yhtp.append(pc2p_analysis.clusteringwise_separation(yhtp2008, clusters))
    precision_jaccard_yhtp.append(pc2p_analysis.precision_Jaccard(yhtp2008, clusters))
    recall_jaccard_yhtp.append(pc2p_analysis.recall_Jaccard(yhtp2008, clusters))
    f_measure_jaccard_yhtp.append(pc2p_analysis.F_measure_Jaccard(yhtp2008, clusters))

    # Compute sgd metrics
    sensitivity_sgd.append(pc2p_analysis.clusteringwise_sensitivity(sgd, clusters))
    positive_predicted_value_sgd.append(pc2p_analysis.positive_predictive_value(sgd, clusters))
    accuracy_sgd.append(pc2p_analysis.accuracy(sgd, clusters))
    fraction_matched_sgd.append(pc2p_analysis.fraction_matched(sgd, clusters))
    separation_sgd.append(pc2p_analysis.clusteringwise_separation(sgd, clusters))
    precision_jaccard_sgd.append(pc2p_analysis.precision_Jaccard(sgd, clusters))
    recall_jaccard_sgd.append(pc2p_analysis.recall_Jaccard(sgd, clusters))
    f_measure_jaccard_sgd.append(pc2p_analysis.F_measure_Jaccard(sgd, clusters))
    #
    end = time.time()
    seconds = end - start
    times.append(seconds)

# Make a dataframe with each row representing a clustering.
# Save the dataframe to a file.
df2 = pd.DataFrame.from_records(
    list(zip(
        # inflations,
        # modularity,
        percentage_connected,
        sensitivity_yhtp,
        positive_predicted_value_yhtp,
        accuracy_yhtp,
        fraction_matched_yhtp,
        separation_yhtp,
        precision_jaccard_yhtp,
        recall_jaccard_yhtp,
        f_measure_jaccard_yhtp,
        sensitivity_sgd,
        positive_predicted_value_sgd,
        accuracy_sgd,
        fraction_matched_sgd,
        separation_sgd,
        precision_jaccard_sgd,
        recall_jaccard_sgd,
        f_measure_jaccard_sgd,
        filepaths,
        num_clusters
    )),
    columns=[
        # "inflation",
        # "modularity",
        "percentage_connected",
        "sensitivity_yhtp",
        "positive_predicted_value_yhtp",
        "accuracy_yhtp",
        "fraction_matched_yhtp",
        "separation_yhtp",
        "precision_jaccard_yhtp",
        "recall_jaccard_yhtp",
        "f_measure_jaccard_yhtp",
        "sensitivity_sgd",
        "positive_predicted_value_sgd",
        "accuracy_sgd",
        "fraction_matched_sgd",
        "separation_sgd",
        "precision_jaccard_sgd",
        "recall_jaccard_sgd",
        "f_measure_jaccard_sgd",
        "filepath",
        "num_clusters"
    ]
)

print(f"Seconds: {(time.time()- time_0) / 60}")

df_filepath = lib.files.make_filepath_to_clusters(f"{network_name}.pc2p.global.csv")
# df = pd.read_csv(df_filepath)
df2.to_csv(df_filepath)
# df3 = pd.concat([df, df2], ignore_index=True)
#
# df3_filepath = lib.files.make_filepath_to_clusters(f"{network_name}.mcl.global.concat.csv")
# df3.to_csv(df3_filepath)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Numbewr of nodes 3903
Components 1, iteration: 1
Line 204
Components 2, iteration: 2
Line 204
Components 2, iteration: 3
Line 204
Components 2, iteration: 4
Line 204
Components 3, iteration: 5
Line 204
Components 3, iteration: 6
Line 204
Components 3, iteration: 7
Line 204
Components 3, iteration: 8
Line 204
Components 3, iteration: 9
Line 204
Components 5, iteration: 10
Line 204
Components 6, iteration: 11
Line 204
Components 6, iteration: 12
Line 204
Components 6, iteration: 13
Line 204
Components 7, iteration: 14
Line 204
Components 7, iteration: 15
Line 204
Components 7, iteration: 16
Line 204
Components 7, iteration: 17
Line 204
Components 7, iteration: 18
Line 204
Components 7, iteration: 19
Line 204
Components 10, iteration: 20
Line 204
Components 14, iteration: 21
Line 204
Components 14, iteration: 22
Line 204
Components 14, iteration: 23
Line 204
Components 14, iteration: 24
Line 204
Compon

KeyboardInterrupt: 

In [21]:
# This cell adds sizes to the dataframe. TODO: Make fn add_col and add_row to dataframe files.
df = pd.read_csv(lib.files.make_filepath_to_clusters('icp55-cc-900-inv.mcl.global.concat.csv'))
filepaths = list(df['filepath'].array)

filepaths_2 = []
sizes = []
for path in filepaths:
    clusters = lib.cluster.read_csv(path)
    filepaths_2.append(path)
    sizes.append(len(clusters))

df2 = pd.DataFrame.from_records(list(zip(filepaths_2, sizes)), columns=['filepath', 'num_clusters'])
df3 = df.merge(df2, on='filepath')
df3.head()
df3.to_csv(lib.files.make_filepath_to_clusters('icp55-cc-900-inv.mcl.global.concat2.csv'))


### ICP55 connected component no inviable proteins

In [None]:
lib.cluster.run_mcl_and_write_to_file(
    graph=lib.graph.read_edgelist(lib.files.make_filepath_to_networks('icp55-cc-900-noInv.txt')),
    filepath=lib.files.make_filepath_to_mcl_clusters("icp55-cc-inviable.csv"),
    inflation=INFLATION
)