The first section of this code was used to compute the various centrality scores for our Markov clusters. The second section finds the proportion of essential proteins in each quartile. This part was not mentioned in the presentation.

In [7]:
# Autoreload modules without having to restart the notebook kernel.
# hi bilbo
%load_ext autoreload
%autoreload 2


import sys
import os
import matplotlib.pyplot as plt

# Introduce the project directory to your system's path to make data/lib folders visible.
sys.path.insert(1, "..")

# Plotting code stolen from Georg's notebook.
%matplotlib inline
font = {'family': 'DejaVu Sans',
        'weight': 'bold',
        'size': 32}
plt.rc('font', **font)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
# Personal libraries
import lib.cluster
import lib.graph
import lib.files
import pandas as pd
import networkx as nx
import numpy as np

In [9]:
def centrality_df(centrality_function, centrality_name, cluster_filepath, network_filepath):
    # reading the clusters and network into nx
    clusters = lib.cluster.read_csv(cluster_filepath)
    network = lib.graph.read_weighted_edgelist(network_filepath)
    clusters = [network.subgraph(cluster) for cluster in clusters]
    
    # finding the centrality 
    protein_centrality = []
    for cluster in clusters:
        cent = centrality_function(cluster)
        list_cent = list(cent.items())
        protein_centrality.extend(list_cent)
        
    df = pd.DataFrame(protein_centrality)
    df.rename(columns = {0: "protein", 1: centrality_name} , inplace=True)
    nodes = pd.read_csv(cluster_filepath, index_col= 0)
    
    # append the centrality measures to the original markov clusters in the dataframe
    centrality_df = nodes.merge(df, on = 'protein')
    
    return centrality_df

In [10]:
# obtaining the paths for our network
network_name = 'icp55-cc-900-inv'
network_filepath = lib.files.make_filepath_to_networks(f'{network_name}.txt')
cluster_filepath = lib.files.make_filepath_to_clusters(f'mcl.{network_name}.nodes.csv')

In [11]:
# plugging into the function to calculate the centralities and combining them into one df
centrality_measures = ((nx.betweenness_centrality, "Betweenness Centrality"),
                (nx.degree_centrality, "Degree Centrality"),
                (nx.eigenvector_centrality, "Eigenvector Centrality"))

cent_df = centrality_df(nx.subgraph_centrality, "Subgraph Centrality", cluster_filepath, network_filepath)


for i,j in centrality_measures:
    cent_df = cent_df.merge(centrality_df(i, j, cluster_filepath, network_filepath), on =('protein','cluster'))
    

In [12]:
# outputting as a csv file into the mcl folder
cent_df.to_csv(lib.files.make_filepath_to_mcl_clusters("mcl.icp55-cc-900-inv-centralities.csv"))

## Finding Proteins That Are Essential From The Centrality Measures and Quartiles

Defining the function `inviable()` which takes in a single argument for the path of a file for which we want to check which proteins are inviable. It adds a new column named `inviable` in the dataframe which takes in boolean values.

In [13]:
def inviable_wo_4932(filepath):
    inviable_path = lib.files.make_filepath_to_data("inviable_proteins.csv") # the path of the csv with data for which proteins are inviable
    inviable_proteins = list(pd.read_csv(inviable_path,header=None)[1]) # reading the inviable proteins csv in as a df
    df = pd.read_csv(filepath, index_col = 0) # reads in the file for which we want to check which proteins are inviable
    df['inviable'] = True
    proteins = df['protein']
    for i in range(len(proteins)): # it checks to see whether or not a protein is inviable and adds a new column called inviable. If the protein is inviable, it will be True, otherwise it will be False
        protein = df.iloc[i,1]
        if protein not in inviable_proteins:
            df.iloc[i,-1] = False
    return df

### Third Quartile

In [36]:
file_names = ['betweenness_centrality_quantile_0.5_to_0.75',
              'degree_centrality_quantile_0.5_to_0.75',
              'eigenvector_centrality_quantile_0.5_to_0.75']
#establishes the path to the MCL folder for each of the above file names
file_paths = list(map(lambda file: lib.files.make_filepath_to_mcl_clusters(file), file_names))
btw_3q = inviable_wo_4932(file_paths[0])
deg_3q = inviable_wo_4932(file_paths[1])
egv_3q = inviable_wo_4932(file_paths[2])
inviable_dfs_3q = [btw_3q,deg_3q,egv_3q]
centrality_measures = ['betweenness','degree','eigenvector']

In [37]:
for i, df in enumerate(inviable_dfs_3q):
   print('There are', df['inviable'].sum(), 'essential proteins found in the third quantile for',
         centrality_measures[i],'out of', len(df), 'proteins.', 'Proportion:', round(df['inviable'].sum()/len(df),2))

There are 548 essential proteins found in the third quantile for betweenness out of 2467 proteins. Proportion: 0.22
There are 568 essential proteins found in the third quantile for degree out of 2372 proteins. Proportion: 0.24
There are 515 essential proteins found in the third quantile for eigenvector out of 2164 proteins. Proportion: 0.24


In [19]:
# this function is for turning these dataframes into csv files in the mcl folder. It takes in a list of dataframes
# as its argument and it drops the inviable column retaining only those inviable proteins.
def make_inviable_csv(list_of_inviable_dfs):
    for i, df in enumerate(list_of_inviable_dfs):
        df2 = df[df['inviable'] == True]
        df2 = df2.drop('inviable',axis=1)
        df2.to_csv(lib.files.make_filepath_to_mcl_clusters(f'inviable_{file_names[i]}'))

In [76]:
make_inviable_csv(inviable_dfs_3q)

### Fourth Quartile

In [22]:
file_names = ['betweenness_centrality_quantile_0.75_to_1',
              'degree_centrality_quantile_0.75_to_1',
              'eigenvector_centrality_quantile_0.75_to_1']
file_paths = list(map(lambda file: lib.files.make_filepath_to_mcl_clusters(file), file_names))
btw_4q = inviable_wo_4932(file_paths[0])
deg_4q = inviable_wo_4932(file_paths[1])
egv_4q = inviable_wo_4932(file_paths[2])
inviable_dfs_4q = [btw_4q,deg_4q,egv_4q]
centrality_measures = ['betweenness','degree','eigenvector']

In [23]:
btw_4q

Unnamed: 0,cluster,protein,degree,betweenness_centrality,eigenvector_centrality,degree_centrality,inviable
0,0,Q0297,6,0.286,0.403,0.857,False
1,0,Q0010,6,0.119,0.417,0.857,False
2,0,Q0143,6,0.119,0.417,0.857,False
3,1,YCL017C,11,0.009,0.303,1.000,True
4,1,YKL040C,11,0.009,0.303,1.000,False
...,...,...,...,...,...,...,...
2104,685,YCL012C,1,0.000,0.707,1.000,False
2105,686,YNL033W,3,0.250,0.500,0.750,False
2106,686,YNL019C,3,0.250,0.500,0.750,False
2107,687,YIL088C,1,0.000,0.707,1.000,False


In [24]:
for i, df in enumerate(inviable_dfs_4q):
   print('There are', df['inviable'].sum(), 'essential proteins found in the fourth quantile for',
         centrality_measures[i],'out of', len(df), 'proteins.', 'Proportion:', round(df['inviable'].sum()/len(df),2))

There are 573 essential proteins found in the fourth quantile for betweenness out of 2109 proteins. Proportion: 0.27
There are 598 essential proteins found in the fourth quantile for degree out of 2140 proteins. Proportion: 0.28
There are 564 essential proteins found in the fourth quantile for eigenvector out of 2027 proteins. Proportion: 0.28


One thing to note is that for both the third and fourth quantiles, the overall number of proteins for the eigenvector one seems to be less compared to betweenness and degree. The proportion of essential proteins is indeed higher for the fourth quantile compared to the third, but by not so much. Let us also have a look at the second quantile later.

In [77]:
make_inviable_csv(inviable_dfs_4q)

### Second Quartile

In [25]:
file_names = ['betweenness_centrality_quantile_0.25_to_0.5',
              'degree_centrality_quantile_0.25_to_0.5',
              'eigenvector_centrality_quantile_0.25_to_0.5']
file_paths = list(map(lambda file: lib.files.make_filepath_to_mcl_clusters(file), file_names))
btw_2q = inviable_wo_4932(file_paths[0])
deg_2q = inviable_wo_4932(file_paths[1])
egv_2q = inviable_wo_4932(file_paths[2])
inviable_dfs_2q = [btw_2q,deg_2q,egv_2q]
centrality_measures = ['betweenness','degree','eigenvector']

In [26]:
egv_2q

Unnamed: 0,cluster,protein,degree,eigenvector_centrality,betweenness_centrality,degree_centrality,inviable
0,0,Q0182,5,0.391,0.000,0.714,False
1,0,Q0032,5,0.391,0.000,0.714,False
2,1,YJR122W,10,0.283,0.002,0.909,False
3,1,YLL027W,10,0.283,0.002,0.909,False
4,1,YOR196C,10,0.283,0.002,0.909,False
...,...,...,...,...,...,...,...
2130,686,YNL034W,2,0.408,0.056,0.500,False
2131,686,YFL051C,2,0.408,0.056,0.500,False
2132,686,YNL018C,2,0.408,0.056,0.500,False
2133,687,YIL088C,1,0.707,0.000,1.000,False


In [27]:
for i, df in enumerate(inviable_dfs_2q):
   print('There are', df['inviable'].sum(), 'essential proteins found in the second quantile for',
         centrality_measures[i],'out of', len(df), 'proteins.', 'Proportion:', round(df['inviable'].sum()/len(df),2))

There are 484 essential proteins found in the second quantile for betweenness out of 2528 proteins. Proportion: 0.19
There are 471 essential proteins found in the second quantile for degree out of 2327 proteins. Proportion: 0.2
There are 428 essential proteins found in the second quantile for eigenvector out of 2135 proteins. Proportion: 0.2


In [78]:
make_inviable_csv(inviable_dfs_2q)

It doesn't seem like the second quartile has that much less essential proteins compared to the third. The fourth quartile would probably be still similar, but let us have a look anyways. Also, the number of total proteins for the second quartile for eigenvector centrality is still a lot less for some reasons. It probably is all in the fourth quantile.

### First Quantile

In [29]:
file_names = ['betweenness_centrality_quantile_0_to_0.25',
              'degree_centrality_quantile_0_to_0.25',
              'eigenvector_centrality_quantile_0_to_0.25']
file_paths = list(map(lambda file: lib.files.make_filepath_to_mcl_clusters(file), file_names))
btw_1q = inviable_wo_4932(file_paths[0])
deg_1q = inviable_wo_4932(file_paths[1])
egv_1q = inviable_wo_4932(file_paths[2])
inviable_dfs_1q = [btw_1q,deg_1q,egv_1q]
centrality_measures = ['betweenness','degree','eigenvector']

In [30]:
egv_2q

Unnamed: 0,cluster,protein,degree,eigenvector_centrality,betweenness_centrality,degree_centrality,inviable
0,0,Q0182,5,0.391,0.000,0.714,False
1,0,Q0032,5,0.391,0.000,0.714,False
2,1,YJR122W,10,0.283,0.002,0.909,False
3,1,YLL027W,10,0.283,0.002,0.909,False
4,1,YOR196C,10,0.283,0.002,0.909,False
...,...,...,...,...,...,...,...
2130,686,YNL034W,2,0.408,0.056,0.500,False
2131,686,YFL051C,2,0.408,0.056,0.500,False
2132,686,YNL018C,2,0.408,0.056,0.500,False
2133,687,YIL088C,1,0.707,0.000,1.000,False


In [31]:
for i, df in enumerate(inviable_dfs_1q):
   print('There are', df['inviable'].sum(), 'essential proteins found in the first quantile for',
         centrality_measures[i],'out of', len(df), 'proteins.', 'Proportion:', round(df['inviable'].sum()/len(df),2))

There are 406 essential proteins found in the first quantile for betweenness out of 2357 proteins. Proportion: 0.17
There are 379 essential proteins found in the first quantile for degree out of 2154 proteins. Proportion: 0.18
There are 365 essential proteins found in the first quantile for eigenvector out of 2066 proteins. Proportion: 0.18


In [79]:
make_inviable_csv(inviable_dfs_1q)

Compared to the second quartile there is a slight difference in number of essential proteins detected. Overall there is a slight trend in that the larger the quartile, the more essential proteins were found. For some reasons the eigenvector centrality still only has around 1800 proteins total.