The first section of this code was used to compute the various centrality scores for our Markov clusters. The second section finds the proportion of essential proteins in each quartile. This part was not mentioned in the presentation.

In [4]:
# Autoreload modules without having to restart the notebook kernel.
# hi bilbo
%load_ext autoreload
%autoreload 2


import sys
import os
import matplotlib.pyplot as plt

# Introduce the project directory to your system's path to make data/lib folders visible.
sys.path.insert(1, "..")

# Plotting code stolen from Georg's notebook.
%matplotlib inline
font = {'family': 'DejaVu Sans',
        'weight': 'bold',
        'size': 32}
plt.rc('font', **font)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
# Personal libraries
import lib.plot
import lib.graph
import lib.files
import pandas as pd
import networkx as nx
import numpy as np

In [7]:
def centrality_df(centrality_function, centrality_name, cluster_filepath, network_filepath):
    # reading the clusters and network into nx
    clusters = lib.cluster.read_csv(cluster_filepath)
    network = lib.graph.read_weighted_edgelist(network_filepath)
    clusters = [network.subgraph(cluster) for cluster in clusters]
    
    # finding the centrality 
    protein_centrality = []
    for cluster in clusters:
        cent = centrality_function(cluster)
        list_cent = list(cent.items())
        protein_centrality.extend(list_cent)
        
    df = pd.DataFrame(protein_centrality)
    df.rename(columns = {0: "protein", 1: centrality_name} , inplace=True)
    nodes = pd.read_csv(cluster_filepath, index_col= 0)
    
    # append the centrality measures to the original markov clusters in the dataframe
    centrality_df = nodes.merge(df, on = 'protein')
    
    return centrality_df

In [8]:
# obtaining the paths for our network
network_name = 'icp55-cc-900-inv'
network_filepath = lib.files.make_filepath_to_networks(f'{network_name}.txt')
cluster_filepath = lib.files.make_filepath_to_clusters(f'mcl.{network_name}.nodes.csv')

In [9]:
# plugging into the function to calculate the centralities and combining them into one df
centrality_measures = ((nx.betweenness_centrality, "Betweenness Centrality"),
                (nx.degree_centrality, "Degree Centrality"),
                (nx.eigenvector_centrality, "Eigenvector Centrality"))

cent_df = centrality_df(nx.subgraph_centrality, "Subgraph Centrality", cluster_filepath, network_filepath)


for i,j in centrality_measures:
    cent_df = cent_df.merge(centrality_df(i, j, cluster_filepath, network_filepath), on =('protein','cluster'))
    

In [144]:
# outputting as a csv file into the mcl folder
cent_df.to_csv(lib.files.make_filepath_to_mcl_clusters("mcl.icp55-cc-900-inv-centralities.csv"))

## Finding Proteins That Are Essential From The Centrality Measures and Quartiles

Defining the function `inviable()` which takes in a single argument for the path of a file for which we want to check which proteins are inviable. It adds a new column named `inviable` in the dataframe which takes in boolean values.

In [82]:
def inviable(filepath):
    inviable_path = lib.files.make_filepath_to_data("inviable_proteins.csv") # the path of the csv with data for which proteins are inviable
    inviable_proteins = list(pd.read_csv(inviable_path,header=None)[1]) # reading the inviable proteins csv in as a df
    df = pd.read_csv(filepath, index_col = 0) # reads in the file for which we want to check which proteins are inviable
    df['protein'] = df['protein'].apply(lambda x: x[5:]) #removes the 4932. string at the start of each protein code
    df['inviable'] = True
    proteins = df['protein']
    for i in range(len(proteins)): # it checks to see whether or not a protein is inviable and adds a new column called inviable. If the protein is inviable, it will be True, otherwise it will be False
        protein = df.iloc[i,1]
        if protein not in inviable_proteins:
            df.iloc[i,-1] = False
    return df

### Third Quartile

In [83]:
file_names = ['betweenness_centrality_quantile_0.5_to_0.75',
              'degree_centrality_quantile_0.5_to_0.75',
              'eigenvector_centrality_quantile_0.5_to_0.75']
#establishes the path to the MCL folder for each of the above file names
file_paths = list(map(lambda file: lib.files.make_filepath_to_mcl_clusters(file), file_names))
btw_3q = inviable(file_paths[0])
deg_3q = inviable(file_paths[1])
egv_3q = inviable(file_paths[2])
inviable_dfs_3q = [btw_3q,deg_3q,egv_3q]
centrality_measures = ['betweenness','degree','eigenvector']

In [84]:
for i, df in enumerate(inviable_dfs_3q):
   print('There are', df['inviable'].sum(), 'essential proteins found in the third quantile for',
         centrality_measures[i],'out of', len(df), 'proteins.', 'Proportion:', round(df['inviable'].sum()/len(df),2))

There are 495 essential proteins found in the third quantile for betweenness out of 2271 proteins. Proportion: 0.22
There are 503 essential proteins found in the third quantile for degree out of 2116 proteins. Proportion: 0.24
There are 439 essential proteins found in the third quantile for eigenvector out of 1922 proteins. Proportion: 0.23


In [85]:
# this function is for turning these dataframes into csv files in the mcl folder. It takes in a list of dataframes
# as its argument and it drops the inviable column retaining only those inviable proteins.
def make_inviable_csv(list_of_inviable_dfs):
    for i, df in enumerate(list_of_inviable_dfs):
        df2 = df[df['inviable'] == True]
        df2 = df2.drop('inviable',axis=1)
        df2.to_csv(lib.files.make_filepath_to_mcl_clusters(f'inviable_{file_names[i]}'))

In [76]:
make_inviable_csv(inviable_dfs_3q)

### Fourth Quartile

In [53]:
# exact same function as inviable() but with the line commented below being different
def inviable_wo_4932(filepath):
    inviable_path = lib.files.make_filepath_to_data("inviable_proteins.csv")
    inviable_proteins = list(pd.read_csv(inviable_path,header=None)[1])
    df = pd.read_csv(filepath, index_col = 0)
#    df['protein'] = df['protein'].apply(lambda x: x[5:]) #for some reasons the naming of the proteins changed
    df['inviable'] = True
    proteins = df['protein']
    for i in range(len(proteins)):
        protein = df.iloc[i,1]
        if protein not in inviable_proteins:
            df.iloc[i,-1] = False
    return df

In [54]:
file_names = ['betweenness_centrality_quantile_0.75_to_1.0',
              'degree_centrality_quantile_0.75_to_1.0',
              'eigenvector_centrality_quantile_0.75_to_1.0']
file_paths = list(map(lambda file: lib.files.make_filepath_to_mcl_clusters(file), file_names))
btw_4q = inviable_wo_4932(file_paths[0])
deg_4q = inviable_wo_4932(file_paths[1])
egv_4q = inviable_wo_4932(file_paths[2])
inviable_dfs_4q = [btw_4q,deg_4q,egv_4q]
centrality_measures = ['betweenness','degree','eigenvector']

In [55]:
btw_4q

Unnamed: 0,cluster,protein,degree,betweenness_centrality,eigenvector_centrality,degree_centrality,inviable
0,0,Q0143,6,0.119,0.417,0.857,False
1,0,Q0010,6,0.119,0.417,0.857,False
2,0,Q0297,6,0.286,0.403,0.857,False
3,1,YOR196C,13,0.020,0.237,0.812,False
4,1,YPR067W,16,0.042,0.283,1.000,False
...,...,...,...,...,...,...,...
2057,568,YNL034W,0,0.000,0.577,0.000,False
2058,568,YNL018C,0,0.000,0.577,0.000,False
2059,568,YFL051C,0,0.000,0.577,0.000,False
2060,569,YNL033W,0,0.000,0.707,0.000,False


In [57]:
for i, df in enumerate(inviable_dfs_4q):
   print('There are', df['inviable'].sum(), 'essential proteins found in the fourth quantile for',
         centrality_measures[i],'out of', len(df), 'proteins.', 'Proportion:', round(df['inviable'].sum()/len(df),2))

There are 523 essential proteins found in the fourth quantile for betweenness out of 2062 proteins. Proportion: 0.25
There are 577 essential proteins found in the fourth quantile for degree out of 2019 proteins. Proportion: 0.29
There are 534 essential proteins found in the fourth quantile for eigenvector out of 1899 proteins. Proportion: 0.28


One thing to note is that for both the third and fourth quantiles, the overall number of proteins for the eigenvector one seems to be less compared to betweenness and degree. The proportion of essential proteins is indeed higher for the fourth quantile compared to the third, but by not so much. Let us also have a look at the second quantile later.

In [77]:
make_inviable_csv(inviable_dfs_4q)

### Second Quartile

In [59]:
file_names = ['betweenness_centrality_quantile_0.25_to_0.5',
              'degree_centrality_quantile_0.25_to_0.5',
              'eigenvector_centrality_quantile_0.25_to_0.5']
file_paths = list(map(lambda file: lib.files.make_filepath_to_mcl_clusters(file), file_names))
btw_2q = inviable_wo_4932(file_paths[0])
deg_2q = inviable_wo_4932(file_paths[1])
egv_2q = inviable_wo_4932(file_paths[2])
inviable_dfs_2q = [btw_2q,deg_2q,egv_2q]
centrality_measures = ['betweenness','degree','eigenvector']

In [60]:
egv_2q

Unnamed: 0,cluster,protein,degree,eigenvector_centrality,betweenness_centrality,degree_centrality,inviable
0,0,Q0182,5,0.391,0.000,0.714,False
1,0,Q0032,5,0.391,0.000,0.714,False
2,0,Q0017,5,0.391,0.000,0.714,False
3,1,YDL120W,13,0.249,0.005,0.812,True
4,1,YPL135W,13,0.249,0.006,0.812,False
...,...,...,...,...,...,...,...
1890,568,YNL034W,0,0.577,0.000,0.000,False
1891,568,YNL018C,0,0.577,0.000,0.000,False
1892,568,YFL051C,0,0.577,0.000,0.000,False
1893,569,YNL033W,0,0.707,0.000,0.000,False


In [62]:
for i, df in enumerate(inviable_dfs_2q):
   print('There are', df['inviable'].sum(), 'essential proteins found in the second quantile for',
         centrality_measures[i],'out of', len(df), 'proteins.', 'Proportion:', round(df['inviable'].sum()/len(df),2))

There are 467 essential proteins found in the second quantile for betweenness out of 2530 proteins. Proportion: 0.18
There are 417 essential proteins found in the second quantile for degree out of 2128 proteins. Proportion: 0.2
There are 384 essential proteins found in the second quantile for eigenvector out of 1895 proteins. Proportion: 0.2


In [78]:
make_inviable_csv(inviable_dfs_2q)

It doesn't seem like the second quartile has that much less essential proteins compared to the third. The fourth quartile would probably be still similar, but let us have a look anyways. Also, the number of total proteins for the second quartile for eigenvector centrality is still a lot less for some reasons. It probably is all in the fourth quantile.

### First Quantile

In [65]:
file_names = ['betweenness_centrality_quantile_0.0_to_0.25',
              'degree_centrality_quantile_0.0_to_0.25',
              'eigenvector_centrality_quantile_0.0_to_0.25']
file_paths = list(map(lambda file: lib.files.make_filepath_to_mcl_clusters(file), file_names))
btw_1q = inviable_wo_4932(file_paths[0])
deg_1q = inviable_wo_4932(file_paths[1])
egv_1q = inviable_wo_4932(file_paths[2])
inviable_dfs_1q = [btw_1q,deg_1q,egv_1q]
centrality_measures = ['betweenness','degree','eigenvector']

In [66]:
egv_2q

Unnamed: 0,cluster,protein,degree,eigenvector_centrality,betweenness_centrality,degree_centrality,inviable
0,0,Q0182,5,0.391,0.000,0.714,False
1,0,Q0032,5,0.391,0.000,0.714,False
2,0,Q0017,5,0.391,0.000,0.714,False
3,1,YDL120W,13,0.249,0.005,0.812,True
4,1,YPL135W,13,0.249,0.006,0.812,False
...,...,...,...,...,...,...,...
1890,568,YNL034W,0,0.577,0.000,0.000,False
1891,568,YNL018C,0,0.577,0.000,0.000,False
1892,568,YFL051C,0,0.577,0.000,0.000,False
1893,569,YNL033W,0,0.707,0.000,0.000,False


In [70]:
for i, df in enumerate(inviable_dfs_1q):
   print('There are', df['inviable'].sum(), 'essential proteins found in the first quantile for',
         centrality_measures[i],'out of', len(df), 'proteins.', 'Proportion:', round(df['inviable'].sum()/len(df),2))

There are 369 essential proteins found in the first quantile for betweenness out of 2256 proteins. Proportion: 0.16
There are 315 essential proteins found in the first quantile for degree out of 1937 proteins. Proportion: 0.16
There are 299 essential proteins found in the first quantile for eigenvector out of 1849 proteins. Proportion: 0.16


In [79]:
make_inviable_csv(inviable_dfs_1q)

Compared to the second quartile there is a slight difference in number of essential proteins detected. Overall there is a slight trend in that the larger the quartile, the more essential proteins were found. For some reasons the eigenvector centrality still only has around 1800 proteins total.