In [27]:
import networkx as nx

from networkx.algorithms import community
from community import community_louvain

import markov_clustering as mc

import numpy as np
import scipy as sp
import pandas as pd



In [57]:
# Load the full yeast dataset as pandas
df = pd.read_csv('4932_protein_links_full_v12_0.txt', sep='\s+')

df[["protein1", "protein2", "experiments", "experiments_transferred", "database", "database_transferred", "textmining", "textmining_transferred"]].head(5)

Unnamed: 0,protein1,protein2,experiments,experiments_transferred,database,database_transferred,textmining,textmining_transferred
0,4932.Q0010,4932.YMR207C,0,0,0,245,0,99
1,4932.Q0010,4932.YCR051W,0,180,0,0,0,71
2,4932.Q0010,4932.YPR002W,0,116,0,0,0,117
3,4932.Q0010,4932.YGR117C,0,151,0,0,0,48
4,4932.Q0010,4932.YML056C,0,134,0,0,0,61


In [29]:
# Method to get edges with scores in each category above the specified threshold
def full_yeast_cleaner(filename: str, attributes: list[str], score: list[int]):
    
    df = pd.read_csv(filename, sep='\s+')
    
    selected_df = (df[attributes[0]] >= score[0])
    
    for i in range(1, len(attributes)):
        
        selected_df = selected_df & (df[attributes[i]] >= score[i])
    
    
    return df[selected_df]

In [30]:
# Get edges with experiments score above or equal to 200

attributes = ['experiments']
score = [200]


df = full_yeast_cleaner('4932_protein_links_full_v12_0.txt', attributes, score)

In [31]:
# Get the dataframe of the unweighted edges
network = df[df.columns[0:2]].copy()

# Remove the 4932. prefix
network.loc[:,'protein1'] = network['protein1'].str[5:]
network.loc[:,'protein2'] = network['protein2'].str[5:]

network.head(2)

Unnamed: 0,protein1,protein2
1114,Q0045,YNR018W
1128,Q0045,YEL047C


In [33]:
# Store the edges into a txt file

yeast_file = "yeast_network.txt"

network.to_csv(yeast_file, header = None, index = None, sep = ' ', mode = 'w')

In [34]:
# The subgraph is a connected component
G0 = nx.read_edgelist(yeast_file,comments="#",nodetype=str)

print(nx.is_connected(G0))


True


In [58]:
len(G0.edges)

162725

In [59]:
len(G0.nodes)

5853