In [28]:
# Do all imports

import pandas as pd
import networkx as nx

import numpy as np
import scipy as sp

import collections
import itertools
import math

# some basic settings for plotting figures
import matplotlib.pyplot as plt
%matplotlib inline 
font = {'family' : 'DejaVu Sans',
        'weight' : 'bold',
        'size'   : 32}

plt.rc('font', **font)

# Download raw data

In [2]:
# Download all raw data files
!curl -O https://raw.githubusercontent.com/jqxcz/MATH3888/master/4932.protein.links.detailed.v11.5.txt
!curl -O https://raw.githubusercontent.com/jqxcz/MATH3888/master/4932.protein.info.v11.5.txt
!curl -O https://raw.githubusercontent.com/jqxcz/MATH3888/master/uniprot.tsv
!curl -O https://raw.githubusercontent.com/jqxcz/MATH3888/master/essential.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 90.0M  100 90.0M    0     0  4644k      0  0:00:19  0:00:19 --:--:-- 6648k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1900k  100 1900k    0     0  1121k      0  0:00:01  0:00:01 --:--:-- 1120k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1957k  100 1957k    0     0  1173k      0  0:00:01  0:00:01 --:--:-- 1173k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 95667  100 95667    0     0   275k      0 --:--:-- --:--:-- --:--:--  275k


# Preprocessing uniprot data

In [6]:
df = pd.read_table('uniprot.tsv',delimiter='\t')

## Create columns that match for subcellular location

We search the column `'Subcellular location [CC]'` for a substring match for each of the locations:

 - Mitochondrial protein
 - Cytoplasm protein
 - Membrane protein
 - Nucleus protein
 - Ribosomal protein

In [7]:
attributes = ['mitochon', 'cytoplasm', 'membrane', 'nucleus']

for attr in attributes:
    df[attr] = df['Subcellular location [CC]'].str.contains(attr, case=False)

df['ribosomal'] = df['Protein names'].str.contains('ribosomal', case=False).fillna(False)
df['ribosomal'] |= df['Protein names'].str.contains('ribosome', case=False).fillna(False)


We drop the extra column

In [8]:
df = df.drop(columns='Subcellular location [CC]')

Now, we create a column to be the number of locations each protein is in.
 - Mitochondrial protein
 - Cytoplasm protein
 - Membrane protein
 - Nucleus protein

In [9]:
df['location_match'] = 0
for attr in attributes:
    df['location_match'] += df[attr]
df['location_match'] = df['location_match'].fillna(0)

Save dataframe as a csv file.

In [10]:
df.to_csv('uniprot_processed.csv')

# Main analysis

## Reading in and doing initial processing of data

In [11]:
# Generate common name for yeast proteins
temp_df = pd.read_table("4932.protein.info.v11.5.txt",delimiter='\t')
common_name = dict(temp_df.iloc[:,:2].values)

In [12]:
# Read in preprocessed uniprot data
uniprot_df = pd.read_csv('uniprot_processed.csv')

In [13]:
# Read in essential node data
essential_df = pd.read_csv('essential.csv', header=None)

Track studied proteins and their yeast name.

In [14]:
# Let's keep track of our studied proteins
studied_proteins = {
    'DIC1': '4932.YLR348C',
    'RGT2': '4932.YDL138W',
    'CBF5': '4932.YLR175W',
    'EST2': '4932.YLR318W'
}

In [15]:
# Read in dataframe for detailed protein links
G0_dataframe = pd.read_table("./4932.protein.links.detailed.v11.5.txt", delimiter=' ')

Reweight interactions. We:
- Double the weight of experimental evidence
- Halve the weight of textmining evidence

In [16]:
# Reweight edges of protein links
keep_columns = [
    'neighborhood', 'fusion', 'cooccurence', 
    'coexpression', 'experimental', 'database', 
    'textmining'
]

# Double weight of experimental evidence
G0_dataframe['experimental'] *= 2

# Half weight of textmining
G0_dataframe['textmining'] //=2

# Calculate combined score
G0_dataframe.combined_score = G0_dataframe[keep_columns].max(axis=1)

# Drop edges with combined score of zero
G0_dataframe = G0_dataframe.drop(G0_dataframe[G0_dataframe.combined_score == 0].index)

We remove all edges with combined score <= 600

In [17]:
# Drop edges with combined score of <= threshold score
threshold_score = 600
G0_dataframe = G0_dataframe.drop(
    G0_dataframe[G0_dataframe.combined_score <= threshold_score].index
    )

In [18]:
# Sanity check of data

# G0_dataframe

Now, we create our networkx graph

In [19]:
# Create nx graph

G0 = nx.from_pandas_edgelist(G0_dataframe, 
    source='protein1', 
    target='protein2', 
    edge_attr=True)

Now, we remove proteins that are not in cellular location of interest.

We also remove ribosomal proteins.

In [20]:
# Ignore certain nodes in the protein network, and also combine uniprot data

# remove proteins that are not in a location that we are targeting
# (we target membrane, mitochrondrial, cytoplasm and nucleus proteins)
irrelevant_proteins = []

# remove proteins that are ribosomal
ribosomal_proteins = []

row_index = {}

for node in G0.nodes:
    # Get row index of matching row for a protein
    node_name = node.split('.')[1]
    node_matches = uniprot_df['Gene Names'].str.contains(node_name).fillna(False)
    node_uniprot = uniprot_df.index[node_matches][0]

    if uniprot_df['location_match'][node_uniprot] == 0:
        irrelevant_proteins.append(node)

    elif uniprot_df['ribosomal'][node_uniprot] == True:
        ribosomal_proteins.append(node)

    else:
        row_index[node] = node_uniprot

print(f"{len(irrelevant_proteins)=}, {len(ribosomal_proteins)=}")
print(f"{len(G0.nodes)=}")
G0.remove_nodes_from(irrelevant_proteins)
G0.remove_nodes_from(ribosomal_proteins)
print(f"{len(G0.nodes)=}")

len(irrelevant_proteins)=1334, len(ribosomal_proteins)=312
len(G0.nodes)=5912
len(G0.nodes)=4266


We now remove essential proteins.

In [21]:
# We remove essential proteins, but keep proteins we want to study
essential_proteins_raw = essential_df[1].values
essential_proteins = set(map(lambda x: '4932.' + x, essential_proteins_raw))
essential_proteins -= set(studied_proteins.values())
print(f"{len(G0.nodes)=}")
G0.remove_nodes_from(list(essential_proteins))
print(f"{len(G0.nodes)=}")

len(G0.nodes)=4266
len(G0.nodes)=3271


We also remove proteins that are not in the largest connected component.

In [22]:
# Let's only keep the largest connected component
largest_component = max(list(nx.connected_components(G0)), key=len)
G0 = G0.subgraph(largest_component)
print(f"{len(G0.nodes)=}")

len(G0.nodes)=3172


In [23]:
# Sanity check - 
#   Check proteins we are interested in are still in the network

for key, value in studied_proteins.items():
    if value not in G0.nodes:
        print(f"ERROR: {key} was deleted from the network!")

We add the location of each protein into the node.

In [26]:
# Track location of nodes
node_location = {}

for node in G0.nodes:
    if uniprot_df['mitochon'][row_index[node]] == 1:
        node_location[node] = 'mitochondria'
    elif uniprot_df['cytoplasm'][row_index[node]] == 1:
        node_location[node] = 'cytoplasm'
    elif uniprot_df['membrane'][row_index[node]] == 1:
        node_location[node] = 'membrane'
    elif uniprot_df['nucleus'][row_index[node]] == 1:
        node_location[node] = 'nucleus'
    else:
        node_location[node] = 'other'

nx.set_node_attributes(G0, values=node_location, name='location')

We define the direction of interaction to be those we are interested in.

In [24]:
direction = {
    'mitochondria': {'cytoplasm', 'mitochondria'},
    'cytoplasm': {'nucleus', 'mitochondria', 'cytoplasm'},
    'nucleus': {'nucleus'},
    'membrane': {'mitochondria', 'membrane'}
}

Construct our directed graph.

In [27]:
GX = nx.DiGraph()
for edge in G0.edges:
    if G0.nodes[edge[1]]['location'] in direction[G0.nodes[edge[0]]['location']]:
        GX.add_edge(edge[0], edge[1])
    if G0.nodes[edge[0]]['location'] in direction[G0.nodes[edge[1]]['location']]:
        GX.add_edge(edge[1], edge[0])

Define our code for getting all paths between start and end.

We allow for the abiity to set a mininum threshold for the least frequent protein amongst all paths returned.

In [31]:
def get_paths(start, end, func, min_count=1, **kwargs):
    paths = list(func(GX, 
            studied_proteins[start], 
            studied_proteins[end], **kwargs))
    all_occurrences = collections.Counter(itertools.chain.from_iterable(paths))
    return (p for p in paths if min(set(map(all_occurrences.get, p))) > min_count)

In [32]:
def print_paths(start, end, func, reverse=False, **kwargs):
    out = lambda x: ', '.join(map(common_name.get, x))
    if reverse:
        out = lambda x: ', '.join(reversed(list(map(common_name.get, x))))
    print('\n'.join(sorted(map(out, get_paths(start, end, func, **kwargs)))))

In [36]:
for start in ['DIC1', 'RGT2']:
    for end in ['CBF5', 'EST2']:
        try:
            print(start, 'to', end)
            num = sum(1 for _ in get_paths(start, end, nx.all_simple_paths, min_count=1, cutoff=5))
            print(num)
            print(sum(1 for _ in get_paths(start, end, nx.all_simple_paths, 
                min_count=math.sqrt(num)/2-2, cutoff=5)))
        except Exception as e:
            print(e)


DIC1 to CBF5
79515
51174
DIC1 to EST2
34607
10957
RGT2 to CBF5
142
115
RGT2 to EST2
17
22
