# Part 1. Protein-Protein Interactions in Arabidopsis Data
The purpose of this notebook is to answer the question of how networks genereated using phenotypic-text similarity based approaches through either embedding, vocabulary presence, or ontology annotation compare to or relate to networks that specify known protein-protein interactions. The hypothesis that these networks are potentially related is based on the idea that if two proteins interact, they are likely to be acting in a common pathway with a common biological function. If the phenotypic outcome of this pathway is observable and documented, then similarites between text describing the mutant phenotype for these genes may coincide with direct protein-protein interactions.

In [1]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import pandas as pd
import numpy as np
import time
import math
import sys
import gensim
import os
import warnings
import itertools
from collections import Counter, defaultdict
from inspect import signature
from scipy.stats import ks_2samp
from sklearn.metrics import precision_recall_curve, f1_score, auc

sys.path.append("../../oats")
from oats.utils.utils import save_to_pickle, load_from_pickle, merge_list_dicts
from oats.datasets.dataset import Dataset
from oats.datasets.groupings import Groupings
from oats.datasets.string import get_stringdb_information
from oats.annotation.ontology import Ontology
from oats.annotation.annotation import write_annotations_to_tsv_file, read_annotations_from_tsv_file
from oats.graphs.pairwise import pairwise_edgelist_doc2vec, pairwise_edgelist_counting, pairwise_edgelist_annotations
from oats.graphs.pairwise import merge_edgelists, subset_edgelist_with_ids
from oats.graphs.pairwise import remove_self_loops

mpl.rcParams["figure.dpi"] = 400
warnings.simplefilter('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
# Nested dictionary to summarize output with shape dict[method][(tag,metric)] --> value
TAG = "protein-protein"
OUTPUT = defaultdict(dict)

## 1. Does the graph recapitulate known protein-protein interactions?
The different sections in this notebook correspond to different ways of determining if the graphs based on similarity between text descriptions, encodings of text descriptions, or annotations derived from text descriptions at all correspond to known protein-protein interactions in this dataset. The knowledge source about the protein-protein interactions for genes in this dataset is the STRING database (https://string-db.org/). The available entries in the whole dataset are subset to include only the genes that correspond to proteins that are atleast mentioned in the STRING database. This ways if a protein-protein interaction is not specified between two of the remaining genes, it is not because no interactions at all are documented either of those genes. The following cells focus on setting up a dataframe which specifies edge lists specific to each similarity method, and also a protein-protein interaction score for the genes which correspond to those two given nodes in the graphs.

In [3]:
# Reading in the entire dataset, subsetting for Arabidopsis and all annotation types.
dataset = load_from_pickle("../data/pickles/full_dataset.pickle")
dataset.describe()
dataset.filter_by_species("ath")
dataset.collapse_by_all_gene_names()
dataset.filter_has_description()
dataset.filter_has_annotation()
dataset.describe()

Number of rows in the dataframe: 47151
Number of unique IDs:            47151
Number of unique descriptions:   16504
Number of unique gene name sets: 47151
Number of species represented:   6
Number of rows in the dataframe: 5972
Number of unique IDs:            5972
Number of unique descriptions:   3635
Number of unique gene name sets: 5972
Number of species represented:   1


In [4]:
# Reduce size of the dataset by removing genes not mentioned in the STRING.
string_database_file = "../data/group_related_files/string/3702.protein.links.detailed.v11.0.txt"
string_df, string_id_list = get_stringdb_information(string_database_file, dataset.get_name_to_id_dictionary())
dataset.filter_with_ids(string_id_list)
dataset.filter_random_k(1000)
dataset.describe()

Number of rows in the dataframe: 1000
Number of unique IDs:            1000
Number of unique descriptions:   880
Number of unique gene name sets: 1000
Number of species represented:   1


In [6]:
# Objects and dictionaries needed to build the list of edges for the full graph.
doc2vec_model_filename = "../gensim/enwiki_dbow/doc2vec.bin"
doc2vec_model = gensim.models.Doc2Vec.load(doc2vec_model_filename)
ontology_filename = "../ontologies/mo.obo"
ontology = Ontology(ontology_filename)
descriptions = dataset.get_description_dictionary()
annotations = dataset.get_annotations_dictionary()
vocabulary = ontology.get_all_tokens_as_ordered_vocabulary()

# Generating the pairwise edgelist for some vanilla methods.
name_to_df_mapping = {}
name_to_df_mapping["doc2vec"] = pairwise_edgelist_doc2vec(doc2vec_model, descriptions, metric="cosine")
name_to_df_mapping["bagofwords"] = pairwise_edgelist_counting(descriptions, binary=False, metric="cosine") 
#name_to_df_mapping["setofwords"] = pairwise_edgelist_counting(descriptions, binary=True, metric="cosine")
#name_to_df_mapping["ontology"] = pairwise_edgelist_annotations(annotations, ontology, binary=True, metric="cosine")
print(len(name_to_df_mapping))

# Generating the pairwise edgelists for some additional methods.
#name_to_df_mapping["bag_w12gram"] = pairwise_edgelist_counting(descriptions, metric="cosine", binary=False, analyzer="word", ngram_range=(1,2))
#name_to_df_mapping["bag_c36gram"] = pairwise_edgelist_counting(descriptions, metric="cosine", binary=False, analyzer="char", ngram_range=(3,6))
#name_to_df_mapping["bag_reduced"] = pairwise_edgelist_counting(descriptions, metric="cosine", binary=False, vocabulary=vocabulary)
#name_to_df_mapping["set_reduced"] = pairwise_edgelist_counting(descriptions, metric="cosine", binary=True, vocabulary=vocabulary)
print(len(name_to_df_mapping))

2
2


In [7]:
# Merging all of the edgelist dataframes together.
df = merge_edgelists(name_to_df_mapping, default_value=0.000)
df = remove_self_loops(df)
print(df.head(10))
print(df.shape[0])

    from     to   doc2vec  bagofwords
1   1537  26576  0.608391    0.984615
2   1537  16679  0.529723    1.000000
3   1537   1070  0.493390    0.915366
4   1537    618  0.543314    1.000000
5   1537  21936  0.508886    0.926200
6   1537    519  0.518746    0.972697
7   1537   1237  0.630269    0.881972
8   1537   6585  0.445488    0.865385
9   1537   9007  0.639517    1.000000
10  1537   1596  0.535616    1.000000
499500


In [8]:
# Merging information from the protein-protein interaction database with this dataset.
df = df.merge(right=string_df, how="left", on=["from","to"])
df.fillna(value=0,inplace=True)
print(df[["from","to","doc2vec","combined_score"]].head(10))
print(df.shape)

   from     to   doc2vec  combined_score
0  1537  26576  0.608391             0.0
1  1537  16679  0.529723             0.0
2  1537   1070  0.493390             0.0
3  1537    618  0.543314             0.0
4  1537  21936  0.508886             0.0
5  1537    519  0.518746             0.0
6  1537   1237  0.630269             0.0
7  1537   6585  0.445488             0.0
8  1537   9007  0.639517             0.0
9  1537   1596  0.535616             0.0
(500968, 5)


### 1.1 Do the edges joining interacting proteins come from a different distribution?
This section uses a statistical test (Kolmogorov-Smirnov) to see if the distributions of edge weights which correspond to edges representing a known protein interaction come from a different distribution than the edge weights for edges that do not. This test was chosen because the sizes of the two samples can be different and there is no assumption of normality for the underlying distributions. 

In [None]:
# Use KS test to see if protein-protein interaction edges come from a unique distribution.
METHODS = name_to_df_mapping.keys()
ppi_pos_dict = {name:(df[df["combined_score"] > 0.00][name].values) for name in METHODS}
ppi_neg_dict = {name:(df[df["combined_score"] == 0.00][name].values) for name in METHODS}
results = {}
for name in METHODS:
    stat,p = ks_2samp(ppi_pos_dict[name],ppi_neg_dict[name])
    pos_mean = np.average(ppi_pos_dict[name])
    neg_mean = np.average(ppi_neg_dict[name])
    pos_n = len(ppi_pos_dict[name])
    neg_n = len(ppi_neg_dict[name])
    results[name] = {"mean_1":"{:.4f}".format(pos_mean),
                       "mean_0":"{:.4f}".format(neg_mean),
                       "n_1":"{:.0f}".format(pos_n),
                       "n_0":"{:.0f}".format(neg_n),
                       "ks":"{:.0f}".format(stat),
                       "pval":"{:.0f}".format(p)}
                                                
    OUTPUT[name].update({(TAG,"mean_1"):pos_mean, (TAG,"mean_0"):neg_mean, (TAG,"n_1"):stat, (TAG,"n_0"):stat})
    OUTPUT[name].update({(TAG,"ks"):stat, (TAG,"pval"):stat, (TAG,"n_1"):stat, (TAG,"n_0"):stat})
    
print(pd.DataFrame(results).transpose())

### 1.2 What do those distributions of edge weights in the graph look like?
Visualization of the densities of the distributions that are tested in the previous cell. This is a check to see if the differences between the distribution of weights based on text-similarity corresponding to known protein-protein interactions are distinct enough those that do not to be practically useful in predicting interactions based on text-similarity.

In [None]:
num_plots, plots_per_row, row_width, row_height = (len(METHODS), 4, 14, 3)
fig,axs = plt.subplots(math.ceil(num_plots/plots_per_row), plots_per_row, squeeze=False)
for name,ax in zip(METHODS,axs.flatten()):
    ax.set_title(name)
    ax.set_xlabel("value")
    ax.set_ylabel("density")
    sns.kdeplot(ppi_pos_dict[name], color="black", shade=False, alpha=1.0, ax=ax)
    sns.kdeplot(ppi_neg_dict[name], color="black", shade=True, alpha=0.1, ax=ax) 
    
fig.set_size_inches(row_width, row_height*math.ceil(num_plots/plots_per_row))
fig.tight_layout()
fig.show()

### 1.3 Can we practically use the graph to predict known protein-protein interactions?
This is a different question than above, because the false positive rate is a limiting factor in how practically useful it would be to generate predictions about real protein interactions. In the case of the statistical test above, the sample distribution could be significantly different than the whole distribution of edge values even if there are many high valued edges which look like they could come from the sample distribution. In other words, the sample distribution could be distinct because of a lack of low edge weight values, not there could still be many high edge weight values that are not positive edges in this case, which represent false positives).

In [None]:
# Generate the targe class values, 1 indicating interaction and 0 indicating no interaction.
y_true_dict = {name:(np.where(df["combined_score"] > 0.000, 1, 0)) for name in METHODS}
y_prob_dict = {name:(1 - df[name].values) for name in METHODS}
results = {}
num_plots, plots_per_row, row_width, row_height = (len(METHODS), 4, 14, 3)
fig,axs = plt.subplots(math.ceil(num_plots/plots_per_row), plots_per_row, squeeze=False)
for name,ax in zip(METHODS, axs.flatten()):
    
    # Obtaining the values and metrics.
    y_true, y_prob = y_true_dict[name], y_prob_dict[name]
    n_pos, n_neg = Counter(y_true)[1], Counter(y_true)[0]
    precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
    baseline = Counter(y_true)[1]/len(y_true) 
    area = auc(recall, precision)
    auc_to_baseline_auc_ratio = area/baseline
    results[name] = {"auc":"{:.4f}".format(area),
                       "baseline":"{:.4f}".format(baseline),
                       "n_int":"{:.0f}".format(n_pos),
                       "n_not":"{:.0f}".format(n_neg)}
    OUTPUT[name].update({(TAG,"auc"):area, (TAG,"auc_bl"):auc_to_baseline_auc_ratio})
    
    # Producing the precision recall curve.
    step_kwargs = ({'step': 'post'} if 'step' in signature(plt.fill_between).parameters else {})
    ax.step(recall, precision, color='black', alpha=0.2, where='post')
    ax.fill_between(recall, precision, alpha=0.7, color='black', **step_kwargs)
    ax.set_xlabel('Recall')
    ax.set_ylabel('Precision')
    ax.set_ylim([0.0, 1.05])
    ax.set_xlim([0.0, 1.0])
    ax.set_title("{0} PR Curve (Baseline={1:0.3f})".format(name, baseline))
    
# Report the results and show the precision recall curves.
print(pd.DataFrame(results).transpose())
fig.set_size_inches(row_width, row_height*math.ceil(num_plots/plots_per_row))
fig.tight_layout()
fig.show()

### 1.4 Can the graph be queried with one gene to successfully return other genes that it interacts with?
This question is similar (potentially overlapping) with a question asked above, which is to ask if genes in the network are connected by highly weighted edges to genes that gene interacts with in the protein-protein interaction data. If this is true, than we can query (return the nodes in order of greatest similarity to the query node) the network with one gene and get back genes that are likely to interact with it. The difference in how this is tested is that only the genes with atleast one known interaction partner are used as the queried proteins. This is mainly just a difference in the visualization? What's actually shown is where the interacting proteins are ranking against all the proteins in the dataset (which is just those proteins atleast mentioned in STRING for this species). When binning the ranks, the bin with the greatest count is the top ranks (1-10), but the ranks that fall in this bin are still a tiny fraction of the total number of returned ranks, so the answer to this question is effectively no.

In [15]:
# Testing to see whether the number of binding partners returned in k closest nodes is greater than expected.
dft = df[["from","to","doc2vec","bagofwords","combined_score"]]


# Unfortunately fastest way is to generate redundant directed version of the dataframe.
# This is so that the "from" column can be used to quickly get all edges to nodes joining it
# and won't miss the ones that were only specified from j,i instead of i,j.
flipped_nodes = dft[["to","from","doc2vec", "bagofwords", "combined_score"]]
flipped_nodes.columns = ["from","to", "doc2vec", "bagofwords", "combined_score"]
dft = pd.concat([dft, flipped_nodes])
dft.drop_duplicates(keep="first", inplace=True)

print(dft)


# What's the number of partners ranked k or higher for each gene?
k = 10
dft[["doc2vec","bagofwords"]] = dft.groupby("from")["doc2vec","bagofwords"].rank()
#print(dft)
meanranks_pos = dft[dft["combined_score"]>0].groupby("from")["doc2vec","bagofwords"].count()
meanranks_all = dft.groupby("from")["doc2vec","bagofwords"].count()



print(meanranks_pos.head(15))
print(meanranks_all.head(15))






#dft["newrank"] = dft.groupby(by=["from"])["bagofwords"].rank()
#a = dft.groupby(by=["from"])["newrank"].mean()
#b = dft.groupby(by=["from"])["newrank"]

        from     to   doc2vec  bagofwords  combined_score
0       1537  26576  0.608391    0.984615             0.0
1       1537  16679  0.529723    1.000000             0.0
2       1537   1070  0.493390    0.915366             0.0
3       1537    618  0.543314    1.000000             0.0
4       1537  21936  0.508886    0.926200             0.0
...      ...    ...       ...         ...             ...
500963  2007   2203  0.438923    0.982771             0.0
500964    25   2203  0.447021    0.960677             0.0
500965  2007    315  0.549328    0.678902             0.0
500966    25    315  0.539851    1.000000             0.0
500967    25   2007  0.492661    1.000000             0.0

[1001936 rows x 5 columns]
      doc2vec  bagofwords
from                     
16         76          76
22         54          54
24         53          53
25         31          31
30        110         110
32         78          78
35        106         106
36         21          21
38         46   

In [None]:
# Get the list of gene IDs involved in interation with atleast one other protein in this dataset.
genes_with_partners = pd.unique(string_df[["from","to"]].dropna().values.ravel('K'))
partner_ranks_dict = {}

# For each method, obtain a list of the rankings in terms of similarity as measured by that method, using each gene
# iteratively as the query gene. The method of accessing the ranks is to first subset the edgelist dataframe to 
# contain only the rows where an edge involving the node of the query gene is specified, then the edge list is 
# sorted by the specific similarity score for that method, so that the indices of edges where the protein-protein
# interaction score is non-zero will correspond directly to ranks.
for name in METHODS:
    partner_ranks = []
    for gene_id in genes_with_partners:
        merged_df = pd.concat([df[df["from"]==gene_id],df[df["to"]==gene_id]],ignore_index=True)
        merged_df = merged_df.sort_values(by=[name]).reset_index(drop=True)
        indices = merged_df.query("combined_score > 0.00").index.tolist()
        partner_ranks.extend(indices)
    partner_ranks = [rank+1 for rank in partner_ranks]
    partner_ranks_dict[name] = partner_ranks
    print("finished with {}".format(name))

In [None]:
# Set these values to be able to make the large (rightmost) bin an average of all encompassed smaller bins.
# This way the size of that column can be reasonably compared to the columns to its left.
max_rank = len(pd.unique(df[["from","to"]].values.ravel('K')))
bins = [1,10,20,30,40,50,max_rank]
small_bin_size = 10
large_bin_size = (max_rank-50)
ratio = large_bin_size/small_bin_size

# Generate the barplots for each method.
num_plots, plots_per_row, row_width, row_height = (len(METHODS), 4, 14, 3)
fig,axs = plt.subplots(math.ceil(num_plots/plots_per_row), plots_per_row, squeeze=False)
for name,ax in zip(METHODS,axs.flatten()):
    partner_ranks = partner_ranks_dict[name]
    ax.set_title(name)
    ax.set_xlabel("rank bins")
    ax.set_ylabel("rank quantity")
    counts,bin_edges = np.histogram(partner_ranks, bins)
    bin_labels = ["{}-{}".format(bin_edges[i],bin_edges[i+1]) for i in range(len(bin_edges)-1)]
    counts[-1] = counts[-1]/ratio    # Adjust the large bin column to reflect average of encompassed smaller bins.
    bin_labels[-1] = "higher\nmean"  # Change the label of that bin to reflect this change.
    sns.barplot(x=bin_labels,y=counts,linewidth=1.5,facecolor="white",alpha=0.8,errcolor=".2",edgecolor="black",ax=ax)

fig.set_size_inches(row_width, row_height*math.ceil(num_plots/plots_per_row))
fig.tight_layout()
fig.show()

### 1.5 Summarizing the results for this notebook

In [None]:
results = pd.DataFrame(OUTPUT).transpose()
print(results)