# Part 2. Biochemical Pathways in Phenotypic Description Data
The purpose of this notebook is to answer the question of how networks genereated using phenotypic-text similarity based approaches through either embedding, vocabulary presence, or ontology annotation compare to or relate to networks that specify known protein-protein interactions. The hypothesis that these networks are potentially related is based on the idea that if two proteins interact, they are likely to be acting in a common pathway with a common biological function. If the phenotypic outcome of this pathway is observable and documented, then similarites between text describing the mutant phenotype for these genes may coincide with direct protein-protein interactions.

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import pandas as pd
import numpy as np
import time
import math
import sys
import gensim
import os
import warnings
import itertools
from collections import Counter, defaultdict
from inspect import signature
from scipy.stats import ks_2samp
from sklearn.metrics import precision_recall_curve, f1_score, auc

sys.path.append("../../oats")
from oats.utils.utils import save_to_pickle, load_from_pickle, merge_list_dicts, flatten
from oats.datasets.dataset import Dataset
from oats.datasets.groupings import Groupings
from oats.datasets.string import get_stringdb_information
from oats.annotation.ontology import Ontology
from oats.annotation.annotation import write_annotations_to_tsv_file, read_annotations_from_tsv_file
from oats.graphs.pairwise import pairwise_edgelist_doc2vec, pairwise_edgelist_counting, pairwise_edgelist_annotations
from oats.graphs.pairwise import merge_edgelists, subset_edgelist_with_ids
from oats.graphs.pairwise import remove_self_loops

mpl.rcParams["figure.dpi"] = 400
warnings.simplefilter('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
# Nested dictionary to summarize output with shape dict[method][(tag,metric)] --> value
TAG = "pathways"
OUTPUT = defaultdict(dict)

### 2.1 Reading in dataset and subsetting based on membership in biochemical pathways
Note that the subsetting is based on if the gene is mapped to atleast one pathway, but not subsetting based on only including genes that map to a pathway that has atleast two genes mapped to it. This is the same strategy taken with the STRING data, where we included genes in the subset data if they were mentioend in STRING at all, they didn't have to be shown to be interacting with another protein in teh dataset. THis allows for NAs when finding the within group mean similarity, because some groups might only have one gene in them, which is also the case here and that's good, just account for it.


In [None]:
# Reading in the entire dataset, subsetting for Arabidosis and all annotation types.
dataset = load_from_pickle("../data/pickles/full_dataset.pickle")
dataset.describe()
dataset.filter_by_species("ath")
dataset.collapse_by_all_gene_names()
dataset.filter_has_description()
dataset.filter_has_annotation()
dataset.describe()

In [None]:
# Subsetting the dataset to include only those genes that map to atleast one group from some classification source.
groups = load_from_pickle(path="../data/pickles/pmn_pathways.pickle")
id_to_group_ids = groups.get_forward_dict(dataset.get_gene_dictionary())
group_mapped_ids = [k for (k,v) in id_to_group_ids.items() if len(v)>1]    
dataset.filter_with_ids(group_mapped_ids)
dataset.filter_random_k(400)
dataset.describe()



In [None]:
# Get the mappings again now that the data has been subset.
id_to_group_ids = groups.get_forward_dict(dataset.get_gene_dictionary())
group_id_to_ids = groups.get_reverse_dict(dataset.get_gene_dictionary())
sorted_group_tuples = sorted(group_id_to_ids.items(), key=lambda x: len(x[1]), reverse=True)
print("{:<20}{:<20}{:<20}".format("Num Genes Mapped","Pathway ID", "Pathway Name"))
for group in sorted_group_tuples[:10]:
    print("{:<20}{:<20}{:<20}".format(len(group_id_to_ids[group[0]]), group[0], groups.get_long_name(group[0]))) 

### 2.2 Generating edgelists specifying the pairwise similarities from this dataset
This includes things like whether or not to do capitilization removal, lemmatization, stemming, etc. on the descriptions present in the dataset. This could also included things like scrambling the contexts of each description to establish a baseline performance measure. This could also include things like reducing the vocabulary size through the preprocessing methods given here but also through additional means such as provided a reduced (more specialized) vocabulary dictionary to the vectorizing functions so that only those words which are most likely to have meaning have positions with those vectors. Should also test other vectorization methods such as term-frequency inverse-document-frequency for weighting. Can also change how the feature selection is done for those vectors by altering whether the *n*-grams are based on word or characters, and what the range of *n* is.

Also add the thing about combining the terms annotations and text when using the bag-of-words approach.

In [None]:
# Make a dictionary of descriptions with same quantity of words but randomly selected from the vocabulary.
from nltk.tokenize import word_tokenize
#descriptions = dataset.get_description_dictionary()
#annotations = dataset.get_annotations_dictionary()
#go_annotations = {k:[term for term in v if term[0:2]=="GO"] for k,v in annotations.items()}
#po_annotations = {k:[term for term in v if term[0:2]=="PO"] for k,v in annotations.items()}
#tokens = [w for w in itertools.chain.from_iterable(word_tokenize(desc) for desc in descriptions.values())]
#scrambled_descriptions = {k:" ".join(np.random.choice(tokens,len(word_tokenize(v)))) for k,v in descriptions.items()}

In [None]:
# Objects and dictionaries needed to build the list of edges for the full graph.
doc2vec_model_filename = "../gensim/enwiki_dbow/doc2vec.bin"
doc2vec_model = gensim.models.Doc2Vec.load(doc2vec_model_filename)
ontology_filename = "../ontologies/mo.obo"
ontology = Ontology(ontology_filename)
descriptions = dataset.get_description_dictionary()
annotations = dataset.get_annotations_dictionary()
vocabulary = ontology.get_all_tokens_as_ordered_vocabulary()

# Generating the pairwise edgelist for some vanilla methods.
name_to_df_mapping = {}
name_to_df_mapping["doc2vec"] = pairwise_edgelist_doc2vec(doc2vec_model, descriptions, metric="cosine")
name_to_df_mapping["bagofwords"] = pairwise_edgelist_counting(descriptions, binary=False, metric="cosine") 
name_to_df_mapping["setofwords"] = pairwise_edgelist_counting(descriptions, binary=True, metric="cosine")
name_to_df_mapping["ontology"] = pairwise_edgelist_annotations(annotations, ontology, binary=True, metric="cosine")
print("{} methods completed".format(len(name_to_df_mapping)))

# Generating the pairwise edgelists for some additional methods.
name_to_df_mapping["bag_w12gram"] = pairwise_edgelist_counting(descriptions, metric="cosine", binary=False, analyzer="word", ngram_range=(1,2))
name_to_df_mapping["bag_c36gram"] = pairwise_edgelist_counting(descriptions, metric="cosine", binary=False, analyzer="char", ngram_range=(3,6))
name_to_df_mapping["bag_reduced"] = pairwise_edgelist_counting(descriptions, metric="cosine", binary=False, vocabulary=vocabulary)
name_to_df_mapping["set_reduced"] = pairwise_edgelist_counting(descriptions, metric="cosine", binary=True, vocabulary=vocabulary)
print("{} methods completed".format(len(name_to_df_mapping)))

In [None]:
# Merging all of the edgelist dataframes together.
methods = list(name_to_df_mapping.keys())
df = merge_edgelists(name_to_df_mapping, default_value=0.000)
df = remove_self_loops(df)
print(df.head(10))
print(df.shape[0])

### 2.2 Merging the dataset with information about biochemical pathway membership

In [None]:
# Generate a column that says whether or not the two genes have atleast one pathway in common.
df["common"] = df[["from","to"]].apply(lambda x: len(set(id_to_group_ids[x["from"]]).intersection(set(id_to_group_ids[x["to"]])))>0, axis=1)*1
print(df.head(5))
print(Counter(df["common"].values))

### 2.3 Do the edges joining genes that share atleast one pathways come from a different distribution?

In [None]:
METHODS = name_to_df_mapping.keys()

# Approach 1, using the column generated for the dataframe.
ppi_pos_dict = {name:(df[df["common"] > 0.00][name].values) for name in METHODS}
ppi_neg_dict = {name:(df[df["common"] == 0.00][name].values) for name in METHODS}


# Approach 2, using the multi-indexed dataframe graph object.
METHODS = name_to_df_mapping.keys()
group_id_to_ids = groups.get_reverse_dict(dataset.get_gene_dictionary())
group_ids = list(group_id_to_ids.keys())
graph = IndexedGraph(df)
within_weights_dict = defaultdict(list)
all_weights_dict = {}
for method in METHODS:
    all_weights_dict[method] = df[method].values
    for group in group_ids:
        within_ids = group_id_to_ids[group]
        within_pairs = [(i,j) for i,j in itertools.permutations(within_ids,2)]
        within_weights_dict[method].extend(graph.get_values(within_pairs, kind=method))


# Display the plots, using either the distributions found with approach 1 or 2 above.
num_plots, plots_per_row, row_width, row_height = (len(METHODS), 4, 14, 3)
fig,axs = plt.subplots(math.ceil(num_plots/plots_per_row), plots_per_row, squeeze=False)
for name,ax in zip(METHODS,axs.flatten()):
    ax.set_title(name)
    ax.set_xlabel("value")
    ax.set_ylabel("density")
    sns.kdeplot(ppi_pos_dict[name], color="black", shade=False, alpha=1.0, ax=ax)
    sns.kdeplot(ppi_neg_dict[name], color="black", shade=True, alpha=0.1, ax=ax) 
fig.set_size_inches(row_width, row_height*math.ceil(num_plots/plots_per_row))
fig.tight_layout()
fig.show()

### 2.2 Which individual biochemical pathways are the most 'phenotypically visible' in this dataset?
This is a method of sorting the individual pathways by which 

In [None]:
# Get all the within-group similarity values for each method for each particular pathway.
METHODS = name_to_df_mapping.keys()
group_id_to_ids = groups.get_reverse_dict(dataset.get_gene_dictionary())
group_ids = list(group_id_to_ids.keys())
graph = IndexedGraph(df)
within_weights_dict = defaultdict(lambda: defaultdict(list))
all_weights_dict = {}
for method in METHODS:
    all_weights_dict[method] = df[method].values
    for group in group_ids:
        within_ids = group_id_to_ids[group]
        within_pairs = [(i,j) for i,j in itertools.permutations(within_ids,2)]
        within_weights_dict[method][group] = np.mean((graph.get_values(within_pairs, kind=method)))

In [None]:
# Find the average rank across methods of each pathway in terms of how low the mean within-group distance values were.
ranks = pd.DataFrame(within_weights_dict).rank()
ranks["average"] = ranks.mean(axis=1)
ranks.sort_values(by="average", inplace=True)
ranks.reset_index(inplace=True)
ranks["group_id"] = ranks["index"]
ranks["full_name"] = ranks["group_id"].apply(lambda x: groups.get_long_name(x))
ranks["n"] = ranks["group_id"].apply(lambda x: len(group_id_to_ids[x]))
ranks = ranks[["group_id", "n", "average","full_name",]]
ranks.head(10)

### Predicting whether two genes share or do not share a functional classification or pathway

In [None]:
# Generate the targe class values, 1 indicating common function and 0 indicating no common function.
df.loc[:,"class"] = [int(len(set(id_to_group_ids[id1]).intersection(set(id_to_group_ids[id2])))>0) 
    for (id1,id2) in zip(df["from"].values,df["to"].values)]
print(df.head(8))

In [None]:
y_true_dict = {name:df["class"] for name in METHODS}
y_prob_dict = {name:(1 - df[name].values) for name in METHODS}
results = {}

num_plots, plots_per_row, row_width, row_height = (len(METHODS), 4, 14, 3)
fig,axs = plt.subplots(math.ceil(num_plots/plots_per_row), plots_per_row, squeeze=False)
for method,ax in zip(METHODS, axs.flatten()):
    # Obtaining the values and metrics.
    y_true, y_prob = y_true_dict[method], y_prob_dict[method]
    n_pos, n_neg = Counter(y_true)[1], Counter(y_true)[0]
    precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
    baseline = Counter(y_true)[1]/len(y_true) 
    area = auc(recall, precision)
    auc_to_baseline_auc_ratio = area/baseline
    results[method] = {"auc":"{:.4f}".format(area), 
                       "baseline":"{:.4f}".format(baseline), 
                       "n_shared":"{:.0f}".format(n_pos), 
                       "n_not":"{:.0f}".format(n_neg)}
    # Producing the precision recall curve.
    step_kwargs = ({'step': 'post'} if 'step' in signature(plt.fill_between).parameters else {})
    ax.step(recall, precision, color='black', alpha=0.2, where='post')
    ax.fill_between(recall, precision, alpha=0.7, color='black', **step_kwargs)
    ax.set_xlabel('Recall')
    ax.set_ylabel('Precision')
    ax.set_ylim([0.0, 1.05])
    ax.set_xlim([0.0, 1.0])
    ax.set_title("PR {0} (Baseline={1:0.3f})".format(method, baseline))
print(pd.DataFrame(results).transpose())
fig.set_size_inches(row_width, row_height*math.ceil(num_plots/plots_per_row))
fig.tight_layout()
fig.show()

### Predicting which functional group or pathway a specific gene belongs to

In [None]:
# Can we learn associations between biochemical pathways and text descriptions?
# The non ml ways of doing this would be:
#    1 create a representative datapoint for each pathway by first dropping the specific sample then mapping.
#    need to implement a function for dropping k samples from the matrix calculation and then classifying them?
#    2 just use the mean similarity to other members of that thing.
#    3 The thing 