In [1]:
import datetime
import nltk
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import pandas as pd
import numpy as np
import time
import math
import sys
import gensim
import os
import warnings
import torch
import itertools
import argparse
import shlex
import random
import multiprocessing as mp
from collections import Counter, defaultdict
from inspect import signature
from scipy.stats import ks_2samp, hypergeom, pearsonr, spearmanr
from sklearn.metrics import precision_recall_curve, f1_score, auc
from sklearn.model_selection import train_test_split, KFold
from scipy import spatial, stats
from statsmodels.sandbox.stats.multicomp import multipletests
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from gensim.parsing.preprocessing import strip_non_alphanum, stem_text, preprocess_string
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation
from gensim.utils import simple_preprocess
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.cluster import AgglomerativeClustering
from nltk.corpus import brown, stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('brown', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

sys.path.append("../../oats")
from oats.utils.utils import save_to_pickle, load_from_pickle, flatten, to_hms
from oats.utils.utils import function_wrapper_with_duration, remove_duplicates_retain_order
from oats.biology.dataset import Dataset
from oats.biology.groupings import Groupings
from oats.biology.relationships import ProteinInteractions, AnyInteractions
from oats.annotation.ontology import Ontology
from oats.annotation.annotation import annotate_using_noble_coder
from oats.distances import pairwise as pw
from oats.nlp.vocabulary import get_overrepresented_tokens, get_vocab_from_tokens
from oats.nlp.vocabulary import reduce_vocab_connected_components, reduce_vocab_linares_pontes

from _utils import Method
from _utils import IndexedGraph


# Some settings for how data is visualized in the notebook.
mpl.rcParams["figure.dpi"] = 400
warnings.simplefilter('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

Warming up PyWSD (takes ~10 secs)... took 6.746164798736572 secs.


In [2]:
# Create and name an output directory according to when the notebooks or script was run.
name = "topic_modeling"
OUTPUT_DIR = os.path.join("../outputs","{}_{}_{}".format(name,datetime.datetime.now().strftime('%m_%d_%Y_h%Hm%Ms%S'),random.randrange(1000,9999)))
os.mkdir(OUTPUT_DIR)

In [3]:
# Paths to different datasets containing gene names, text descriptions, and/or ontology term annotations.
plant_dataset_path = "../../plant-data/genes_texts_annots.csv"
clinvar_dataset_path = "../data/clinvar/clinvar_diseases.csv"
snpedia_snippets_dataset_path = "../data/snpedia/snpedia_snippets.csv"
snpedia_contexts_dataset_path = "../data/snpedia/snpedia_contexts.csv"

# Paths to datasets of sentence or description pairs.
paired_phenotypes_path = "../data/paired_sentences/plants/scored.csv"
biosses_datset_path = "../data/paired_sentences/biosses/cleaned_by_me.csv"

# Paths to files for data about how genes can be grouped into biochemical pathways, etc.
kegg_pathways_path = "../../plant-data/reshaped_data/kegg_pathways.csv" 
plantcyc_pathways_path = "../../plant-data/reshaped_data/plantcyc_pathways.csv" 
lloyd_meinke_subsets_path = "../../plant-data/reshaped_data/lloyd_meinke_subsets.csv" 
lloyd_meinke_classes_path = "../../plant-data/reshaped_data/lloyd_meinke_classes.csv" 

# Paths files that contain mappings from the identifiers used by those groups to full name strings.
kegg_pathways_names_path = "../../plant-data/reshaped_data/kegg_pathways_name_map.csv"
plantcyc_pathways_names_path = "../../plant-data/reshaped_data/plantcyc_pathways_name_map.csv"
lloyd_meinke_subsets_names_path = "../../plant-data/reshaped_data/lloyd_meinke_subsets_name_map.csv"
lloyd_meinke_classes_names_path = "../../plant-data/reshaped_data/lloyd_meinke_classes_name_map.csv"

# Paths to other files including the ortholog edgelist from Panther, and cleaned files from the two papers.
pppn_edgelist_path = "../../plant-data/papers/oellrich_walls_et_al_2015/supplemental_files/13007_2015_53_MOESM9_ESM.txt"
ortholog_file_path = "../../plant-data/databases/panther/PlantGenomeOrthologs_IRB_Modified.txt"
lloyd_function_hierarchy_path = "../../plant-data/papers/lloyd_meinke_2012/versions_cleaned_by_me/192393Table_S1_Final.csv"

In [4]:
# Pathways to text corpora files that are used in this analysis.
background_corpus_filename = "../data/corpus_related_files/untagged_text_corpora/background.txt"
phenotypes_corpus_filename = "../data/corpus_related_files/untagged_text_corpora/phenotypes_all.txt"

In [5]:
# Paths to pretrained or saved models used for embeddings with Word2Vec or Doc2vec.
doc2vec_plants_path = "../models/plants_dbow/doc2vec.model"
doc2vec_wikipedia_path = "../models/enwiki_dbow/doc2vec.bin"
word2vec_plants_path = "../models/plants_sg/word2vec.model"
word2vec_wikipedia_path = "../models/wiki_sg/word2vec.bin"

# Paths to BioBERT models.
biobert_pmc_path = "../models/biobert_v1.0_pmc/pytorch_model"                                  
biobert_pubmed_path = "../models/biobert_v1.0_pubmed/pytorch_model"                                 
biobert_pubmed_pmc_path = "../models/biobert_v1.0_pubmed_pmc/pytorch_model"      

# Word2Vec models availalbe pretrained from Pyysalo et al.
# http://bio.nlplab.org/#doc-tools
# http://evexdb.org/pmresources/vec-space-models/
word2vec_bio_pmc_path = "../models/bio_nlp_lab/PMC-w2v.bin"
word2vec_bio_pubmed_path = "../models/bio_nlp_lab/PubMed-w2v.bin"
word2vec_bio_pubmed_and_pmc_path = "../models/bio_nlp_lab/PubMed-and-PMC-w2v.bin"
word2vec_bio_wikipedia_pubmed_and_pmc_path = "../models/bio_nlp_lab/wikipedia-pubmed-and-PMC-w2v.bin"

<a id="part_8"></a>
# Part 8. Clustering
The purpose of this section is to look at different ways that the embeddings obtained for the dataset of phenotype descriptions can be used to cluster or organize the genes to which those phenotypes are mapped into subgroups or representations. These approaches include generating topic models from the data, and doing agglomerative clustering to find clusters to which each gene belongs.

<a id="clustering"></a>
### Approach 2: Agglomerative clustering and comparison to predefined groups
This clustering approach uses agglomerative clustering to cluster the genes into a fixed number of clusters based off the distances between their embedding representations using all of the above methods. Clustering into a fixed number of clusters allows for clustering into a similar number of groups as a present in some existing grouping of the data, such as phenotype categories or biochemical pathways, and then determining if the clusters obtained are at all similar to the groupings that already exist. Agglomerative clustering is used here in order to use an arbitrary predefined distance matrix, in this case the matrix being used is the mean distance percentiles from each of the different approaches.

In [None]:
# Trying something new here.
texts = dataset.get_description_dictionary()
texts = list(texts.values())
dm = gensim.models.Doc2Vec.load(doc2vec_wikipedia_filename)
vecs = [dm.infer_vector(t.lower().split()) for t in texts]

In [None]:
sents = flatten([sent_tokenize(t) for t in texts])
svecs = [dm.infer_vector(t.lower().split()) for t in sents]
print(len(svecs))
print(len(vecs))

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [None]:
kmeans = KMeans(n_clusters=8, random_state=0).fit(vecs)
kmeans.labels_

In [None]:
KMeans(n_clusters=8, random_state=0).fit_predict(vecs)

In [None]:
KMeans(n_clusters=8, random_state=0).fit_transform(vecs)

In [None]:
kmeans.score(vecs)

In [None]:
pca = PCA(n_components=2).fit(vecs)
datapoint = pca.transform(vecs)
len(datapoint)

In [None]:
pca.explained_variance_ratio_

In [None]:
a = pd.DataFrame(datapoint, columns=["component_1","component_2"])
a["cluster"] = kmeans.labels_
a
a.to_csv("/Users/irbraun/Desktop/a.csv", index=False)

In [None]:
# repeat with sentencesi nstead.
kmeans = KMeans(n_clusters=8, random_state=0).fit(svecs)
pca = PCA(n_components=2).fit(svecs)
datapoint = pca.transform(svecs)
b = pd.DataFrame(datapoint, columns=["component_1","component_2"])
b["cluster"] = kmeans.labels_
b.to_csv("/Users/irbraun/Desktop/a.csv", index=False)
b.head()

In [None]:
sys.path.append("../lib/gsdmm-master/gsdmm")
from mgp import MovieGroupProcess
m = MovieGroupProcess(K=8, alpha=0.1, beta=0.1, n_iters=30)
y = m.fit(sents)
print(y)

In [None]:
# Generate the numpy array where values are mean distance percentiles between all the methods.
mean_pct_array = name_to_array["mean"]
to_id = array_index_to_id

# Do agglomerative clustering based on that distance matrix.
number_of_clusters = 42
ac = AgglomerativeClustering(n_clusters=number_of_clusters, linkage="complete", affinity="precomputed")
clustering = ac.fit(mean_pct_array)
id_to_cluster = {}
cluster_to_ids = defaultdict(list)
for idx,c in enumerate(clustering.labels_):
    id_to_cluster[to_id[idx]] = c
    cluster_to_ids[c].append(to_id[idx])

In [None]:
# Create the dataframe containing the average score assigned to each topic for the genes from each subset.
group_to_cluster_vector = {}
for group_id,ids in group_id_to_ids.items():
    
    mean_cluster_vector = np.zeros(number_of_clusters)
    for i in ids:
        print(ids)
        cluster = id_to_cluster[i]
        mean_cluster_vector[cluster] = mean_cluster_vector[cluster]+1
    mean_cluster_vector = mean_cluster_vector/mean_cluster_vector.sum(axis=0,keepdims=1)
    group_to_cluster_vector[group_id] = mean_cluster_vector
    
ac_df = pd.DataFrame(group_to_cluster_vector)

# Changing the order of the Lloyd, Meinke phenotype subsets to match other figures for consistency.
lmtm_df = pd.read_csv(lloyd_function_hierarchy_path)    
columns_in_order = [col for col in lmtm_df["Subset Symbol"].values if col in ac_df.columns]
ac_df = ac_df[columns_in_order]

# Reordering so consistency with the curated subsets can be checked by looking at the diagonal.
ac_df["idxmax"] = ac_df.idxmax(axis = 1)
ac_df["idxmax"] = ac_df["idxmax"].apply(lambda x: ac_df.columns.get_loc(x))
ac_df = ac_df.sort_values(by="idxmax")
ac_df.drop(columns=["idxmax"], inplace=True)
ac_df = ac_df.reset_index(drop=False).rename({"index":"cluster"},axis=1).reset_index(drop=False).rename({"index":"order"},axis=1)
ac_df.to_csv(os.path.join(OUTPUT_DIR,"part_6_agglomerative_clustering.csv"), index=False)
ac_df

In [None]:
cluster_to_ids

### Approach 3: Agglomerative clustering and sillhouette scores for each NLP method

In [None]:
if NOTEBOOK:
    from sklearn.metrics.cluster import silhouette_score
    # Note that homogeneity scores don't fit for evaluating how close the clustering is to pathway membership, etc.
    # This is because genes can be assigned to more than one pathway, metric would have to be changed to account for this.
    # So all this section does is determines which values of n_clusters provide good clustering results for each matrix.
    n_clusters_silhouette_scores = defaultdict(dict)
    min_n_clusters = 20
    max_n_clusters = 80
    step_size = 4
    number_of_clusters = np.arange(min_n_clusters, max_n_clusters, step_size)
    for n in number_of_clusters:
        for name in names:
            distance_matrix = name_to_array[name]
            ac = AgglomerativeClustering(n_clusters=n, linkage="complete", affinity="precomputed")
            clustering = ac.fit(distance_matrix)
            sil_score = silhouette_score(distance_matrix, clustering.labels_, metric="precomputed")
            n_clusters_silhouette_scores[name][n] = sil_score
    sil_df = pd.DataFrame(n_clusters_silhouette_scores).reset_index(drop=False).rename({"index":"n"},axis="columns")
    sil_df.to_csv(os.path.join(OUTPUT_DIR,"part_6_silhouette_scores_by_n.csv"), index=False)
    sil_df.head(10)

<a id="part_9"></a>
# Part 9. Human Phenologs

<a id="phenologs"></a>
### Looking for phenolog relationships between clusters and OMIM disease phenotypes
This section produces a table of values that provides a score for the a particular pair of a cluster found for this dataset of plant genes and a disease phenotype. Currently the value indicates the fraction of the plant genes in that cluster that have orthologs associated with that disease phenotype. This should be replaced or supplemented with a p-value for evaluating the significance of this value given the distribution of genes and their mappings to all of the disease phenotypes. All the rows from the input dataframe containing the PantherDB and OMIM information where the ID from this dataset is not known or the mapping to a phenotype was unsuccessful are removed at this step, fix this if the metric for evaluating cluster to phenotype phenolog mappings need this information.

In [None]:
# Read in the dataframe mapping plant genes --> human orthologs --> disease phenotypes.
panther_to_omim_filename = "../data/orthology_related_files/ath_to_hsa/pantherdb_omim_df.csv"  
omim_df = pd.read_csv(panther_to_omim_filename)
# Add a column that indicates which ID in the dataset those plant genes refer to, for mapping to phenotypes.
name_to_id = dataset.get_name_to_id_dictionary()
omim_df["id"] = omim_df["gene_identifier"].map(lambda x: name_to_id.get(x,None))
omim_df = omim_df.dropna(subset=["id","phenotype_mim_name"], inplace=False)
omim_df["phenotype_mim_name"] = omim_df["phenotype_mim_name"].astype(str)
omim_df["compressed_phenotype_mim_name"] = omim_df["phenotype_mim_name"].map(lambda x: x.split(",")[0])
omim_df["id"] = omim_df["id"].astype("int64")
omim_df["phenotype_mim_number"] = omim_df["phenotype_mim_number"].astype("int64")
# Generate mappings between the IDs in this dataset and disease phenotypes or orthologous genes.
id_to_mim_phenotype_names = defaultdict(list)
for i,p in zip(omim_df["id"].values,omim_df["compressed_phenotype_mim_name"].values):
    id_to_mim_phenotype_names[i].append(p)
id_to_human_gene_symbols = defaultdict(list)
for i,s in zip(omim_df["id"].values,omim_df["human_ortholog_gene_symbol"].values):
    id_to_human_gene_symbols[i].append(s)
omim_df.head(5)

In [None]:
# How many genes in our dataset map to orthologs that map to the same OMIM phenotype?
omim_df.groupby("compressed_phenotype_mim_name").size()

In [None]:
phenolog_x_dict = defaultdict(dict)
phenolog_p_dict = defaultdict(dict)
candidate_genes_dict = defaultdict(dict)
phenotypes = pd.unique(omim_df["compressed_phenotype_mim_name"].values)
clusters = list(cluster_to_ids.keys())
for cluster,phenotype in itertools.product(clusters,phenotypes):
    
    # What are the candidate genes predicted if this phenolog pairing is real?
    ids = cluster_to_ids[cluster]
    candidate_genes_dict[cluster][phenotype] = list(set(flatten([id_to_human_gene_symbols[i] for i in ids if phenotype not in id_to_mim_phenotype_names.get(i,[])])))

    # What is the p-value for this phenolog pairing?
    # The size of the population (genes in the dataset).
    M = len(id_to_cluster.keys())
    # The number of elements we draw without replacement (genes in the cluster).
    N = len(cluster_to_ids[cluster])     
    # The number of available successes in the population (genes that map to orthologs that map to this phenotype).
    n = len([i for i in id_to_cluster.keys() if phenotype in id_to_mim_phenotype_names.get(i,[])])
    # The number of successes drawn (genes in this cluster that map to orthologs that map to this phenotype).
    x = list(set(flatten([id_to_mim_phenotype_names.get(i,[]) for i in ids]))).count(phenotype)
    prob = 1-hypergeom.cdf(x-1, M, n, N) # Equivalent to prob = 1-sum([hypergeom.pmf(x_i, M, n, N) for x_i in range(0,x)])
    phenolog_x_dict[cluster][phenotype] = x
    phenolog_p_dict[cluster][phenotype] = prob
    

# Convert the dictionary to a table of values with cluster and phenotype as the rows and columns.
phenolog_matrix = pd.DataFrame(phenolog_x_dict)        
phenolog_matrix.head(5)

In [None]:
# Produce a melted version of the phenolog matrix sorted by value and including predicted candidate genes.
phenolog_matrix_reset = phenolog_matrix.reset_index(drop=False).rename({"index":"omim_phenotype_name"}, axis="columns")
phenolog_df = pd.melt(phenolog_matrix_reset, id_vars=["omim_phenotype_name"], value_vars=phenolog_matrix.columns[1:], var_name="cluster", value_name="x")
# What other information should be present in this melted phenologs matrix?
phenolog_df["size"] = phenolog_df["cluster"].map(lambda x: len(cluster_to_ids[x]))
phenolog_df["candidate_gene_symbols"] = np.vectorize(lambda x,y: concatenate_with_bar_delim(*candidate_genes_dict[x][y]))(phenolog_df["cluster"], phenolog_df["omim_phenotype_name"])
phenolog_df["p_value"] = np.vectorize(lambda x,y: phenolog_p_dict[x][y])(phenolog_df["cluster"], phenolog_df["omim_phenotype_name"])
phenolog_df["p_adjusted"] = multipletests(phenolog_df["p_value"].values, method='bonferroni')[1]
phenolog_df.sort_values(by=["p_value"], inplace=True, ascending=True)
phenolog_df = phenolog_df[["omim_phenotype_name", "cluster", "size", "x", "p_value", "p_adjusted", "candidate_gene_symbols"]]
phenolog_df.to_csv(os.path.join(OUTPUT_DIR,"part_6_phenologs.csv"), index=False)
phenolog_df.head(30)