In [1]:
import datetime
import nltk
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import pandas as pd
import numpy as np
import time
import math
import sys
import gensim
import os
import warnings
import torch
import itertools
import multiprocessing as mp
from collections import Counter, defaultdict
from inspect import signature
from scipy.stats import ks_2samp
from sklearn.metrics import precision_recall_curve, f1_score, auc
from sklearn.model_selection import train_test_split, KFold
from scipy import spatial, stats
from nltk.corpus import brown
from nltk.tokenize import word_tokenize
from sklearn.neighbors import KNeighborsClassifier
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from gensim.parsing.preprocessing import strip_non_alphanum, stem_text, preprocess_string, remove_stopwords
from gensim.utils import simple_preprocess

sys.path.append("../../oats")
from oats.utils.utils import save_to_pickle, load_from_pickle, merge_list_dicts, flatten, to_hms
from oats.datasets.dataset import Dataset
from oats.datasets.groupings import Groupings
from oats.annotation.ontology import Ontology
from oats.datasets.string import String
from oats.datasets.edges import Edges
from oats.annotation.annotation import annotate_using_noble_coder
from oats.graphs import pairwise as pw
from oats.graphs.indexed import IndexedGraph
from oats.graphs.models import train_logistic_regression_model, apply_logistic_regression_model
from oats.graphs.models import train_random_forest_model, apply_random_forest_model
from oats.nlp.vocabulary import get_overrepresented_tokens, build_vocabulary_from_tokens
from oats.utils.utils import function_wrapper_with_duration
from oats.nlp.preprocess import concatenate_with_bar_delim

mpl.rcParams["figure.dpi"] = 400
warnings.simplefilter('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
nltk.download('punkt', quiet=True)
nltk.download('brown', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

True

In [2]:
dataset_filename = "../data/pickles/text_plus_annotations_dataset.pickle"        # The full dataset pickle.
groupings_filename = "../data/pickles/lloyd_subsets.pickle"                      # The groupings pickle.
background_corpus_filename = "../data/corpus_related_files/untagged_text_corpora/background.txt"       # Text file with background content.
phenotypes_corpus_filename = "../data/corpus_related_files/untagged_text_corpora/phenotypes_small.txt" # Text file with specific content.
doc2vec_pubmed_filename = "../gensim/pubmed_dbow/doc2vec_2.bin"                  # File holding saved Doc2Vec model.
doc2vec_wikipedia_filename = "../gensim/enwiki_dbow/doc2vec.bin"                 # File holding saved Doc2Vec model.
word2vec_model_filename = "../gensim/wiki_sg/word2vec.bin"                       # File holding saved Word2Vec model.
ontology_filename = "../ontologies/mo.obo"                                       # Ontology file in OBO format.
noblecoder_jarfile_path = "../lib/NobleCoder-1.0.jar"                            # Jar for NOBLE Coder tool.
biobert_pmc_path = "../gensim/biobert_v1.0_pmc/pytorch_model"                    # Path for PyTorch BioBERT model.
biobert_pubmed_path = "../gensim/biobert_v1.0_pubmed/pytorch_model"              # Path for PyTorch BioBERT model.
biobert_pubmed_pmc_path = "../gensim/biobert_v1.0_pubmed_pmc/pytorch_model"      # Path for PyTorch BioBERT model.

In [3]:
dataset = load_from_pickle(dataset_filename)
dataset.describe()
dataset.filter_by_species("ath")
dataset.filter_has_description()
dataset.describe()

Number of rows in the dataframe: 30169
Number of unique IDs:            30169
Number of unique descriptions:   4566
Number of unique gene name sets: 30169
Number of species represented:   6
Number of rows in the dataframe: 6030
Number of unique IDs:            6030
Number of unique descriptions:   3611
Number of unique gene name sets: 6030
Number of species represented:   1


In [4]:
# Files and models related to the machine learning text embedding methods.
doc2vec_wiki_model = gensim.models.Doc2Vec.load(doc2vec_wikipedia_filename)
doc2vec_pubmed_model = gensim.models.Doc2Vec.load(doc2vec_pubmed_filename)
word2vec_model = gensim.models.Word2Vec.load(word2vec_model_filename)

In [5]:
# Obtain a mapping between IDs and the raw text descriptions associated with that ID from the dataset.
descriptions = dataset.get_description_dictionary()

# Preprocessing of the text descriptions. Different methods are necessary for different approaches.
descriptions_full_preprocessing = {i:" ".join(preprocess_string(d)) for i,d in descriptions.items()}
descriptions_simple_preprocessing = {i:" ".join(simple_preprocess(d)) for i,d in descriptions.items()}
descriptions_no_stopwords = {i:remove_stopwords(d) for i,d in descriptions.items()}

In [6]:
get_pos_tokens = lambda text,pos: " ".join([t[0] for t in nltk.pos_tag(word_tokenize(text)) if t[1].lower()==pos.lower()])
descriptions_noun_only =  {i:get_pos_tokens(d,"NN") for i,d in descriptions.items()}
descriptions_noun_only_full_preprocessing = {i:" ".join(preprocess_string(d)) for i,d in descriptions_noun_only.items()}
descriptions_noun_only_simple_preprocessing = {i:" ".join(simple_preprocess(d)) for i,d in descriptions_noun_only.items()}
descriptions_adj_only =  {i:get_pos_tokens(d,"JJ") for i,d in descriptions.items()}
descriptions_adj_only_full_preprocessing = {i:" ".join(preprocess_string(d)) for i,d in descriptions_adj_only.items()}
descriptions_adj_only_simple_preprocessing = {i:" ".join(simple_preprocess(d)) for i,d in descriptions_adj_only.items()}

In [7]:
# Run the ontology term annotators over either raw or preprocessed text descriptions.
annotations_noblecoder_precise = annotate_using_noble_coder(descriptions, noblecoder_jarfile_path, "mo", precise=1)
annotations_noblecoder_partial = annotate_using_noble_coder(descriptions, noblecoder_jarfile_path, "mo", precise=0)

In [8]:
ontology = Ontology(ontology_filename)

In [9]:
# Same preprocessing steps but on the query string.
query = open("/Users/irbraun/Desktop/autophagy.txt","r").read()
query = {0:query}
query_full_preprocessing = {i:" ".join(preprocess_string(d)) for i,d in query.items()}
query_simple_preprocessing = {i:" ".join(simple_preprocess(d)) for i,d in query.items()}
query_no_stopwords = {i:remove_stopwords(d) for i,d in query.items()}
get_pos_tokens = lambda text,pos: " ".join([t[0] for t in nltk.pos_tag(word_tokenize(text)) if t[1].lower()==pos.lower()])
query_noun_only =  {i:get_pos_tokens(d,"NN") for i,d in query.items()}
query_noun_only_full_preprocessing = {i:" ".join(preprocess_string(d)) for i,d in query_noun_only.items()}
query_noun_only_simple_preprocessing = {i:" ".join(simple_preprocess(d)) for i,d in query_noun_only.items()}
query_adj_only =  {i:get_pos_tokens(d,"JJ") for i,d in query.items()}
query_adj_only_full_preprocessing = {i:" ".join(preprocess_string(d)) for i,d in query_adj_only.items()}
query_adj_only_simple_preprocessing = {i:" ".join(simple_preprocess(d)) for i,d in query_adj_only.items()}
query_annotations_noblecoder_precise = annotate_using_noble_coder(query, noblecoder_jarfile_path, "mo", precise=1)
query_annotations_noblecoder_partial = annotate_using_noble_coder(query, noblecoder_jarfile_path, "mo", precise=0)

In [10]:
# Define a list of tuples, each tuple will be used to build to find a matrix of pairwise distances.
# The naming scheme for methods should include both a name substring and then a hyperparameter substring
# separated by a colon. Anything after the colon will be removed from the name and put in a separate 
# column in the output table. This is so that the name column can be directly used for making figures, so
# if two hyperparameter choices are both going to be used in a figure, keep them in the name substring not
# the hyperparameter section. The required items in each tuple are:
# Index 0: name of the method
# Index 1: function to call for running this method
# Index 2: arguments to pass to that function as dictionary of keyword args
# Index 3: distance metric to apply to vectors generated with that method


name_function_args_tuples = [
    
    # Methods that use neural networks to generate embeddings.
    ("Doc2Vec Wikipedia:Size=300", pw.pairwise_doc2vec_twogroup, {"model":doc2vec_wiki_model, "object_dict_1":query, "object_dict_2":descriptions, "metric":"cosine"}, spatial.distance.cosine),
    ("Doc2Vec PubMed:Size=100", pw.pairwise_doc2vec_twogroup, {"model":doc2vec_pubmed_model, "object_dict_1":query, "object_dict_2":descriptions, "metric":"cosine"}, spatial.distance.cosine),

    #("BERT Base:Layers=2,Concatenated", pw.pairwise_bert_onegroup, {"model":bert_model_base, "tokenizer":bert_tokenizer_base, "object_dict":descriptions, "metric":"cosine", "method":"concat", "layers":2}, spatial.distance.cosine),
    #("BERT Base:Layers=3,Concatenated", pw.pairwise_bert_onegroup, {"model":bert_model_base, "tokenizer":bert_tokenizer_base, "object_dict":descriptions, "metric":"cosine", "method":"concat", "layers":3}, spatial.distance.cosine),
    #("BERT Base:Layers=4,Concatenated", pw.pairwise_bert_onegroup, {"model":bert_model_base, "tokenizer":bert_tokenizer_base, "object_dict":descriptions, "metric":"cosine", "method":"concat", "layers":4}, spatial.distance.cosine),
    #("BERT Base:Layers=2,Summed", pw.pairwise_bert_onegroup, {"model":bert_model_base, "tokenizer":bert_tokenizer_base, "object_dict":descriptions, "metric":"cosine", "method":"sum", "layers":2}, spatial.distance.cosine),
    #("BERT Base:Layers=3,Summed", pw.pairwise_bert_onegroup, {"model":bert_model_base, "tokenizer":bert_tokenizer_base, "object_dict":descriptions, "metric":"cosine", "method":"sum", "layers":3}, spatial.distance.cosine),
    #("BERT Base:Layers=4,Summed", pw.pairwise_bert_onegroup, {"model":bert_model_base, "tokenizer":bert_tokenizer_base, "object_dict":descriptions, "metric":"cosine", "method":"sum", "layers":4}, spatial.distance.cosine),
    #("BioBERT:PMC,Layers=2,Concatenated", pw.pairwise_bert_onegroup, {"model":bert_model_pmc, "tokenizer":bert_tokenizer_pmc, "object_dict":descriptions, "metric":"cosine", "method":"concat", "layers":2}, spatial.distance.cosine),
    #("BioBERT:PMC,Layers=3,Concatenated", pw.pairwise_bert_onegroup, {"model":bert_model_pmc, "tokenizer":bert_tokenizer_pmc, "object_dict":descriptions, "metric":"cosine", "method":"concat", "layers":3}, spatial.distance.cosine),
    #("BioBERT:PMC,Layers=4,Concatenated", pw.pairwise_bert_onegroup, {"model":bert_model_pmc, "tokenizer":bert_tokenizer_pmc, "object_dict":descriptions, "metric":"cosine", "method":"concat", "layers":4}, spatial.distance.cosine),
    #("BioBERT:PubMed,Layers=4,Concatenated", pw.pairwise_bert_onegroup, {"model":bert_model_pubmed, "tokenizer":bert_tokenizer_pubmed, "object_dict":descriptions, "metric":"cosine", "method":"concat", "layers":4}, spatial.distance.cosine),
    #("BioBERT:PubMed,PMC,Layers=4,Concatenated", pw.pairwise_bert_onegroup, {"model":bert_model_pubmed_pmc, "tokenizer":bert_tokenizer_pubmed_pmc, "object_dict":descriptions, "metric":"cosine", "method":"concat", "layers":4}, spatial.distance.cosine),
        
    # Methods that use variations on the n-grams approach with full preprocessing (includes stemming).
    ("N-Grams:Full,Words,1-grams,2-grams", pw.pairwise_ngrams_twogroup, {"object_dict_1":query_full_preprocessing, "object_dict_2":descriptions_full_preprocessing, "metric":"cosine", "binary":False, "analyzer":"word", "ngram_range":(1,2),"max_features":10000, "tfidf":False}, spatial.distance.cosine),
    ("N-Grams:Full,Words,1-grams,2-grams,Binary", pw.pairwise_ngrams_twogroup, {"object_dict_1":query_full_preprocessing, "object_dict_2":descriptions_full_preprocessing, "metric":"jaccard", "binary":True, "analyzer":"word", "ngram_range":(1,2), "max_features":10000, "tfidf":False}, spatial.distance.jaccard),
    ("N-Grams:Full,Words,1-grams", pw.pairwise_ngrams_twogroup, {"object_dict_1":query_full_preprocessing, "object_dict_2":descriptions_full_preprocessing, "metric":"cosine", "binary":False, "analyzer":"word", "ngram_range":(1,1), "max_features":10000, "tfidf":False}, spatial.distance.cosine),
    ("N-Grams:Full,Words,1-grams,Binary", pw.pairwise_ngrams_twogroup, {"object_dict_1":query_full_preprocessing, "object_dict_2":descriptions_full_preprocessing, "metric":"jaccard", "binary":True, "analyzer":"word", "ngram_range":(1,1), "max_features":10000, "tfidf":False}, spatial.distance.jaccard),
    ("N-Grams:Full,Words,1-grams,2-grams,TFIDF", pw.pairwise_ngrams_twogroup, {"object_dict_1":query_full_preprocessing, "object_dict_2":descriptions_full_preprocessing, "metric":"cosine", "binary":False, "analyzer":"word", "ngram_range":(1,2),"max_features":10000, "tfidf":True}, spatial.distance.cosine),
    ("N-Grams:Full,Words,1-grams,2-grams,Binary,TFIDF", pw.pairwise_ngrams_twogroup, {"object_dict_1":query_full_preprocessing, "object_dict_2":descriptions_full_preprocessing, "metric":"cosine", "binary":True, "analyzer":"word", "ngram_range":(1,2), "max_features":10000, "tfidf":True}, spatial.distance.cosine),
    ("N-Grams:Full,Words,1-grams,TFIDF", pw.pairwise_ngrams_twogroup, {"object_dict_1":query_full_preprocessing, "object_dict_2":descriptions_full_preprocessing, "metric":"cosine", "binary":False, "analyzer":"word", "ngram_range":(1,1), "max_features":10000, "tfidf":True}, spatial.distance.cosine),
    ("N-Grams:Full,Words,1-grams,Binary,TFIDF", pw.pairwise_ngrams_twogroup, {"object_dict_1":query_full_preprocessing, "object_dict_2":descriptions_full_preprocessing, "metric":"cosine", "binary":True, "analyzer":"word", "ngram_range":(1,1), "max_features":10000, "tfidf":True}, spatial.distance.cosine),
    
    # Methods that use variations on the n-grams approach with simple preprocessing (no stemming).
    ("N-Grams:simple,Words,1-grams,2-grams", pw.pairwise_ngrams_twogroup, {"object_dict_1":query_simple_preprocessing, "object_dict_2":descriptions_simple_preprocessing, "metric":"cosine", "binary":False, "analyzer":"word", "ngram_range":(1,2),"max_features":10000, "tfidf":False}, spatial.distance.cosine),
    ("N-Grams:simple,Words,1-grams,2-grams,Binary", pw.pairwise_ngrams_twogroup, {"object_dict_1":query_simple_preprocessing, "object_dict_2":descriptions_simple_preprocessing, "metric":"jaccard", "binary":True, "analyzer":"word", "ngram_range":(1,2), "max_features":10000, "tfidf":False}, spatial.distance.jaccard),
    ("N-Grams:simple,Words,1-grams", pw.pairwise_ngrams_twogroup, {"object_dict_1":query_simple_preprocessing, "object_dict_2":descriptions_simple_preprocessing, "metric":"cosine", "binary":False, "analyzer":"word", "ngram_range":(1,1), "max_features":10000, "tfidf":False}, spatial.distance.cosine),
    ("N-Grams:simple,Words,1-grams,Binary", pw.pairwise_ngrams_twogroup, {"object_dict_1":query_simple_preprocessing, "object_dict_2":descriptions_simple_preprocessing, "metric":"jaccard", "binary":True, "analyzer":"word", "ngram_range":(1,1), "max_features":10000, "tfidf":False}, spatial.distance.jaccard),
    ("N-Grams:simple,Words,1-grams,2-grams,TFIDF", pw.pairwise_ngrams_twogroup, {"object_dict_1":query_simple_preprocessing, "object_dict_2":descriptions_simple_preprocessing, "metric":"cosine", "binary":False, "analyzer":"word", "ngram_range":(1,2),"max_features":10000, "tfidf":True}, spatial.distance.cosine),
    ("N-Grams:simple,Words,1-grams,2-grams,Binary,TFIDF", pw.pairwise_ngrams_twogroup, {"object_dict_1":query_simple_preprocessing, "object_dict_2":descriptions_simple_preprocessing, "metric":"cosine", "binary":True, "analyzer":"word", "ngram_range":(1,2), "max_features":10000, "tfidf":True}, spatial.distance.cosine),
    ("N-Grams:simple,Words,1-grams,TFIDF", pw.pairwise_ngrams_twogroup, {"object_dict_1":query_simple_preprocessing, "object_dict_2":descriptions_simple_preprocessing, "metric":"cosine", "binary":False, "analyzer":"word", "ngram_range":(1,1), "max_features":10000, "tfidf":True}, spatial.distance.cosine),
    ("N-Grams:simple,Words,1-grams,Binary,TFIDF", pw.pairwise_ngrams_twogroup, {"object_dict_1":query_simple_preprocessing, "object_dict_2":descriptions_simple_preprocessing, "metric":"cosine", "binary":True, "analyzer":"word", "ngram_range":(1,1), "max_features":10000, "tfidf":True}, spatial.distance.cosine),
    
    # Methods that use variations on the n-grams approach selecting for specific parts-of-speech.
    ("N-Grams:Full,Nouns,1-grams", pw.pairwise_ngrams_twogroup, {"object_dict_1":query_noun_only_full_preprocessing,"object_dict_2":descriptions_noun_only_full_preprocessing, "metric":"cosine", "binary":False, "analyzer":"word", "ngram_range":(1,1), "max_features":10000, "tfidf":False}, spatial.distance.cosine),
    ("N-Grams:Full,Nouns,1-grams,Binary", pw.pairwise_ngrams_twogroup, {"object_dict_1":query_noun_only_full_preprocessing,"object_dict_2":descriptions_noun_only_full_preprocessing, "metric":"jaccard", "binary":True, "analyzer":"word", "ngram_range":(1,1), "max_features":10000, "tfidf":False}, spatial.distance.jaccard),
    ("N-Grams:Full,Nouns,1-grams,TFIDF", pw.pairwise_ngrams_twogroup, {"object_dict_1":query_noun_only_full_preprocessing,"object_dict_2":descriptions_noun_only_full_preprocessing, "metric":"cosine", "binary":False, "analyzer":"word", "ngram_range":(1,1), "max_features":10000, "tfidf":True}, spatial.distance.cosine),
    ("N-Grams:Full,Nouns,1-grams,Binary,TFIDF", pw.pairwise_ngrams_twogroup, {"object_dict_1":query_noun_only_full_preprocessing,"object_dict_2":descriptions_noun_only_full_preprocessing, "metric":"cosine", "binary":True, "analyzer":"word", "ngram_range":(1,1), "max_features":10000, "tfidf":True}, spatial.distance.cosine),
    ("N-Grams:Full,Adjectives,1-grams", pw.pairwise_ngrams_twogroup, {"object_dict_1":query_adj_only_full_preprocessing,"object_dict_2":descriptions_adj_only_full_preprocessing, "metric":"cosine", "binary":False, "analyzer":"word", "ngram_range":(1,1), "max_features":10000, "tfidf":False}, spatial.distance.cosine),
    ("N-Grams:Full,Adjectives,1-grams,Binary", pw.pairwise_ngrams_twogroup, {"object_dict_1":query_adj_only_full_preprocessing,"object_dict_2":descriptions_adj_only_full_preprocessing, "metric":"jaccard", "binary":True, "analyzer":"word", "ngram_range":(1,1), "max_features":10000, "tfidf":False}, spatial.distance.jaccard),
    ("N-Grams:Full,Adjectives,1-grams,TFIDF", pw.pairwise_ngrams_twogroup, {"object_dict_1":query_adj_only_full_preprocessing,"object_dict_2":descriptions_adj_only_full_preprocessing, "metric":"cosine", "binary":False, "analyzer":"word", "ngram_range":(1,1), "max_features":10000, "tfidf":True}, spatial.distance.cosine),
    ("N-Grams:Full,Adjectives,1-grams,Binary,TFIDF", pw.pairwise_ngrams_twogroup, {"object_dict_1":query_adj_only_full_preprocessing,"object_dict_2":descriptions_adj_only_full_preprocessing, "metric":"cosine", "binary":True, "analyzer":"word", "ngram_range":(1,1), "max_features":10000, "tfidf":True}, spatial.distance.cosine),
    
    # Methods that use terms inferred from automated annotation of the text.
    ("NOBLE Coder:Precise", pw.pairwise_annotations_twogroup, {"annotations_dict_1":query_annotations_noblecoder_precise,"annotations_dict_2":annotations_noblecoder_precise, "ontology":ontology, "binary":True, "metric":"jaccard", "tfidf":False}, spatial.distance.jaccard),
    ("NOBLE Coder:Partial", pw.pairwise_annotations_twogroup, {"annotations_dict_1":query_annotations_noblecoder_partial,"annotations_dict_2":annotations_noblecoder_partial, "ontology":ontology, "binary":True, "metric":"jaccard", "tfidf":False}, spatial.distance.jaccard),
    ("NOBLE Coder:Precise,TFIDF", pw.pairwise_annotations_twogroup, {"annotations_dict_1":query_annotations_noblecoder_precise,"annotations_dict_2":annotations_noblecoder_precise, "ontology":ontology, "binary":True, "metric":"cosine", "tfidf":True}, spatial.distance.cosine),
    ("NOBLE Coder:Partial,TFIDF", pw.pairwise_annotations_twogroup, {"annotations_dict_1":query_annotations_noblecoder_partial,"annotations_dict_2":annotations_noblecoder_partial, "ontology":ontology, "binary":True, "metric":"cosine", "tfidf":True}, spatial.distance.cosine),
]

In [11]:
# Generate all the pairwise distance matrices (not in parallel).
graphs = {}
names = []
durations = []
for tup in name_function_args_tuples:
    graph,duration = function_wrapper_with_duration(function=tup[1], args=tup[2])
    graphs[tup[0]] = graph
    names.append(tup[0])
    durations.append(to_hms(duration))
    print("{:50} {}".format(tup[0],to_hms(duration)))
durations_df = pd.DataFrame({"method":names,"duration":durations})

Doc2Vec Wikipedia:Size=300                         00:00:09
Doc2Vec PubMed:Size=100                            00:00:02
N-Grams:Full,Words,1-grams,2-grams                 00:00:01
N-Grams:Full,Words,1-grams,2-grams,Binary          00:00:01
N-Grams:Full,Words,1-grams                         00:00:00
N-Grams:Full,Words,1-grams,Binary                  00:00:00
N-Grams:Full,Words,1-grams,2-grams,TFIDF           00:00:00
N-Grams:Full,Words,1-grams,2-grams,Binary,TFIDF    00:00:00
N-Grams:Full,Words,1-grams,TFIDF                   00:00:00
N-Grams:Full,Words,1-grams,Binary,TFIDF            00:00:00
N-Grams:simple,Words,1-grams,2-grams               00:00:01
N-Grams:simple,Words,1-grams,2-grams,Binary        00:00:01
N-Grams:simple,Words,1-grams                       00:00:00
N-Grams:simple,Words,1-grams,Binary                00:00:00
N-Grams:simple,Words,1-grams,2-grams,TFIDF         00:00:01
N-Grams:simple,Words,1-grams,2-grams,Binary,TFIDF  00:00:01
N-Grams:simple,Words,1-grams,TFIDF      

In [12]:
# Merging all of the edgelist dataframes together.
metric_dict = {tup[0]:tup[3] for tup in name_function_args_tuples}
methods = list(graphs.keys())
edgelists = {k:v.edgelist for k,v in graphs.items()}
df = pw.merge_edgelists(edgelists, default_value=0.000)
df = pw.remove_self_loops(df)
df.head(10)

Unnamed: 0,from,to,Doc2Vec Wikipedia:Size=300,Doc2Vec PubMed:Size=100,"N-Grams:Full,Words,1-grams,2-grams","N-Grams:Full,Words,1-grams,2-grams,Binary","N-Grams:Full,Words,1-grams","N-Grams:Full,Words,1-grams,Binary","N-Grams:Full,Words,1-grams,2-grams,TFIDF","N-Grams:Full,Words,1-grams,2-grams,Binary,TFIDF","N-Grams:Full,Words,1-grams,TFIDF","N-Grams:Full,Words,1-grams,Binary,TFIDF","N-Grams:simple,Words,1-grams,2-grams","N-Grams:simple,Words,1-grams,2-grams,Binary","N-Grams:simple,Words,1-grams","N-Grams:simple,Words,1-grams,Binary","N-Grams:simple,Words,1-grams,2-grams,TFIDF","N-Grams:simple,Words,1-grams,2-grams,Binary,TFIDF","N-Grams:simple,Words,1-grams,TFIDF","N-Grams:simple,Words,1-grams,Binary,TFIDF","N-Grams:Full,Nouns,1-grams","N-Grams:Full,Nouns,1-grams,Binary","N-Grams:Full,Nouns,1-grams,TFIDF","N-Grams:Full,Nouns,1-grams,Binary,TFIDF","N-Grams:Full,Adjectives,1-grams","N-Grams:Full,Adjectives,1-grams,Binary","N-Grams:Full,Adjectives,1-grams,TFIDF","N-Grams:Full,Adjectives,1-grams,Binary,TFIDF",NOBLE Coder:Precise,NOBLE Coder:Partial,"NOBLE Coder:Precise,TFIDF","NOBLE Coder:Partial,TFIDF"
0,0,1,0.402668,0.096372,0.934962,0.981132,0.917216,0.969231,0.975861,0.990633,0.958982,0.978215,0.944005,0.966887,0.925083,0.944444,0.98252,0.982311,0.969876,0.964319,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.592233,0.67619,0.661614,0.820084
1,0,7,0.347427,0.312569,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.818182,0.882353,0.819782,0.899158
2,0,8,0.405793,0.233833,0.874477,0.932515,0.83617,0.894231,0.952482,0.952371,0.926126,0.912557,0.913132,0.953586,0.890804,0.919708,0.963891,0.959358,0.945586,0.92715,0.896283,0.909091,0.935968,0.920319,0.959107,0.96875,0.975273,0.961189,0.4,0.59116,0.52107,0.691006
3,0,9,0.335423,0.378854,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.886792,0.948276,0.951849,0.976353
4,0,10,0.34996,0.387915,0.932209,0.950617,0.91835,0.931034,0.976083,0.970167,0.967422,0.95162,0.892376,0.97,0.867401,0.961538,0.979842,0.987146,0.972439,0.98364,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.833333,0.913793,0.852901,0.844832
5,0,11,0.500701,0.157152,0.929649,0.968944,0.914879,0.954545,0.977744,0.980768,0.968528,0.966423,0.871389,0.968326,0.839376,0.95302,0.965681,0.980008,0.949264,0.968343,0.937006,0.955556,0.974367,0.971506,1.0,1.0,1.0,1.0,0.5375,0.70339,0.681654,0.810955
6,0,12,0.460424,0.404019,0.891072,0.944828,0.875748,0.923913,0.964049,0.964806,0.963395,0.951558,0.898805,0.956757,0.885062,0.940171,0.962875,0.966682,0.961412,0.958623,0.913431,0.909091,0.960821,0.928015,1.0,1.0,1.0,1.0,0.482353,0.685714,0.584773,0.788256
7,0,13,0.422794,0.45107,0.850214,0.93956,0.818992,0.89899,0.927447,0.944878,0.899681,0.904329,0.837516,0.950758,0.793918,0.913669,0.934538,0.948441,0.904896,0.915488,0.925064,0.957447,0.958428,0.962241,0.912294,0.939394,0.928257,0.912213,0.525253,0.572864,0.520025,0.673403
8,0,14,0.299372,0.257639,0.898943,0.943396,0.865769,0.913043,0.960424,0.965168,0.935772,0.93629,0.921893,0.967105,0.895672,0.946237,0.974324,0.973922,0.956986,0.952191,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.529412,0.638554,0.440392,0.799154
9,0,15,0.316917,0.336396,0.881886,0.946237,0.849244,0.918033,0.951467,0.966149,0.926288,0.93876,0.873581,0.949153,0.834072,0.923077,0.957137,0.967064,0.934049,0.947145,0.96363,0.965517,0.979878,0.966304,1.0,1.0,1.0,1.0,0.432836,0.657895,0.568844,0.779206


In [13]:
# Get the mean rank of each gene for all the distance methods.
df[methods] = df[methods].rank()
df["rank"] = df[methods].mean(axis=1)
df = df.sort_values(by="rank")
df.head(10)

Unnamed: 0,from,to,Doc2Vec Wikipedia:Size=300,Doc2Vec PubMed:Size=100,"N-Grams:Full,Words,1-grams,2-grams","N-Grams:Full,Words,1-grams,2-grams,Binary","N-Grams:Full,Words,1-grams","N-Grams:Full,Words,1-grams,Binary","N-Grams:Full,Words,1-grams,2-grams,TFIDF","N-Grams:Full,Words,1-grams,2-grams,Binary,TFIDF","N-Grams:Full,Words,1-grams,TFIDF","N-Grams:Full,Words,1-grams,Binary,TFIDF","N-Grams:simple,Words,1-grams,2-grams","N-Grams:simple,Words,1-grams,2-grams,Binary","N-Grams:simple,Words,1-grams","N-Grams:simple,Words,1-grams,Binary","N-Grams:simple,Words,1-grams,2-grams,TFIDF","N-Grams:simple,Words,1-grams,2-grams,Binary,TFIDF","N-Grams:simple,Words,1-grams,TFIDF","N-Grams:simple,Words,1-grams,Binary,TFIDF","N-Grams:Full,Nouns,1-grams","N-Grams:Full,Nouns,1-grams,Binary","N-Grams:Full,Nouns,1-grams,TFIDF","N-Grams:Full,Nouns,1-grams,Binary,TFIDF","N-Grams:Full,Adjectives,1-grams","N-Grams:Full,Adjectives,1-grams,Binary","N-Grams:Full,Adjectives,1-grams,TFIDF","N-Grams:Full,Adjectives,1-grams,Binary,TFIDF",NOBLE Coder:Precise,NOBLE Coder:Partial,"NOBLE Coder:Precise,TFIDF","NOBLE Coder:Partial,TFIDF",rank
3690,0,5223,46.0,67.0,112.0,26.5,133.5,51.0,124.0,90.0,164.0,160.0,495.0,36.0,642.0,44.5,177.0,66.0,189.0,102.0,427.5,96.5,438.0,509.0,753.0,193.0,805.0,720.0,264.5,27.0,903.0,151.0,267.1
346,0,394,8.0,2417.0,79.0,70.5,123.0,140.5,33.0,66.0,74.0,109.0,94.0,70.0,118.0,84.5,48.0,105.0,61.0,98.0,415.0,80.0,381.0,372.0,394.0,40.5,325.0,439.0,35.0,811.5,544.0,396.0,267.716667
344,0,392,659.0,662.0,162.0,55.5,137.0,65.0,105.0,69.0,79.0,48.0,232.0,18.5,210.0,11.0,130.0,76.0,68.0,30.0,447.0,80.0,416.0,389.0,485.0,40.5,443.0,436.0,1002.0,201.0,1341.0,134.0,274.383333
1215,0,1362,35.0,268.0,299.0,96.0,251.0,59.5,332.0,252.0,254.0,122.0,161.0,42.0,135.0,16.0,203.0,196.0,152.0,65.0,503.0,29.0,580.0,391.0,574.0,400.0,596.0,891.0,65.0,237.0,620.0,819.0,288.116667
3106,0,4639,195.0,239.0,130.0,176.0,119.0,107.5,178.0,253.0,130.0,153.0,248.5,34.0,275.0,35.5,102.0,54.0,108.0,48.0,419.0,486.0,461.0,911.0,512.0,73.5,589.0,670.0,190.0,791.5,832.0,217.0,291.25
1818,0,2034,63.0,1421.0,21.0,132.0,22.0,44.0,55.0,275.0,40.0,126.0,108.0,126.0,64.0,18.5,69.0,231.0,44.0,73.0,310.0,128.0,336.0,715.0,527.5,193.0,576.0,727.0,468.0,982.0,1267.0,75.0,307.9
245,0,287,1988.0,1971.0,34.0,11.0,49.0,4.0,56.0,52.0,119.0,57.0,195.0,141.0,222.0,77.0,84.0,69.0,106.0,57.0,346.0,10.5,387.0,368.0,666.0,291.5,592.0,563.0,11.0,11.0,684.0,71.0,309.766667
1035,0,1149,240.0,1152.0,97.0,38.5,152.0,215.0,24.0,10.0,64.0,38.0,196.0,16.0,200.0,24.5,50.0,9.0,63.0,23.0,384.0,56.0,337.0,299.0,728.5,335.0,513.0,440.0,169.5,618.0,676.0,2643.0,327.033333
2211,0,2440,729.0,1083.0,3.0,3.0,6.0,25.0,2.0,6.0,1.0,1.0,3.0,3.5,3.0,5.0,2.0,6.0,1.0,1.0,321.0,933.0,314.0,466.0,322.0,16.5,316.0,318.0,2253.5,536.0,2044.0,411.0,337.816667
1237,0,1384,421.0,1372.0,101.0,15.0,135.0,36.0,161.0,62.0,306.0,170.0,383.0,28.0,573.5,79.0,287.0,53.0,463.0,148.0,565.0,47.5,803.0,551.0,673.5,87.0,662.0,473.0,551.5,191.0,1001.0,138.0,351.233333


In [14]:
# Finding out what from this table of provided genes is in the dataset already, and with what text.
# Normalizing the gene names to lowercase doesn't impact how many are in the data.
names_dict = dataset.get_name_to_id_dictionary(unambiguous=True)
adf = pd.read_csv("~/Desktop/autophagy_related_genes.csv",usecols=[0,1],names=["name","other"])
adf["in_data"] = adf["name"].map(lambda x: (x in names_dict))
adf["identifier"] = adf["name"].map(lambda x: (names_dict[x] if (x in names_dict) else ""))
adf["text"] = adf["identifier"].map(lambda x: (descriptions[x] if (x is not "") else ""))
adf.sort_values("in_data",ascending=False)

Unnamed: 0,name,other,in_data,identifier,text
14,AT3G07525,AtATG10,True,2371.0,Early senescence. Late flowering. Reduced fert...
21,AT3G49590,AtATG13a,True,4963.0,Premature senescence under a short-day photope...
19,AT3G19190,AtATG2,True,4850.0,early senescence; accumulation of high levels ...
18,AT3G18770,AtATG13b,True,4845.0,Premature senescence under a short-day photope...
1,AT1G50030,TOR,True,914.0,Embryo defective-Preglobular / Globular. large...
16,AT3G13970,AtATG12b,True,4201.0,Small plants and premature senescence under no...
15,AT3G08850,RAPTOR B,True,888.0,Embryo defective-Preglobular. Heterozygotes sh...
36,AT5G01770,RAPTOR A,True,5386.0,"Under non-limiting conditions of light, water,..."
12,AT3G01090,KIN10,True,1755.0,Reduced starch transport under phosphate starv...
24,AT3G59950,AtATG4b,True,5034.0,Small plants and premature senescence under no...


In [33]:
results = df
gene_dict = dataset.get_gene_dictionary()
results["phenotype"] = results["to"].map(lambda x: descriptions[x])
results["phenotype"] = results["to"].map(lambda x: descriptions[x])
results["gene_name"] = results["to"].map(lambda x: concatenate_with_bar_delim(*gene_dict[x].names[0:1]))
results["gene_aliases"] = results["to"].map(lambda x: concatenate_with_bar_delim(*gene_dict[x].names[1:]))
results["known"] = results["to"].map(lambda x: (x in adf["identifier"].values))
results["rank"] = np.arange(1, len(results)+1)
results = results[["rank","known","gene_name","phenotype","gene_aliases"]].head(50)
results.to_csv("~/Desktop/autophagy_query.csv",index=False)
results.head()

Unnamed: 0,rank,known,gene_name,phenotype,gene_aliases
3690,1,False,AT4G22330,"Irregular wax crystals, reduced stature, small...",ATCES1|AtACER|CERAMIDASE|T10I14.160|T10I14_160
346,2,False,At1g66730,Delayed germination. Sensitive to low temperat...,LIG6|AT1G66730|AtLIG6|DNA LIGASE 6|F4N21.14|F4...
344,3,False,At5g40770,Curled rosette leaves. Delayed germination. Se...,PHB3|AT5G40770|ATPHB3|EER3|prohibitin 3|AT5G40...
1215,4,False,At1g20450,Abnormal seed shape. Low germination rate. Sen...,ERD10|AT1G20450|LTI29|LTI45|LOW TEMPERATURE IN...
3106,5,False,AT2G42700,abnormally accumulated storage protein precurs...,MIP3|MAG2-interacting protein 3|F14N22.4
