In [1]:
import datetime
import nltk
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import pandas as pd
import numpy as np
import time
import math
import sys
import gensim
import os
import warnings
import itertools
from collections import Counter, defaultdict
from inspect import signature
from scipy.stats import ks_2samp
from sklearn.metrics import precision_recall_curve, f1_score, auc
from sklearn.model_selection import train_test_split, KFold
from scipy import spatial

sys.path.append("../../oats")
from oats.utils.utils import save_to_pickle, load_from_pickle, merge_list_dicts, flatten
from oats.datasets.dataset import Dataset
from oats.datasets.groupings import Groupings
from oats.annotation.ontology import Ontology
from oats.datasets.string import String
from oats.datasets.known import Known
from oats.annotation.annotation import annotate_using_rabin_karp
from oats.graphs.pairwise import pairwise_doc2vec_onegroup, pairwise_counting_onegroup, pairwise_annotations_onegroup
from oats.graphs.pairwise import merge_edgelists, subset_edgelist_with_ids, pairwise_word2vec_onegroup
from oats.graphs.pairwise import remove_self_loops
from oats.graphs.indexed import IndexedGraph
from oats.graphs.models import train_logistic_regression_model, apply_logistic_regression_model
from oats.graphs.models import train_random_forest_model, apply_random_forest_model
from oats.pubmed.querying import search, fetch_details

mpl.rcParams["figure.dpi"] = 400
warnings.simplefilter('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
nltk.download('punkt')
nltk.download('brown')

[nltk_data] Downloading package punkt to /Users/irbraun/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to /Users/irbraun/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [13]:
from oats.nlp.vocabulary import vocabulary_by_feature_selection
from nltk.corpus import brown

# Get a sampling of text that makes up the background.
background_text = " ".join(brown.words(categories=['news',"editorial","reviews","lore"]))


# Get a sampling of text that is composed of the interesting information.
interesting_texts = []
limit = 1000
query = "arabidopsis AND phenotype"
results = search(query, limit)
id_list = results['IdList']
if len(id_list) > 0:
    papers = fetch_details(id_list)
    for i, paper in enumerate(papers['PubmedArticle']):
        try:
            abstract_text = paper['MedlineCitation']['Article']['Abstract']["AbstractText"][0]
            interesting_texts.append(abstract_text)
        except KeyError:
            continue
    print(len(interesting_texts))
interesting_text = " ".join(interesting_texts)



vocab = vocabulary_by_feature_selection(interesting_text, background_text, max_features=40)
for v in vocab.keys():
    print(v)

997
in
arabidopsis
and
plants
we
expression
plant
genes
that
of
protein
sup
stress
cell
mutant
growth
sub
gene
root
proteins
mutants
response
development
thaliana
function
results
role
type
these
induced
transcription
activity
analysis
levels
showed
signaling
study
involved
cells
wild


In [3]:
# Reading in the entire dataset, subsetting for Arabidosis and all annotation types.
dataset = load_from_pickle("../data/pickles/full_dataset.pickle")
dataset.filter_by_species("ath")
dataset.collapse_by_all_gene_names()
dataset.filter_has_description()
dataset.filter_has_annotation()
dataset.filter_random_k(20)
dataset.describe()

Number of rows in the dataframe: 20
Number of unique IDs:            20
Number of unique descriptions:   18
Number of unique gene name sets: 20
Number of species represented:   1


In [4]:
genes = dataset.get_gene_dictionary()
id_to_abstract_text = {}
for identifier,gene_obj in genes.items():
    limit = 1
    query = "arabidopsis AND ({})".format(" OR ".join(gene_obj.names))
    results = search(query, limit)
    id_list = results['IdList']
    if len(id_list) > 0:
        papers = fetch_details(id_list)
        for i, paper in enumerate(papers['PubmedArticle']): 
            abstract_text = paper['MedlineCitation']['Article']['Abstract']["AbstractText"][0]
            id_to_abstract_text[identifier] = abstract_text

In [5]:
print(id_to_abstract_text[list(id_to_abstract_text.keys())[0]])

As part of our analysis of branched-chain amino acid metabolism in plants, we analyzed the function of Arabidopsis thaliana BRANCHED-CHAIN AMINOTRANSFERASE4 (BCAT4). Recombinant BCAT4 showed high efficiency with Met and its derivatives and the corresponding 2-oxo acids, suggesting its participation in the chain elongation pathway of Met-derived glucosinolate biosynthesis. This was substantiated by in vivo analysis of two BCAT4 T-DNA knockout mutants, in which Met-derived aliphatic glucosinolate accumulation is reduced by approximately 50%. The increase in free Met and S-methylmethionine levels in these mutants, together with in vitro substrate specificity, strongly implicate BCAT4 in catalysis of the initial deamination of Met to 4-methylthio-2-oxobutyrate. BCAT4 transcription is induced by wounding and is predominantly observed in the phloem. BCAT4 transcript accumulation also follows a diurnal rhythm, and green fluorescent protein tagging experiments and subcellular protein fractions

In [6]:
from oats.nlp.search import check_text_for_patterns_rabin_karp
from nltk.tokenize import sent_tokenize
descriptions = {}
for identifier,abstract_text in id_to_abstract_text.items():
    sentences = sent_tokenize(abstract_text)
    sentences = [x for x in sentences if check_text_for_patterns_rabin_karp(genes[identifier].names, x)]
    relevant_text = " ".join(sentences)
    descriptions[identifier] = relevant_text

In [7]:
text1 = descriptions[522]
text2 = descriptions[24198]
text1 = text1 + "orange orange orange orange orange orange orange orange orange orange orange orange orange orange orange"
print(text1)
print()
print(text2)

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(input='content')
dtm = vectorizer.fit_transform([text1,text2])

vocab = np.array(vectorizer.get_feature_names())
dtm = dtm.toarray()
rates = 1000 * dtm / np.sum(dtm, axis=1, keepdims=True)
print(rates[0])
print("asdf")
print(rates[1])
keyness = rates[0]-rates[1]
ranking = np.argsort(keyness)[::-1]
print(vocab[ranking][0:10])











from sklearn.feature_selection import chi2
labels = ["A","B"]


# chi2 returns two arrays, the chi2 test statistic and an
# array of "p-values", which we'll ignore

#keyness, _ = chi2(dtm, labels)
#ranking = np.argsort(keyness)[::-1]
#print(vocab[ranking][0:10])



As part of our analysis of branched-chain amino acid metabolism in plants, we analyzed the function of Arabidopsis thaliana BRANCHED-CHAIN AMINOTRANSFERASE4 (BCAT4). Recombinant BCAT4 showed high efficiency with Met and its derivatives and the corresponding 2-oxo acids, suggesting its participation in the chain elongation pathway of Met-derived glucosinolate biosynthesis. This was substantiated by in vivo analysis of two BCAT4 T-DNA knockout mutants, in which Met-derived aliphatic glucosinolate accumulation is reduced by approximately 50%. The increase in free Met and S-methylmethionine levels in these mutants, together with in vitro substrate specificity, strongly implicate BCAT4 in catalysis of the initial deamination of Met to 4-methylthio-2-oxobutyrate. BCAT4 transcription is induced by wounding and is predominantly observed in the phloem. BCAT4 transcript accumulation also follows a diurnal rhythm, and green fluorescent protein tagging experiments and subcellular protein fractions

In [8]:
doc2vec_model_filename = "../gensim/enwiki_dbow/doc2vec.bin"
doc2vec_model = gensim.models.Doc2Vec.load(doc2vec_model_filename)


descriptions = dataset.get_description_dictionary()
descriptions = {k:v for k,v in descriptions.items() if k in id_to_abstract_text}
abstracts = id_to_abstract_text
    
name_to_df_mapping = {}
name_to_df_mapping["d2v"] = pairwise_edgelist_doc2vec_twogroup(doc2vec_model, descriptions, abstracts, "cosine")
name_to_df_mapping["bow"] = pairwise_edgelist_counting_twogroup(descriptions, abstracts, "cosine", binary=False) 
df = merge_edgelists(name_to_df_mapping, default_value=0.000)
print(df.head(8))
print(df.shape[0])

x = df[df["from"]==df["to"]]["d2v"].values
print(x)

NameError: name 'pairwise_edgelist_doc2vec_twogroup' is not defined

In [None]:



"""
# Getting up to 10 articles about maize
limit = 1
query = ""
results = search("maize AND arabidopsis", limit=4)
id_list = results['IdList']
papers = fetch_details(id_list)

for i, paper in enumerate(papers['PubmedArticle']): 
    print("\n\nFound Paper #{}".format(i+1))
    print(paper['MedlineCitation']['Article']['ArticleTitle'])
    print(paper["MedlineCitation"]["PMID"])
    print(paper['MedlineCitation']['Article']['Abstract']["AbstractText"][0])
"""

In [None]:
# Reading in and generating orthologs dictionary from panther dataset for plant species.
# Need to figure out if reciprocals are given in this file.
df = pd.read_table("/Users/irbraun/Desktop/orthologs.txt")
df = df.head(100)
for row in df.itertuples():
    gene1_list = row[1].split("|")
    gene2_list = row[2].split("|")
    print(gene1_list)
    print(gene2_list)