In [1]:
import datetime
import nltk
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import pandas as pd
import numpy as np
import time
import math
import sys
import gensim
import os
import warnings
import torch
import itertools
import multiprocessing as mp
from collections import Counter, defaultdict
from inspect import signature
from scipy.stats import ks_2samp, hypergeom
from sklearn.metrics import precision_recall_curve, f1_score, auc
from sklearn.model_selection import train_test_split, KFold
from scipy import spatial, stats
from statsmodels.sandbox.stats.multicomp import multipletests
from nltk.corpus import brown
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.neighbors import KNeighborsClassifier
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from gensim.parsing.preprocessing import strip_non_alphanum, stem_text, preprocess_string, remove_stopwords
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.cluster import AgglomerativeClustering

sys.path.append("../../oats")
from oats.utils.utils import save_to_pickle, load_from_pickle, merge_list_dicts, flatten, to_hms
from oats.utils.utils import function_wrapper_with_duration
from oats.biology.dataset import Dataset
from oats.biology.groupings import Groupings
from oats.biology.relationships import ProteinInteractions, AnyInteractions
from oats.annotation.ontology import Ontology
from oats.annotation.annotation import annotate_using_noble_coder
from oats.distances import pairwise as pw
from oats.distances.edgelists import merge_edgelists, make_undirected, remove_self_loops, subset_with_ids

mpl.rcParams["figure.dpi"] = 400
warnings.simplefilter('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
nltk.download('punkt', quiet=True)
nltk.download('brown', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

True

In [2]:
dataset_filename = "../data/pickles/gene_phenotype_dataset_all_text_and_annotations.pickle"
dataset = load_from_pickle(dataset_filename)
pppn_edgelist_path = "../data/supplemental_files_oellrich_walls/13007_2015_53_MOESM9_ESM.txt"
pppn_edgelist = AnyInteractions(dataset.get_name_to_id_dictionary(), pppn_edgelist_path, default=0.00)

In [3]:
# Filter the dataset based on whether or not the genes were successfully mapped to an interaction.
# Reduce size of the dataset by removing genes not mentioned in the STRING.
naming_file = "../data/group_related_files/string/all_organisms.name_2_string.tsv"
interaction_files = [
    "../data/group_related_files/string/3702.protein.links.detailed.v11.0.txt", # Arabidopsis thaliana
    "../data/group_related_files/string/4577.protein.links.detailed.v11.0.txt", # maize
    "../data/group_related_files/string/4530.protein.links.detailed.v11.0.txt", # tomato 
    "../data/group_related_files/string/4081.protein.links.detailed.v11.0.txt", # medicago
    "../data/group_related_files/string/3880.protein.links.detailed.v11.0.txt", # rice 
    "../data/group_related_files/string/3847.protein.links.detailed.v11.0.txt", # soybean
]

genes = dataset.get_gene_dictionary()
string_data = ProteinInteractions(genes, naming_file, *interaction_files)
#dataset.filter_with_ids(string_data.ids)
string_data.df

KeyboardInterrupt: 

In [None]:
string_data.df.head(300)

In [None]:
print(pppn_edgelist.df.shape[0])
print(len(pppn_edgelist.ids))
print(len(pppn_edgelist.ids)**2)

In [None]:
pppn_edgelist.df.tail(30)

In [None]:
print(tasdfa)

In [None]:
df = pd.read_csv("../outputs/04_25_2020_h18m07s00-phe-c/part_5_topic_modeling.csv")
df

In [None]:
o = Ontology("../ontologies/go.obo")
print("done")

In [None]:
len(o)


In [None]:
import pronto
o = pronto.Ontology("../ontologies/go-basic.obo")
print("done")

In [None]:
# how fast is it to get root terms?
a = []
for t in o.terms():
    parents = [t for t in t.superclasses(with_self=False)]
    if t.name is not None:
        if len(parents) == 0:
            if "obsolete" not in t.name:
                a.append(t.id)
print(a)

In [None]:
num_terms = len([t for t in o.terms()])
depths = {i:0 for i in a}
#checked = set(depths.keys())

depth = 1
cont = True
while cont:
    before = len(depths)
    print(len(depths))
    print(num_terms)
    
    new_terms = []
    for t in depths.keys():
        new_terms.extend([y.id for y in o[t].subclasses(with_self=False, distance=1)])
    
    for i in new_terms:
        if i not in depths:
            depths[i] = depth
    
    depth = depth + 1
    after = len(depths)
    if before == after:
        cont = False
    
print("done")

# this gets all the ones we can find using this method,
# and then we'll need to make sure to default to 1 on all the other terms (things that were excluded from the 
# previous hunt for whatever reason... no .name? obsolete? etc.)

In [None]:
import pronto
pronto_ontology_obj = pronto.Ontology("../ontologies/go-basic.obo")


def get_term_depth_dictionary(pronto_ontology_obj):

    # Find the root term(s) of the ontology.
    root_term_ids = []
    for term in pronto_ontology_obj.terms():
        # Check if this term has no inherited terms (is a root), discounting terms that are obsolete.
        inherited_terms = [t for t in term.superclasses(with_self=False)]
        if (len(inherited_terms)==0) and (term.name is not None) and ("obsolete" not in term.name):
            root_term_ids.append(term.id)
            
    # Find the depths of all terms in the ontology below those terms.
    depths = {i:0 for i in root_term_ids}
    depth = 1
    done = False
    while not done:
        
        # Add all the terms immediately below 
        before = len(depths)
        new_terms = []
        for old_term_id in [i for i in depths.keys() if depths[i] == depth-1]:
            for new_term_id in [t.id for t in pronto_ontology_obj[old_term_id].subclasses(with_self=False,distance=1)]:
                if new_term_id not in depths:
                    depths[new_term_id] = depth
        
        # Increment the depth and see if any new terms were added to the distance dictionary during this pass.
        depth = depth + 1
        after = len(depths)
        if before == after:
            done = True
            
    # Add any other remaining terms to the dictionary with a depth of 0 indicating minimal specificity.
    for term in pronto_ontology_obj.terms():
        if term.id not in depths:
            depths[term.id] = 0
    
    # Return the dictionary mapping term IDs to their depth in the hierarchy.
    return(depths)
    
d = get_term_depth_dictionary(pronto_ontology_obj)

print(d)
print("done")

In [None]:
terms = [term for term in o.terms()]


t = terms[100]
print(t.name)
print(t.id)


In [None]:
a = [x.name for x in t.superclasses(with_self=False)]
print(a)

In [None]:
o = Ontology("../../oats/tests/data/test_ontology.obo")
for k,v in o.subclass_dict.items():
    print(k,v)

In [None]:
import pronto
pronto_o = pronto.Ontology("../ontologies/go-basic.obo")
print("done")
for t in pronto_o.terms():
    print(t.name)
    if (t.name is not None) and ("obsolete" not in t.name):    
        print(t.name)
        #pass
    else:
        print("HERE")

#import pronto
#pronto_o = pronto.Ontology("../ontologies/po.obo")

#term = pronto_o["PO:0000003"]

#for term in pronto_o.terms():
#sup = [t for t in term.subclasses(distance=1)]
#print(sup)
#break

In [None]:
for t in pronto_o.terms():
    for s in t.synonyms:
        print(s.description)