### TODO
1. Change 'datasets' to 'biodata'
2. Change 'annotation.annotation' to 'annotation.annotate'
3. Change 'collapse_using_all_gene_names' to something more meaningful
4. Deal with capitalization in gene merging.
5. change noble_coder_jar_path to just 'jar'

In [1]:
import pandas as pd
import pprint
from tabulate import tabulate
import sys
sys.path.append("../../oats")

from oats.utils.utils import save_to_pickle, load_from_pickle, merge_list_dicts, flatten, to_hms
from oats.datasets.dataset import Dataset

# Finding good candidates for genes to include an a small example.
data = load_from_pickle("../data/pickles/gene_phenotype_dataset_all_text_and_annotations.pickle")
data.filter_has_description()

from oats.datasets.groupings import Groupings
pathway_species_files = {
    "ath":"../data/group_related_files/pmn/aracyc_pathways.20180702", 
    "zma":"../data/group_related_files/pmn/corncyc_pathways.20180702"}
groupings = Groupings(pathway_species_files, "pmn")
id_to_groups, group_to_ids = groupings.get_groupings_for_dataset(data)

good_ids = [k for k,v in id_to_groups.items() if len(v)>1]
good_ath_ids = [i for i in good_ids if data.get_species_dictionary()[i] == "ath"]
good_zma_ids = [i for i in good_ids if data.get_species_dictionary()[i] == "zma"]

#good_ath_ids = good_ath_ids[1:2000]
#good_zma_ids = good_zma_ids[1:20]

df = data.to_pandas()
df["keep"] = df["id"].map(lambda x: x in good_ath_ids or x in good_zma_ids)
df = df[df["keep"]]
df["keep"] = df["description"].map(lambda x: "drought" in x)
df = df[df["keep"]]
df.to_csv("/Users/irbraun/Desktop/fromnb.csv")

# BioData

In [2]:
from oats.datasets.dataset import Dataset
data = Dataset("/Users/irbraun/oats/examples/example.csv")
data.to_pandas()

Unnamed: 0,id,species,gene_names,gene_synonyms,description,term_ids,sources
0,0,ath,At1g74030,F2P9_10,Decreased root hair density. Distorted trichomes.,GO:0009507|PO:0001170,example
1,1,ath,At1g74030|ENO1,F2P9_10|AT1G74030.1,Trichomes are less turgescent and are distorte...,GO:0009735,example
2,2,ath,ENO1|enolase 1,F2P9_10|AT1G74030.1,Plants also have fewer root hairs with respect...,GO:0009735,example
3,3,ath,At3g56960|PIP5K4,phosphatidyl inositol monophosphate 5 kinase 4,Decreased stomatal opening.,PO:0009046,example
4,4,ath,At3g56960,T8M16.6,Delayed stomatal opening.,GO:0009860,example
5,5,ath,At1g74920|ALDH10A8,aldehyde dehydrogenase 10A8,Sensitive to drought. Sensitive to mannitol.,GO:0005618|GO:0009516,example
6,6,ath,At1g74920,F25A4_11,Sensitive to salt.,PO:0007123,example
7,7,zma,nec4|GRMZM5G870342,nec4|cpx1,Necrotic leaf. Affected tissue dies.,,example
8,8,zma,GRMZM5G870342,cpx1|dks8,Pale green seedling. Yellow green leaf.,,example
9,9,zma,GRMZM2G117878|ufgt2,,Salt stress intolerant.,,example


In [3]:
with open("/Users/irbraun/oats/examples/readme_table_1.txt", 'w') as outputfile:
    outputfile.write(tabulate(data.to_pandas(), headers="keys", showindex="never"))

In [4]:
data.collapse_by_all_gene_names()
data.filter_has_description()
data.to_pandas()

Unnamed: 0,id,species,gene_names,gene_synonyms,description,term_ids,sources
0,0,ath,At1g74030|ENO1|enolase 1,F2P9_10|AT1G74030.1,Decreased root hair density. Distorted trichom...,GO:0009507|PO:0001170|GO:0009735,example
1,1,ath,At3g56960|PIP5K4,phosphatidyl inositol monophosphate 5 kinase 4...,Decreased stomatal opening. Delayed stomatal o...,PO:0009046|GO:0009860,example
2,2,ath,At1g74920|ALDH10A8,aldehyde dehydrogenase 10A8|F25A4_11,Sensitive to drought. Sensitive to mannitol. S...,GO:0005618|GO:0009516|PO:0007123,example
3,3,zma,nec4|GRMZM5G870342,nec4|cpx1|dks8,Necrotic leaf. Affected tissue dies. Pale gree...,,example
4,4,zma,GRMZM2G117878|ufgt2,UDP-glycosyltransferase 76C1,Salt stress intolerant. Drought susceptible.,,example
5,5,zma,ccd8|GRMZM2G446858|Zm00001d043442,Zmccd8|ccd8-trDs|ccd8|ccd8a,"A plant with a thin culm, giving the plant an ...",GO:0010311|GO:0022622|GO:1901601|GO:0010016,example


In [5]:
with open("/Users/irbraun/oats/examples/readme_table_2.txt", 'w') as outputfile:
    outputfile.write(tabulate(data.to_pandas(), headers="keys", showindex="never"))

In [6]:
data.describe()

Unnamed: 0,species,num_genes,unique_descriptions
0,ath,3,3
1,zma,3,3
2,total,6,6


In [7]:
from oats.datasets.groupings import Groupings

pathway_species_files = {
    "ath":"../data/group_related_files/pmn/aracyc_pathways.20180702", 
    "zma":"../data/group_related_files/pmn/corncyc_pathways.20180702"}
groupings = Groupings(pathway_species_files, "pmn")
id_to_groups, group_to_ids = groupings.get_groupings_for_dataset(data)
pprint.pprint(id_to_groups)

{0: ['PWY-1042',
     'PWY-5723',
     'PWY66-399',
     'PWY-5484',
     'GLUCONEO-PWY',
     'GLYCOLYSIS'],
 1: ['PWY-6351', 'PWY-6352'],
 2: ['PWY-2', 'PWY1F-353'],
 3: [],
 4: [],
 5: ['PWY-7101', 'PWY-6806']}


# Annotation

In [8]:
from oats.annotation.ontology import Ontology
from oats.annotation.annotation import annotate_using_noble_coder

In [9]:

descriptions = data.get_description_dictionary()
pprint.pprint(descriptions)

{0: 'Decreased root hair density. Distorted trichomes. Trichomes are less '
    'turgescent and are distorted with respect to the wild type. Plants also '
    'have fewer root hairs with respect to wild type.',
 1: 'Decreased stomatal opening. Delayed stomatal opening.',
 2: 'Sensitive to drought. Sensitive to mannitol. Sensitive to salt.',
 3: 'Necrotic leaf. Affected tissue dies. Pale green seedling.  Yellow green '
    'leaf.',
 4: 'Salt stress intolerant. Drought susceptible.',
 5: 'A plant with a thin culm, giving the plant an overall slender appearance. '
    'Small ears. Short plant. Slender plant.'}


In [10]:
ont = Ontology("../ontologies/pato.obo")  
annots = annotate_using_noble_coder(descriptions, "../lib/NobleCoder-1.0.jar"  , "pato", precise=1)
pprint.pprint(annots)

{0: ['PATO:0001617', 'PATO:0001997', 'PATO:0001019'],
 1: ['PATO:0001997', 'PATO:0000502'],
 2: ['PATO:0000516'],
 3: ['PATO:0001941', 'PATO:0001272', 'PATO:0000647'],
 4: ['PATO:0001152'],
 5: ['PATO:0000574',
     'PATO:0000592',
     'PATO:0000569',
     'PATO:0002212',
     'PATO:0000587']}


In [11]:
annot_labels = {i:[ont.get_label_from_id(t) for t in term_list] for i,term_list in annots.items()}
pprint.pprint(annot_labels)

{0: ['deformed', 'decreased amount', 'mass density'],
 1: ['decreased amount', 'delayed'],
 2: ['sensitive toward'],
 3: ['yellow green', 'desaturated green', 'necrotic'],
 4: ['susceptible toward'],
 5: ['decreased length',
     'decreased thickness',
     'decreased height',
     'slender',
     'decreased size']}


In [12]:
ont.forward_term_dict
ont.pronto_ontology_obj.terms

OrderedDict([('PATO:0000000', <PATO:0000000: obsolete pato>),
             ('PATO:0000001', <PATO:0000001: quality>),
             ('PATO:0000002', <PATO:0000002: obsolete value>),
             ('PATO:0000003', <PATO:0000003: obsolete assay>),
             ('PATO:0000004', <PATO:0000004: mobility>),
             ('PATO:0000005', <PATO:0000005: obsolete absolute activity>),
             ('PATO:0000006', <PATO:0000006: obsolete process>),
             ('PATO:0000007', <PATO:0000007: obsolete relative activity>),
             ('PATO:0000008', <PATO:0000008: speed>),
             ('PATO:0000009', <PATO:0000009: obsolete absolute speed>),
             ('PATO:0000010', <PATO:0000010: obsolete relative speed>),
             ('PATO:0000011', <PATO:0000011: age>),
             ('PATO:0000012', <PATO:0000012: obsolete absolute age>),
             ('PATO:0000013', <PATO:0000013: obsolete relative age>),
             ('PATO:0000014', <PATO:0000014: color>),
             ('PATO:0000015', <PATO:0000

# Distances

In [13]:
from oats.graphs import pairwise as pw


In [17]:
dists = pw.pairwise_square_annotations(annots, ont, metric="jaccard")    
array_to_print = [["{:.4f}".format(i) for i in x] for x in dists.array.round(4).tolist()]
array_to_print

[['0.0000', '0.5714', '0.8333', '0.7500', '0.8462', '0.6842'],
 ['0.5714', '0.0000', '0.9286', '0.9000', '0.9333', '0.8182'],
 ['0.8333', '0.9286', '0.0000', '0.8462', '0.3333', '0.8889'],
 ['0.7500', '0.9000', '0.8462', '0.0000', '0.8571', '0.8696'],
 ['0.8462', '0.9333', '0.3333', '0.8571', '0.0000', '0.8947'],
 ['0.6842', '0.8182', '0.8889', '0.8696', '0.8947', '0.0000']]

In [18]:
dists = pw.pairwise_square_ngrams(ids_to_texts=descriptions, binary=True, metric="jaccard")    
array_to_print = [["{:.4f}".format(i) for i in x] for x in dists.array.round(4).tolist()]
array_to_print

[['0.0000', '0.9583', '0.9600', '1.0000', '1.0000', '0.9375'],
 ['0.9583', '0.0000', '1.0000', '1.0000', '1.0000', '1.0000'],
 ['0.9600', '1.0000', '0.0000', '1.0000', '0.7500', '1.0000'],
 ['1.0000', '1.0000', '1.0000', '0.0000', '1.0000', '1.0000'],
 ['1.0000', '1.0000', '0.7500', '1.0000', '0.0000', '1.0000'],
 ['0.9375', '1.0000', '1.0000', '1.0000', '1.0000', '0.0000']]

In [15]:
dists = pw.pairwise_square_topic_model(ids_to_texts=descriptions, num_topics=3, algorithm="lda", metric="euclidean")
array_to_print = [["{:.4f}".format(i) for i in x] for x in dists.array.round(4).tolist()]
array_to_print

[['0.0000', '0.1291', '0.1209', '1.0854', '0.0968', '0.0520'],
 ['0.1291', '0.0000', '0.0082', '0.9738', '0.0323', '0.0771'],
 ['0.1209', '0.0082', '0.0000', '0.9807', '0.0241', '0.0689'],
 ['1.0854', '0.9738', '0.9807', '0.0000', '1.0013', '1.0401'],
 ['0.0968', '0.0323', '0.0241', '1.0013', '0.0000', '0.0448'],
 ['0.0520', '0.0771', '0.0689', '1.0401', '0.0448', '0.0000']]

In [16]:
import gensim
model = gensim.models.Doc2Vec.load("../gensim/enwiki_dbow/doc2vec.bin" )
dists = pw.pairwise_square_doc2vec(model=model, ids_to_texts=descriptions, metric="cosine")    
array_to_print = [["{:.4f}".format(i) for i in x] for x in dists.array.round(4).tolist()]
array_to_print

[['0.0000', '0.3744', '0.4556', '0.3406', '0.4194', '0.3527'],
 ['0.3744', '0.0000', '0.4453', '0.3653', '0.4015', '0.4177'],
 ['0.4556', '0.4453', '0.0000', '0.4572', '0.4138', '0.4599'],
 ['0.3406', '0.3653', '0.4572', '0.0000', '0.4194', '0.3084'],
 ['0.4194', '0.4015', '0.4138', '0.4194', '0.0000', '0.4539'],
 ['0.3527', '0.4177', '0.4599', '0.3084', '0.4539', '0.0000']]