### TODO
1. Change 'datasets' to 'biodata'
2. Change 'annotation.annotation' to 'annotation.annotate'
3. Change 'collapse_using_all_gene_names' to something more meaningful

In [1]:
import pandas as pd
import pprint
from tabulate import tabulate
import sys
sys.path.append("../../oats")

# BioData

In [2]:
from oats.datasets.dataset import Dataset
data = Dataset("/Users/irbraun/oats/example/example.csv")
data.to_pandas()

Unnamed: 0,id,species,gene_names,gene_synonyms,description,term_ids,sources
0,0,ath,At1g74030,F2P9_10,Decreased root hair density. Distorted trichomes.,GO:0009507|PO:0001170,example
1,1,ath,At1g74030|ENO1,F2P9_10|AT1G74030.1,Trichomes are less turgescent and are distorte...,GO:0009735,example
2,2,ath,ENO1|enolase 1,F2P9_10|AT1G74030.1,Plants also have fewer root hairs with respect...,GO:0009735,example
3,3,ath,At3g56960|PIP5K4,phosphatidyl inositol monophosphate 5 kinase 4,Decreased stomatal opening.,PO:0009046,example
4,4,ath,At3g56960,T8M16.6,Delayed stomatal opening.,GO:0009860,example
5,5,zma,nec4|GRMZM5G870342,nec4|cpx1,Necrotic leaf. Affected tissue dies.,,example
6,6,zma,GRMZM5G870342,cpx1|dks8,Pale green seedling. Yellow green leaf.,,example
7,7,zma,ccd8,Zmccd8,"A plant with a thin culm, giving the plant an ...",GO:0010311|GO:0022622,example
8,8,zma,ccd8|GRMZM2G446858,ccd8-trDs|ccd8,Short plant.,GO:1901601,example
9,9,zma,ccd8|Zm00001d043442,ccd8|ccd8a,Slender plant.,GO:0010016,example


In [3]:
with open("/Users/irbraun/oats/example/readme_table_1.txt", 'w') as outputfile:
    outputfile.write(tabulate(data.to_pandas(), headers="keys", showindex="never"))

In [4]:
data.collapse_by_all_gene_names()
data.filter_has_description()
data.to_pandas()

Unnamed: 0,id,species,gene_names,gene_synonyms,description,term_ids,sources
0,0,ath,At1g74030|ENO1|enolase 1,F2P9_10|AT1G74030.1,Decreased root hair density. Distorted trichom...,GO:0009507|PO:0001170|GO:0009735,example
1,1,ath,At3g56960|PIP5K4,phosphatidyl inositol monophosphate 5 kinase 4...,Decreased stomatal opening. Delayed stomatal o...,PO:0009046|GO:0009860,example
2,2,zma,nec4|GRMZM5G870342,nec4|cpx1|dks8,Necrotic leaf. Affected tissue dies. Pale gree...,,example
3,3,zma,ccd8|GRMZM2G446858|Zm00001d043442,Zmccd8|ccd8-trDs|ccd8|ccd8a,"A plant with a thin culm, giving the plant an ...",GO:0010311|GO:0022622|GO:1901601|GO:0010016,example


In [5]:
with open("/Users/irbraun/oats/example/readme_table_2.txt", 'w') as outputfile:
    outputfile.write(tabulate(data.to_pandas(), headers="keys", showindex="never"))

In [6]:
from oats.datasets.groupings import Groupings

pathway_species_files = {
    "ath":"../data/group_related_files/pmn/aracyc_pathways.20180702", 
    "zma":"../data/group_related_files/pmn/corncyc_pathways.20180702"}
groupings = Groupings(pathway_species_files, "pmn")
id_to_groups, group_to_ids = groupings.get_groupings_for_dataset(data)
pprint.pprint(id_to_groups)

{0: ['PWY-1042',
     'PWY-5723',
     'PWY66-399',
     'PWY-5484',
     'GLUCONEO-PWY',
     'GLYCOLYSIS'],
 1: ['PWY-6351', 'PWY-6352'],
 2: [],
 3: ['PWY-7101', 'PWY-6806']}


# Annotation

In [7]:
from oats.annotation.ontology import Ontology
from oats.annotation.annotation import annotate_using_noble_coder

In [8]:
ont = Ontology("../ontologies/pato.obo")
descriptions = data.get_description_dictionary()
pprint.pprint(descriptions)

{0: 'Decreased root hair density. Distorted trichomes. Trichomes are less '
    'turgescent and are distorted with respect to the wild type. Plants also '
    'have fewer root hairs with respect to wild type.',
 1: 'Decreased stomatal opening. Delayed stomatal opening.',
 2: 'Necrotic leaf. Affected tissue dies. Pale green seedling.  Yellow green '
    'leaf.',
 3: 'A plant with a thin culm, giving the plant an overall slender appearance. '
    'Small ears. Short plant. Slender plant.'}


In [9]:
noblecoder_jarfile_path = "../lib/NobleCoder-1.0.jar"      
annots = annotate_using_noble_coder(descriptions, noblecoder_jarfile_path, "mo", precise=1)
pprint.pprint(annots)

{0: ['PATO:0001019',
     'PATO:0001617',
     'PO:0000282',
     'PATO:0001997',
     'PO:0009005'],
 1: ['PATO:0001997', 'PATO:0000502'],
 2: ['PATO:0001941', 'PO:0025034', 'PATO:0001272', 'PATO:0000647'],
 3: ['PATO:0002212',
     'PATO:0000574',
     'PATO:0000592',
     'PATO:0000569',
     'PATO:0000587']}


# Distances

In [10]:
from oats.graphs import pairwise as pw
import gensim

model = gensim.models.Doc2Vec.load("../gensim/enwiki_dbow/doc2vec.bin" )

In [11]:
dists = pw.pairwise_square_doc2vec(model=model, ids_to_texts=descriptions, metric="cosine")    
pprint.pprint(dists.array.round(4))

array([[0.    , 0.375 , 0.3715, 0.3413],
       [0.375 , 0.    , 0.3743, 0.4075],
       [0.3715, 0.3743, 0.    , 0.3178],
       [0.3413, 0.4075, 0.3178, 0.    ]])


In [12]:
dists = pw.pairwise_square_ngrams(ids_to_texts=descriptions, binary=True, metric="jaccard")    
pprint.pprint(dists.array.round(4))

array([[0.    , 0.9583, 1.    , 0.9375],
       [0.9583, 0.    , 1.    , 1.    ],
       [1.    , 1.    , 0.    , 1.    ],
       [0.9375, 1.    , 1.    , 0.    ]])


In [13]:
dists = pw.pairwise_square_topic_model(ids_to_texts=descriptions, num_topics=3, algorithm="lda", metric="euclidean")
array_to_print = [["{:.4f}".format(i) for i in x] for x in dists.array.round(4).tolist()]
array_to_print

[['0.0000', '0.1287', '1.0864', '1.0912'],
 ['0.1287', '0.0000', '0.9750', '0.9805'],
 ['1.0864', '0.9750', '0.0000', '1.0428'],
 ['1.0912', '0.9805', '1.0428', '0.0000']]