In [8]:
import sys
import os
import pandas as pd
import numpy as np

sys.path.append("../.")
from phenolog.datasets.dataset import Dataset
from phenolog.language.nlp import get_clean_description
from phenolog.annotation.ontology import Ontology
from phenolog.annotation.annotation import annotate_using_rabin_karp, annotate_using_noble_coder, write_annotations_to_tsv_file
from phenolog.graphs.similarity import get_similarity_df_using_fastsemsim
from phenolog.graphs.similarity import get_similarity_df_using_doc2vec
from phenolog.graphs.similarity import get_similarity_df_using_bagofwords
from phenolog.graphs.similarity import get_similarity_df_using_setofwords
from phenolog.graphs.similarity import get_similarity_df_using_annotations_unweighted_jaccard
from phenolog.graphs.similarity import get_similarity_df_using_annotations_weighted_jaccard

from phenolog.graphs.models import combine_dfs_with_name_dict
from phenolog.graphs.models import apply_mean
from phenolog.graphs.models import train_linear_regression_model
from phenolog.graphs.models import apply_linear_regression_model
from phenolog.graphs.models import train_random_forest_model
from phenolog.graphs.models import apply_random_forest_model



In [2]:
dataset = Dataset()
dataset.add_data(pd.read_csv("../data/reshaped/arabidopsis_phenotypes.csv", lineterminator="\n"))
dataset.add_data(pd.read_csv("../data/reshaped/maize_phenotypes.csv", lineterminator="\n"))
dataset.add_data(pd.read_csv("../data/reshaped/ppn_phenotypes.csv", lineterminator="\n"))
dataset.add_data(pd.read_csv("../data/reshaped/ppn_phenes.csv", lineterminator="\n"))
dataset.describe()


Describing the Dataset object...
Number of rows in the dataframe: 23918
Number of unique IDs: 23918
Number of unique descriptions: 10974
Number of unique gene name sets: 10394
Number of species represented: 6


In [3]:
# Subsample the data that is available.
dataset.randomly_subsample_dataset(n=10)
dataset.describe()


Describing the Dataset object...
Number of rows in the dataframe: 10
Number of unique IDs: 10
Number of unique descriptions: 10
Number of unique gene name sets: 10
Number of species represented: 3


In [4]:
# Prepare a dictionary of phenotype descriptions where each has a unique ID value.
description_dict = dataset.get_description_dictionary()
description_dict = {i:get_clean_description(d) for (i,d) in description_dict.items()}

for identifier, description in description_dict.items():
    print("{}\t{}".format(identifier, description[0:50]))

21337	pale green
11649	collapsed endosperm endosperm collapsed and partia
18753	small leaves under continuous light
17594	highlevel resistance to many strains of tobacco mo
16066	late flowering
4441	in the mutant about 2 of the trichomes showed a cl
10925	reduced amounts of xylose and fucose in cell wall 
566	mutant plant does not show the post illumination i
14810	abnormal leaf growth
14915	aborted kernel


In [5]:
merged_ontology_file = "../ontologies/mo.obo"
annotations_file = "../data/annotations/annotations_with_mo.tsv"
doc2vec_model_file = "../gensim/apnews_dbow/doc2vec.bin"

mo = Ontology(merged_ontology_file)
annotations = annotate_using_rabin_karp(description_dict, mo)
write_annotations_to_tsv_file(annotations, annotations_file)

for identifier, term_list in annotations.items():
    print("{}\t{}".format(identifier, term_list[0:5]))

21337	['PATO:0000320', 'PATO:0000328', 'PO:0009038', 'PO:0009054', 'PATO:0001272']
11649	['PATO:0000460', 'PATO:0001511', 'PATO:0001341', 'PATO:0001478', 'PO:0009089']
18753	['PATO:0000587', 'PATO:0000665', 'PATO:0000689', 'PATO:0001341', 'NCBITaxon:1']
17594	['PATO:0001034', 'PATO:0001046', 'PATO:0001046', 'PATO:0001341']
16066	['PATO:0000502', 'PATO:0001341', 'PO:0009046']
4441	['PATO:0000161', 'PATO:0001470', 'PATO:0000322', 'PATO:0001341', 'PO:0000282']
10925	['PATO:0000070', 'PATO:0000304', 'PATO:0000322', 'PATO:0000587', 'PATO:0001997']
566	['PATO:0000018', 'PATO:0000322', 'PATO:0000327', 'PATO:0001341', 'PATO:0001999']
14810	['PATO:0000460', 'PATO:0000461', 'PATO:0001341', 'PO:0025034']
14915	['PATO:0001341', 'PO:0025263', 'PO:0030104']


In [6]:
df1 = get_similarity_df_using_fastsemsim(merged_ontology_file, annotations_file, description_dict)
df2 = get_similarity_df_using_doc2vec(doc2vec_model_file, description_dict)
df3 = get_similarity_df_using_bagofwords(description_dict)
df4 = get_similarity_df_using_setofwords(description_dict)
df5 = get_similarity_df_using_annotations_unweighted_jaccard(annotations, mo)
df6 = get_similarity_df_using_annotations_weighted_jaccard(annotations, mo)
dfs = [df1, df2, df3, df4, df5, df6]
methods = ["ontology", "doc2vec", "bagofwords", "setofwords", "onto_unwt", "onto_wt"]
method_to_df = {k:v for (k,v) in zip(methods,dfs)}

1 [1.8182 2 [3.6364 3 [5.4545 4 [7.2727 5 [9.0909 6 [10.9091 %7 [12.7273 %8 [14.5455 %9 [16.3636 %10 [18.1818 11 [20.0000 12 [21.8182 13 [23.6364 14 [25.4545 15 [27.2727 16 [29.0909 17 [30.9091 18 [32.7273 19 [34.5455 20 [36.3636 21 [38.1818 22 [40.0000 23 [41.8182 24 [43.6364 25 [45.4545 26 [47.2727 27 [49.0909 28 [50.9091 29 [52.7273 30 [54.5455 31 [56.3636 32 [58.1818 33 [60.0000 34 [61.8182 35 [63.6364 36 [65.4545 37 [67.2727 38 [69.0909 39 [70.9091 40 [72.7273 41 [74.5455 42 [76.3636 43 [78.1818 44 [80.0000 45 [81.8182 46 [83.6364 47 [85.4545 48 [87.2727 49 [89.0909 50 [90.9091 51 [92.7273 52 [94.5455 53 [96.3636 54 [98.1818 55 [100.0000 %]

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [10]:
merged_df = combine_dfs_with_name_dict(method_to_df)
print(merged_df.head())

    from     to  ontology   doc2vec  bagofwords  setofwords  onto_unwt  \
0  21337  21337  1.000000  1.000000         1.0         1.0   1.000000   
1  21337  11649  0.346154  0.469696         0.0         0.0   0.356164   
2  21337  18753  0.156627  0.612293         0.0         0.0   0.186667   
3  21337  17594  0.151515  0.445399         0.0         0.0   0.177419   
4  21337  16066  0.492537  0.568404         0.0         0.0   0.507692   

    onto_wt  
0  1.000000  
1  0.124120  
2  0.092680  
3  0.082881  
4  0.225099  
