In [1]:
import sys
import os
import pandas as pd
import numpy as np

sys.path.append("../.")

from phenolog.datasets.dataset import Dataset
from phenolog.language.nlp import get_clean_description
from phenolog.annotation.ontology import Ontology
from phenolog.annotation.annotation import annotate_using_rabin_karp, annotate_using_noble_coder 
from phenolog.annotation.annotation import write_annotations_to_tsv_file, read_annotations_from_tsv_file

from phenolog.graphs.similarity import get_similarity_df_using_fastsemsim
from phenolog.graphs.similarity import get_similarity_df_using_doc2vec
from phenolog.graphs.similarity import get_similarity_df_using_bagofwords
from phenolog.graphs.similarity import get_similarity_df_using_setofwords
from phenolog.graphs.similarity import get_similarity_df_using_annotations_unweighted_jaccard
from phenolog.graphs.similarity import get_similarity_df_using_annotations_weighted_jaccard

from phenolog.graphs.models import combine_dfs_with_name_dict
from phenolog.graphs.models import apply_mean
from phenolog.graphs.models import train_linear_regression_model
from phenolog.graphs.models import apply_linear_regression_model
from phenolog.graphs.models import train_random_forest_model
from phenolog.graphs.models import apply_random_forest_model

### Create a dataset object and read in the prepared data files from different plant species
The `Dataset.add_data()` method expects a pandas.DataFrame that must have atleast the columns ("species", "description", "gene_names", "reference"). Any other columns in a passed in dataframe are ignored. The gene names field from the dataframe should be delimited by the bar character (`|`). There are methods available in the `language.nlp` module to help with obtaining and formatting gene names to be consisent with what is expected for this method. The `describe()` method here provides basic information about what the contents of the dataset looks like. 

In [2]:
dataset = Dataset()
dataset.add_data(pd.read_csv("../data/reshaped/arabidopsis_phenotypes.csv", lineterminator="\n"))
dataset.add_data(pd.read_csv("../data/reshaped/maize_phenotypes.csv", lineterminator="\n"))
dataset.add_data(pd.read_csv("../data/reshaped/ppn_phenotypes.csv", lineterminator="\n"))
dataset.add_data(pd.read_csv("../data/reshaped/ppn_phenes.csv", lineterminator="\n"))
dataset.describe()


Describing the Dataset object...
Number of rows in the dataframe: 23918
Number of unique IDs: 23918
Number of unique descriptions: 10974
Number of unique gene name sets: 10394
Number of species represented: 6


In [3]:
# Subsample the data that is available.
dataset.randomly_subsample_dataset(n=10)
dataset.describe()


Describing the Dataset object...
Number of rows in the dataframe: 10
Number of unique IDs: 10
Number of unique descriptions: 10
Number of unique gene name sets: 10
Number of species represented: 3


In [None]:
# Prepare a dictionary of phenotype descriptions where each has a unique ID value.
description_dict = dataset.get_description_dictionary()
description_dict = {i:get_clean_description(d) for (i,d) in description_dict.items()}

for identifier, description in description_dict.items():
    print("{}\t{}".format(identifier, description[0:50]))

In [None]:
merged_ontology_file = "../ontologies/mo.obo"
annotations_file = "../data/annotations/annotations_with_mo.tsv"
doc2vec_model_file = "../gensim/apnews_dbow/doc2vec.bin"

mo = Ontology(merged_ontology_file)
annotations = annotate_using_rabin_karp(description_dict, mo)
write_annotations_to_tsv_file(annotations, annotations_file)

for identifier, term_list in annotations.items():
    print("{}\t{}".format(identifier, term_list[0:5]))

In [None]:
df1 = get_similarity_df_using_fastsemsim(merged_ontology_file, annotations_file, description_dict)
df2 = get_similarity_df_using_doc2vec(doc2vec_model_file, description_dict)
df3 = get_similarity_df_using_bagofwords(description_dict)
df4 = get_similarity_df_using_setofwords(description_dict)
df5 = get_similarity_df_using_annotations_unweighted_jaccard(annotations, mo)
df6 = get_similarity_df_using_annotations_weighted_jaccard(annotations, mo)
dfs = [df1, df2, df3, df4, df5, df6]
methods = ["ontology", "doc2vec", "bagofwords", "setofwords", "onto_unwt", "onto_wt"]
method_to_df = {k:v for (k,v) in zip(methods,dfs)}

In [None]:
merged_df = combine_dfs_with_name_dict(method_to_df)
print(merged_df.head())

In [None]:
output_df = apply_mean(df=merged_df, predictor_columns=methods)
print(output_df.head())

In [None]:
# Combine the different graphs by training and applying a linear regression model.
merged_df["target_value"] = np.random.sample(merged_df.shape[0]) # Target values are floats between 0 and 1.
model = train_linear_regression_model(df=merged_df, predictor_columns=methods, target_column="target_value")
output_df = apply_linear_regression_model(df=merged_df, predictor_columns=methods, model=model)
print(output_df.head(16))

In [None]:
merged_df["target_class"] = np.random.randint(0,2,merged_df.shape[0]) # Target classes are 0 or 1, randomly.
model = train_random_forest_model(df=merged_df, predictor_columns=methods, target_column="target_class")
output_df = apply_random_forest_model(df=merged_df, predictor_columns=methods, model=model)
print(output_df.head(16))