### Can machine learning approaches learn relationships between concepts that are in ontologies?
If the neural network document encoding models (Doc2Vec) are being successfully trained, then they should be able to recapture some of the domain-specific information that is written into relationships present in biological ontologies. Specifically, two concepts which have a parent-child relationship in PATO or PO can be considered to be highly similar in this context. We compare the distances between the labels for these pairs of terms as inferred by both the general Doc2Vec model trained on the English Wikipedia corpus, as well as our own models trained specifically on abstracts from PubMed that are specific to plant phenotypes. Here we generate figures to compare the results for a specific set of handpicked phrase or term pairs, as well as a second figure over all pairs parsed from the hierarchies in each ontology to check whether the result generalizes to the ontologies as a whole.

In [1]:
import pronto
from scipy.spatial.distance import cosine
from scipy.spatial.distance import jaccard

import datetime
import nltk
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import pandas as pd
import numpy as np
import time
import math
import sys
import gensim
import os
import warnings
import torch
import itertools
import multiprocessing as mp
from collections import Counter, defaultdict
from inspect import signature
from scipy.stats import ks_2samp
from sklearn.metrics import precision_recall_curve, f1_score, auc
from sklearn.model_selection import train_test_split, KFold
from scipy import spatial, stats
from nltk.corpus import brown
from nltk.tokenize import word_tokenize
from sklearn.neighbors import KNeighborsClassifier
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from gensim.parsing.preprocessing import strip_non_alphanum, stem_text, preprocess_string, remove_stopwords
from gensim.utils import simple_preprocess

sys.path.append("../../oats")
from oats.utils.utils import save_to_pickle, load_from_pickle, merge_list_dicts, flatten, to_hms
from oats.datasets.dataset import Dataset
from oats.datasets.groupings import Groupings
from oats.annotation.ontology import Ontology
from oats.datasets.string import String
from oats.datasets.edges import Edges
from oats.annotation.annotation import annotate_using_noble_coder
from oats.graphs import pairwise as pw
from oats.graphs.indexed import IndexedGraph
from oats.graphs.weighting import train_logistic_regression_model, apply_logistic_regression_model
from oats.graphs.weighting import train_random_forest_model, apply_random_forest_model
from oats.nlp.vocabulary import get_overrepresented_tokens, build_vocabulary_from_tokens
from oats.utils.utils import function_wrapper_with_duration
from oats.nlp.preprocess import concatenate_with_bar_delim

from _nb_utils import Method

mpl.rcParams["figure.dpi"] = 400
warnings.simplefilter('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
nltk.download('punkt', quiet=True)
nltk.download('brown', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)


True

### Generate a dataset of sibling and parent-child term label pairs
The purpose of this cell is to create a dataframe that holds all of the pairs of term labels that represent either sibling (the two terms have the same parent term) or parent-child relationships in the PATO and PO ontologies. In other words, this is a dataset of all the pairs of term labels in those ontologies where the terms are connected by a single edge in the graph. These pairs are expected to be terms which are very similar and have a close semantic relationship in general. Therefore, this dataset can be used to evaluate which methods for measuring similarity between short text phrases are most useful for capturing the relationships that are encoded in these ontologies. We want to know if the distances found for these pairs are smaller than the general background distribution of distances between all the possible pairings of labels in these ontologies, or some other background distribution such as all of the phenotype descriptions in a dataset of descriptions.

In [2]:
# Create dictionary for quickly checking these. The two relationships are mutually exclusive so only need one dict.
relationship_dict = defaultdict(dict)
ontologies = {"PATO":pronto.Ontology("../ontologies/pato.obo"), "PO":pronto.Ontology("../ontologies/po.obo")}
tuples = []
for ont_name,ont in ontologies.items():
    delim = "[DELIM]"
    sibling_pairs = set()
    for term in ont:
        for parent in term.parents.id:
            tuples.append((ont_name,"parent_child",term.name,ont[parent].name))   
            relationship_dict[term.id][ont[parent].id] = "parent_child"
            relationship_dict[ont[parent].id][term.id] = "parent_child"
        sorted_id_pairs = [sorted(pair) for pair in list(itertools.combinations(term.children.id, 2))]
        for sorted_id_pair in sorted_id_pairs:
            relationship_dict[sorted_id_pair[0]][sorted_id_pair[1]] = "sibling"
            relationship_dict[sorted_id_pair[1]][sorted_id_pair[0]] = "sibling"
        sorted_pairs = ["{}{}{}".format(ont[pair[0]].name, delim, ont[pair[1]].name) for pair in sorted_id_pairs]
        sibling_pairs.update(sorted_pairs)
    for pair in list(sibling_pairs):
        pair = pair.split(delim)
        tuples.append((ont_name,"sibling",pair[0],pair[1]))        
pairs_df = pd.DataFrame(tuples, columns=["ontology","relationship","label_1","label_2"])
pairs_df.head(10)

Unnamed: 0,ontology,relationship,label_1,label_2
0,PATO,parent_child,mobility,physical quality
1,PATO,parent_child,speed,movement quality
2,PATO,parent_child,age,time
3,PATO,parent_child,color,optical quality
4,PATO,parent_child,color hue,chromatic property
5,PATO,parent_child,color brightness,optical quality
6,PATO,parent_child,color saturation,chromatic property
7,PATO,parent_child,fluorescence,luminous flux
8,PATO,parent_child,color pattern,spatial pattern
9,PATO,parent_child,compatibility,behavioral quality


### Generate a dataset of all possible label pairs and their Jaccard distances

In [3]:
# Generate a dataset of all possible label pairs and their Jaccard distance between the label text.
ontologies = {"PATO":pronto.Ontology("../ontologies/pato.obo"), "PO":pronto.Ontology("../ontologies/po.obo")}
ontology = Ontology("../ontologies/mo.obo")
edgelists = []
for ont_name,ont in ontologies.items():
    annotations = {}
    id_to_term_label = {}
    id_to_term_id = {}
    for i,term in enumerate(ont):
        if not "obsolete" in term.name:
            id_to_term_label[i] = term.name
            id_to_term_id[i] = term.id
            annotations[i] = [term.id]
    edgelist = pw.pairwise_square_annotations(annotations, ontology, "jaccard").edgelist
    edgelist["term_1"] = edgelist["from"].map(lambda x: id_to_term_id[x]) 
    edgelist["term_2"] = edgelist["to"].map(lambda x: id_to_term_id[x])
    edgelist["label_1"] = edgelist["from"].map(lambda x: id_to_term_label[x]) 
    edgelist["label_2"] = edgelist["to"].map(lambda x: id_to_term_label[x])
    edgelist["ontology"] = ont_name
    edgelist.rename(columns={"value":"distance"}, inplace=True)
    edgelist = edgelist[["ontology","term_1","term_2","label_1","label_2","distance"]]
    edgelists.append(edgelist)
df = pd.concat(edgelists, ignore_index=True)
df.head(10)

Unnamed: 0,ontology,term_1,term_2,label_1,label_2,distance
0,PATO,PATO:0000001,PATO:0000001,quality,quality,0.0
1,PATO,PATO:0000001,PATO:0000004,quality,mobility,0.8
2,PATO,PATO:0000001,PATO:0000008,quality,speed,0.833333
3,PATO,PATO:0000001,PATO:0000011,quality,age,0.833333
4,PATO,PATO:0000001,PATO:0000014,quality,color,0.875
5,PATO,PATO:0000001,PATO:0000015,quality,color hue,0.888889
6,PATO,PATO:0000001,PATO:0000016,quality,color brightness,0.875
7,PATO,PATO:0000001,PATO:0000017,quality,color saturation,0.888889
8,PATO,PATO:0000001,PATO:0000018,quality,fluorescence,0.888889
9,PATO,PATO:0000001,PATO:0000019,quality,color pattern,0.857143


### Evaluating all methods for recapturing relationships encoded in the ontologies
This section is similar to the main analysis notebook that generates the pairwise distance matrices for all of the different NLP methods, and many of those cells have been copied and pasted here. The main difference is that the descriptions dataset is actually term labels from the ontologies, so the Jaccard similarity between each of possible pair of terms is treated as ground truth in order to evaluate how well these relationships are captured by each of the methods. This section uses only one ontology at a time because we are only interested in the pairs of terms that come from the same ontology and therefore have a meaningful distance measure between them.

In [4]:
# Generate dictionaries in the shape expected for running all the NLP methods for just one particular ontology.
ont_name = "PO"
ont = pronto.Ontology("../ontologies/po.obo")
ontology = Ontology("../ontologies/po.obo")
annotations = {}
descriptions = {}
id_to_term_id = {}
for i,term in enumerate(ont):
    if not "obsolete" in term.name:
        descriptions[i] = term.name
        annotations[i] = [term.id]
        id_to_term_id[i] = term.id

### Sections borrowed from the main analysis notebook
If the dictionary between IDs and term labels is stored as'descriptions', then the cells from the main analysis notebook can be borrowed directly.

In [5]:
# The summarizing output dictionary has the shape TABLE[method][metric] --> value.
TOPIC = "Ontology Stuff"
DATA = "PO"
TABLE = defaultdict(dict)
OUTPUT_DIR = os.path.join("../outputs",datetime.datetime.now().strftime('%m_%d_%Y_h%Hm%Ms%S'))
os.mkdir(OUTPUT_DIR)

# Paths
dataset_filename = "../data/pickles/text_plus_annotations_dataset.pickle"        # The full dataset pickle.
groupings_filename = "../data/pickles/lloyd_subsets.pickle"                      # The groupings pickle.
background_corpus_filename = "../data/corpus_related_files/untagged_text_corpora/background.txt"       # Text file with background content.
phenotypes_corpus_filename = "../data/corpus_related_files/untagged_text_corpora/phenotypes_small.txt" # Text file with specific content.
doc2vec_pubmed_filename = "../gensim/pubmed_dbow/doc2vec_2.bin"                  # File holding saved Doc2Vec model.
doc2vec_wikipedia_filename = "../gensim/enwiki_dbow/doc2vec.bin"                 # File holding saved Doc2Vec model.
word2vec_model_filename = "../gensim/wiki_sg/word2vec.bin"                       # File holding saved Word2Vec model.
ontology_filename = "../ontologies/mo.obo"                                       # Ontology file in OBO format.
noblecoder_jarfile_path = "../lib/NobleCoder-1.0.jar"                            # Jar for NOBLE Coder tool.
biobert_pmc_path = "../gensim/biobert_v1.0_pmc/pytorch_model"                    # Path for PyTorch BioBERT model.
biobert_pubmed_path = "../gensim/biobert_v1.0_pubmed/pytorch_model"              # Path for PyTorch BioBERT model.
biobert_pubmed_pmc_path = "../gensim/biobert_v1.0_pubmed_pmc/pytorch_model"      # Path for PyTorch BioBERT model.

# Files and models related to the machine learning text embedding methods.
doc2vec_wiki_model = gensim.models.Doc2Vec.load(doc2vec_wikipedia_filename)
doc2vec_pubmed_model = gensim.models.Doc2Vec.load(doc2vec_pubmed_filename)
word2vec_model = gensim.models.Word2Vec.load(word2vec_model_filename)
#bert_tokenizer_base = BertTokenizer.from_pretrained('bert-base-uncased')
#bert_tokenizer_pmc = BertTokenizer.from_pretrained(biobert_pmc_path)
#bert_tokenizer_pubmed = BertTokenizer.from_pretrained(biobert_pubmed_path)
#bert_tokenizer_pubmed_pmc = BertTokenizer.from_pretrained(biobert_pubmed_pmc_path)
#bert_model_base = BertModel.from_pretrained('bert-base-uncased')
#bert_model_pmc = BertModel.from_pretrained(biobert_pmc_path)
#bert_model_pubmed = BertModel.from_pretrained(biobert_pubmed_path)
#bert_model_pubmed_pmc = BertModel.from_pretrained(biobert_pubmed_pmc_path)

# Preprocessing of the text descriptions. Different methods are necessary for different approaches.
descriptions_full_preprocessing = {i:" ".join(preprocess_string(d)) for i,d in descriptions.items()}
descriptions_simple_preprocessing = {i:" ".join(simple_preprocess(d)) for i,d in descriptions.items()}
descriptions_no_stopwords = {i:remove_stopwords(d) for i,d in descriptions.items()}
get_pos_tokens = lambda text,pos: " ".join([t[0] for t in nltk.pos_tag(word_tokenize(text)) if t[1].lower()==pos.lower()])
descriptions_noun_only =  {i:get_pos_tokens(d,"NN") for i,d in descriptions.items()}
descriptions_noun_only_full_preprocessing = {i:" ".join(preprocess_string(d)) for i,d in descriptions_noun_only.items()}
descriptions_noun_only_simple_preprocessing = {i:" ".join(simple_preprocess(d)) for i,d in descriptions_noun_only.items()}
descriptions_adj_only =  {i:get_pos_tokens(d,"JJ") for i,d in descriptions.items()}
descriptions_adj_only_full_preprocessing = {i:" ".join(preprocess_string(d)) for i,d in descriptions_adj_only.items()}
descriptions_adj_only_simple_preprocessing = {i:" ".join(simple_preprocess(d)) for i,d in descriptions_adj_only.items()}

In [6]:
# Define a list of different methods for calculating distance between text descriptions using the Methods object 
# defined in the utilities for this notebook. The constructor takes a string for the method name, a string defining
# the hyperparameter choices for that method, a function to be called to run this method, a dictionary of arguments
# by keyword that should be passed to that function, and a distance metric from scipy.spatial.distance to associate
# with this method.

methods = [
    
    # Methods that use neural networks to generate embeddings.
    Method("Doc2Vec Wikipedia", "Size=300", pw.pairwise_square_doc2vec, {"model":doc2vec_wiki_model, "ids_to_texts":descriptions, "metric":"cosine"}, spatial.distance.cosine),
    Method("Doc2Vec PubMed", "Size=100", pw.pairwise_square_doc2vec, {"model":doc2vec_pubmed_model, "ids_to_texts":descriptions, "metric":"cosine"}, spatial.distance.cosine),
    Method("Word2Vec Wikipedia", "Size=300,Mean", pw.pairwise_square_word2vec, {"model":word2vec_model, "ids_to_texts":descriptions, "metric":"cosine", "method":"mean"}, spatial.distance.cosine),
    Method("Word2Vec Wikipedia", "Size=300,Max", pw.pairwise_square_word2vec, {"model":word2vec_model, "ids_to_texts":descriptions, "metric":"cosine", "method":"max"}, spatial.distance.cosine),
    
    #("BERT Base:Layers=2,Concatenated", pw.pairwise_bert_onegroup, {"model":bert_model_base, "tokenizer":bert_tokenizer_base, "object_dict":descriptions, "metric":"cosine", "method":"concat", "layers":2}, spatial.distance.cosine),
    #("BERT Base:Layers=3,Concatenated", pw.pairwise_bert_onegroup, {"model":bert_model_base, "tokenizer":bert_tokenizer_base, "object_dict":descriptions, "metric":"cosine", "method":"concat", "layers":3}, spatial.distance.cosine),
    #("BERT Base:Layers=4,Concatenated", pw.pairwise_bert_onegroup, {"model":bert_model_base, "tokenizer":bert_tokenizer_base, "object_dict":descriptions, "metric":"cosine", "method":"concat", "layers":4}, spatial.distance.cosine),
    #("BERT Base:Layers=2,Summed", pw.pairwise_bert_onegroup, {"model":bert_model_base, "tokenizer":bert_tokenizer_base, "object_dict":descriptions, "metric":"cosine", "method":"sum", "layers":2}, spatial.distance.cosine),
    #("BERT Base:Layers=3,Summed", pw.pairwise_bert_onegroup, {"model":bert_model_base, "tokenizer":bert_tokenizer_base, "object_dict":descriptions, "metric":"cosine", "method":"sum", "layers":3}, spatial.distance.cosine),
    #("BERT Base:Layers=4,Summed", pw.pairwise_bert_onegroup, {"model":bert_model_base, "tokenizer":bert_tokenizer_base, "object_dict":descriptions, "metric":"cosine", "method":"sum", "layers":4}, spatial.distance.cosine),
    #("BioBERT:PMC,Layers=2,Concatenated", pw.pairwise_bert_onegroup, {"model":bert_model_pmc, "tokenizer":bert_tokenizer_pmc, "object_dict":descriptions, "metric":"cosine", "method":"concat", "layers":2}, spatial.distance.cosine),
    #("BioBERT:PMC,Layers=3,Concatenated", pw.pairwise_bert_onegroup, {"model":bert_model_pmc, "tokenizer":bert_tokenizer_pmc, "object_dict":descriptions, "metric":"cosine", "method":"concat", "layers":3}, spatial.distance.cosine),
    #("BioBERT:PMC,Layers=4,Concatenated", pw.pairwise_bert_onegroup, {"model":bert_model_pmc, "tokenizer":bert_tokenizer_pmc, "object_dict":descriptions, "metric":"cosine", "method":"concat", "layers":4}, spatial.distance.cosine),
    #("BioBERT:PubMed,Layers=4,Concatenated", pw.pairwise_bert_onegroup, {"model":bert_model_pubmed, "tokenizer":bert_tokenizer_pubmed, "object_dict":descriptions, "metric":"cosine", "method":"concat", "layers":4}, spatial.distance.cosine),
    #Method("BioBERT", "PubMed,PMC,Layers=4,Concatenated", pw.pairwise_square_bert, {"model":bert_model_pubmed_pmc, "tokenizer":bert_tokenizer_pubmed_pmc, "ids_to_texts":descriptions, "metric":"cosine", "method":"concat", "layers":4}, spatial.distance.cosine),
        
    # Methods that use variations on the n-grams approach with full preprocessing (includes stemming).
    Method("N-Grams", "Full,Words,1-grams,2-grams", pw.pairwise_square_ngrams, {"ids_to_texts":descriptions, "metric":"cosine", "binary":False, "analyzer":"word", "ngram_range":(1,2),"max_features":100000, "tfidf":False}, spatial.distance.cosine),
    
    Method("Jaccard", "", pw.pairwise_square_annotations, {"ids_to_annotations":annotations, "ontology":ontology, "metric":"jaccard", "binary":True, "analyzer":"word", "ngram_range":(1,1), "tfidf":False}, spatial.distance.jaccard),
]

In [7]:
# Generate all the pairwise distance matrices (not in parallel).
graphs = {}
names = []
durations = []
for method in methods:
    graph,duration = function_wrapper_with_duration(function=method.function, args=method.kwargs)
    graphs[method.name_with_hyperparameters] = graph
    names.append(method.name_with_hyperparameters)
    durations.append(to_hms(duration))
    print("{:50} {}".format(method.name_with_hyperparameters,to_hms(duration)))
durations_df = pd.DataFrame({"method":names,"duration":durations})
durations_df.to_csv(os.path.join(OUTPUT_DIR,"durations.csv"), index=False)

# Merging all of the edgelist dataframes together.
metric_dict = {method.name_with_hyperparameters:method.metric for tup in methods}
methods = list(graphs.keys())
edgelists = {k:v.edgelist for k,v in graphs.items()}
df = pw.merge_edgelists(edgelists, default_value=0.000)
df = pw.remove_self_loops(df)
df.tail(10)

Doc2Vec Wikipedia:Size=300                         00:00:02
Doc2Vec PubMed:Size=100                            00:00:01
Word2Vec Wikipedia:Size=300,Mean                   00:00:03
Word2Vec Wikipedia:Size=300,Max                    00:00:03
N-Grams:Full,Words,1-grams,2-grams                 00:00:05
Jaccard:                                           00:00:01


Unnamed: 0,from,to,Doc2Vec Wikipedia:Size=300,Doc2Vec PubMed:Size=100,"Word2Vec Wikipedia:Size=300,Mean","Word2Vec Wikipedia:Size=300,Max","N-Grams:Full,Words,1-grams,2-grams",Jaccard:
1624489,1812,1813,1.05004,1.087368,0.53577,0.878026,1.0,0.625
1624490,1812,1814,1.043532,1.052812,0.56736,0.867917,1.0,0.5
1624491,1812,1815,1.008526,1.225449,0.63347,0.918647,1.0,0.571429
1624492,1812,1816,0.995252,1.014143,0.858569,0.999921,0.0,1.0
1624494,1813,1814,0.15021,0.569704,0.192139,0.184564,0.666667,0.25
1624495,1813,1815,0.265966,0.569928,0.36794,0.276213,0.825922,0.125
1624496,1813,1816,0.934372,1.108542,0.868246,0.852361,0.0,1.0
1624498,1814,1815,0.188479,0.717914,0.226844,0.19643,0.477767,0.142857
1624499,1814,1816,0.935248,0.950913,0.866944,0.87745,0.0,1.0
1624501,1815,1816,0.978292,0.94745,0.822466,0.856101,0.0,1.0


### Spearman rank-order correlation coefficient and p-value for each method
The purpose of this section is to see how well each methods generated distance values between labels for all of the term pairs in the ontology correspond to the distance values generated by just looking just at the Jaccard distance between the terms themselves, ignoring the labels and just accounting directly for the specified ontology hierarchical graph. Spearman's ρ is used evalute the correlation between these distributions of distance values, and the results are output to a table. The distributions are also subset to include only the pairs where the labels do not have one more words in common, and the correlation coefficient is recalculated.

In [8]:
from scipy.stats import spearmanr
df_no_shared_words = df[df["N-Grams:Full,Words,1-grams,2-grams"]==1]
for method in methods:
    sp = spearmanr(df["Jaccard:"].values, df[method].values)
    TABLE[method].update({"rho":sp.correlation,"p":sp.pvalue})
    sp_no_shared = spearmanr(df_no_shared_words["Jaccard:"].values, df_no_shared_words[method].values)
    TABLE[method].update({"rho_unshared":sp_no_shared.correlation,"p_unshared":sp_no_shared.pvalue})

### Look at distance distributions for specific term relationships
The purpose of this section is to use the specific relationships between either sibling terms or parent-child term pairs and their labels in order to see how each method compares in capturing the relationships between these closely related terms. The distance values found by each method are converted to percentiles so that the distributions of scores between methods will be comparable, and then the dataframe is subset to only include the edges between term pairs that are siblings or parent-child pairs, and then the dataframe is written to file.

In [9]:
# Convert the distance values to all percentiles.
df[methods] = df[methods].rank(pct=True)
# Use the relationships dictionary saved above to find edges that correspond to specific relationships.
df["term_1"] = df["from"].map(lambda x: id_to_term_id[x])
df["term_2"] = df["to"].map(lambda x: id_to_term_id[x])
df["relationship"] = np.vectorize(lambda t1,t2: relationship_dict[t1].get(t2))(df["term_1"], df["term_2"])
df = df[(df["relationship"]=="sibling") | (df["relationship"]=="parent_child")]
df = df[flatten(["from","to","relationship",methods])]
for method in methods:
    TABLE[method].update({"sib_mean":df[df["relationship"]=="sibling"][method].mean()})
    TABLE[method].update({"sib_median":df[df["relationship"]=="sibling"][method].median()})
    TABLE[method].update({"par_mean":df[df["relationship"]=="parent_child"][method].mean()})
    TABLE[method].update({"par_median":df[df["relationship"]=="parent_child"][method].median()})
df.to_csv(os.path.join(OUTPUT_DIR,"distance_percentiles.csv"), index=False)

In [10]:
results = pd.DataFrame(TABLE).transpose()
columns = flatten(["Hyperparams","Group","Order","Topic","Data",results.columns])
results["Hyperparams"] = ""
results["Group"] = ""
results["Order"] = np.arange(results.shape[0])
results["Topic"] = TOPIC
results["Data"] = DATA
results = results[columns]
results.reset_index(inplace=True)
results = results.rename({"index":"Method"}, axis="columns")
hyperparam_sep = ":"
results["Hyperparams"] = results["Method"].map(lambda x: x.split(hyperparam_sep)[1] if hyperparam_sep in x else "-")
results["Method"] = results["Method"].map(lambda x: x.split(hyperparam_sep)[0])
results.to_csv(os.path.join(OUTPUT_DIR,"full_table.csv"), index=False)
results

Unnamed: 0,Method,Hyperparams,Group,Order,Topic,Data,rho,p,rho_unshared,p_unshared,sib_mean,sib_median,par_mean,par_median
0,Doc2Vec Wikipedia,Size=300,,0,Ontology Stuff,PO,0.221711,0.0,0.195396,0.0,0.383934,0.322938,0.21713,0.012715
1,Doc2Vec PubMed,Size=100,,1,Ontology Stuff,PO,0.025925,3.0546249999999997e-239,0.003912,2e-06,0.51157,0.511206,0.42577,0.379653
2,Word2Vec Wikipedia,"Size=300,Mean",,2,Ontology Stuff,PO,0.161944,0.0,0.113303,0.0,0.414267,0.379388,0.229295,0.014544
3,Word2Vec Wikipedia,"Size=300,Max",,3,Ontology Stuff,PO,0.030972,0.0,-0.032159,0.0,0.475103,0.515018,0.287947,0.097332
4,N-Grams,"Full,Words,1-grams,2-grams",,4,Ontology Stuff,PO,0.140649,0.0,,,0.359152,0.537906,0.152134,0.009621
5,Jaccard,,,5,Ontology Stuff,PO,1.0,0.0,1.0,0.0,0.164754,0.077613,0.113833,0.013554
