In [1]:
import pandas as pd
import numpy as np
import gensim
import random
import sys
from nltk import sent_tokenize
from nltk import word_tokenize
from scipy.spatial.distance import cosine
import warnings
from gensim.parsing.preprocessing import strip_non_alphanum, stem_text, preprocess_string
from gensim.utils import simple_preprocess
from gensim.models.callbacks import CallbackAny2Vec

warnings.simplefilter('ignore')

sys.path.append("../../oats")
from oats.annotation.ontology import Ontology
from oats.distances import pairwise as pw
from oats.utils.utils import flatten

In [2]:
# Load the ontology and term information.
path = "../ontologies/po.obo"
ont = Ontology(path)
term_ids_and_names = [(t.id,t.name) for t in ont.terms() if "obsolete" not in t.name]
key_to_annotations = {i:[x[0]] for i,x in enumerate(term_ids_and_names)}
key_to_term_id = {i:x[0] for i,x in enumerate(term_ids_and_names)}
key_to_text_string = {i:x[1] for i,x in enumerate(term_ids_and_names)}
key_to_preprocessed_text_string = {i:" ".join(preprocess_string(s)) for i,s in key_to_text_string.items()}

In [3]:
# Get mappings that define which terms are very close to which others ones in the ontology structure.
parents = {}
children = {}
for term in ont.terms():
    parents[term.id] = [t.id for t in term.superclasses(with_self=False, distance=1)]
    children[term.id] = [t.id for t in term.subclasses(with_self=False, distance=1)]
siblings = {}
for term in ont.terms():
    siblings[term.id] = flatten([[t for t in children[parent_id] if t!=term.id] for parent_id in parents[term.id]])
assert len(parents) == len(children)
assert len(parents) == len(siblings)
any_close = {}
for key in parents.keys():
    any_close[key] = flatten([parents[key],children[key],siblings[key]])

In [4]:
df = pw.with_annotations(key_to_annotations, ont, "jaccard", tfidf=False).edgelist
df = df[df["from"]!=df["to"]]
df["from_id"] = df["from"].map(lambda x: key_to_term_id[x])
df["to_id"] = df["to"].map(lambda x: key_to_term_id[x])
df["from_text"] = df["from"].map(lambda x: key_to_text_string[x])
df["to_text"] = df["to"].map(lambda x: key_to_text_string[x])
df["close"] = df.apply(lambda x: x["to_id"] in any_close[x["from_id"]], axis=1)
df["token_overlap"] = df.apply(lambda x: len(set(x["from_text"].split()).intersection(set(x["to_text"].split())))>0, axis=1)
df.head(20)

Unnamed: 0,from,to,value,from_id,to_id,from_text,to_text,close,token_overlap
1,0,1,0.666667,BFO:0000002,BFO:0000003,continuant,occurrent,False,False
2,0,2,0.666667,BFO:0000002,BFO:0000004,continuant,independent continuant,True,True
3,0,3,0.8,BFO:0000002,BFO:0000006,continuant,spatial region,False,False
4,0,4,1.0,BFO:0000002,BFO:0000015,continuant,process,False,False
5,0,5,0.75,BFO:0000002,BFO:0000040,continuant,material entity,False,False
6,0,6,0.75,BFO:0000002,BFO:0000141,continuant,immaterial entity,False,False
7,0,7,0.833333,BFO:0000002,CARO:0000000,continuant,anatomical entity,False,False
8,0,8,0.833333,BFO:0000002,CARO:0001010,continuant,organism or virus or viroid,False,False
9,0,9,0.8,BFO:0000002,CARO:0030000,continuant,biological entity,False,False
10,0,10,1.0,BFO:0000002,GO:0003674,continuant,molecular_function,False,False


In [5]:
df.shape

(1619100, 9)

In [6]:
positive_df = df[(df["token_overlap"]==False) & (df["close"]==True)]
negative_df = df[(df["token_overlap"]==False) & (df["close"]==False)]
assert negative_df.shape[0]+positive_df.shape[0] == df[df["token_overlap"]==False].shape[0]
num_positive_examples = positive_df.shape[0]
training_df = pd.concat([positive_df, negative_df.sample(num_positive_examples, random_state=2)])
del df
training_df.shape

(19436, 9)

In [7]:
training_df.head(10)

Unnamed: 0,from,to,value,from_id,to_id,from_text,to_text,close,token_overlap
60,0,60,0.75,BFO:0000002,PO:0000034,continuant,vascular system,True,False
81,0,81,0.777778,BFO:0000002,PO:0000055,continuant,bud,True,False
148,0,148,0.75,BFO:0000002,PO:0000423,continuant,plant zygote,True,False
960,0,960,0.8,BFO:0000002,PO:0020035,continuant,epicotyl,True,False
1067,0,1067,0.75,BFO:0000002,PO:0025025,continuant,root system,True,False
1124,0,1124,0.777778,BFO:0000002,PO:0025082,continuant,reproductive shoot system,True,False
1131,0,1131,0.75,BFO:0000002,PO:0025094,continuant,sporangium,True,False
1161,0,1161,0.75,BFO:0000002,PO:0025124,continuant,multicellular plant gametangium,True,False
1440,0,1440,0.777778,BFO:0000002,PO:0025405,continuant,leaf marginal meristem,True,False
1464,0,1464,0.777778,BFO:0000002,PO:0025429,continuant,leaf plate meristem,True,False


In [10]:
path = "../models/plants_sg/word2vec_ep500_dim150.model"
model = gensim.models.Word2Vec.load(path)
result = pw.with_word2vec(model, key_to_preprocessed_text_string, "cosine", "mean")
training_df["m"] = training_df.apply(lambda x: result.array[result.id_to_index[x["from"]],result.id_to_index[x["to"]]], axis=1)
training_df.head(10)

Unnamed: 0,from,to,value,from_id,to_id,from_text,to_text,close,token_overlap,m
60,0,60,0.75,BFO:0000002,PO:0000034,continuant,vascular system,True,False,1.062577
81,0,81,0.777778,BFO:0000002,PO:0000055,continuant,bud,True,False,0.886492
148,0,148,0.75,BFO:0000002,PO:0000423,continuant,plant zygote,True,False,0.989557
960,0,960,0.8,BFO:0000002,PO:0020035,continuant,epicotyl,True,False,0.951276
1067,0,1067,0.75,BFO:0000002,PO:0025025,continuant,root system,True,False,0.901442
1124,0,1124,0.777778,BFO:0000002,PO:0025082,continuant,reproductive shoot system,True,False,0.89782
1131,0,1131,0.75,BFO:0000002,PO:0025094,continuant,sporangium,True,False,0.85254
1161,0,1161,0.75,BFO:0000002,PO:0025124,continuant,multicellular plant gametangium,True,False,0.923595
1440,0,1440,0.777778,BFO:0000002,PO:0025405,continuant,leaf marginal meristem,True,False,0.94154
1464,0,1464,0.777778,BFO:0000002,PO:0025429,continuant,leaf plate meristem,True,False,0.950756


In [11]:
from sklearn.metrics import precision_recall_curve
y_true = list(training_df["close"].values*1)
y_prob = list(1-training_df["m"].values)
precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
f_beta = lambda pr,re,beta: [((1+beta**2)*p*r)/((((beta**2)*p)+r)) for p,r in zip(pr,re)]
f_1_scores = f_beta(precision,recall,beta=1)
f_1_max = np.nanmax(f_1_scores)
f_1_max

0.6666895345247487

In [12]:
path = "../models/wiki_sg/word2vec.bin"
model = gensim.models.Word2Vec.load(path)
result = pw.with_word2vec(model, key_to_text_string, "cosine", "mean")
training_df["m"] = training_df.apply(lambda x: result.array[result.id_to_index[x["from"]],result.id_to_index[x["to"]]], axis=1)
training_df.head(10)

from sklearn.metrics import precision_recall_curve
y_true = list(training_df["close"].values*1)
y_prob = list(1-training_df["m"].values)
precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
f_beta = lambda pr,re,beta: [((1+beta**2)*p*r)/((((beta**2)*p)+r)) for p,r in zip(pr,re)]
f_1_scores = f_beta(precision,recall,beta=1)
f_1_max = np.nanmax(f_1_scores)
f_1_max

0.6667124039517014

In [13]:
training_df.sample(30)


Unnamed: 0,from,to,value,from_id,to_id,from_text,to_text,close,token_overlap,m
383518,227,796,0.285714,PO:0003000,PO:0008019,transition zone,leaf lamina base,True,False,0.739817
677515,426,1666,0.416667,PO:0006022,PO:0030003,bundle sheath extension,protonema,False,False,0.690284
817146,532,1324,0.583333,PO:0006333,PO:0025289,seed chalaza,seedling coleorhiza,True,False,0.538723
678474,427,1252,0.571429,PO:0006023,PO:0025215,bundle sheath,phyllome stomatal complex,False,False,0.613861
873863,578,794,0.875,PO:0006436,PO:0008017,rachilla of sessile spikelet of ear,leaf sheath pulvinus,False,False,0.319676
1402341,1139,1371,0.894737,PO:0025102,PO:0025336,shoot internode differentiation zone,transition phyllode leaf,False,False,0.540361
420181,250,1556,0.444444,PO:0004010,PO:0025521,meristematic cell,unicellular plant gametangium,True,False,0.443629
1318631,1022,1784,0.75,PO:0020130,PO:0030121,central root cap,capitulum inflorescence,False,False,0.650762
181308,103,1264,0.583333,PO:0000084,PO:0025227,plant sperm cell,tetrad of megaspores,True,False,0.506737
10307,5,1322,0.818182,BFO:0000040,PO:0025287,material entity,seedling coleoptile,True,False,0.799187


In [14]:
a = model[["driving"]]
b = model[["driver"]]
cosine(a,b)

0.33738744258880615

In [15]:
y_true = list(training_df["close"].values*1)
y_prob = list(1-training_df["m"].values)
random.shuffle(y_prob)
precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
f_beta = lambda pr,re,beta: [((1+beta**2)*p*r)/((((beta**2)*p)+r)) for p,r in zip(pr,re)]
f_1_scores = f_beta(precision,recall,beta=1)
f_1_max = np.nanmax(f_1_scores)
f_1_max

0.6666895345247487

In [16]:
print(setophere)

NameError: name 'setophere' is not defined

In [None]:
max_retain_in_each_bin = len(df[df["bin"]==0.5])
df = df.groupby("bin", group_keys=False).apply(lambda x: x.sample(min(max_retain_in_each_bin,len(x))))
df.shape

In [None]:
path = "../models/plants_sg/word2vec_ep500_dim150.model"
model = gensim.models.Word2Vec.load(path)
model

result = pw.with_word2vec(model, key_to_text_string, "cosine", "mean")
df["thing"] = df.apply(lambda x: result.array[result.id_to_index[x["from"]],result.id_to_index[x["to"]]], axis=1)
df

In [None]:
from scipy.stats import pearsonr
pearsonr(df["value"],df["thing"])

In [None]:
key_to_preprocssed_text_string = {i:" ".join(preprocess_string(s)) for i,s in key_to_text_string.items()}
result = pw.with_word2vec(model, key_to_preprocssed_text_string, "cosine", "mean")
df["thing2"] = df.apply(lambda x: result.array[result.id_to_index[x["from"]],result.id_to_index[x["to"]]], axis=1)
df







In [None]:
from scipy.stats import pearsonr
from scipy.stats import spearmanr
spearmanr(df["value"],df["thing2"])

In [None]:
df["from_id"] = df["from"].map(lambda x: annotations[x][0])
df["to_id"] = df["to"].map(lambda x: annotations[x][0])
df["from_text"] = df["from_id"].map(lambda x: term_ids_to_strings[x])
df["to_text"] = df["to_id"].map(lambda x: term_ids_to_strings[x])
df

In [None]:
path = "../models/plants_sg/word2vec_ep500_dim150.model"
model = gensim.models.Word2Vec.load(path)
model

In [None]:














def f(from_text, to_text, model):
    cosine(pw.vectorize_with_word2vec(from_text, model, "max"),pw.vectorize_with_word2vec(from_text, model, "max"))




    
df["thing"] = df.apply(lambda row: f(row["from_text"],row["to_text"],model), axis=1)
df





In [None]:
# Input paths to text datasets.
plant_abstracts_corpus_path = "../data/corpus_related_files/untagged_text_corpora/phenotypes_all.txt"
plant_phenotype_descriptions_path = "../../plant-data/genes_texts_annots.csv"

In [None]:
# Preparing the dataset that combines the dataset of plant phenotype descriptions and scrapped abstracts.
corpus = open(plant_abstracts_corpus_path, 'r').read()
sentences_from_corpus = sent_tokenize(corpus)
phenotype_descriptions = " ".join(pd.read_csv(plant_phenotype_descriptions_path)["descriptions"].values)
times_to_duplicate_phenotype_dataset = 5
sentences_from_descriptions = sent_tokenize(phenotype_descriptions)
sentences_from_descriptions = list(np.repeat(sentences_from_descriptions, times_to_duplicate_phenotype_dataset))
sentences = sentences_from_corpus+sentences_from_descriptions
random.shuffle(sentences)
sentences = [preprocess_string(sentence) for sentence in sentences]
print(len(sentences))

In [None]:
print("starting training")

In [None]:
class LossLogger(CallbackAny2Vec):
    def __init__(self):
        self.epochs = []
        self.epoch = 1
        self.losses = []
        self.deltas = []
    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 1:
            delta = loss
        else:
            delta = loss- self.loss_previous_step
        self.loss_previous_step=loss
        self.losses.append(loss)
        self.epochs.append(self.epoch)
        self.epoch += 1
        self.deltas.append(delta)

In [None]:
# Training the word2vec neural network with the current set of hyperparameters. 
model = gensim.models.Word2Vec(min_count=4, window=10, size=50, workers=4, alpha=0.025, min_alpha=0.025)
model.build_vocab(sentences)
loss_logger = LossLogger()
model.train(sentences, epochs=500, total_examples=model.corpus_count, compute_loss=True, callbacks=[loss_logger])

# Saving the model to a file.
output_path = "../models/plants_sg/word2vec_ep500_dim50.model"
model.save(output_path)
print("done training 1 ")

In [None]:
# Training the word2vec neural network with the current set of hyperparameters. 
model = gensim.models.Word2Vec(min_count=4, window=10, size=100, workers=4, alpha=0.025, min_alpha=0.025)
model.build_vocab(sentences)
loss_logger = LossLogger()
model.train(sentences, epochs=500, total_examples=model.corpus_count, compute_loss=True, callbacks=[loss_logger])

# Saving the model to a file.
output_path = "../models/plants_sg/word2vec_ep500_dim100.model"
model.save(output_path)
print("done training 2")

In [None]:
# Training the word2vec neural network with the current set of hyperparameters. 
model = gensim.models.Word2Vec(min_count=4, window=10, size=150, workers=4, alpha=0.025, min_alpha=0.025)
model.build_vocab(sentences)
loss_logger = LossLogger()
model.train(sentences, epochs=500, total_examples=model.corpus_count, compute_loss=True, callbacks=[loss_logger])

# Saving the model to a file.
output_path = "../models/plants_sg/word2vec_ep500_dim150.model"
model.save(output_path)
print("done training 3")

In [None]:
print(stophere)

In [None]:
preprocess_string("plants leaves genes proteins tall wide abscisic root nodule")

In [None]:
from scipy.spatial.distance import cosine
print(cosine(model[["plant"]],model[["leav"]]))
print(cosine(model[["gene"]],model[["leav"]]))
print(cosine(model[["gene"]],model[["protein"]]))
print(cosine(model[["auxin"]],model[["hormon"]]))
print(cosine(model[["hormon"]],model[["abscis"]]))
print(cosine(model[["root"]],model[["nodul"]]))
len(model.wv.vocab)

In [None]:
# Checking to make sure the model can be loaded and used for looking up embeddings.
path = "../models/wiki_sg/word2vec.bin"
model_from_wikipedia = gensim.models.Word2Vec.load(path)
a_word_in_vocab = list(model_from_wikipedia.wv.vocab.keys())[0]
vector = model_from_wikipedia[a_word_in_vocab]
print(len(vector))

In [None]:
len(model.wv.vocab)

In [None]:
model.intersect_word2vec_format("../models/wiki_sg/word2vec.bin", binary=True, lockf=1.0)
model.train(sentences, epochs=1, total_examples=model.corpus_count, compute_loss=True, callbacks=[loss_logger])
len(model.wv.vocab)