In [1]:
import datetime
import nltk
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import pandas as pd
import numpy as np
import time
import math
import sys
import gensim
import os
import warnings
import torch
import itertools
import multiprocessing as mp
from collections import Counter, defaultdict
from inspect import signature
from scipy.stats import ks_2samp
from sklearn.metrics import precision_recall_curve, f1_score, auc
from sklearn.model_selection import train_test_split, KFold
from scipy import spatial, stats
from nltk.corpus import brown
from nltk.tokenize import word_tokenize
from sklearn.neighbors import KNeighborsClassifier
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from gensim.parsing.preprocessing import strip_non_alphanum, stem_text, preprocess_string, remove_stopwords
from gensim.utils import simple_preprocess

sys.path.append("../../oats")
from oats.utils.utils import save_to_pickle, load_from_pickle, merge_list_dicts, flatten, to_hms
from oats.datasets.dataset import Dataset
from oats.datasets.groupings import Groupings
from oats.annotation.ontology import Ontology
from oats.datasets.string import String
from oats.datasets.edges import Edges
from oats.annotation.annotation import annotate_using_noble_coder
from oats.graphs import pairwise as pw
from oats.graphs.indexed import IndexedGraph
from oats.graphs.models import train_logistic_regression_model, apply_logistic_regression_model
from oats.graphs.models import train_random_forest_model, apply_random_forest_model
from oats.nlp.vocabulary import get_overrepresented_tokens, build_vocabulary_from_tokens
from oats.utils.utils import function_wrapper_with_duration
from oats.nlp.preprocess import concatenate_with_bar_delim

from _nb_utils import Method

mpl.rcParams["figure.dpi"] = 400
warnings.simplefilter('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
nltk.download('punkt', quiet=True)
nltk.download('brown', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

True

In [2]:
dataset_filename = "../data/pickles/text_plus_annotations_dataset.pickle"        # The full dataset pickle.
groupings_filename = "../data/pickles/lloyd_subsets.pickle"                      # The groupings pickle.
background_corpus_filename = "../data/corpus_related_files/untagged_text_corpora/background.txt"       # Text file with background content.
phenotypes_corpus_filename = "../data/corpus_related_files/untagged_text_corpora/phenotypes_small.txt" # Text file with specific content.
doc2vec_pubmed_filename = "../gensim/pubmed_dbow/doc2vec_2.bin"                  # File holding saved Doc2Vec model.
doc2vec_wikipedia_filename = "../gensim/enwiki_dbow/doc2vec.bin"                 # File holding saved Doc2Vec model.
word2vec_model_filename = "../gensim/wiki_sg/word2vec.bin"                       # File holding saved Word2Vec model.
ontology_filename = "../ontologies/mo.obo"                                       # Ontology file in OBO format.
noblecoder_jarfile_path = "../lib/NobleCoder-1.0.jar"                            # Jar for NOBLE Coder tool.
biobert_pmc_path = "../gensim/biobert_v1.0_pmc/pytorch_model"                    # Path for PyTorch BioBERT model.
biobert_pubmed_path = "../gensim/biobert_v1.0_pubmed/pytorch_model"              # Path for PyTorch BioBERT model.
biobert_pubmed_pmc_path = "../gensim/biobert_v1.0_pubmed_pmc/pytorch_model"      # Path for PyTorch BioBERT model.

In [3]:
doc2vec_wiki_model = gensim.models.Doc2Vec.load(doc2vec_wikipedia_filename)

In [4]:
dict1 = {0:"dog and cat", 1:"cat and oaf"}
dict2 = {4:"something else", 5:"cat and mouse", 6:"something entirely different"}
results = pw.pairwise_square_doc2vec(doc2vec_wiki_model, dict2, "cosine")
print(results.array)
print(results.col_index_to_id)
print(results.row_index_to_id)

[[0.         0.4219073  0.17203338]
 [0.4219073  0.         0.42653266]
 [0.17203338 0.42653266 0.        ]]
{0: 4, 1: 5, 2: 6}
{0: 4, 1: 5, 2: 6}


In [7]:
results = pw.pairwise_square_word2vec(doc2vec_wiki_model, dict2, "cosine", "max")
print(results.array)
print(results.col_index_to_id)
print(results.row_index_to_id)

[[0.         0.55883661 0.29804973]
 [0.55883661 0.         0.32946022]
 [0.29804973 0.32946022 0.        ]]
{0: 4, 1: 5, 2: 6}
{0: 4, 1: 5, 2: 6}


In [6]:
results = pw.pairwise_square_ngrams(dict2, "cosine")
print(results.array)
print(results.row_index_to_id)
print(results.col_index_to_id)
print(results.vector_dictionary)

[[0.         1.         0.59175171]
 [1.         0.         1.        ]
 [0.59175171 1.         0.        ]]
{0: 4, 1: 5, 2: 6}
{0: 4, 1: 5, 2: 6}
{4: array([0, 0, 0, 1, 0, 0, 1]), 5: array([1, 1, 0, 0, 0, 1, 0]), 6: array([0, 0, 1, 0, 1, 0, 1])}
