In [6]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import pandas as pd
import numpy as np
import time
import math
import sys
import gensim
import os
import warnings
import itertools
from collections import Counter
from inspect import signature
from scipy.stats import ks_2samp
from sklearn.metrics import precision_recall_curve, f1_score, auc
from collections import defaultdict

sys.path.append("../../oats")
from oats.pubmed.querying import search, fetch_details
from oats.utils.utils import save_to_pickle, load_from_pickle, merge_list_dicts, flatten
from oats.datasets.dataset import Dataset
from oats.datasets.groupings import Groupings
from oats.datasets.string import get_stringdb_information
from oats.annotation.ontology import Ontology
from oats.annotation.annotation import write_annotations_to_tsv_file, read_annotations_from_tsv_file
from oats.graphs.pairwise import pairwise_edgelist_doc2vec, pairwise_edgelist_counting
from oats.graphs.pairwise import pairwise_edgelist_doc2vec_twogroup, pairwise_edgelist_counting_twogroup
from oats.graphs.pairwise import pairwise_edgelist_annotations, merge_edgelists, subset_edgelist_with_ids
from oats.graphs.pairwise import remove_self_loops
from oats.objectives.functions import balance_classes
from oats.graphs.indexed import IndexedGraph

mpl.rcParams["figure.dpi"] = 200
warnings.simplefilter('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
# Reading in the entire dataset, subsetting for Arabidosis and all annotation types.
dataset = load_from_pickle("../data/pickles/full_dataset.pickle")
dataset.filter_by_species("ath")
dataset.collapse_by_all_gene_names()
dataset.filter_has_description()
dataset.filter_has_annotation()
dataset.filter_random_k(k=20)
dataset.describe()

Number of rows in the dataframe: 20
Number of unique IDs:            20
Number of unique descriptions:   18
Number of unique gene name sets: 20
Number of species represented:   1


In [3]:
genes = dataset.get_gene_dictionary()
id_to_abstract_text = {}
for identifier,gene_obj in genes.items():
    limit = 1
    query = "arabidopsis AND ({})".format(" OR ".join(gene_obj.names))
    results = search(query, limit)
    id_list = results['IdList']
    if len(id_list) > 0:
        papers = fetch_details(id_list)
        for i, paper in enumerate(papers['PubmedArticle']): 
            abstract_text = paper['MedlineCitation']['Article']['Abstract']["AbstractText"][0]
            id_to_abstract_text[identifier] = abstract_text

In [9]:
doc2vec_model_filename = "../gensim/enwiki_dbow/doc2vec.bin"
doc2vec_model = gensim.models.Doc2Vec.load(doc2vec_model_filename)


descriptions = dataset.get_description_dictionary()
descriptions = {k:v for k,v in descriptions.items() if k in id_to_abstract_text}
abstracts = id_to_abstract_text
    
name_to_df_mapping = {}
name_to_df_mapping["d2v"] = pairwise_edgelist_doc2vec_twogroup(doc2vec_model, descriptions, abstracts, "cosine")
name_to_df_mapping["bow"] = pairwise_edgelist_counting_twogroup(descriptions, abstracts, "cosine", binary=False) 
df = merge_edgelists(name_to_df_mapping, default_value=0.000)
print(df.head(8))
print(df.shape[0])

x = df[df["from"]==df["to"]]["d2v"].values
print(x)







   from     to       d2v       bow
0   522    522  0.561337  0.524136
1   522  27919  0.489754  0.654781
2   522  27899  0.591065  0.594711
3   522  25184  0.561430  0.729682
4   522  24198  0.648393  0.674352
5   522   9105  0.562026  0.573750
6   522   1240  0.572151  0.617332
7   522   1322  0.640348  0.621515
196
[0.5613372  0.4012332  0.38893368 0.42769828 0.43321684 0.52474205
 0.47727839 0.45785402 0.42360762 0.50984089 0.40095927 0.52506476
 0.46756784 0.54131532]


In [5]:



"""
# Getting up to 10 articles about maize
limit = 1
query = ""
results = search("maize AND arabidopsis", limit=4)
id_list = results['IdList']
papers = fetch_details(id_list)

for i, paper in enumerate(papers['PubmedArticle']): 
    print("\n\nFound Paper #{}".format(i+1))
    print(paper['MedlineCitation']['Article']['ArticleTitle'])
    print(paper["MedlineCitation"]["PMID"])
    print(paper['MedlineCitation']['Article']['Abstract']["AbstractText"][0])
"""

'\n# Getting up to 10 articles about maize\nlimit = 1\nquery = ""\nresults = search("maize AND arabidopsis", limit=4)\nid_list = results[\'IdList\']\npapers = fetch_details(id_list)\n\nfor i, paper in enumerate(papers[\'PubmedArticle\']): \n    print("\n\nFound Paper #{}".format(i+1))\n    print(paper[\'MedlineCitation\'][\'Article\'][\'ArticleTitle\'])\n    print(paper["MedlineCitation"]["PMID"])\n    print(paper[\'MedlineCitation\'][\'Article\'][\'Abstract\']["AbstractText"][0])\n'

In [None]:
# Reading in and generating orthologs dictionary from panther dataset for plant species.
# Need to figure out if reciprocals are given in this file.
df = pd.read_table("/Users/irbraun/Desktop/orthologs.txt")
df = df.head(100)
for row in df.itertuples():
    gene1_list = row[1].split("|")
    gene2_list = row[2].split("|")
    print(gene1_list)
    print(gene2_list)