In [1]:
from nltk.tokenize import sent_tokenize
import pandas as pd
import sys
import time
import os
sys.path.append("../../oats")
from oats.pubmed.query import search, fetch_details
from oats.utils.utils import save_to_pickle, load_from_pickle

### Collecting dataset of sentences from Arabidopsis articles and saving

In [2]:
# Creating the dataset of sentences related to Arabidopsis phenotypes.
path = "../data/corpus_related_files/brat_annotations_ath_corpus"
query = "arabidopsis AND phenotype"
limit = 100
results = search(query, limit)
id_list = results["IdList"]
papers = fetch_details(id_list)

# A list of the sentences and a list of the corresponding PubMed IDs of the same length.
sentences = []
pubmed_ids = []
for i, paper in enumerate(papers['PubmedArticle']):
    try:
        abstract_text = paper['MedlineCitation']['Article']['Abstract']["AbstractText"][0]
        abstract_sentences = sent_tokenize(abstract_text)
        for sentence in abstract_sentences:
            sentences.append(sentence)
            pubmed_ids.append(str(id_list[i]))
    except KeyError:
        continue

In [3]:
# Produce a text file that can be annotated using brat.
# The process of annotating with brat should then produce the ann file.
delimited_sentences = "\n[DELIM]\n".join(sentences)
file_to_annotate = open(os.path.join(path,"sentences.txt"),"w")
file_to_annotate.write(delimited_sentences)
file_to_annotate.close()

# Create a pickle that holds the list of sentence strings,corresponding list of PubMed IDs, and delim sentence string.
saved_object_dict = {}
saved_object_dict["sentences"] = sentences
saved_object_dict["pubmed_ids"] = pubmed_ids
saved_object_dict["delimited_sentences"] = delimited_sentences
save_to_pickle(obj=saved_object_dict, path=os.path.join(path,"saved_objects.pickle"))

In [None]:
# Create a pandas dataframe with all of the untagged data.
dataset_path = os.path.join(path,"untagged_dataset.csv")
df = pd.DataFrame({"pmid":pubmed_ids,"sentence":sentences,})
df.to_csv(dataset_path, index=True)
print(df.head(20))

### Collecting dataset of sentences from maize articles and saving

In [6]:
# Creating the dataset of sentences related to maize phenotypes.
path = "../data/corpus_related_files/brat_annotations_zma_corpus"
query = "maize AND phenotype"
limit = 100
results = search(query, limit)
id_list = results["IdList"]
papers = fetch_details(id_list)

# A list of the sentences and a list of the corresponding PubMed IDs of the same length.
sentences = []
pubmed_ids = []
for i, paper in enumerate(papers['PubmedArticle']):
    try:
        abstract_text = paper['MedlineCitation']['Article']['Abstract']["AbstractText"][0]
        abstract_sentences = sent_tokenize(abstract_text)
        for sentence in abstract_sentences:
            sentences.append(sentence)
            pubmed_ids.append(str(id_list[i]))
    except KeyError:
        continue

In [7]:
# Produce a text file that can be annotated using brat.
# The process of annotating with brat should then produce the ann file.
delimited_sentences = "\n[DELIM]\n".join(sentences)
file_to_annotate = open(os.path.join(path,"sentences.txt"),"w")
file_to_annotate.write(delimited_sentences)
file_to_annotate.close()

# Create a pickle that holds the list of sentence strings,corresponding list of PubMed IDs, and delim sentence string.
saved_object_dict = {}
saved_object_dict["sentences"] = sentences
saved_object_dict["pubmed_ids"] = pubmed_ids
saved_object_dict["delimited_sentences"] = delimited_sentences
save_to_pickle(obj=saved_object_dict, path=os.path.join(path,"saved_objects.pickle"))

In [9]:
# Create a pandas dataframe with all of the untagged data.
dataset_path = os.path.join(path,"untagged_dataset.csv")
df = pd.DataFrame({"index":"","pmid":pubmed_ids,"tag":"","sentence":sentences,})
df.to_csv(dataset_path, index=False)
print(df.head(20))

   index      pmid tag                                           sentence
0         30699124      The evolution of maize (Zea mays L.) is highly...
1         30699124      Also, morphological and genetic traits of crop...
2         30699124      In contrast in the Tarapacá region (18-21° S),...
3         30699124      2500-400 yr BP) displayed extensive maize agri...
4         30699124      The presence of archaeological macro-botanical...
5         30699124      Thus, in this study, we ask how the morphologi...
6         30699124      To answer this, we measured and compared morph...
7         30699124      To established genetic diversity eight microsa...
8         30699124      Genetic diversity was estimated by allelic fre...
9         30699124      Differences between populations and genetic st...
10        30699124      Our results indicate significant phenotypic di...
11        30699124      This result is suggestive of an introduction o...
12        30699124      Additionally, 