In [2]:
from nltk.tokenize import sent_tokenize
import pandas as pd
import sys
import time
sys.path.append("../../oats")
from oats.pubmed.query import search, fetch_details

In [None]:
# Creating the dataset of sentences related to Arabidopsis phenotypes.
query = "arabidopsis AND phenotype"
limit = 5
results = search(query, limit)
id_list = results["IdList"]
papers = fetch_details(id_list)

# A list of the sentences and a list of the corresponding PubMed IDs of the same length.
sentences = []
pubmed_ids = []
for i, paper in enumerate(papers['PubmedArticle']):
    try:
        abstract_text = paper['MedlineCitation']['Article']['Abstract']["AbstractText"][0]
        abstract_sentences = sent_tokenize(abstract_text)
        for sentence in abstract_sentences:
            sentences.append(sentence)
            pubmed_ids.append(id_list[i])
    except KeyError:
        continue

In [14]:
# Produce a text file that can be annotated using brat.
# The process of annotating with brat should then produce the ann file.
deliminated_sentences = "\n[DELIM]\n".join(sentences)
file_to_annotate = open("/Users/irbraun/Desktop/sentences.txt","w")
file_to_annotate.write(deliminated_sentences)
file_to_annotate.close()

In [7]:
# Map the annotations made using brat back to the full sentences.
annotations_path = "/Users/irbraun/brat/brat-v1.3_Crunchy_Frog/data/plants/sentences.ann"
header = ["num","tag","start","end","first_word"]
annotations = pd.read_table(annotations_path, names=header, delim_whitespace=True)
get_sentence_index = lambda x: len(x.split("[DELIM]"))-1
indices = [get_sentence_index(deliminated_sentences[:i]) for i in annotations["start"].values]

In [13]:
# Create a pandas dataframe with all the annotations and the PubMed IDs.
dataset_path = "~/Desktop/arabidopsis_sentence_dataset.csv"
tags = [(i in indices)*1 for i in range(len(sentences))]
df = pd.DataFrame({"pmid":pubmed_ids,"sentence":sentences,"tag":tags})
df.sort_values(by=["tag","pmid"], inplace=True, ascending=False)
df.to_csv(dataset_path, index=False)
print(df.head(20))

        pmid                                           sentence  tag
10  29394403  Mature seeds from TCDD-treated plants had a ch...    1
14  29394403  Our findings reveal novel effects of dioxins t...    1
22  27302005  Similar to Arabidopsis WT seedlings, the suspe...    1
23  27302005  However, the exponential growth rate of the ce...    1
26  27302005  The stay-green, sugar-insensitive phenotype of...    1
0   30824017  DNA methylation carried out by different methy...    0
1   30824017  Accordingly, in Arabidopsis thaliana loss of D...    0
2   30824017  The present study describes novel growth disor...    0
3   30824017  By using an auxin responsive reporter gene, we...    0
4   30824017  In addition, we demonstrated that the defectiv...    0
5   30824017  Finally, we provided evidence of the direct an...    0
31  29487188  Splicing of pre-mRNA involves two consecutive ...    0
32  29487188  In addition to core spliceosomal proteins, eac...    0
33  29487188  Although the <i>Arab