# Dataset Notebook
<a href="https://colab.research.google.com/github/hacksaremeta/IS-Sentence-Completion/blob/datasets/src/dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
- Provides functionality regarding data including fetch, persistence and TF2/Keras preparation utils

## DataManager Class
- Provides save and load functionality for datasets in json format

In [1]:
import os, json, logging, string
from Bio import Entrez, Medline
from typing import Any

In [2]:
class DataManager():
    """Provides save and load functionality for datasets in json format"""
    def __init__(self, email, root_dir):
        self.email = email
        self.root_dir = root_dir
        self.log = logging.getLogger(self.__class__.__name__)

    def _exists_dataset(self, name):
        """Checks whether a dataset with the given name exists"""
        if not os.path.isdir(self.root_dir):
            return False
            
        for file in os.listdir(self.root_dir):
            if file.endswith(".json"):
                with open(os.path.join(self.root_dir, file), 'r') as f:
                    content = json.load(f)
                    if content["name"] == name:
                        return True
        return False

    def _fetch_papers(self, query : str, limit : int) -> 'list[dict]':
        """Retrieves data from PubMed"""
        Entrez.email = self.email
        record = Entrez.read(Entrez.esearch(db="pubmed", term=query, retmax=limit))
        idlist = record["IdList"]

        self.log.info("\nFound %d records for %s." % (len(idlist), query.strip()))

        records = Medline.parse(Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode = "text"))
        return list(records)

    def _fetch_abstracts(self, query : str, limit : int) -> 'list[str]':
        """Retrieves abstracts from PubMed"""
        papers = self._fetch_papers(query, limit)
        list_of_abstracts = [p['AB'] for p in papers]

        return list_of_abstracts
        
    def create_dataset(self, queries : 'list[str]', name : str, limit=50, overwrite=False) -> None:
        """
        Wraps other methods in this class
        Creates a dataset from multiple queries
        Does nothing if the dataset is already present (param overwrite)
        Limits every query to <limit> results
        """
        exists_dataset = self._exists_dataset(name)
        if not exists_dataset or (exists_dataset and overwrite):
            self.log.info("Dataset does not exist, fetching from PubMed...")

            res = dict()
            res["name"] = name
            res["data"] = list()
            
            for q in queries:
                q_data = dict()
                q_data["query"] = q
                q_data["abstracts"] = self._fetch_abstracts(q, limit)
                res["data"].append(q_data)
            
            self._save_dataset(res, name)
        else:
            self.log.info("Dataset already exists, skipping fetch")

    def _save_dataset(self, dataset: dict, name : str) -> None:
        """
        Creates a file <name>.json in the dataset directory
        For JSON file structure see below
        Param dataset has a structure analogous to the JSON file
        """
        if not os.path.isdir(self.root_dir):
            os.makedirs(self.root_dir)

        with open(os.path.join(self.root_dir, name + ".json"), 'w') as f:
            json.dump(dataset, f, indent=2)
        
    def load_full_dataset(self, name : str) -> 'list[str]':
        """
        Finds the file that matches given <name> in JSON information,
        parses it, loading all abstracts into a list (one string for each abstract)
        and returns it (Error if dataset doesn't exist)
        """

        if  not self._exists_dataset(name):
            self.log.info("Dataset does not exist")
            
        else:
           with open(os.path.join(self.root_dir, name+'.json'), 'r') as file:

                abstract_list=[]
                jsonObject = json.load(file)

                data_list= jsonObject['data']

                for item in data_list:
                    abstract_list.extend(item['abstracts'])
                return abstract_list

    def load_query_from_dataset(self, name : str, query : str) -> 'list[str]':
        """Like load_full_dataset but only loads abstracts for a single query"""


        result = self._exists_dataset(name)

        if  result:

            with open(os.path.join(self.root_dir, name+'.json'), 'r') as file:

                query_abstracts=[]
                jsonObject = json.load(file)
                data_list= jsonObject['data']

                q_names = [x['query'] for x in data_list]

                if query not in q_names:
                    self.log.info("The Query that you are searching for,does not exist in the Dataset")
                else:

                      for queries in data_list:
                            if queries['query'] == query:
                              query_abstracts.extend(queries['abstracts'])
                              return query_abstracts

        else:
             self.log.info("Dataset does not exist")


    def remove_punctuation(self, name:str) -> 'list[str]':


            abstracts_list= self.load_full_dataset(name)

            for text in abstracts:

                text = text.translate(str.maketrans('', '', string.punctuation))
                abstracts_list.append(text)


            return  abstracts_list

### DataManager Example

In [3]:
# Usage example
if __name__ == "__main__":
    # Set log level
    logging.basicConfig(level=logging.DEBUG, format='[%(levelname)s] %(name)s: %(message)s')

    # Create DataManager in '../res/datasets' folder
    data_folder = os.path.join("..", "res", "datasets")
    dman = DataManager("mymail@example.com", data_folder)

    dataset_name = "RNA Dataset"
    queries = ["RNA", "mRNA", "tRNA"]

    # Gather maximum of 100 abstracts for each query
    # I would suggest around 5 - 20 abstracts in total for the small data sets
    # and maybe 500 - 5000 for the final ones but we'll have to test
    # since that depends on how long it takes to train the network
    # This only queries PubMed if data if the data is not already present
    dman.create_dataset(queries, dataset_name, 5)

    # Load the dataset
    abstracts = dman.load_full_dataset(dataset_name)
    abstracts_mrna = dman.load_query_from_dataset(dataset_name, queries[1])

    ab = dman.remove_punctuation(dataset_name)
    # Do stuff with abstracts
    pass


[INFO] DataManager: Dataset already exists, skipping fetch


In [4]:
ab

['Long noncoding RNA nuclear paraspeckle assembly transcript 1 (lncRNA NEAT1) is abnormally expressed in numerous tumors and functions as an oncogene, but the role of NEAT1 in laryngocarcinoma is largely unknown. Our study validated that NEAT1 expression was markedly upregulated in laryngocarcinoma tissues and cells. Downregulation of NEAT1 dramatically suppressed cell proliferation and invasion through inhibiting miR-524-5p expression. Additionally, NEAT1 overexpression promoted cell growth and metastasis, while overexpression of miR-524-5p could reverse the effect. NEAT1 increased the expression of histone deacetylase 1 gene (HDAC1) via sponging miR-524-5p. Mechanistically, overexpression of HDAC1 recovered the cancer-inhibiting effects of miR-524-5p mimic or NEAT1 silence by deacetylation of tensin homolog deleted on chromosome ten (PTEN) and inhibiting AKT signal pathway. Moreover, in vivo experiments indicated that silence of NEAT1 signally suppressed tumor growth. Taken together,

## DataUtils class
- Static class providing functionality for data preparation for TF2/Keras

In [5]:
class DataUtils():
    @staticmethod
    def extract_features_and_labels(sequences : 'list[list[Any]]', train_len : int) -> 'tuple[list[Any], list[Any]]':
        """
        Extracts features of size <train_len> from the sequences
        Also extracts every (<train_len>+1)-th word as labels
        Returns tuple(features, labels)
        """
        features = []
        labels = []
        for s in sequences:
            for i in range(train_len, len(s)):

                # Extract <train_len> + 1 words and
                # shift by 1 after each iteration
                # That way it generates a lot of training
                # samples from a relatively small amount of data
                ex = s[i-train_len : i+1]

                # First <train_len> words are features
                features.append(ex[:-1])
                
                # (<train_len>+1)-th word is label
                labels.append(ex[-1])
        
        return (features, labels)
