In [1]:
!pip install bioservices
from bioservices import UniProt

# imports
import numpy as np
import pandas as pd
import io
from scipy import stats
import os
import json
from pandas.io.json import build_table_schema
import csv
import collections
import re
import xml.etree.ElementTree as ET
import requests

pd.set_option('max_rows', 99999)
pd.set_option('max_colwidth', 400)
pd.describe_option('max_colwidth')


class BioDatasets:
  def gen_reactome():
    reactome = pd.DataFrame()
    columns = ["UNIPROT identifier", "Reactome Pathway Stable identifier", "URL", "Event Name", "Evidence code", "Species"]        
    for chunk in pd.read_csv('C:\Datasets\REACTOME_UniProt2Reactome_All_Levels.txt', header=None, sep = '\t', chunksize=1000, names=columns):
      reactome = pd.concat([reactome, chunk], ignore_index=True)
    return reactome 

  def gen_disgenet():
    disgenet = pd.DataFrame()
    columns = ["geneId","geneSymbol", "DSI", "DPI", "diseaseId", "diseaseName", "diseaseType", "diseaseClass", 
           "diseaseSemanticType", "score", "EI", "YearInitial", "YearFinal", "NofPmids", "NofSnps", "source"]
    for chunk in pd.read_csv('C:\Datasets\DISGENET_curated_gene_disease_associations.tsv', header=0, sep = '\t', chunksize=1000, comment='#', skiprows=1, names=columns):
        disgenet = pd.concat([disgenet, chunk], ignore_index=True)
    return pd.DataFrame(disgenet)

  def gen_omim():
    omim = pd.DataFrame()
    columns = ["Chromosome", "Genomic_Position_Start","Genomic_Position_End","Cyto_Location","Computed_Cyto_Location","MIM_Number",
              "Gene_Symbols","Gene_Name","Approved_Symbol","Entrez_Gene_ID","Ensembl_Gene_ID","Comments","Phenotypes","Mouse_Gene_Symbol/ID"]
    for chunk in pd.read_csv('C:\Datasets\OMIM_genemap2.txt', header=0, sep = '\t', chunksize=1000, comment='#', skiprows=1, names=columns):
        omim = pd.concat([omim, chunk], ignore_index=True)
    return pd.DataFrame(omim)

  def gen_iid_human():
    iid_human = pd.DataFrame() 
    for chunk in pd.read_csv('C:\Datasets\IID_human_annotated_PPIs.txt', header=0, sep = '\t', chunksize=10000):
      iid_human = pd.concat([iid_human, chunk], ignore_index=True)
    return pd.DataFrame(iid_human)

  def gen_drug_bank():
    #adapted from source: https://github.com/dhimmel/drugbank
    with open('C:\Datasets\DRUGBANK_all_full_database.xml', encoding='utf8') as xml_file:
        tree = ET.parse(xml_file)
    root = tree.getroot()

    ns = '{http://www.drugbank.ca}'
    inchikey_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChIKey']/{ns}value"
    inchi_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChI']/{ns}value"

    rows = list()
    for i, drug in enumerate(root):
        row = collections.OrderedDict()
        assert drug.tag == ns + 'drug'
        row['type'] = drug.get('type')
        row['drugbank_id'] = drug.findtext(ns + "drugbank-id[@primary='true']")
        row['name'] = drug.findtext(ns + "name")
        row['description'] = drug.findtext(ns + "description")
        row['groups'] = [group.text for group in
            drug.findall("{ns}groups/{ns}group".format(ns = ns))]
        row['atc_codes'] = [code.get('code') for code in
            drug.findall("{ns}atc-codes/{ns}atc-code".format(ns = ns))]
        row['categories'] = [x.findtext(ns + 'category') for x in
            drug.findall("{ns}categories/{ns}category".format(ns = ns))]
        row['inchi'] = drug.findtext(inchi_template.format(ns = ns))
        row['inchikey'] = drug.findtext(inchikey_template.format(ns = ns))
        
        # Add drug aliases
        aliases = {
            elem.text for elem in 
            drug.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
            drug.findall("{ns}synonyms/{ns}synonym[@language='English']".format(ns = ns)) +
            drug.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
            drug.findall("{ns}products/{ns}product/{ns}name".format(ns = ns))

        }
        #aliases.add(row['name'])
        row['aliases'] = sorted(aliases)

        rows.append(row)
    
    #DRUGBANK - xml
    alias_dict = {row['drugbank_id']: row['aliases'] for row in rows}
    with open('C:/Datasets/data/aliases.json', 'w') as fp:
        json.dump(alias_dict, fp, indent=2, sort_keys=True)
      #DRUGBANK - xml
    def collapse_list_values(row):
        for key, value in row.items():
            if isinstance(value, list):
                row[key] = '|'.join(value)
        return row

    rows = list(map(collapse_list_values, rows))

    columns = ['drugbank_id', 'name', 'type', 'groups', 'atc_codes', 'categories', 'inchikey', 'inchi', 'description']
    drugbank = pd.DataFrame.from_dict(rows)[columns]
    return pd.DataFrame(drugbank)

  def gen_drug_bank_protein():
    #adapted from source: https://github.com/dhimmel/drugbank
    with open('C:\Datasets\DRUGBANK_all_full_database.xml', encoding='utf8') as xml_file:
        tree = ET.parse(xml_file)
    root = tree.getroot()
    ns = '{http://www.drugbank.ca}'

    #Extract protein information
    protein_rows = list()
    for i, drug in enumerate(root):
        drugbank_id = drug.findtext(ns + "drugbank-id[@primary='true']")
        for category in ['target', 'enzyme', 'carrier', 'transporter']:
            proteins = drug.findall('{ns}{cat}s/{ns}{cat}'.format(ns=ns, cat=category))#.apply(str)
            for protein in proteins:
                row = {'drugbank_id': drugbank_id, 'category': category}
                row['organism'] = protein.findtext('{}organism'.format(ns))
                row['known_action'] = protein.findtext('{}known-action'.format(ns))
                actions = protein.findall('{ns}actions/{ns}action'.format(ns=ns))
                row['actions'] = '|'.join(action.text for action in actions)
                uniprot_ids = [polypep.text for polypep in protein.findall(
                    "{ns}polypeptide/{ns}external-identifiers/{ns}external-identifier[{ns}resource='UniProtKB']/{ns}identifier".format(ns=ns))]            
                if len(uniprot_ids) != 1:
                    continue
                row['uniprot_id'] = uniprot_ids[0]
                ref_text = str(protein.findtext("{ns}references[@format='textile']".format(ns=ns)))
                pmids = re.findall(r'pubmed/([0-9]+)', ref_text)
                row['pubmed_ids'] = '|'.join(pmids)
                protein_rows.append(row)

    drugbank_protein_df = pd.DataFrame.from_dict(protein_rows)

    return pd.DataFrame(drugbank_protein_df)

  def gen_mondo():
    with open('C:\Datasets\MONDO_with_equivalents.json', encoding='utf8') as f:
        mondo = json.load(f)
    mondo = pd.DataFrame(mondo)    
    mondo

    #MONDO - json
    pd.set_option("display.max.columns", None) # to display all the columns
    mondodf = pd.json_normalize(mondo ["graphs"], record_path =['nodes'],
        meta=['meta', ['definition', 'xrefs', 'synonyms']], errors='ignore')
    
    return pd.DataFrame(mondodf)
  
  def gen_drugcentral():
    drugcentral = pd.DataFrame()
    columns = ["DRUG_NAME", "STRUCT_ID", "TARGET_NAME", "TARGET_CLASS", "ACCESSION", "GENE", "SWISSPROT", "ACT_VALUE", 
              "ACT_UNIT", "ACT_TYPE", "ACT_COMMENT", "ACT_SOURCE", "RELATION", "MOA", "MOA_SOURCE", "ACT_SOURCE_URL", 
              "MOA_SOURCE_URL", "ACTION_TYPE", "TDL", "ORGANISM"]
    for chunk in pd.read_csv('C:\Datasets\DRUGCENTRAL_drug.target.interaction.tsv', header=0, sep = '\t', chunksize=1000, names=columns):
        drugcentral = pd.concat([drugcentral, chunk], ignore_index=True)
    
    return pd.DataFrame(drugcentral)

  def gen_uniprot():
    service = UniProt()  
    query = "reviewed:yes" #Lists all UniProtKB/Swiss-Prot entries (about UniProtKB).
    result = service.search(query, frmt="tab")
    uniprotdf = pd.DataFrame()
    columns = ["Entry", "Entry name", "Status", "Protein names", "Gene names", "Organism", "Lenght"]
    for chunk in pd.read_table(io.StringIO(result), header=None, sep = '\t', chunksize=10000, names=columns):
        uniprotdf = pd.concat([uniprotdf, chunk], ignore_index=True)
    uniprotdf
    
    return pd.DataFrame(uniprotdf)

display.max_colwidth : int or None
    The maximum width in characters of a column in the repr of
    a pandas data structure. When the column overflows, a "..."
    placeholder is embedded in the output. A 'None' value means unlimited.
    [default: 50] [currently: 400]
