# Analysis of the IDP Knowledge Graph

__Authors:__  
Alasdair J G Gray ([ORCID:0000-0002-5711-4872](http://orcid.org/0000-0002-5711-4872)), _Heriot-Watt University, Edinburgh, UK_

Petros Papadopoulos ([ORCID:0000-0002-8110-7576](https://orcid.org/0000-0002-8110-7576)), _Heriot-Watt University, Edinburgh, UK_

Ivan Mičetić ([ORCID:0000-0003-1691-8425](https://orcid.org/0000-0003-1691-8425)), _University of Padua, Italy_

Andras Hatos ([ORCID:0000-0001-9224-9820](https://orcid.org/0000-0001-9224-9820)), _University of Padua, Italy_

Imran Asif ([ORCID:0000-0002-1144-6265](https://orcid.org/0000-0002-1144-6265)), _Heriot-Watt University, Edinburgh, UK_


__License:__ Apache 2.0

__Acknowledgements:__ This notebook was created during the Virtual BioHackathon-Europe 2020.

## Introduction

This notebook contains SPARQL queries to perform a data analysis of the Intrinsically Disordered Protein (IDP) Knowledge Graph. The IDP knowledge graph was constructed from Bioschemas markup embedded in DisProt, MobiDb, and Protein Ensemble Database (PED) that was harvested using the Bioschemas Markup Scraper and Extractor and converted into a knowledge graph using the process in this [notebook](https://github.com/elixir-europe/BioHackathon-projects-2020/blob/master/projects/24/IDPCentral/notebooks/ETLProcess.ipynb). 

### Library Imports

In [122]:
# Import and configure logging library
from datetime import datetime
import logging
logging.basicConfig(
    filename='idpQuery.log', 
    filemode='w', 
    format='%(levelname)s:%(message)s', 
    level=logging.INFO)
logging.info('Starting processing at %s' % datetime.now().time())

import ipywidgets as widgets
from ipywidgets import Layout
from IPython.core.display import display, HTML
from IPython.display import clear_output
from SPARQLWrapper import SPARQLWrapper, JSON
import json
import glob

In [145]:
# Imports from RDFlib
import rdflib
from rdflib import ConjunctiveGraph, plugin
from rdflib.serializer import Serializer

### Result Display Function

The following function takes the results of a `SPARQL SELECT` query and displays them using a HTML table for human viewing.

In [146]:
def displayResults(queryResult):
    HTMLResult = '<p>Number of results: ' + str(len(queryResult['results']['bindings'])) + '</p>'
    HTMLResult = HTMLResult + '<table><tr style="color:white;background-color:#43BFC7;font-weight:bold">'
    # print variable names and build header:
    for varName in queryResult['head']['vars']:
        HTMLResult = HTMLResult + '<td>' + varName + '</td>'
    HTMLResult = HTMLResult + '</tr>'
    
    # print values from each row and build table of results
    for row in queryResult['results']['bindings']:
        HTMLResult = HTMLResult + '<tr>'   
        for column in row:
            #print("COLUMN:", column)
            if column != "":
                HTMLResult = HTMLResult + '<td>' +  str(row[column]['value']) + '</td>'
            else:
                HTMLResult = HTMLResult + '<td>' + "N/A"+ '</td>'
        HTMLResult = HTMLResult + '</tr>'
    HTMLResult = HTMLResult + '</table>'
    display(HTML(HTMLResult))

## Loading IDP-KG

The data is read in from an N-QUADS file (`IDPKG.nq`). The data is expected to be in multiple named graphs, based on where the data was extracted from, with provenance data in the default graph.

In [253]:
idpKG = None
opt = ''  #selection option

def set_variable(loadingOpt, endpoint):
    global idpKG
    if loadingOpt == 'sparql':
        idpKG = SPARQLWrapper(endpoint)
        idpKG.setReturnFormat(JSON)
    else:
        idpKG = ConjunctiveGraph()
        idpKG.parse("IDPKG-Sample25.nq", format="nquads")
        #idpKG.serialize(format="json-ld") 
        logging.info("\tIDP-KG has %s statements." % len(idpKG))

def query_idpkg(query, loadingOpt):
    if loadingOpt == 'sparql':
        idpKG.setQuery(query)
        results = idpKG.queryAndConvert()
        return results
    else:
        results = idpKG.query(query)
        return json.loads(results.serialize(format="json"))
#############################################
#Create Selection GUI
rdo1 = widgets.RadioButtons(
    options=['SPARQL Endpoint:', 'Local (IDPKG-Sample25.nq)'],
    #     value='pineapple',
    #description='Pizza topping:',
    name = 'select',
    disabled=False,
    layout=Layout(width='30%')
)
    
txt = widgets.Text(
    value='http://137.195.27.15:7200/repositories/IDPKG-Full',
    placeholder='Enter endpoint',
    disabled=False,
    layout=Layout(width='75%', height='5px')
)
    
btn = widgets.Button(
    description='Execute',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Execute',
    icon='check'
)
output = widgets.Output()

def createSelectionGUI():
    btn.on_click(on_button_clicked)
    display(widgets.HBox([rdo1, txt]), widgets.VBox([btn]), output)

def on_button_clicked(e):
    with output:
        global opt
        if 'sparql' in rdo1.value.lower():
            if txt.value == '':
                display(HTML('<span style="color:red">Please enter SPARQL endpoint.</span>'))
            else:
                set_variable('sparql', txt.value)
                opt = 'sparql'
        else:
            set_variable('local', '')
            opt = 'local'
            
        clear_output(True)
        
        #Execute all queries
        for queryFile in glob.glob("../queries/*.rq"):
            with open('../queries/'+queryFile) as f:
                query = f.read()
                #print(lines)
                #displayResults(query_idpkg(query, opt))
                if 'construct' not in query.lower():
                    display(HTML('<hr />'))
                    print(queryFile)
                    try:
                        displayResults(query_idpkg(query, opt))
                    except Exception as e:
                        print(str(e))
                    #break

In [254]:
createSelectionGUI()

HBox(children=(RadioButtons(layout=Layout(width='30%'), options=('SPARQL Endpoint:', 'Local (IDPKG-Sample25.nq…

VBox(children=(Button(description='Execute', icon='check', style=ButtonStyle(), tooltip='Execute'),))

Output()

## Knowledge Graph Statistics

This section reports various statistics about the IDP-KG. The choice of statistics was inspired by the [HCLS Dataset Description Community Profile](https://www.w3.org/TR/hcls-dataset/#s6_6).

### Number of Triples

In [259]:
logging.info(' Number of Triples - Query Started.')
displayResults(query_idpkg("""
SELECT (COUNT(*) AS ?triples) 
WHERE {
    GRAPH ?g {
        ?s ?p ?o 
    }
}
""", opt))
logging.info('Query Completed.')

0
triples
7709


### Number of Typed Entities

Note that we use the `DISTINCT` keyword in the query since the same entity can appear in multiple named graphs.

In [260]:
logging.info(' Number of Typed Entities - Query Started.')
displayResults(query_idpkg("""
SELECT (COUNT(DISTINCT ?s) AS ?entities) 
WHERE { 
    GRAPH ?g { 
        ?s a [] 
    }
}
""", opt))
logging.info('Query Completed.')

0
entities
1584


### Number of Unique Subjects

In [257]:
logging.info(' Number of Unique Subjects - Query Started.')
displayResults(query_idpkg("""
SELECT (COUNT(DISTINCT ?s) AS ?subjects) 
WHERE { 
    GRAPH ?g { 
        ?s ?p ?o
    }
}
""", opt))
logging.info('Query Completed.')

0
subjects
1668


### Number of Unique Properties

In [188]:
logging.info(' Number of Unique Properties - Query Started.')
displayResults(query_idpkg("""
SELECT (COUNT(DISTINCT ?p) AS ?properties) 
WHERE { 
    GRAPH ?g { 
        ?s ?p ?o 
    }
}
""", opt))
logging.info('Query Completed.')

0
properties
33


### Number of Unique Objects

In [261]:
logging.info(' Number of Unique Objects - Query Started.')
displayResults(query_idpkg("""
SELECT (COUNT(DISTINCT ?o) AS ?objects) 
WHERE { 
    GRAPH ?g { 
        ?s ?p ?o
    }
    FILTER(!isLiteral(?o))
}
""", opt))
logging.info('Query Completed.')

0
objects
1782


### Number of Unique Classes

In [262]:
logging.info(' Number of Unique Object Classes - Query Started.')
displayResults(query_idpkg("""
SELECT (COUNT(DISTINCT ?o) AS ?classes) 
WHERE { 
    GRAPH ?g { 
        ?s a ?o 
    }
}
""", opt))
logging.info('Query Completed.')

0
classes
8


### Number of Unique Literals

In [263]:
logging.info(' Number of Unique Literals - Query Started.')
displayResults(query_idpkg("""
SELECT (COUNT(DISTINCT ?o) AS ?objects) 
WHERE { 
    GRAPH ?g { 
        ?s ?p ?o 
    }
    FILTER(isLiteral(?o))
}
""", opt))
logging.info('Query Completed.')

0
objects
675


### Number of Graphs

In [264]:
logging.info(' Number of Unique Graphs - Query Started.')
displayResults(query_idpkg("""
SELECT (COUNT(DISTINCT ?g) AS ?graphs) 
WHERE { 
  GRAPH ?g 
    { ?s ?p ?o }
}
""", opt))
logging.info('Query Completed.')

0
graphs
85


### Instances per Class

In [265]:
logging.info(' Classes & Distinct Instances - Query Started.')
displayResults(query_idpkg("""
PREFIX schema: <https://schema.org/>
PREFIX pav: <http://purl.org/pav/>
SELECT ?Class (COUNT(DISTINCT ?s) AS ?distinctInstances) 
WHERE {
    GRAPH ?g {
        ?s a ?Class
    }
} 
GROUP BY ?Class
ORDER BY ?Class
""", opt))
logging.info('Query Completed.')

0,1
Class,distinctInstances
https://schema.org/DataCatalog,1
https://schema.org/Dataset,2
https://schema.org/DefinedTerm,80
https://schema.org/PropertyValue,669
https://schema.org/Protein,73
https://schema.org/ScholarlyArticle,76
https://schema.org/SequenceAnnotation,341
https://schema.org/SequenceRange,342


### Properties and their Occurence

In [266]:
logging.info(' Number of Unique Predicates - Query Started.')
displayResults(query_idpkg("""
PREFIX schema: <https://schema.org/>
PREFIX pav: <http://purl.org/pav/>
SELECT ?p (COUNT(?p) AS ?triples) 
WHERE {
    GRAPH ?g {
        ?s ?p ?o
    }
} 
GROUP BY ?p
ORDER BY ?p
""", opt))
logging.info('Query Completed.')

0,1
p,triples
http://purl.org/pav/createdWith,84
http://purl.org/pav/retrievedFrom,84
http://purl.org/pav/retrievedOn,84
http://rdfs.org/ns/void#inDataset,83
http://www.w3.org/1999/02/22-rdf-syntax-ns#type,1907
http://www.w3.org/2002/07/owl#sameAs,103
https://schema.org/additionalProperty,669
https://schema.org/citation,1
https://schema.org/creator,1


### Property, number of unique typed subjects, and triples

In [267]:
logging.info(' scount	stype	p	triples - Query Started.')
displayResults(query_idpkg("""
PREFIX schema: <https://schema.org/>
PREFIX pav: <http://purl.org/pav/>
SELECT (COUNT(DISTINCT ?s) AS ?scount) ?stype ?p (COUNT(?p) AS ?triples) 
WHERE {
    GRAPH ?g {
        ?s ?p ?o .
        ?s a ?stype 
    }
} 
GROUP BY ?p ?stype
ORDER BY ?stype ?p
""", opt))
logging.info('Query Completed.')

0,1,2,3
scount,stype,p,triples
https://schema.org/DataCatalog,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,1,1
https://schema.org/DataCatalog,https://schema.org/citation,1,1
https://schema.org/DataCatalog,https://schema.org/dataset,1,1
https://schema.org/DataCatalog,https://schema.org/dateModified,1,1
https://schema.org/DataCatalog,https://schema.org/datePublished,1,1
https://schema.org/DataCatalog,https://schema.org/description,1,1
https://schema.org/DataCatalog,https://schema.org/encodingFormat,1,2
https://schema.org/DataCatalog,https://schema.org/identifier,1,1
https://schema.org/DataCatalog,https://schema.org/keywords,1,6


### Number of Unique Typed Objects Linked to a Property

In [268]:
logging.info(' Query Started.')
displayResults(query_idpkg("""
PREFIX schema: <https://schema.org/>
PREFIX pav: <http://purl.org/pav/>
SELECT ?p (COUNT(?p) AS ?triples) ?otype (COUNT(DISTINCT ?o) AS ?count)
WHERE {
    GRAPH ?g {
        ?s ?p ?o .
        ?o a ?otype
    }
} 
GROUP BY ?p ?otype
ORDER BY ?p
""", opt))
logging.info('Query Completed.')

0,1,2,3
p,triples,otype,count
https://schema.org/additionalProperty,https://schema.org/PropertyValue,669,669
https://schema.org/dataset,https://schema.org/Dataset,1,1
https://schema.org/hasSequenceAnnotation,https://schema.org/SequenceAnnotation,360,341
https://schema.org/sequenceLocation,https://schema.org/SequenceRange,342,342
https://schema.org/subjectOf,https://schema.org/ScholarlyArticle,246,76
https://schema.org/value,https://schema.org/DefinedTerm,669,80


### Triples and Number of Unique Literals Related to a Property

In [269]:
logging.info(' Query Started.')
displayResults(query_idpkg("""
PREFIX schema: <https://schema.org/>
PREFIX pav: <http://purl.org/pav/>
SELECT ?p (COUNT(?p) AS ?triples) (COUNT(DISTINCT ?o) AS ?literals)
WHERE {
    GRAPH ?g {
        ?s ?p ?o
    }
    FILTER (isLiteral(?o))
} 
GROUP BY ?p
ORDER BY ?p
""", opt))
logging.info('Query Completed.')

0,1,2
p,triples,literals
http://purl.org/pav/retrievedOn,84,80
http://rdfs.org/ns/void#inDataset,55,2
https://schema.org/dateModified,2,1
https://schema.org/datePublished,1,1
https://schema.org/description,50,2
https://schema.org/encodingFormat,2,2
https://schema.org/hasBioPolymerSequence,107,81
https://schema.org/identifier,105,76
https://schema.org/keywords,13,7


### Number of Unique Subject Types Linked to Unique Object Types

In [270]:
logging.info(' Query Started.')
displayResults(query_idpkg("""
PREFIX schema: <https://schema.org/>
PREFIX pav: <http://purl.org/pav/>
SELECT (COUNT(DISTINCT ?s) AS ?scount) ?stype ?p ?otype (COUNT(DISTINCT ?o) AS ?ocount)
WHERE {
    GRAPH ?g {
        ?s ?p ?o .
        ?s a ?stype .
        ?o a ?otype .
    }
} 
GROUP BY ?p ?stype ?otype
ORDER BY ?p
""", opt))
logging.info('Query Completed.')

0,1,2,3,4
scount,stype,p,otype,ocount
https://schema.org/SequenceAnnotation,https://schema.org/additionalProperty,https://schema.org/PropertyValue,341,669
https://schema.org/DataCatalog,https://schema.org/dataset,https://schema.org/Dataset,1,1
https://schema.org/Protein,https://schema.org/hasSequenceAnnotation,https://schema.org/SequenceAnnotation,62,341
https://schema.org/SequenceAnnotation,https://schema.org/sequenceLocation,https://schema.org/SequenceRange,341,342
https://schema.org/SequenceAnnotation,https://schema.org/subjectOf,https://schema.org/ScholarlyArticle,246,76
https://schema.org/PropertyValue,https://schema.org/value,https://schema.org/DefinedTerm,669,80


## Data Content Statistics

The previous section gave generic dataset statistics. We will now focus on information about the data content that is of interest to the IDP community.

### Number of Distinct Proteins
Retrieve the number of distinct proteins in the IDP-KG.

_Note that a protein can be present in multiple datasets._

In [271]:
logging.info(' Query Started.')
displayResults(query_idpkg("""
PREFIX schema: <https://schema.org/>
PREFIX pav: <http://purl.org/pav/>
SELECT (COUNT(DISTINCT ?s) AS ?Proteins) 
WHERE {
    GRAPH ?g {
        ?s a schema:Protein
    }
} 
""", opt))
logging.info('Query Completed.')

0
Proteins
73


## Analysis of Proteins

The queries in this section focus on the proteins contained in the Knowledge Graph.

### Proteins per Dataset

Display the number of proteins per dataset

In [272]:
logging.info(' Proteins per Dataset - Query Started.')
displayResults(query_idpkg("""
PREFIX schema: <https://schema.org/>
PREFIX pav: <http://purl.org/pav/>
PREFIX void: <http://rdfs.org/ns/void#>

SELECT ?dataset (COUNT(DISTINCT ?s) AS ?Proteins) 
WHERE {
    GRAPH ?g {
        ?s a schema:Protein
    }
    ?g void:inDataset ?dataset
} 
GROUP BY ?dataset
""", opt))
logging.info('Query Completed.')

0,1
dataset,Proteins
https://mobidb.org/#2020-09,28
https://proteinensemble.org/#2021-02-12,20
https://disprot.org/#2020-12,26


### Proteins from Multiple Datasets

A protein comes from multiple sources if the triple is found in multiple named graphs. The number of named graphs containing the triple indicates the number of sources containing the triple.

In [273]:
logging.info(' Proteins Dataset & Number of Datasets - Query Started.')
displayResults(query_idpkg("""
PREFIX schema: <https://schema.org/>
PREFIX pav: <http://purl.org/pav/>
PREFIX void: <http://rdfs.org/ns/void#>

SELECT ?protein (COUNT(?g) as ?numDatasets) (GROUP_CONCAT(?dataset;SEPARATOR=", ") AS ?datasets)
WHERE {
    GRAPH ?g {
        ?protein a schema:Protein .
    }
    ?g void:inDataset ?dataset .
}
GROUP BY ?protein
HAVING (COUNT(*) > 1)
ORDER BY ?numDatasets
""", opt))
logging.info('Query Completed.')

0,1,2
protein,numDatasets,datasets
https://idpcentral.org/id/Q5L4K5,2,"https://proteinensemble.org/#2021-02-12, https://proteinensemble.org/#2021-02-12"
https://idpcentral.org/id/P03265,2,"https://disprot.org/#2020-12, https://mobidb.org/#2020-09"
https://idpcentral.org/id/P37840,2,"https://proteinensemble.org/#2021-02-12, https://proteinensemble.org/#2021-02-12"
https://idpcentral.org/id/Q16143,2,"https://proteinensemble.org/#2021-02-12, https://proteinensemble.org/#2021-02-12"
https://idpcentral.org/id/P42212,2,"https://proteinensemble.org/#2021-02-12, https://proteinensemble.org/#2021-02-12"
https://idpcentral.org/id/P09525-1,2,"https://proteinensemble.org/#2021-02-12, https://proteinensemble.org/#2021-02-12"
https://idpcentral.org/id/P38634,4,"https://proteinensemble.org/#2021-02-12, https://proteinensemble.org/#2021-02-12, https://proteinensemble.org/#2021-02-12, https://proteinensemble.org/#2021-02-12"
https://idpcentral.org/id/P12296,4,"https://proteinensemble.org/#2021-02-12, https://proteinensemble.org/#2021-02-12, https://proteinensemble.org/#2021-02-12, https://proteinensemble.org/#2021-02-12"
https://idpcentral.org/id/O14558,5,"https://proteinensemble.org/#2021-02-12, https://proteinensemble.org/#2021-02-12, https://proteinensemble.org/#2021-02-12, https://proteinensemble.org/#2021-02-12, https://proteinensemble.org/#2021-02-12"


### Proteins from Multiple Pages

A protein comes from multiple pages (sources) if the triple is found in multiple named graphs. The number of named graphs containing the triple indicates the number of sources containing the triple.

_Note that a protein can come from multiple pages within the same dataset._

In [274]:
logging.info(' Proteins Sources & Number of sources - Query Started.')
displayResults(query_idpkg("""
PREFIX schema: <https://schema.org/>
PREFIX pav: <http://purl.org/pav/>
SELECT ?protein (COUNT(?g) as ?numSources) (GROUP_CONCAT(?source;SEPARATOR=", ") AS ?sources)
WHERE {
    GRAPH ?g {
        ?protein a schema:Protein .
    }
    ?g pav:retrievedFrom ?source .
}
GROUP BY ?protein
HAVING (COUNT(*) > 1)
ORDER BY ?numSources
""", opt))
logging.info('Query Completed.')

0,1,2
protein,numSources,sources
https://idpcentral.org/id/Q5L4K5,2,"https://proteinensemble.org/PED00025, https://proteinensemble.org/PED00011"
https://idpcentral.org/id/P03265,2,"https://disprot.org/DP00003, https://mobidb.org/P03265"
https://idpcentral.org/id/P37840,2,"https://proteinensemble.org/PED00006, https://proteinensemble.org/PED00024"
https://idpcentral.org/id/Q16143,2,"https://proteinensemble.org/PED00006, https://proteinensemble.org/PED00003"
https://idpcentral.org/id/P42212,2,"https://proteinensemble.org/PED00010, https://proteinensemble.org/PED00007"
https://idpcentral.org/id/P09525-1,2,"https://proteinensemble.org/PED00010, https://proteinensemble.org/PED00007"
https://idpcentral.org/id/P38634,4,"https://proteinensemble.org/PED00014, https://proteinensemble.org/PED00023, https://proteinensemble.org/PED00001, https://proteinensemble.org/PED00001"
https://idpcentral.org/id/P12296,4,"https://proteinensemble.org/PED00009, https://proteinensemble.org/PED00026, https://proteinensemble.org/PED00012, https://proteinensemble.org/PED00015"
https://idpcentral.org/id/O14558,5,"https://proteinensemble.org/PED00002, https://proteinensemble.org/PED00019, https://proteinensemble.org/PED00021, https://proteinensemble.org/PED00005, https://proteinensemble.org/PED00008"


### Minimal Protein Information

Retreive a minimal amount of information about the proteins.

In [275]:
logging.info(' Query Started.')
displayResults(query_idpkg("""
PREFIX schema: <https://schema.org/>
PREFIX pav: <http://purl.org/pav/>
PREFIX void: <http://rdfs.org/ns/void#>

SELECT  ?s ?name ?description
    (GROUP_CONCAT(DISTINCT ?identifier;SEPARATOR=',<br/>') AS ?identifiers)
    ?associatedDisease
    ?encodedBy
    ?taxonomicRange
    (GROUP_CONCAT(DISTINCT ?sameAs;SEPARATOR=',<br/>') AS ?sameAs)
    (GROUP_CONCAT(DISTINCT ?source;SEPARATOR=',<br/>') AS ?sources)
    (GROUP_CONCAT(DISTINCT ?dataset;SEPARATOR=',<br/>') AS ?datasets)
WHERE {
    GRAPH ?g {
# Bioschemas Minimal Properties
        ?s a schema:Protein .
        OPTIONAL {?s schema:identifier ?identifier }
        OPTIONAL {?s schema:name ?name }
## Bioschemas Recommended properties
        OPTIONAL {?s schema:associatedDisease ?associatedDisease}
        OPTIONAL {?s schema:description ?description}
        OPTIONAL {?s schema:isEncodedByBioChemEntity ?encodedBy}
        OPTIONAL {?s schema:taxonomicRange ?taxonomicRange }
        OPTIONAL {?s schema:url ?url}
        OPTIONAL {?s schema:sameAs ?sameAs }
    }
    ?g pav:retrievedFrom ?source
    OPTIONAL {?g void:inDataset ?dataset}
}
GROUP BY ?s
""", opt))
logging.info(' Query Completed.')

0,1,2,3,4,5,6,7,8,9
s,name,description,identifiers,associatedDisease,encodedBy,taxonomicRange,sameAs,sources,datasets
https://idpcentral.org/id/A0A0G2JXC5,https://identifiers.org/taxonomy:10116,https://identifiers.org/mobidb:A0A0G2JXC5,"https://mobidb.org/A0A0G2JXC5, http://purl.uniprot.org/uniprot/A0A0G2JXC5",https://mobidb.org/A0A0G2JXC5,https://mobidb.org/#2020-09,,,,
https://idpcentral.org/id/P38634,Cell division control protein 4,"https://identifiers.org/uniprot:P38634, https://identifiers.org/uniprot:P07834, https://identifiers.org/uniprot:P52286","http://purl.uniprot.org/uniprot/P38634, http://purl.uniprot.org/uniprot/P52286, https://proteinensemble.org/PED00014#P07834_D_0, https://proteinensemble.org/PED00014#P38634_A_1, https://proteinensemble.org/PED00014#P52286_C_0, http://purl.uniprot.org/uniprot/P07834, https://proteinensemble.org/PED00023#P38634_A_1, https://proteinensemble.org/PED00001#P38634_A_1","https://proteinensemble.org/PED00014, https://proteinensemble.org/PED00023, https://proteinensemble.org/PED00001",https://proteinensemble.org/#2021-02-12,,,,
https://idpcentral.org/id/P52286,Cell division control protein 4,"https://identifiers.org/uniprot:P07834, https://identifiers.org/uniprot:P52286, https://identifiers.org/uniprot:P38634","https://proteinensemble.org/PED00014#P07834_D_0, https://proteinensemble.org/PED00014#P38634_A_1, https://proteinensemble.org/PED00014#P52286_C_0, http://purl.uniprot.org/uniprot/P07834, http://purl.uniprot.org/uniprot/P52286, http://purl.uniprot.org/uniprot/P38634",https://proteinensemble.org/PED00014,https://proteinensemble.org/#2021-02-12,,,,
https://idpcentral.org/id/P07834,Cell division control protein 4,"https://identifiers.org/uniprot:P38634, https://identifiers.org/uniprot:P07834, https://identifiers.org/uniprot:P52286","http://purl.uniprot.org/uniprot/P52286, http://purl.uniprot.org/uniprot/P38634, https://proteinensemble.org/PED00014#P38634_A_1, http://purl.uniprot.org/uniprot/P07834, https://proteinensemble.org/PED00014#P07834_D_0, https://proteinensemble.org/PED00014#P52286_C_0",https://proteinensemble.org/PED00014,https://proteinensemble.org/#2021-02-12,,,,
https://idpcentral.org/id/A5YKK6,CCR4-NOT transcription complex subunit 1,https://identifiers.org/taxonomy:9606,https://identifiers.org/mobidb:A5YKK6,"http://purl.uniprot.org/uniprot/A5YKK6, https://mobidb.org/A5YKK6",https://mobidb.org/A5YKK6,https://mobidb.org/#2020-09,,,
https://idpcentral.org/id/P13551,Elongation factor G,https://bioschemas.org/crawl/v1/disprot/DP00021/20210813/23/disprot.org/DP00021/2054333771,https://identifiers.org/disprot:DP00021,"http://purl.uniprot.org/uniprot/P13551, https://disprot.org/DP00021",https://disprot.org/DP00021,https://disprot.org/#2020-12,,,
https://idpcentral.org/id/A0A0H3CFC9,https://identifiers.org/taxonomy:565050,https://identifiers.org/mobidb:A0A0H3CFC9,"http://purl.uniprot.org/uniprot/A0A0H3CFC9, https://mobidb.org/A0A0H3CFC9",https://mobidb.org/A0A0H3CFC9,https://mobidb.org/#2020-09,,,,
https://idpcentral.org/id/A5YV76,3-hydroxyacyl-[acyl-carrier-protein] dehydratase,https://identifiers.org/taxonomy:9823,https://identifiers.org/mobidb:A5YV76,"http://purl.uniprot.org/uniprot/A5YV76, https://mobidb.org/A5YV76",https://mobidb.org/A5YV76,https://mobidb.org/#2020-09,,,
https://idpcentral.org/id/P49913,Cathelicidin antimicrobial peptide,https://bioschemas.org/crawl/v1/disprot/DP00004/20210813/8/disprot.org/DP00004/1514688226,https://identifiers.org/disprot:DP00004,"http://purl.uniprot.org/uniprot/P49913, https://disprot.org/DP00004",https://disprot.org/DP00004,https://disprot.org/#2020-12,,,


### Full Protein Information

Retrieve basic information about the proteins in the knowledge graph.

In [276]:
logging.info(' Full Protein Information - Query Started.')
displayResults(query_idpkg("""
PREFIX schema: <https://schema.org/>
PREFIX pav: <http://purl.org/pav/>
PREFIX void: <http://rdfs.org/ns/void#>

SELECT  ?s ?name ?description
    (GROUP_CONCAT(DISTINCT ?identifier;SEPARATOR=',<br/>') AS ?identifiers)
    ?associatedDisease
    (GROUP_CONCAT(DISTINCT ?annotation;SEPARATOR=',<br/>') AS ?annotations)
    ?encodedBy
    ?taxonomicRange
    ?url
    ?alternateName
    ?bioChemInteraction
    ?bioChemSimilarity
    ?bioChemEntity
    (GROUP_CONCAT(DISTINCT ?sequence;SEPARATOR=',<br/>') AS ?sequences)
    ?molFunction
    ?representation
    ?image
    ?process
    ?cellularLocation
    ?parentEntity
    (GROUP_CONCAT(DISTINCT ?sameAs;SEPARATOR=',<br/>') AS ?sameAs)
    (GROUP_CONCAT(DISTINCT ?source;SEPARATOR=',<br/>') AS ?sources)
    (GROUP_CONCAT(DISTINCT ?dataset;SEPARATOR=',<br/>') AS ?datasets)
WHERE {
    GRAPH ?g {
# Bioschemas Minimal Properties
        ?s a schema:Protein .
        OPTIONAL {?s schema:identifier ?identifier }
        OPTIONAL {?s schema:name ?name }
## Bioschemas Recommended properties
        OPTIONAL {?s schema:associatedDisease ?associatedDisease}
        OPTIONAL {?s schema:description ?description}
        #OPTIONAL 
        {?s schema:hasSequenceAnnotation ?annotation }
        OPTIONAL {?s schema:isEncodedByBioChemEntity ?encodedBy}
        OPTIONAL {?s schema:taxonomicRange ?taxonomicRange }
        OPTIONAL {?s schema:url ?url}
## Bioschemas Optional properties
        OPTIONAL {?s schema:alternateName ?alternateName}
        OPTIONAL {?s schema:bioChemInteraction ?bioChemInteraction}
        OPTIONAL {?s schema:bioChemSimilarity ?bioChemSimilarity}
        OPTIONAL {?s schema:hasBioChemEntityPart ?bioChemEntity}
        OPTIONAL {?s schema:hasBioPolymerSequence ?sequence}
        OPTIONAL {?s schema:hasMolecularFunction ?molFunction}
        OPTIONAL {?s schema:hasRepresentation ?representation }
        OPTIONAL {?s schema:image ?image}
        OPTIONAL {?s schema:isInvolvedInBiologicalProcess ?process}
        OPTIONAL {?s schema:isLocatedInSubcellularLocation ?cellularLocation}
        OPTIONAL {?s schema:isPartOfBioChemEntity ?parentEntity}
        OPTIONAL {?s schema:sameAs ?sameAs }
    }
    ?g pav:retrievedFrom ?source ;
    OPTIONAL {?g void:inDataset ?dataset}
}
GROUP BY ?s
""", opt))
logging.info('Query Completed.')

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22
s,name,description,identifiers,associatedDisease,annotations,encodedBy,taxonomicRange,url,alternateName,bioChemInteraction,bioChemSimilarity,bioChemEntity,sequences,molFunction,representation,image,process,cellularLocation,parentEntity,sameAs,sources,datasets
https://idpcentral.org/id/A0A0G2JXC5,https://identifiers.org/taxonomy:10116,https://identifiers.org/mobidb:A0A0G2JXC5,"https://mobidb.org/A0A0G2JXC5#prediction-disorder-mobidb_lite.272_366, https://mobidb.org/A0A0G2JXC5#prediction-disorder-mobidb_lite.402_455, https://mobidb.org/A0A0G2JXC5#prediction-disorder-mobidb_lite.713_746, https://mobidb.org/A0A0G2JXC5#prediction-disorder-mobidb_lite.559_583, https://mobidb.org/A0A0G2JXC5#prediction-disorder-mobidb_lite.781_854, https://mobidb.org/A0A0G2JXC5#prediction-disorder-mobidb_lite.871_923, https://mobidb.org/A0A0G2JXC5#prediction-disorder-mobidb_lite.220_256",MGGVTITKAKVDFSSVVCLPPSVIAVNGLDGGGAGENDEEPVLLSLSAAPSPQSEAVANELQELSLQPELTLGLHPGRNPNLPPLSERKNVLQLKLQQRRTREELVSQGIMPPLKSPAAFHEQRRSLERARTEDYLKRKIRSRPERSELVRMHILEETSAEPSLQAKQLKLKRARLADDLNEKIAQRPGPMELVEKNILPVESSLKEALIVGQVNYPKVADSSSFDEDSSDALSPEQPASHESQGSVPSPLESRASDLLPSATSISPTQVLSQLPMAPDPGETLFLAEQPPLPPPPLLPPSLTSGSIVPTAKPAPTLIKQSQPKSASEKSQRSKKAKELKPKVKKLKYHQYIPPDQKQDKGAPAMDSSYAKILQQQQLFLQLQILNQQQQQQQQQHYNYQAILPAPPKPSGETPGSSAPTPSRSLSTSSSSSSGTPGPGGLARQNSTALAGKPGALPANLDDMKVAELKQELKLRSLPVSGTKTELIERLRAYQDQVSPAPGAPKAPATTSVLSKAGEVVVAFPAALLSTGSALVTAGLAPAEMVVATVTSNGMVKFGSTGSTPPVSPTPSERSLLSTGDENSTPGDAFGEMVTSPLTQLTLQASPLQIVKEEGARAASCCLSPGARAELEGLDKDQMLQEKDKQIEELTRMLQQKQQLVELLRLQLEQQKRAQQPAPASSPVKRESSFSSCQLSCQPQGAARAFGPGLVVPTTNHGDAQAPAPESPPVVVKQEAGPPEPDLAPASQLLLGSQGTSFLKKVSPPTLVTDSTGTHLILTVTNKSADGPGLPTGSPQQPLSQPGSPAPGPPAQMDLEHPPQPSFATPTSLLKKEPPGYEETVTQQPKQQENGSSSQHMDDLFDILIQSGEISADFKEPPSLPGKEKSPPAEAYGPPLTPQPSPLSELPQAAPPPGSPTLPGRLEDFLESSTGLPLLTSGHEGPEPLSLIDDLHSQMLSSSAILDHPPSPMDTSELHFAPEPSSGMGLDLAVGHLDSMDWLELSSGGPVLSLAPLSTTAPSLFSMDFLDGHDLQLHWDSCL,"https://mobidb.org/A0A0G2JXC5, http://purl.uniprot.org/uniprot/A0A0G2JXC5",https://mobidb.org/A0A0G2JXC5,https://mobidb.org/#2020-09,,,,,,,,,,,,,,,
https://idpcentral.org/id/P38634,Cell division control protein 4,"https://identifiers.org/uniprot:P38634, https://identifiers.org/uniprot:P07834, https://identifiers.org/uniprot:P52286","https://proteinensemble.org/PED00014#P52286_C_0_4_186, https://proteinensemble.org/PED00014#P38634_A_1_1_90, https://proteinensemble.org/PED00014#P07834_D_0_270_744, https://proteinensemble.org/PED00023#P38634_A_1_1_90, https://proteinensemble.org/PED00001#P38634_A_1_1_90","MTPSTPPRSRGTRYLAQPSGNTSSSALMQGQKTPQKPSQNLVPVTPSTTKSFKNAPLLAPPNSNMGMTSPFNGLTSPQRSPFPKSSVKRT, SNVVLVSGEGERFTVDKKIAERSLLLKNYLNDMHDSNLQNNSDSESDSDSETNHKSKDNNNGDDDDEDDDEIVMPVPNVRSSVLQKVIEWAEHHRDSNFPDEDDDDSRKSAPVDSWDREFLKVDQEMLYEIILAANYLNIKPLLDAGCKVVAEMIRGRSPEEIRRTFNIVNDFTPEEEAAIRR, LKRDLITSLPFEISLKIFNYLQFEDIINSLGVSQNWNKIIRKSTSLWKKLLISENFVSPKGFNSLNLKLSQKYPKLSQQDRLRLSFLENIFILKNWYNPKFVPQRTTLRGHMTSVITCLQFEDNYVITGADDKMIRVYDSINKKFLLQLSGHDGGVWALKYAHGGILVSGSTDRTVRVWDIKKGCCTHVFKGHNSTVRCLDIVEYKNIKYIVTGSRDNTLHVWKLPKESSVPDHGEEHDYPLVFHTPEENPYFVGVLRGHMASVRTVSGHGNIVVSGSYDNTLIVWDVAQMKCLYILSGHTDRIYSTIYDHERKRCISASMDTTIRIWDLENIWNNGECSYATNSASPCAKILGAMYTLQGHTALVGLLRLSDKFLVSAAADGSIRGWDANDYSRKFSYHHTNLSAITTFYVSDNILVSGSENQFNIYNLRSGKLVHANILKDADQIWSVNFKGKTLVAAVEKDGQSFLEILDFS","http://purl.uniprot.org/uniprot/P38634, http://purl.uniprot.org/uniprot/P52286, https://proteinensemble.org/PED00014#P07834_D_0, https://proteinensemble.org/PED00014#P38634_A_1, https://proteinensemble.org/PED00014#P52286_C_0, http://purl.uniprot.org/uniprot/P07834, https://proteinensemble.org/PED00023#P38634_A_1, https://proteinensemble.org/PED00001#P38634_A_1","https://proteinensemble.org/PED00014, https://proteinensemble.org/PED00023, https://proteinensemble.org/PED00001",https://proteinensemble.org/#2021-02-12,,,,,,,,,,,,,,,
https://idpcentral.org/id/P52286,Cell division control protein 4,"https://identifiers.org/uniprot:P07834, https://identifiers.org/uniprot:P52286, https://identifiers.org/uniprot:P38634","https://proteinensemble.org/PED00014#P38634_A_1_1_90, https://proteinensemble.org/PED00014#P52286_C_0_4_186, https://proteinensemble.org/PED00014#P07834_D_0_270_744","SNVVLVSGEGERFTVDKKIAERSLLLKNYLNDMHDSNLQNNSDSESDSDSETNHKSKDNNNGDDDDEDDDEIVMPVPNVRSSVLQKVIEWAEHHRDSNFPDEDDDDSRKSAPVDSWDREFLKVDQEMLYEIILAANYLNIKPLLDAGCKVVAEMIRGRSPEEIRRTFNIVNDFTPEEEAAIRR, MTPSTPPRSRGTRYLAQPSGNTSSSALMQGQKTPQKPSQNLVPVTPSTTKSFKNAPLLAPPNSNMGMTSPFNGLTSPQRSPFPKSSVKRT, LKRDLITSLPFEISLKIFNYLQFEDIINSLGVSQNWNKIIRKSTSLWKKLLISENFVSPKGFNSLNLKLSQKYPKLSQQDRLRLSFLENIFILKNWYNPKFVPQRTTLRGHMTSVITCLQFEDNYVITGADDKMIRVYDSINKKFLLQLSGHDGGVWALKYAHGGILVSGSTDRTVRVWDIKKGCCTHVFKGHNSTVRCLDIVEYKNIKYIVTGSRDNTLHVWKLPKESSVPDHGEEHDYPLVFHTPEENPYFVGVLRGHMASVRTVSGHGNIVVSGSYDNTLIVWDVAQMKCLYILSGHTDRIYSTIYDHERKRCISASMDTTIRIWDLENIWNNGECSYATNSASPCAKILGAMYTLQGHTALVGLLRLSDKFLVSAAADGSIRGWDANDYSRKFSYHHTNLSAITTFYVSDNILVSGSENQFNIYNLRSGKLVHANILKDADQIWSVNFKGKTLVAAVEKDGQSFLEILDFS","https://proteinensemble.org/PED00014#P07834_D_0, https://proteinensemble.org/PED00014#P38634_A_1, https://proteinensemble.org/PED00014#P52286_C_0, http://purl.uniprot.org/uniprot/P07834, http://purl.uniprot.org/uniprot/P52286, http://purl.uniprot.org/uniprot/P38634",https://proteinensemble.org/PED00014,https://proteinensemble.org/#2021-02-12,,,,,,,,,,,,,,,
https://idpcentral.org/id/P07834,Cell division control protein 4,"https://identifiers.org/uniprot:P38634, https://identifiers.org/uniprot:P07834, https://identifiers.org/uniprot:P52286","https://proteinensemble.org/PED00014#P07834_D_0_270_744, https://proteinensemble.org/PED00014#P38634_A_1_1_90, https://proteinensemble.org/PED00014#P52286_C_0_4_186","LKRDLITSLPFEISLKIFNYLQFEDIINSLGVSQNWNKIIRKSTSLWKKLLISENFVSPKGFNSLNLKLSQKYPKLSQQDRLRLSFLENIFILKNWYNPKFVPQRTTLRGHMTSVITCLQFEDNYVITGADDKMIRVYDSINKKFLLQLSGHDGGVWALKYAHGGILVSGSTDRTVRVWDIKKGCCTHVFKGHNSTVRCLDIVEYKNIKYIVTGSRDNTLHVWKLPKESSVPDHGEEHDYPLVFHTPEENPYFVGVLRGHMASVRTVSGHGNIVVSGSYDNTLIVWDVAQMKCLYILSGHTDRIYSTIYDHERKRCISASMDTTIRIWDLENIWNNGECSYATNSASPCAKILGAMYTLQGHTALVGLLRLSDKFLVSAAADGSIRGWDANDYSRKFSYHHTNLSAITTFYVSDNILVSGSENQFNIYNLRSGKLVHANILKDADQIWSVNFKGKTLVAAVEKDGQSFLEILDFS, SNVVLVSGEGERFTVDKKIAERSLLLKNYLNDMHDSNLQNNSDSESDSDSETNHKSKDNNNGDDDDEDDDEIVMPVPNVRSSVLQKVIEWAEHHRDSNFPDEDDDDSRKSAPVDSWDREFLKVDQEMLYEIILAANYLNIKPLLDAGCKVVAEMIRGRSPEEIRRTFNIVNDFTPEEEAAIRR, MTPSTPPRSRGTRYLAQPSGNTSSSALMQGQKTPQKPSQNLVPVTPSTTKSFKNAPLLAPPNSNMGMTSPFNGLTSPQRSPFPKSSVKRT","http://purl.uniprot.org/uniprot/P52286, http://purl.uniprot.org/uniprot/P38634, https://proteinensemble.org/PED00014#P38634_A_1, http://purl.uniprot.org/uniprot/P07834, https://proteinensemble.org/PED00014#P07834_D_0, https://proteinensemble.org/PED00014#P52286_C_0",https://proteinensemble.org/PED00014,https://proteinensemble.org/#2021-02-12,,,,,,,,,,,,,,,
https://idpcentral.org/id/A5YKK6,CCR4-NOT transcription complex subunit 1,https://identifiers.org/taxonomy:9606,https://identifiers.org/mobidb:A5YKK6,"https://mobidb.org/A5YKK6#prediction-disorder-mobidb_lite.1315_1352, https://mobidb.org/A5YKK6#prediction-disorder-mobidb_lite.720_768",MNLDSLSLALSQISYLVDNLTKKNYRASQQEIQHIVNRHGPEADRHLLRCLFSHVDFSGDGKSSGKDFHQTQFLIQECALLITKPNFISTLSYAIDNPLHYQKSLKPAPHLFAQLSKVLKLSKVQEVIFGLALLNSSSSDLRGFAAQFIKQKLPDLLRSYIDADVSGNQEGGFQDIAIEVLHLLLSHLLFGQKGAFGVGQEQIDAFLKTLRRDFPQERCPVVLAPLLYPEKRDILMDRILPDSGGVAKTMMESSLADFMQEVGYGFCASIEECRNIIVQFGVREVTAAQVARVLGMMARTHSGLTDGIPLQSISAPGSGIWSDGKDKSDGAQAHTWNVEVLIDVLKELNPSLNFKEVTYELDHPGFQIRDSKGLHNVVYGIQRGLGMEVFPVDLIYRPWKHAEGQLSFIQHSLINPEIFCFADYPCHTVATDILKAPPEDDNREIATWKSLDLIESLLRLAEVGQYEQVKQLFSFPIKHCPDMLVLALLQINTSWHTLRHELISTLMPIFLGNHPNSAIILHYAWHGQGQSPSIRQLIMHAMAEWYMRGEQYDQAKLSRILDVAQDLKALSMLLNGTPFAFVIDLAALASRREYLKLDKWLTDKIREHGEPFIQACMTFLKRRCPSILGGLAPEKDQPKSAQLPPETLATMLACLQACAGSVSQELSETILTMVANCSNVMNKARQPPPGVMPKGRPPSASSLDAISPVQIDPLAGMTSLSIGGSAAPHTQSMQGFPPNLGSAFSTPQSPAKAFPPLSTPNQTTAFSGIGGLSSQLPVGGLGTGSLTGIGTGALGLPAVNNDPFVQRKLGTSGLNQPTFQQSKMKPSDLSQVWPEANQHFSKEIDDEANSYFQRIYNHPPHPTMSVDEVLEMLQRFKDSTIKREREVFNCMLRNLFEEYRFFPQYPDKELHITACLFGGIIEKGLVTYMALGLALRYVLEALRKPFGSKMYYFGIAALDRFKNRLKDYPQYCQHLASISHFMQFPHHLQEYIEYGQQSRDPPVKMQGSITTPGSIALAQAQAQAQVPAKAPLAGQVSTMVTTSTTTTVAKTVTVTRPTGVSFKKDVPPSINTTNIDTLLVATDQTERIVEPPENIQEKIAFIFNNLSQSNMTQKVEELKETVKEEFMPWVSQYLVMKRVSIEPNFHSLYSNFLDTLKNPEFNKMVLNETYRNIKVLLTSDKAAANFSDRSLLKNLGHWLGMITLAKNKPILHTDLDVKSLLLEAYVKGQQELLYVVPFVAKVLESSIRSVVFRPPNPWTMAIMNVLAELHQEHDLKLNLKFEIEVLCKNLALDINELKPGNLLKDKDRLKNLDEQLSAPKKDVKQPEELPPITTTTTSTTPATNTTCTATVPPQPQYSYHDINVYSLAGLAPHITLNPTIPLFQAHPQLKQCVRQAIERAVQELVHPVVDRSIKIAMTTCEQIVRKDFALDSEESRMRIAAHHMMRNLTAGMAMITCREPLLMSISTNLKNSFASALRTASPQQREMMDQAAAQLAQDNCELACCFIQKTAVEKAGPEMDKRLATEFELRKHARQEGRRYCDPVVLTYQAERMPEQIRLKVGGVDPKQLAVYEEFARNVPGFLPTNDLSQPTGFLAQPMKQAWATDDVAQIYDKCITELEQHLHAIPPTLAMNPQAQALRSLLEVVVLSRNSRDAIAALGLLQKAVEGLLDATSGADADLLLRYRECHLLVLKALQDGRAYGSPWCNKQITRCLIECRDEYKYNVEAVELLIRNHLVNMQQYDLHLAQSMENGLNYMAVAFAMQLVKILLVDERSVAHVTEADLFHTIETLMRINAHSRGNAPEGLPQLMEVVRSNYEAMIDRAHGGPNFMMHSGISQASEYDDPPGLREKAEYLLREWVNLYHSAAAGRDSTKAFSAFVGQMHQQGILKTDDLITRFFRLCTEMCVEISYRAQAEQQHNPAANPTMIRAKCYHNLDAFVRLIALLVKHSGEATNTVTKINLLNKVLGIVVGVLLQDHDVRQSEFQQLPYHRIFIMLLLELNAPEHVLETINFQTLTAFCNTFHILRPTKAPGFVYAWLELISHRIFIARMLAHTPQQKGWPMYAQLLIDLFKYLAPFLRNVELTKPMQILYKGTLRVLLVLLHDFPEFLCDYHYGFCDVIPPNCIQLRNLILSAFPRNMRLPDPFTPNLKVDMLSEINIAPRILTNFTGVMPPQFKKDLDSYLKTRSPVTFLSDLRSNLQVSNEPGNRYNLQLINALVLYVGTQAIAHIHNKGSTPSMSTITHSAHMDIFQNLAVDLDTEGRYLFLNAIANQLRYPNSHTHYFSCTMLYLFAEANTEAIQEQITRVLLERLIVNRPHPWGLLITFIELIKNPAFKFWNHEFVHCAPEIEKLFQSVAQCCMGQKQAQQVMEGTGAS,"http://purl.uniprot.org/uniprot/A5YKK6, https://mobidb.org/A5YKK6",https://mobidb.org/A5YKK6,https://mobidb.org/#2020-09,,,,,,,,,,,,,,
https://idpcentral.org/id/P13551,Elongation factor G,https://bioschemas.org/crawl/v1/disprot/DP00021/20210813/23/disprot.org/DP00021/2054333771,https://identifiers.org/disprot:DP00021,"https://disprot.org/DP00021r003, https://disprot.org/DP00021r005, https://disprot.org/DP00021r002, https://disprot.org/DP00021r004",MAVKVEYDLKRLRNIGIAAHIDAGKTTTTERILYYTGRIHKIGEVHEGAATMDFMEQERERGITITAAVTTCFWKDHRINIIDTPGHVDFTIEVERSMRVLDGAIVVFDSSQGVEPQSETVWRQAEKYKVPRIAFANKMDKTGADLWLVIRTMQERLGARPVVMQLPIGREDTFSGIIDVLRMKAYTYGNDLGTDIREIPIPEEYLDQAREYHEKLVEVAADFDENIMLKYLEGEEPTEEELVAAIRKGTIDLKITPVFLGSALKNKGVQLLLDAVVDYLPSPLDIPPIKGTTPEGEVVEIHPDPNGPLAALAFKIMADPYVGRLTFIRVYSGTLTSGSYVYNTTKGRKERVARLLRMHANHREEVEELKAGDLGAVVGLKETITGDTLVGEDAPRVILESIEVPEPVIDVAIEPKTKADQEKLSQALARLAEEDPTFRVSTHPETGQTIISGMGELHLEIIVDRLKREFKVDANVGKPQVAYRETITKPVDVEGKFIRQTGGRGQYGHVKIKVEPLPRGSGFEFVNAIVGGVIPKEYIPAVQKGIEEAMQSGPLIGFPVVDIKVTLYDGSYHEVDSSEMAFKIAGSMAIKEAVQKGDPVILEPIMRVEVTTPEEYMGDVIGDLNARRGQILGMEPRGNAQVIRAFVPLAEMFGYATDLRSKTQGRGSFVMFFDHYQEVPKQVQEKLIKGQ,"http://purl.uniprot.org/uniprot/P13551, https://disprot.org/DP00021",https://disprot.org/DP00021,https://disprot.org/#2020-12,,,,,,,,,,,,,,
https://idpcentral.org/id/A5YV76,3-hydroxyacyl-[acyl-carrier-protein] dehydratase,https://identifiers.org/taxonomy:9823,https://identifiers.org/mobidb:A5YV76,https://mobidb.org/A5YV76#prediction-disorder-mobidb_lite.2191_2212,MEEVVIAGMSGKLPESENLEEFWANLIGGVDMVTADDRRWKAGLYGLPRRMGKLKDLSRFDASFFGVHSKQANTMDPQLRMLLEVTYEAIVDGGINPASLRGTSTGVWVGVSSSDASEALSRDPETLVGYSMIGCQRAMMANRLSFFFDFKGPSITIDTACSSSLLALQSAYQAIRGGECSAAVVGGLNVLLKPNSSLQFMKLGMLSQDGTCRSFDAEGTGYCRAEAVVAVLLTKKSLARRVYATILNAGTNTDGSKEQGVTFPSGDVQEQLIRSLYAPAGPDPESLEYIEAHGTGTKVGDPQELNGIVNALCATRREPLLIGSTKSNMGHPEPASGVAALIKVLLSLEHGVWAPNLHYHTPNPEIPALQDGRLQVVDRPLPIRGGNVGINSFGFGGSNVHVILQPNSRPAPPPAQHAALPRLLQASGRTLEAVQTLLEQGLRHSRDLAFVGMLNEIAAVSPVAMPFRGYAVLGGEAGSQEVQQVPGSKRPVWFICSGMGAQWQGMGLSLMRLDRFRDSILRSDQALKPLGLRVSDLLLSTDEAVLDDIVSSFVSLTSIQIALIDLLTSLGLQPDGIIGHSLGEVACGYADGCLTQEEAVLSSYWRGYCIKEANVLPGAMAAVGLSWEECKQRCPPGIVPACHNSKDTVTISGPQAAMSEFLQQLKREDVFVKEVRTGGIAFHSYFMESIAPTLLRQLRKVILDPKPRSKRWLSTSIPEAQWQGSLARTFSAEYSVNNLVSPVLFQEALQHVPAHAVVVEIAPHALLQAVLKRSLESSCTIIPLMKKDHRDNLEFFLSNVGRLHLAGVSVNPNGLFPPVEFPAPRGTPLISPHXKWDHSQAWDVPSAADFPSGSSCSSVAVYKFDVSPESPDHYLVDHCIDGRVLFPGTGYLWLTWKTLARALSQNLEETPVVFEDVTLHQATILPKTGTVSLEVRLLEASHAFEVSDSNGSLIASGKVYQWESPDPKLFDTRAAVDPADSTAEFRLSQGDVYKDLRLRGYDYGPFFQLVLESDLEGNRGRLQWNDSWVSFLDAMLHMSILAPGQLGLYLPTRFTSIRIDPVTHRQKLYTLQDTTQAADVVVDRNLNTVVAGGALFLGAHSSVAPRRPQEHLKPILEKFCFTPHVESGCLAGNTALQEELQLCRGLAQALQTKVAQQGLKMVVPGLDGAQAPREAPQQSLPRLLAAACQLQLNGNLQLELGQVLAQERPLLCDDPLLSGLLDAPALKACVDTALENMASPKMKVVEVLAGDGQLYSRIPALLNTQPVMDLDYTATDRNPQALEAAQAKLEQLHVTQGQWDPANPAPGSLGKADLLVCNCALATLGDPAVAVGNMAATLKEGGFLLLHTLLAGHPLGEMVGFLTSPEQGGRHLLSQDQWESLFAGASLHLVALKRSFYGSVLFLCRQQTPQDSPVFLSVEDTSFRWVDSLKDILADASSRPVWLMAVGCSTSGVVGMVNCLRKEPGGHRIRCVLVSNLSSTSPAPEMHPSSSELQKVLQGDLVMNVYRDGAWGAFRHFPLEQDRPEKQTEHAFVNVLSRGDLSSIRWVCSPLHYALPASCQDRLCSVYYTSLNFRDVMLATGKLSPDSIPGKWLTRDCMLGMEFSGRDASGRRVMGMVPAEGLATSVLLLQHATWEVPSTWTLEEAASVPIVYTTAYYSLVVRGRMQPGESVLIHSGSGGVGQAAIAIALSRGCRVFTTVGSAEKRAYLQARFPQLDETCFANSRDTSFEQHVLRHTAGKGVDLVLNSLAEEKLQASVRCLAQHGRFLEIGKFDLSNNHALGMAVFLKNVTFHGILLDSLFEEGGATWQEVSELLKAGIQEGVVQPLKCTVFPRTKVEAAFRYMAQGKHIGKVVIQVREEEQGPAPRGLPPIALTGLSKTFCPPHKSYVITGGLGGFGLQLAQWLRLRGAQKLVLTSRSGIRTGYQARQVREWRRQGVQVLVSTSNASSLDGARSLITEATQLGPVGGVFNLAMVLRDAVLENQTPEFFQDVSKPKYSGTANLDRVTREACPELDYFVIFSSVSCGRGNAGQANYGFANSAMERICEKRRHDGLPGLAVQWGAIGDVGVVLETMGTNDTVIGGTLPQRIASCLEVLDLFLSQPHPVLSSFVLAEKKAAAPRDGSSQKDLVKAVAHILGIRDVASINPDSTLVDLGLDSLMGVEVRQILEREHDLVLSMREVRQLSLRKLQELSSKTSTDADPATPTSHEDSPVRQQATLNLSTLLVNPEGPTLTRLNSVQSAERPLFLVHPIEGSITVFHGLAAKLSIPTYGLQCTGAAPLDSIQSLASYYIECIRQVQPEGPYRIAGYSYGACVAFEMCSQLQAQQSATPGNHSLFLFDGSHTFVLAYTQSVRAKMTPGCEAEAEAKAMYFFVQQFTDMEQGKVLEALIPLQGLEARVAATVDLITQSHAGLDRHALSFAARSFYQKLRAAENYWPQATYHGNVTLLRAKTGGAYGEDLGADYNLSQVCDGKVSVHVIEGDHRTLLEGSGLESILSIIHSCLAEPRVSVREG,"http://purl.uniprot.org/uniprot/A5YV76, https://mobidb.org/A5YV76",https://mobidb.org/A5YV76,https://mobidb.org/#2020-09,,,,,,,,,,,,,,
https://idpcentral.org/id/P49913,Cathelicidin antimicrobial peptide,https://bioschemas.org/crawl/v1/disprot/DP00004/20210813/8/disprot.org/DP00004/1514688226,https://identifiers.org/disprot:DP00004,"https://disprot.org/DP00004r004, https://disprot.org/DP00004r002, https://disprot.org/DP00004r001",MKTQRDGHSLGRWSLVLLLLGLVMPLAIIAQVLSYKEAVLRAIDGINQRSSDANLYRLLDLDPRPTMDGDPDTPKPVSFTVKETVCPRTTQQSPEDCDFKKDGLVKRCMGTVTLNQARGSFDISCDKDNKRFALLGDFFRKSKEKIGKEFKRIVQRIKDFLRNLVPRTES,"http://purl.uniprot.org/uniprot/P49913, https://disprot.org/DP00004",https://disprot.org/DP00004,https://disprot.org/#2020-12,,,,,,,,,,,,,,
https://idpcentral.org/id/A1Z9S6,https://identifiers.org/taxonomy:7227,https://identifiers.org/mobidb:A1Z9S6,"https://mobidb.org/A1Z9S6#prediction-disorder-mobidb_lite.1836_1954, https://mobidb.org/A1Z9S6#prediction-disorder-mobidb_lite.18_44, https://mobidb.org/A1Z9S6#prediction-disorder-mobidb_lite.2847_2881, https://mobidb.org/A1Z9S6#prediction-disorder-mobidb_lite.2265_2425, https://mobidb.org/A1Z9S6#prediction-disorder-mobidb_lite.2100_2243, https://mobidb.org/A1Z9S6#prediction-disorder-mobidb_lite.961_1028, https://mobidb.org/A1Z9S6#prediction-disorder-mobidb_lite.1046_1419, https://mobidb.org/A1Z9S6#prediction-disorder-mobidb_lite.763_948, https://mobidb.org/A1Z9S6#prediction-disorder-mobidb_lite.2789_2828, https://mobidb.org/A1Z9S6#prediction-disorder-mobidb_lite.1482_1824, https://mobidb.org/A1Z9S6#prediction-disorder-mobidb_lite.730_751, https://mobidb.org/A1Z9S6#prediction-disorder-mobidb_lite.3131_3172, https://mobidb.org/A1Z9S6#prediction-disorder-mobidb_lite.421_568, https://mobidb.org/A1Z9S6#prediction-disorder-mobidb_lite.2480_2539, https://mobidb.org/A1Z9S6#prediction-disorder-mobidb_lite.584_687, https://mobidb.org/A1Z9S6#prediction-disorder-mobidb_lite.1966_2000, https://mobidb.org/A1Z9S6#prediction-disorder-mobidb_lite.62_394",MEDIEYLDEYKDLVLPGIKSSAPTASSAGVPRQRRISSDSSNSSSIDADIFQKLFHGKYLDDELLKEGSGSKSQDRRRRPPSPSSSDESNDALEALFCGKSSQSKLSKRRERYESFDSVDDLGEALMRPSHRLTVPKPSTSQAGSKKSQDAAPHRSISNSSSSNIAMSSGQTYACPKNKKTNSVANKTAVSANGDKWASGKNKTVTIVKPQKTDLRPSRLEPKTDPLNLGDLYGDSDSDSSYEYESDFYGDQDSDEEDEPVIDISTDTSRTTSVADTVTPVVSDDEGQEPQSELNQRHANDQESFLSSTYERLQSYLSNLSSAEPPPIYKKQNSARKSRSNEKKPSSSQQVKHANEQSAASDKEANKKERDLEPPEASSSAKSSASKASGSRKLYDVANNATLEAVERMSLDLAEQIMEIDVERSYEFEKRSASKSRSLSRSKIPADSSTSQLGHVRKLTYSSERLDQPEPLGTQLRRSKSESLPKRGRPRGQKQRKNRGATMEEKSANCRDGEGEPVKRKRGRPRKIIPKEEAKTAETENTIESLNTNVPLSIDTSKENPETETVNLEVPIKQDELVSDLDNAKELTGSTNLPQDDIEMASNHQETDLKCAPDRVALDKSESTPKVEEEQLCKVDTPSDTALDESKVSESAKNHIELEDKDKDKEETQKESPNGNSKETNENSVIVTNEVELPAKKAEAKAEAGNIVEESDSQLAEDFKLAEEILAAEVGKGVEANEVSVTSVQGEQNPVIEIVKELEQETISEVVPAQNDQSSVEDQTLADKENPVEKPSPVKAPSSSKDEPPAEENLPAPDQDPIEQQKTPVAKNQQHDKEHNEAPKAESLSVSDIPSSSVTPSKKRNHSSPANTPKKSKEIEALQSSVPRRALRSDKATPQNLRESRSKRTLKTELTLLMDDTMRRSSPRLGRSPAESHSSHERSPMEKKVTVSKLAKDLITIDKEKEIELKSLPDASETKDVKITKTTTASDTSILTDENPSSSKTEMKKLKGKPLKAKKMSRTSETEVKKAIADSNEDIPSIFSIKCVEEHLTSSESEQKDEKEELLCPKPQIDCTNTDLEQSTAIETDTEQVEEKRSNRRKSRRIRNEKFKTETDTLSDHLDAKKAENASLEISMRPKCTLETQQSDPVTAKNKRNSGRLSRKEKSVINAAKSEKDKSPSAISQSTERKQLLNENPSKKDKKTEQSGNKKEAVVGPLDKTETSSSTNIIDKKSNESFDSAMQPSDRLNQKESAFTKLSSISSPKKIMKDQDKDLDALSKGGDSNPTIRDTGEDSRQTDKKHQENDTKHEEEDSSKLKANIDETKSSSEKDAEPISKDSSQDSAKPRLSKPKSRNKRKKNEKKPNDSIAESDIEGGFQVNTETVQATCSTPSESNKKDMVKSDETNEEPNLSETEIGRIRKRGQAFHIENPKDDLHITPQNENQSIAGVNFEKQVPLPESVESDTPIMKIPTKTYLMCTKNKTSLLSASEDPDIVLEPQKLITTSKGDSNPDLDNANNLETSSTQDPKEHEFSDQTFTDNSDIIPSCTKKSQIVFPTTPTKSSDQTKNSFITPNRSPKSKRNVSKEAKRLDNSFEESQNAASESSASKVQKELRTPTASCRKLRVLIKRTPTSSLPTNSRKSIFKKTPAKSKRLTKILESMEKTPSREPSVSLGEVNPDSDPVAAESVAVLHESDRDLESNEIPNEEVFEDTEEASAEDTDNKLKKKEDDHELEVNDICAASKNPITDDSTKDASSNKSTDSDVLQETKDELSNSLINATQGEDTPIKELTEEEVPNNKTVEDESKKQEILKDLEPDNAALEEDTASTAKAAEEMDLYIKEKSNVKSVLAEPETDVTDDEELAQSPIPNSSETTSVTDDPEPSTSSVVKRSLRKREADSSQPDEAAKRKQRQDVEKSLTGKKEQVKPARRRQLAEVEERPSLKRSKTESEAKSTVQGKYISIIGNETIMSSTTAPIRETNREAASTSPSARKSAVQEAKHVETTKHIILGPPGKKLLHSDSPAAEVKKPMVQTLLSSTLSLQKPSTLDDGSPLKIRKSLKKSIADENIDGDQSIFSSSSVLNKNTSVVAPRKVNISVSLLQSKDTQVETAASSSETPILTKKEKLKTQKSTKKPEGNKKTESKKKSLVQGPQMKTQKSEEAVSGPKILNKYLKSETESSRKTVSTVTGRKQIGQLEVLKKPESRKSEESLVEAISRKKQSQVQRLSKIDGRKSEGTSLPQPDVSKSETALKAALPKETEFPVQDAEIEKMSKGRGHQNAVKNTKTEQPKSKPKTEVRSLQAEAATELMDSMDSQSDVSDIRATFPESQGIFNVPGHMTRAISSNRSLAPTPTPMSDSQRNASKERFTPVSDQKKPIRESQTLSKRRARGGRNQPLVSKRKAGEAEDGTAVINPKRPREMDEEDHPQQNDHVQESAFAAFPVKITAASSVIPQVVRSTGNTVPQNISPRKLCVKINRRPYNKWLRSTQERNEEQEGSRNVTSLPLLGETSETDSAAESMSESILQSQVQSEPAIQPLPASQPDSCTLQASDLRIRESSAQLAPIAAYDSPAANDSSTSPALDIAPESAQTAKATLNTALCPSTEKHLPDEPTLLESSKKVAEPQKLQTFQAKCLPVPIPEVKSEPEDIMDEHSPNEPMPMVAAAPATPQPHAITEDAGPDTIQVNTLGVSTSSRPLELHSIPSASDPDGNPNAIGQTKMYSFLYPKRYKQSYDDVGLDFCCPNLDGPMRAIDFTRLHSKAEVPVLEIPQFLVITTKFISKADKNMPSKVRAKLELLDKSKERDSSKLTPTATTPTADPTGPSSFSPAPASVGPATQPFPSIIQNLLSAPLPDPGLFNHPTTVDPSTSTPVVSGSSSSTTISADLDSLSKQLPRGTKLIKKSVQQVATNPSLAGTSMVINASPSFIQLPPICPNDKQRVELQARVQMFDLVLQTLSRRAANLSVAERQRTIEEIVRTSSLMAIDVDVGTKLLENYVHYLNKATSTMTPLTPAQINSSLGASTSSTLSKSIATSDIPQQGKKISADHAQQRSSLPATIPLYDGGRNTLGFPYSCSKSTAGRKSSYVATSTPVKASTSQAAAAAALGNAQPRSTLGIPKSVREDASQFVNLNTTVCMPAPRTNAKKKPGTSGPLKSMNSSPAVQKPALCKQQTAPARTLSKSTVSSVARAKSTGSLSAVLGETPADEFVSPAGMSLSTTGNPNVFIINHAVQSEESILPDSNSSVGHMETTVIKGELDDSAEIII,"https://mobidb.org/A1Z9S6, http://purl.uniprot.org/uniprot/A1Z9S6",https://mobidb.org/A1Z9S6,https://mobidb.org/#2020-09,,,,,,,,,,,,,,,


## Analysis of Sequence Annotations

### Sequence Annotations per Dataset

Display the number of sequence annotations per dataset.

In [277]:
logging.info('Sequence Annotations per Dataset - Query Started.')
displayResults(query_idpkg("""
PREFIX schema: <https://schema.org/>
PREFIX pav: <http://purl.org/pav/>
PREFIX void: <http://rdfs.org/ns/void#>

SELECT ?dataset (COUNT(DISTINCT ?s) AS ?annotations) 
WHERE {
    GRAPH ?g {
        ?s a schema:SequenceAnnotation
    }
    ?g void:inDataset ?dataset
} 
GROUP BY ?dataset
""", opt))
logging.info('Query Completed.')

0,1
dataset,annotations
https://mobidb.org/#2020-09,48
https://proteinensemble.org/#2021-02-12,47
https://disprot.org/#2020-12,246


### Sequence Annotations from Multiple Datasets

Display the number of sequence annotations that come from multiple datasets.

_Note that sequence annotations are not merged based on any feature so we would not expect any sequence annotations to match the criteria in this query._

In [278]:
logging.info(' Sequence Annotations from Multiple Datasets - Query Started.')
displayResults(query_idpkg("""
PREFIX schema: <https://schema.org/>
PREFIX pav: <http://purl.org/pav/>
PREFIX void: <http://rdfs.org/ns/void#>

SELECT ?annotation (COUNT(?g) as ?numDatasets) (GROUP_CONCAT(?dataset;SEPARATOR=", ") AS ?datasets)
WHERE {
    GRAPH ?g {
        ?annotation a schema:SequenceAnnotation .
    }
    ?g void:inDataset ?dataset .
}
GROUP BY ?annotation
HAVING (COUNT(*) > 1)
ORDER BY ?numDatasets
""", opt))
logging.info('Query Completed.')

0,1,2
annotation,numDatasets,datasets
https://proteinensemble.org/PED00001#P38634_A_1_1_90,2,"https://proteinensemble.org/#2021-02-12, https://proteinensemble.org/#2021-02-12"


### Sequence Annotations from Multiple Pages

Dislay the number of sequence annotations that come from multiple pages. It is conceivable that the same annotation comes from different pages in the same source, e.g. PED. However, as annotations are not combined, we would not expect any answers to the following query.

In [279]:
logging.info(' Sequence Annotations from Multiple Pages - Query Started.')
displayResults(query_idpkg("""
PREFIX schema: <https://schema.org/>
PREFIX pav: <http://purl.org/pav/>
SELECT ?annotation (COUNT(?g) as ?numSources) (GROUP_CONCAT(?source;SEPARATOR=", ") AS ?sources)
WHERE {
    GRAPH ?g {
        ?annotation a schema:SequenceAnnotation .
    }
    ?g pav:retrievedFrom ?source .
}
GROUP BY ?annotation
HAVING (COUNT(*) > 1)
ORDER BY ?numSources
""", opt))
logging.info('Query Completed.')

0,1,2
annotation,numSources,sources
https://proteinensemble.org/PED00001#P38634_A_1_1_90,2,"https://proteinensemble.org/PED00001, https://proteinensemble.org/PED00001"


### Sequence Annotation Information

Return information known about each sequence annotation.

In [280]:
logging.info(' Sequence Annotation Information - Query Started.')
displayResults(query_idpkg("""
PREFIX schema: <https://schema.org/>

SELECT ?s ?annotation ?start ?end ?termCode ?termName ?pubmedID
WHERE {
    graph ?g {
        ?s a schema:Protein;
           schema:hasSequenceAnnotation ?annotation .
        ?annotation schema:additionalProperty/schema:value ?term;
            schema:sequenceLocation ?range .
        ?range schema:rangeStart ?start ;
               schema:rangeEnd ?end .
        ?term schema:termCode ?termCode ;
            schema:name ?termName .
        OPTIONAL { ?annotation schema:subjectOf ?pubmedID }
    }
}    
ORDER BY ?s ?start ?end

""", opt))
logging.info('Query Completed.')

0,1,2,3,4,5,6
s,annotation,start,end,termCode,termName,pubmedID
https://idpcentral.org/id/A0A045GWT8,https://mobidb.org/A0A045GWT8#prediction-disorder-mobidb_lite.1_37,37,1,Disorder,DO:00076,
https://idpcentral.org/id/A0A0G2JXC5,https://mobidb.org/A0A0G2JXC5#prediction-disorder-mobidb_lite.220_256,256,220,Disorder,DO:00076,
https://idpcentral.org/id/A0A0G2JXC5,https://mobidb.org/A0A0G2JXC5#prediction-disorder-mobidb_lite.272_366,366,272,Disorder,DO:00076,
https://idpcentral.org/id/A0A0G2JXC5,https://mobidb.org/A0A0G2JXC5#prediction-disorder-mobidb_lite.402_455,455,402,Disorder,DO:00076,
https://idpcentral.org/id/A0A0G2JXC5,https://mobidb.org/A0A0G2JXC5#prediction-disorder-mobidb_lite.559_583,583,559,Disorder,DO:00076,
https://idpcentral.org/id/A0A0G2JXC5,https://mobidb.org/A0A0G2JXC5#prediction-disorder-mobidb_lite.713_746,746,713,Disorder,DO:00076,
https://idpcentral.org/id/A0A0G2JXC5,https://mobidb.org/A0A0G2JXC5#prediction-disorder-mobidb_lite.781_854,854,781,Disorder,DO:00076,
https://idpcentral.org/id/A0A0G2JXC5,https://mobidb.org/A0A0G2JXC5#prediction-disorder-mobidb_lite.871_923,923,871,Disorder,DO:00076,
https://idpcentral.org/id/A0A0H2ZP82,https://mobidb.org/A0A0H2ZP82#prediction-disorder-mobidb_lite.231_255,255,231,Disorder,DO:00076,


### Details of Scholarly Articles with respect to Annotations

Number of articles per annotation.

In [281]:
logging.info(' Details of Scholarly Articles with respect to Annotations - Query Started.')
displayResults(query_idpkg("""
PREFIX schema: <https://schema.org/>
PREFIX void: <http://rdfs.org/ns/void#>

SELECT ?annotation (COUNT (?pubmedID) AS ?numArticles)
WHERE {
    graph ?g {
        ?annotation a schema:SequenceAnnotation;
            schema:subjectOf ?pubmedID
    }
}    
GROUP BY ?annotation
ORDER BY DESC(?numArticles)
""", opt))
logging.info('Query Completed.')

0,1
annotation,numArticles
https://disprot.org/DP00021r005,1
https://disprot.org/DP00021r004,1
https://disprot.org/DP00021r002,1
https://disprot.org/DP00021r003,1
https://disprot.org/DP00004r001,1
https://disprot.org/DP00004r004,1
https://disprot.org/DP00004r002,1
https://disprot.org/DP00023r008,1
https://disprot.org/DP00023r005,1


Number of annotations per article.

In [282]:
logging.info(' Number of annotations per article - Query Started.')
displayResults(query_idpkg("""
PREFIX schema: <https://schema.org/>
PREFIX void: <http://rdfs.org/ns/void#>

SELECT ?pubmedID (COUNT (?pubmedID) AS ?numAnnotations)
WHERE {
    graph ?g {
        ?annotation a schema:SequenceAnnotation;
            schema:subjectOf ?pubmedID
    }
}    
GROUP BY ?pubmedID
ORDER BY DESC(?numAnnotations)
""", opt))
logging.info('Query Completed.')

0,1
pubmedID,numAnnotations
https://identifiers.org/pubmed:32338601,17
https://identifiers.org/pubmed:2810365,16
https://identifiers.org/pubmed:8876165,14
https://identifiers.org/pubmed:4344990,9
https://identifiers.org/pubmed:15024385,8
https://identifiers.org/pubmed:19841061,8
https://identifiers.org/pubmed:11746698,7
https://identifiers.org/pubmed:8620531,6
https://identifiers.org/pubmed:11570883,6


### Number of annotations by term code

For each term code, return the number of annotations using that code.

In [283]:
logging.info(' Number of annotations by term code - Query Started.')
displayResults(query_idpkg("""
PREFIX schema: <https://schema.org/>
PREFIX void: <http://rdfs.org/ns/void#>

SELECT ?termCode ?termName (COUNT (?annotation) AS ?numAnnotations)
WHERE {
    graph ?g {
        ?annotation schema:additionalProperty/schema:value ?term .
        ?term schema:termCode ?termCode ;
            schema:name ?termName .
    }
}    
GROUP BY ?termCode ?termName
ORDER BY DESC(?numAnnotations)
""", opt))
logging.info('Query Completed.')

0,1,2
termCode,termName,numAnnotations
IDPO:00076,Disorder,107
DO:00076,Disorder,48
IDPO:00063,Protein binding,40
IDPO:00208,CRYSOL,31
IDPO:00125,SAXS,27
IDPO:00213,EOM,24
IDPO:00212,Ensemble optimization,23
IDPO:00050,Disorder to order,22
IDPO:00120,NMR,20


## Find proteins with annotations in multiple datasets

We are looking for annotations where the protein is common but the annotation is different across the datasets.

### Proteins with Annotations in Multiple Datasets

In [284]:
logging.info(' Proteins with Annotations in Multiple Datasets - Query Started.')
displayResults(query_idpkg("""
PREFIX pav: <http://purl.org/pav/>
PREFIX schema: <https://schema.org/>
PREFIX void: <http://rdfs.org/ns/void#>

SELECT ?protein (SAMPLE(?proteinName) AS ?name) (COUNT(distinct ?annotation) AS ?annotationCount) (COUNT(distinct ?dataset) AS ?datasets)
WHERE {
    {
        SELECT DISTINCT ?protein ?proteinName
        WHERE {
		    GRAPH ?g {
        		?protein a schema:Protein .
		        OPTIONAL {?protein schema:name ?proteinName .}
		    }
        }
    }
    {
	    SELECT ?annotation ?dataset ?protein
    	WHERE {
        	GRAPH ?g {
            	?protein schema:hasSequenceAnnotation ?annotation
	        }
    	    ?g void:inDataset ?dataset .
	    }
    }
} 
GROUP BY ?protein
HAVING (COUNT(distinct ?dataset) > 1)
ORDER BY DESC(?annotationCount)
""", opt))
logging.info('Query Completed.')

0,1,2,3
protein,name,annotationCount,datasets
https://idpcentral.org/id/P03265,DNA-binding protein,4,2


### Proteins with Annotations in Multiple Pages

As sources such as PED can have the same protein detailed on multiple pages, it is also interesting to look at this at the page level.

The following query finds for each protein, its name (if known), a count of the number of sequence annotations, and a count of the number of sources from which the data has been extracted. Results are only returned if there are annotations from more than one source.

In [285]:
logging.info(' Proteins with Annotations in Multiple Pages - Query Started.')
displayResults(query_idpkg("""
PREFIX pav: <http://purl.org/pav/>
PREFIX schema: <https://schema.org/>
SELECT ?protein (SAMPLE(?proteinName) AS ?name) (COUNT(distinct ?annotation) AS ?annotationCount) (COUNT(distinct ?source) AS ?sourceCount)
WHERE {
    {
        SELECT DISTINCT ?protein ?proteinName
        WHERE {
		    GRAPH ?g {
        		?protein a schema:Protein .
		        OPTIONAL {?protein schema:name ?proteinName .}
		    }
        }
    }
    {
	    SELECT ?annotation ?source ?protein
    	WHERE {
        	GRAPH ?g {
            	?protein schema:hasSequenceAnnotation ?annotation
	        }
    	    ?g pav:retrievedFrom ?source .
	    }
    }
} 
GROUP BY ?protein
HAVING (COUNT(distinct ?source) > 1)
ORDER BY DESC(?annotationCount)
""", opt))
logging.info('Query Completed.')

0,1,2,3
protein,name,annotationCount,sourceCount
https://idpcentral.org/id/O14558,Heat shock protein beta-6,14,5
https://idpcentral.org/id/P42212,Isoform 1 of Annexin A4,6,2
https://idpcentral.org/id/P09525-1,Green fluorescent protein,6,2
https://idpcentral.org/id/P38634,Cell division control protein 4,5,3
https://idpcentral.org/id/P03265,DNA-binding protein,4,2
https://idpcentral.org/id/P12296,Genome polyprotein,4,4
https://idpcentral.org/id/P37840,Alpha-synuclein,4,2
https://idpcentral.org/id/Q16143,Beta-synuclein,4,2
https://idpcentral.org/id/Q5L4K5,Nucleocapsid,2,2


The following varient of the query will list the annotations and the source from which the annotation has come.

In [286]:
logging.info(' list the annotations - Query Started.')
displayResults(query_idpkg("""
PREFIX pav: <http://purl.org/pav/>
PREFIX schema: <https://schema.org/>
SELECT ?protein ?proteinName ?annotation ?source
WHERE {
    {
        SELECT DISTINCT ?protein ?proteinName
        WHERE {
		    GRAPH ?g {
        		?protein a schema:Protein .
		        OPTIONAL {?protein schema:name ?proteinName .}
		    }
        }
    }
    {
        SELECT ?annotation ?source ?protein
        WHERE {
            GRAPH ?g {
                ?protein schema:hasSequenceAnnotation ?annotation
            }
            ?g pav:retrievedFrom ?source .
        }
    }
} 
ORDER BY ?protein ?annotation
""", opt))
logging.info('Query Completed.')

0,1,2,3
protein,proteinName,annotation,source
https://idpcentral.org/id/A0A045GWT8,Prokaryotic ubiquitin-like protein Pup,https://mobidb.org/A0A045GWT8#prediction-disorder-mobidb_lite.1_37,https://mobidb.org/A0A045GWT8
https://idpcentral.org/id/A0A0G2JXC5,https://mobidb.org/A0A0G2JXC5#prediction-disorder-mobidb_lite.220_256,https://mobidb.org/A0A0G2JXC5,
https://idpcentral.org/id/A0A0G2JXC5,https://mobidb.org/A0A0G2JXC5#prediction-disorder-mobidb_lite.272_366,https://mobidb.org/A0A0G2JXC5,
https://idpcentral.org/id/A0A0G2JXC5,https://mobidb.org/A0A0G2JXC5#prediction-disorder-mobidb_lite.402_455,https://mobidb.org/A0A0G2JXC5,
https://idpcentral.org/id/A0A0G2JXC5,https://mobidb.org/A0A0G2JXC5#prediction-disorder-mobidb_lite.559_583,https://mobidb.org/A0A0G2JXC5,
https://idpcentral.org/id/A0A0G2JXC5,https://mobidb.org/A0A0G2JXC5#prediction-disorder-mobidb_lite.713_746,https://mobidb.org/A0A0G2JXC5,
https://idpcentral.org/id/A0A0G2JXC5,https://mobidb.org/A0A0G2JXC5#prediction-disorder-mobidb_lite.781_854,https://mobidb.org/A0A0G2JXC5,
https://idpcentral.org/id/A0A0G2JXC5,https://mobidb.org/A0A0G2JXC5#prediction-disorder-mobidb_lite.871_923,https://mobidb.org/A0A0G2JXC5,
https://idpcentral.org/id/A0A0H2ZP82,https://mobidb.org/A0A0H2ZP82#prediction-disorder-mobidb_lite.231_255,https://mobidb.org/A0A0H2ZP82,
