## Use saprQL to query DBpedia

In [1]:
import rdflib
g=rdflib.Graph()
g.load('http://dbpedia.org/resource/Biomedical')

for s,p,o in g:
    print (s,p,o)

http://dbpedia.org/resource/Biomedical http://www.w3.org/2000/01/rdf-schema#label Biomedical
http://dbpedia.org/resource/Biomedical http://xmlns.com/foaf/0.1/isPrimaryTopicOf http://en.wikipedia.org/wiki/Biomedical
http://dbpedia.org/resource/Nadine_Barrie_Smith http://dbpedia.org/ontology/field http://dbpedia.org/resource/Biomedical
http://dbpedia.org/resource/Biomedical http://dbpedia.org/ontology/wikiPageID 609268
http://dbpedia.org/resource/Musa_Hakan_Asyalı http://dbpedia.org/ontology/knownFor http://dbpedia.org/resource/Biomedical
http://dbpedia.org/resource/Charles_Stark_Draper_Laboratory http://dbpedia.org/ontology/industry http://dbpedia.org/resource/Biomedical
http://dbpedia.org/resource/Biomedical http://dbpedia.org/ontology/wikiPageRedirects http://dbpedia.org/resource/Medical_research
http://dbpedia.org/resource/Biomedical http://dbpedia.org/ontology/wikiPageRevisionID 288855867
http://en.wikipedia.org/wiki/Biomedical http://xmlns.com/foaf/0.1/primaryTopic http://dbpedia.o

In [1]:
from SPARQLWrapper import SPARQLWrapper, JSON

prefix = """
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX dbpedia: <http://dbpedia.org/resource/>
    PREFIX dbo: <http://dbpedia.org/ontology/>PREFIX category: <http://dbpedia.org/resource/Category:>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>PREFIX foaf: <http://xmlns.com/foaf/0.1/>PREFIX dbpprop: <http://dbpedia.org/property/>
    PREFIX dbprop: <http://dbpedia.org/property/>PREFIX grs: <http://www.georss.org/georss/>
    PREFIX category: <http://dbpedia.org/resource/Category:>
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    PREFIX freebase: <http://rdf.freebase.com/ns/>
    PREFIX db: <http://dbpedia.org/>
    PREFIX dbp: <http://dbpedia.org/property/>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    PREFIX http: <http://www.w3.org/2006/http#>"""

In [3]:
sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setQuery(prefix + """    
    SELECT ?z
    WHERE { <http://dbpedia.org/resource/Biomedical_engineering> rdf:type ?z }
""")
#Arye_Rosen
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

for result in results["results"]["bindings"]:
    print(result)

{'z': {'type': 'uri', 'value': 'http://www.w3.org/2002/07/owl#Thing'}}
{'z': {'type': 'uri', 'value': 'http://dbpedia.org/ontology/Software'}}
{'z': {'type': 'uri', 'value': 'http://dbpedia.org/class/yago/Abstraction100002137'}}
{'z': {'type': 'uri', 'value': 'http://dbpedia.org/class/yago/Cognition100023271'}}
{'z': {'type': 'uri', 'value': 'http://dbpedia.org/class/yago/Content105809192'}}
{'z': {'type': 'uri', 'value': 'http://dbpedia.org/class/yago/Discipline105996646'}}
{'z': {'type': 'uri', 'value': 'http://dbpedia.org/class/yago/KnowledgeDomain105999266'}}
{'z': {'type': 'uri', 'value': 'http://dbpedia.org/class/yago/PsychologicalFeature100023100'}}
{'z': {'type': 'uri', 'value': 'http://dbpedia.org/class/yago/WikicatEngineeringDisciplines'}}


In [2]:
sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setQuery(prefix + """
    SELECT ?z
    WHERE { <http://dbpedia.org/resource/Biomedical_engineering> rdfs:label ?z }
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

for result in results["results"]["bindings"]:
    if result['z']['xml:lang'] == 'en':
        print(result['z']['value'])

Biomedical engineering


## Search things in DBpedia and Store into mongoDB

### Search things using DBpedia

In [2]:
from pymongo import MongoClient
from bson.objectid import ObjectId
import json

config = json.load(open("config.json"))

client = MongoClient()
client = MongoClient(config["server"], 27017)

db = client.Biomedical

#load search type for sparql
searchtype = json.load(open("searchtype.json"))
query_contents = json.load(open("query_content.json"))

In [3]:
#Replace symbol that may crash the query
def clean(word):
    word = word.replace(' ','_')
    word = word.replace('"','//')
    word = word.replace("'","/")
    return word


def clean_output(word):
    word = word.split('/')[-1]
    word = word.replace('_',' ')
    return word

In [4]:
#Search the page title and get its pages
def get_type_from_db(title):
    title = clean(title)
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setQuery(prefix + """    
        SELECT ?z
        WHERE { <http://dbpedia.org/resource/"""+title+"""> rdf:type ?z }
    """)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    value = []
    for result in results["results"]["bindings"]:
        value.append(result['z']['value'].split('/')[-1])
    if 'Person' in value:
        return 'Person'
    elif 'University' in value:
        return 'University'
    else:
        return 'Thing'

In [5]:
#Get all fields needed for a person
def construct(title,ctype):
    title = clean(title)
    results = {}
    results['category'] = clean(ctype)
    for content in query_contents[ctype]:
        sparql = SPARQLWrapper("http://dbpedia.org/sparql")
        sparql.setQuery(prefix + """    
            SELECT ?z
            WHERE { <http://dbpedia.org/resource/"""+title+"> "+searchtype[content]+":"+content+""" ?z }
        """)
        sparql.setReturnFormat(JSON)
        result = sparql.query().convert()
        output = []
        for r in result["results"]["bindings"]:
            #clean spaces except links
            if content != 'homepage' and 'Link' not in content:
                r['z']['value'] = clean_output(r['z']['value'])
            try:
                if r['z']['xml:lang'] == 'en':
                    output.append(r['z']['value'])
            except:
                output.append(r['z']['value'])
        results[content] = output
    return results

In [8]:
construct('Cassie_Mitchell','Person')

{'abstract': ['Cassie Mitchell (born 1981) is an American chemist and Paralympic athlete and cyclist.'],
 'almaMater': ['Georgia Institute of Technology',
  'Emory University',
  'Oklahoma State University–Stillwater'],
 'birthDate': ['1981-1-1'],
 'birthPlace': ['Muskogee, Oklahoma'],
 'category': 'Person',
 'field': ['Biomedical engineering', 'Chemical engineering'],
 'homepage': ['http://www.cassie-mitchell.com/paralympics.html'],
 'label': ['Cassie Mitchell'],
 'type': []}

In [29]:
construct('Columbia_University','University')

{'abstract': ["Columbia University (officially Columbia University in the City of New York) is a private Ivy League research university in Upper Manhattan, New York City. It was established in 1754 as King's College by royal charter of George II of Great Britain. Columbia is the oldest college in the state of New York and the fifth chartered institution of higher learning in the country, making it one of nine colonial colleges founded before the Declaration of Independence. After the American Revolutionary War, King's College briefly became a state entity, and was renamed Columbia College in 1784. A 1787 charter placed the institution under a private board of trustees before it was renamed Columbia University in 1896 when the campus was moved from Madison Avenue to its current location in Morningside Heights occupying 32 acres (13 ha) of land. Columbia is one of the fourteen founding members of the Association of American Universities, and was the first school in the United States to g

In [7]:
construct('Biomedical_engineering','Thing')

{'abstract': ['Biomedical engineering (BME) is the application of engineering principles and design concepts to medicine and biology for healthcare purposes (e.g. diagnostic or therapeutic). This field seeks to close the gap between engineering and medicine, combining the design and problem solving skills of engineering with medical and biological sciences to advance health care treatment, including diagnosis, monitoring, and therapy.Biomedical engineering has only recently emerged as its own study, compared to many other engineering fields. Such an evolution is common as a new field transitions from being an interdisciplinary specialization among already-established fields, to being considered a field in itself. Much of the work in biomedical engineering consists of research and development, spanning a broad array of subfields (see below). Prominent biomedical engineering applications include the development of biocompatible prostheses, various diagnostic and therapeutic medical devic

### Add Data to MongoDB

In [9]:
#check if the noe is already exists in the database
def check_not_exists(result):
    if result['category'] == 'Person':
        return db.Node.find({'category':result['category'],'label':result['label'],'birthDate':result['birthDate']}).count() == 0
    else:
        r = db.Node.find({'category':result['category'],'label':result['label']}).count()
        return r == 0
        
    
#Transfer the result format and save to mongoDB
def save_to_mongo(result):
    #flag = check_not_exists(result)
    flag = db.Node.find({'label':result['label']}).count() == 0
    if flag:
        db.Node.insert_one(result)
    else:
        pass
    return


#Add edges based on the nodes constructed in mongoDB
def add_relation_to_mongo():
    for node in db.Node,find({}):
        relationship = query_content[node['category']]
        for r in relationship:
            if len(node[r]) > 0:
                db.Edge.insert_one({'Source':node['title'],'Destination':node[r],'relationship':r})
    return 

In [11]:
# Use the title to search and extract
def get_info_from_db(title):
    page_type = get_type_from_db(title)
    result = construct(title, page_type)
    return result
    
    
# Transder pages to specific nodes with labels
def transfer_pages():
    pages = db.pages.find({'_id': {'$gt':ObjectId('5aeb72c443bf575d8c34a2e7')}},no_cursor_timeout=True)
    count = 0
    ct = 0
    for page in pages:
        if db.Node.find({'label':page['title']}).count() == 0:
            try:
                result = get_info_from_db(page['title'])
                if len(result['abstract']) == 0:
                    if ct%100 == 0:
                        print(page['_id'])
                    ct += 1
                    continue
                else:
                    save_to_mongo(result)
                    if count%100 == 0:
                        print(count)
                    count += 1
            except:
                print(page['title'])
        else:
            pass

In [12]:
transfer_pages()

5aeb72c643bf575d8c349f6c
0
Negative elongation factor
100
200
TRANSFAC
Tricho-rhino-phalangeal syndrome Type 1
TWIST2
Ultrabithorax
USF1
Whirly
300
400
500
600
700
800
900
5aeb74d643bf575d8c34a64a
1000


### Query data saved below

In [None]:
add_relation_to_mongo()

In [12]:
db.Node.remove({})

  """Entry point for launching an IPython kernel.


{'n': 1457, 'ok': 1.0}

In [16]:
db.pages.find({}).count()

13427