In [155]:
from SPARQLWrapper import SPARQLWrapper, JSON
import json
import math
import os
import re
import unidecode
from nltk.tokenize import sent_tokenize

output_folder = '../dbpedia/' #where we store the json files we retrieve from the sparql endpoint
corpus_folder = '../data/corpora/' #where we output the text files we create

def query_dbpedia(query):
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setReturnFormat(JSON)
    sparql.setQuery(query) 
    return sparql.query().convert()

In [142]:
objectnames = ['Device', 'Animal', 'Biomolecule', 'Company', 'ChemicalSubstance', 'Food', 'Plant', 'MeanOfTransportation']

In [160]:
for objectname in objectnames: 
    #get the count of objects for this query
    query = """PREFIX dbo: <http://dbpedia.org/ontology/>
        PREFIX n1: <http://schema.org/>
        SELECT COUNT DISTINCT ?Device ?abstract
        WHERE { ?Device a dbo:"""+objectname+""" .
                ?Device dbo:abstract ?abstract . 
                filter(langMatches(lang(?abstract),"en"))}"""
    res = query_dbpedia(query)
    res = res['results']['bindings'][0]['callret-0'].get('value')
    print(str(res)+ " results found for "+objectname)
    
    #figure out how often we need to query, because the endpoint is limited to 10000 at a time. 
    num = math.ceil(int(res) / 10000)
    
    for i in range(num):
        query = """PREFIX dbo: <http://dbpedia.org/ontology/>
        PREFIX n1: <http://schema.org/>
        SELECT DISTINCT ?Device ?abstract
        WHERE { ?Device a dbo:"""+objectname+""" .
                ?Device dbo:abstract ?abstract . 
                filter(langMatches(lang(?abstract),"en"))}
        LIMIT 10000
        OFFSET """+str(i*10000)
        #print(query_dbpedia(query))
        name = output_folder + objectname + str(i)+'.json'
        with open(name, 'w') as outfile:
            json.dump(query_dbpedia(query), outfile)

11957 results found for Device
229955 results found for Animal
7335 results found for Biomolecule
107357 results found for Company
18505 results found for ChemicalSubstance
11358 results found for Food
62277 results found for Plant
55325 results found for MeanOfTransportation


In [None]:
# For good measure, we also get the schema.org product - note this includes the productontology stuff to some degree- 

objectname = "Product"
#get the count of objects for this query
query = """PREFIX dbo: <http://dbpedia.org/ontology/>
        PREFIX n1: <http://schema.org/>
        SELECT COUNT DISTINCT ?Device ?abstract
        WHERE { ?Device a n1:Product .
                ?Device dbo:abstract ?abstract . 
                filter(langMatches(lang(?abstract),"en"))}"""
res = query_dbpedia(query)
res = res['results']['bindings'][0]['callret-0'].get('value')
print(str(res)+ " results found for "+objectname)
num = math.ceil(int(res) / 10000)
    
for i in range(num):
    q = """
    PREFIX dbo: <http://dbpedia.org/ontology/>
    PREFIX n1: <http://schema.org/>
    SELECT DISTINCT ?Device ?abstract
    WHERE { ?Device a n1:"""+objectname+""" .
            ?Device dbo:abstract ?abstract . 
            filter(langMatches(lang(?abstract),"en"))}
            LIMIT 10000
            OFFSET """+str(i*10000)

    name = output_folder + objectname + str(i)+'.json'
    with open(name, 'w') as outfile:
        json.dump(query_dbpedia(q), outfile)    

In [None]:
# so we got a ton of json files, whats next?
# start looping through the directory, and output everything found to a list named files
files = [f for f in os.listdir(output_folder) if os.path.isfile(os.path.join(output_folder, f))]

In [144]:
def sentence_cleaner(raw):
    # we definitely need to conserve hyphens! thus, we do not include these in this expression
    return re.sub("[^a-zA-Z0-9-]"," ", raw)

def comma_splitter(text):
    text = text.replace(';', '')
    text = text.replace('"', '')
    return text.replace(',', '')

def spurious_whitespace_remover(text):
    return re.sub(r"\s{2,}", " ", text).strip()

In [156]:
total_lines = 0
total_words = 0
for textfile in files:
    with open(output_folder+textfile) as file:
            data = json.load(file)
            corpus = [] #wont go over 10000 items per file, so its OK to build corpora in memory IMO
            for idx, item in enumerate(data['results']['bindings']):
                #print(item)
                abstract  = item['abstract']['value']
                for l in sent_tokenize(abstract):
                #for l in re.split(r"(\. |\? |\! )",abstract): #split on punctuation symbols
                    if (len(l) > 3):
                        l = l.rstrip("\n\r") #remove newlines and other gibberish
                        l = unidecode.unidecode(l).lower()
                        l = comma_splitter(l)
                        l = sentence_cleaner(l)
                        l = spurious_whitespace_remover(l)
                        corpus.append(l)
                        total_words += len(l.split(" "))
                        
            #save our corpus to disk
            print(str(len(corpus))+" lines found in "+textfile)
            total_lines += len(corpus)
            
            with open(corpus_folder+textfile.split('.')[0]+".txt", "w") as f:
                for item in corpus:
                    f.write("%s\n" % str(item))

31277 lines found in Animal0.json
22504 lines found in Animal1.json
25982 lines found in Animal10.json
33491 lines found in Animal11.json
30319 lines found in Animal12.json
27903 lines found in Animal13.json
41913 lines found in Animal14.json
34643 lines found in Animal15.json
24962 lines found in Animal16.json
27349 lines found in Animal17.json
27979 lines found in Animal18.json
33899 lines found in Animal19.json
37647 lines found in Animal2.json
24694 lines found in Animal20.json
29925 lines found in Animal21.json
35480 lines found in Animal22.json
34841 lines found in Animal3.json
24670 lines found in Animal4.json
29925 lines found in Animal5.json
29254 lines found in Animal6.json
32562 lines found in Animal7.json
28415 lines found in Animal8.json
38343 lines found in Animal9.json
32314 lines found in Biomolecule0.json
41691 lines found in ChemicalSubstance0.json
37994 lines found in ChemicalSubstance1.json
48707 lines found in Company0.json
53449 lines found in Company1.json
36755 

In [157]:
corpus[0]

'the second uss bainbridge dd-1 was the first destroyer in the united states navy and the lead ship of the bainbridge-class'

In [158]:
total_words #how many extra words we can use to train our word2vec

40916158

In [159]:
total_lines #how many extra 

2287513