-
Notifications
You must be signed in to change notification settings - Fork 0
/
dbpedia.py
74 lines (50 loc) · 2.31 KB
/
dbpedia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# DBPEDIA
import rdflib
import pandas as pd
import stringutils as su
from SPARQLWrapper import SPARQLWrapper, JSON
import logging
logger = logging.getLogger(__name__)
OWL_THING = "<http://www.w3.org/2002/07/owl#Thing>"
def get_typed_resources(class_list, db_types_file):
"""
Get all the entries in a Wikipedia version missing infoboxes
:return: Returns a dataframe with
- id
- URI
- Types
- Abstract
- Text
- Categories
"""
logger.debug("Starting get_typed_resources")
logger.debug("Class list" + class_list)
logger.debug("End of get_typed_resources")
def get_resources_from_types(class_list, db_types_file, remove_owl_thing = True, encode=False):
logger.debug("Starting get_resources_from_types from %s" % (db_types_file))
df_types = pd.read_csv(db_types_file, sep=' ', names = ["individual", "typeprop", "type", "dot"])
logger.debug("Got %s instances from file %s" % (len(df_types),db_types_file))
#this takes quite some time
#duplicated_entries = df_types[df_types.duplicated(['individual'], keep=False)]
#logger.debug("Got %s duplicated instances" % len(duplicated_entries))
#remove unnecesary columnes (typeprop, dot)
df_types=df_types.drop(columns=['typeprop', 'dot'])
if remove_owl_thing:
df_types = df_types[df_types.type != OWL_THING]
if encode:
# encode all individuals uris
df_types['individual'] = df_types['individual'].apply(su.encode_url)
return df_types
def get_resource_abstracts(class_list, abstracts_file, encode=False):
logger.debug("Starting get_resource_abstracts from %s" % (abstracts_file))
df_abstract = pd.read_csv(abstracts_file, sep=' ', names = ["individual", "abstractprop", "abstract", "dot"])
logger.debug("Got %s instance abstracts from file %s" % (len(df_abstract),abstracts_file))
#this takes quite some time
#duplicated_entries = df_types[df_abstract.duplicated(['individual'], keep=False)]
#logger.debug("Got %s duplicated instances" % len(duplicated_entries))
#remove unnecesary columnes (typeprop, dot)
df_abstract=df_abstract.drop(columns=['abstractprop', 'dot'])
if encode:
# encode all individuals uris
df_abstract['individual'] = df_abstract['individual'].apply(su.encode_url)
return df_abstract