In [1]:
topics = [
    'Business', 'Culture', 'Economy', 'Education', 'Energy', 
    'Engineering', 'Entertainment', 'Ethics', 'Events', 'Food and drink', 'Geography', 
    'Government', 'Health', 'History', 'Human nature', 'Humanities', 'Industry', 
    'Knowledge', 'Language', 'Law', 'Life', 'Mass media', 'Mathematics', 'Military', 
    'Music', 'Nature', 'Organizations', 'People', 'Philosophy', 'Policy', 'Politics', 
    'Religion', 'Science and technology', 'Society', 'Sports', 'Universe', 'World'
]

In [2]:
%%time
import urllib.parse as ul
import trident
g = trident.Db('/scratch/kruit/kb/dbpedia_20201001/categories.tridentdb')

topic_subcats = {}
topic_articles = {}
for topic in topics:
    topic = topic.replace(" ","_")
    c = g.lookup_id(f'<http://dbpedia.org/resource/Category:{topic}>')
    if not c:
        continue
        
    br = g.lookup_id('<http://www.w3.org/2004/02/skos/core#broader>')
    subj = g.lookup_id('<http://purl.org/dc/terms/subject>')

    arts = set()
    cats = set([c])
    arts.update(g.s(subj, c))
    for c2 in g.s(br, c):
        arts.update(g.s(subj, c2))
        cats.add(c2)
        for c3 in g.s(br, c2):
            if c3 not in cats:
                arts.update(g.s(subj, c3))
                cats.add(c3)
                for c4 in g.s(br, c3):
                    if c4 not in cats:
                        arts.update(g.s(subj, c4))
    
    topic_subcats[topic] = cats
    topic_articles[topic] = arts
    print(f"{topic:>30s}: {len(cats):5d} subcats, {len(arts):8d} articles")

                      Business:   454 subcats,    73164 articles
                       Culture:   886 subcats,   404422 articles
                       Economy:   352 subcats,    54204 articles
                     Education:   523 subcats,    50648 articles
                        Energy:   441 subcats,    18630 articles
                   Engineering:   195 subcats,    38870 articles
                 Entertainment:   981 subcats,   132142 articles
                        Ethics:   245 subcats,    38774 articles
                        Events:   789 subcats,    51487 articles
                Food_and_drink:   647 subcats,    41434 articles
                     Geography:   246 subcats,    44074 articles
                    Government:  1064 subcats,   141264 articles
                        Health:   587 subcats,   112370 articles
                       History:   516 subcats,    85833 articles
                  Human_nature:   127 subcats,  1040093 articles
                    Human

In [5]:
%%capture --no-display
from dask.distributed import Client
client = Client(address = 'tcp://192.168.62.207:8786')
client

0,1
Client  Scheduler: tcp://192.168.62.207:8786  Dashboard: http://192.168.62.207:8787/status,Cluster  Workers: 10  Cores: 10  Memory: 673.47 GB


In [6]:
import dask.bag as db
import json
bag = db.read_text(f'hdfs://bricks07:9000/user/kruit/output/tabel-small/9-triples/*').map(json.loads).persist()
bag.count().compute()

16778

In [12]:
# Top header cells

from collections import Counter

def frequencies(bag, func):
    extract = bag.map_partitions(lambda ts: [x for t in ts for x in func(t)] )
    return Counter(dict(extract.frequencies().compute()))


def get_header_cells(t):
    for hr in t.get('tableHeaders'):
        for c in hr:
            text = c.get('text', '').strip()
            if text and text[0] != '_':
                yield text

for cell, count in frequencies(bag, get_header_cells).most_common(10):
    print(f"{count:>4d} {cell}")

3026 Year
1827 Name
1664 Title
1372 Notes
1352 Team
1304 Date
1001 Rank
 878 #
 823 Result
 775 Event


In [26]:
import trident
g = trident.Db('/export/scratch1/home/kruit/20200713-prop-skos')
plabel = g.lookup_id('<http://www.w3.org/2004/02/skos/core#prefLabel>')
def label(uri):
    i = g.lookup_id(f"<{uri}>")
    if i:
        for li in g.o(i, plabel):
            l = g.lookup_str(li).strip()
            if l.endswith('@en'):
                return l[1:-4]

In [30]:
# Top classes
from collections import Counter

def get_coltypes(t):
    for _, cs in t.get('classes', {}).items():
        for c, score in cs.items():
            yield c

for cls, count in frequencies(bag, get_coltypes).most_common(10):
    l = cls.split('/')[-1] + " " + (label(cls) or '')
    print(f"{count:>6d} {l}")

25085 XMLSchema#string 
21535 XMLSchema#decimal 
11567 XMLSchema#dateTime 
8903 Q215627 person
8843 Q5 human
8843 Q164509 omnivore
8843 Q154954 natural person
7423 Q17442446 Wikimedia internal item
7404 Q12139612 list
7239 Q15633587 MediaWiki main-namespace page


In [29]:
wikiprops = '/export/scratch1/home/kruit/nary/data/kb/wikidata/wikidata-properties.txt'
puri_name = {
    l.split(' ', 1)[0]: l 
    for l in map(str.strip, open(wikiprops))
}

In [31]:
# Top classes
from collections import Counter

def get_props(t):
    for _, cps in t.get('properties', {}).items():
        for _, ps in cps.items():
            for p, score in ps.items():
                yield p

for prp, count in frequencies(bag, get_props).most_common(10):
    prp = prp.split('/')[-1]
    l = puri_name.get(prp, prp)
    print(f"{count:>6d} {l}")

1065 P54 member of sports team
 957 P17 country
 942 P131 located in the administrative territorial entity
 857 P1346 winner
 852 P175 performer
 594 P27 country of citizenship
 591 P361 part of
 546 P102 member of political party
 543 P161 cast member
 490 P527 has part


In [45]:
def filter_tables(ts, articles):
    
    for t in ts:
        pass
        
        
import trident
catdb = trident.Db('/scratch/kruit/kb/dbpedia_20201001/categories.tridentdb')
def get_article_titles(arts):
    for a in arts:
        yield catdb.lookup_str(a)[1:-1].replace('http://dbpedia.org/resource/', '')

# for topic, articles in sorted(topic_articles.items(), key=lambda ta: len(ta[1]))[:1]:
#     set(get_article_titles(list(articles)[:10])

for t in bag.take(1):
    print(*t)
    print(t['tableHeaders'])

columnIndexOffset tableIndex _id numCols numDataRows numHeaderRows numericColumns order pgId pgTitle sectionTitle tableCaption tableHeaders tableId tableCaptions wasStackedHor headerId links rows tbNr type numTables pivots entities classes keycol properties triples
[[{'text': '_pgTitle', 'surfaceLinks': []}, {'cellID': -1, 'textTokens': [], 'text': 'No.', 'tdHtmlString': '<th colspan="1" rowspan="1" scope="col"> No. </th>', 'surfaceLinks': [], 'subtableID': -1, 'isNumeric': False}, {'cellID': -1, 'textTokens': [], 'text': 'Date', 'tdHtmlString': '<th colspan="1" rowspan="1" scope="col"> Date </th>', 'surfaceLinks': [], 'subtableID': -1, 'isNumeric': False}, {'cellID': -1, 'textTokens': [], 'text': 'Opponents', 'tdHtmlString': '<th colspan="1" rowspan="1" scope="col"> Opponents </th>', 'surfaceLinks': [], 'subtableID': -1, 'isNumeric': False}, {'cellID': -1, 'textTokens': [], 'text': 'Venue', 'tdHtmlString': '<th colspan="1" rowspan="1" scope="col"> Venue </th>', 'surfaceLinks': [], 'su