In [15]:
import pandas as pd
topics = [
    'Business', 'Culture', 'Economy', 'Education', 'Energy', 
    'Engineering', 'Entertainment', 'Events', 'Food and drink', 'Geography', 
    'Government', 'Health', 'History', 'Humanities', 'Industry', 
    'Language', 'Law', 'Life', 'Mass media', 'Mathematics', 'Military', 
    'Music', 'Nature', 'People', 'Politics', 
    'Religion', 'Science and technology', 'Sports'
]
topics = ['Music', 'Politics', 'Sports', 'Business', 'Science']

In [37]:
%%time
import urllib.parse as ul
import trident
cat_db = trident.Db('/scratch/kruit/kb/dbpedia_20201001/categories.tridentdb')

topic_subcats = {}
topic_articles = {}
for topic in topics:
    topic = topic.replace(" ","_")
    c = cat_db.lookup_id(f'<http://dbpedia.org/resource/Category:{topic}>')
    if not c:
        continue
        
    br = cat_db.lookup_id('<http://www.w3.org/2004/02/skos/core#broader>')
    subj = cat_db.lookup_id('<http://purl.org/dc/terms/subject>')

    arts = set()
    cats = set([c])
    arts.update(cat_db.s(subj, c))
    for c2 in cat_db.s(br, c):
        arts.update(cat_db.s(subj, c2))
        cats.add(c2)
        for c3 in cat_db.s(br, c2):
            if c3 not in cats:
                arts.update(cat_db.s(subj, c3))
                cats.add(c3)
#                 for c4 in cat_db.s(br, c3):
#                     if c4 not in cats:
#                         arts.update(cat_db.s(subj, c4))
    
    topic_subcats[topic] = cats
    topic_articles[topic] = arts
    print(f"{topic:>30s}: {len(cats):5d} subcats, {len(arts):8d} articles")

                         Music:   558 subcats,    13063 articles
                      Politics:   543 subcats,    13777 articles
                        Sports:   838 subcats,    11816 articles
                      Business:   454 subcats,    11595 articles
                       Science:   463 subcats,    15224 articles
CPU times: user 36.1 ms, sys: 52.8 ms, total: 88.9 ms
Wall time: 88.3 ms


In [38]:
%%capture --no-display
from dask.distributed import Client
client = Client(address = 'tcp://192.168.62.207:8786')
client

0,1
Client  Scheduler: tcp://192.168.62.207:8786  Dashboard: http://192.168.62.207:8787/status,Cluster  Workers: 10  Cores: 10  Memory: 673.47 GB


In [39]:
import dask.bag as db
from takco.util import robust_json_loads_lines
fnames = 'hdfs:///user/kruit/output/tabel-small-4/9-triples/*'
bag = db.read_text(fnames).map_partitions(robust_json_loads_lines).persist()
bag.count().compute()

4103

In [40]:
# Top header cells

from collections import Counter

def frequencies(bag, func):
    extract = bag.map_partitions(lambda ts: [x for t in ts for x in func(t)] )
    return Counter(dict(extract.frequencies().compute()))


def get_header_cells(t):
    for hr in t.get('tableHeaders'):
        for c in hr:
            text = c.get('text', '').strip()
            if text and text[0] != '_':
                yield text

def top_headers(bag, n=10):
    return pd.Series(dict(frequencies(bag, get_header_cells).most_common(n)))

top_headers(bag)

Year      620
Team      420
Date      415
Name      410
Title     348
W         333
L         328
Notes     283
Result    248
Player    230
dtype: int64

In [41]:
import trident
prop_db = trident.Db('/export/scratch1/home/kruit/20200713-prop-skos')
plabel = prop_db.lookup_id('<http://www.w3.org/2004/02/skos/core#prefLabel>')
def label(uri):
    i = prop_db.lookup_id(f"<{uri}>")
    if i:
        for li in prop_db.o(i, plabel):
            l = prop_db.lookup_str(li).strip()
            if l.endswith('@en'):
                return l[1:-4]

In [42]:
# Top classes
from collections import Counter
bad = ['Q17442446','Q12139612','Q15633587','Q14204246','Q4167410','Q48522', 'Q11266439', 'Q4167836', 'Q13406463', 
       'Q21025364', 'Q56248902', 'Q164509']

def get_coltypes(t):
    for _, cs in t.get('classes', {}).items():
        for c, score in cs.items():
            if not any(c.endswith(b) for b in bad):
                yield c

ent_name = lambda uri: uri.split('/')[-1] + " " + (label(uri) or '')
                
def top_coltypes(bag, n=10):
    freqs = frequencies(bag, get_coltypes).most_common(n)
    return pd.Series({label(cls): count for cls, count in freqs if label(cls)})

top_coltypes(bag)

sports festival    549
sports season      484
human              478
person             478
natural person     478
season             472
dtype: int64

In [43]:
wikiprops = '/export/scratch1/home/kruit/nary/data/kb/wikidata/wikidata-properties.txt'
puri_name = {
    l.split(' ', 1)[0]: tuple(l.split(' ', 1))
    for l in map(str.strip, open(wikiprops))
}

In [44]:
# Top classes
from collections import Counter

def get_props(t):
    for _, cps in t.get('properties', {}).items():
        for _, ps in cps.items():
            for p, score in ps.items():
                yield p.split('/')[-1]

def top_props(bag, n=10):
    freqs = frequencies(bag, get_props).most_common(n)
    return pd.Series({puri_name.get(prp, prp)[1]: count for prp, count in freqs})
                
top_props(bag)

cast member                                         37
member of sports team                               33
located in the administrative territorial entity    27
winner                                              25
participant of                                      21
participating team                                  20
performer                                           19
category's main topic                               12
director                                            11
topic's main category                               11
dtype: int64

In [45]:
def get_pivots(prov):
    if 'pivot' in prov:
        yield prov['pivot']
    elif 'concat' in prov:
        for c in prov['concat']:
            yield from get_pivots(c)
            
def get_table_pivots(t):
    for pivot in get_pivots(t.provenance):
        yield from pivot['headers'][pivot['level']][pivot['colfrom']:pivot['colto']+1]

def top_pivots(bag, n=10):
    return pd.Series(dict(frequencies(bag, get_table_pivots).most_common(n)))
        
top_pivots(bag)

1        1568
3        1425
2        1415
4        1383
Total    1002
5         991
7         968
6         959
Final     946
8         939
dtype: int64

In [46]:
import pandas as pd
from pathlib import Path

def get_pagetitles(prov):
    if 'pgTitle' in prov:
        yield prov['pgTitle']
    elif 'concat' in prov:
        for c in prov['concat']:
            yield from get_pagetitles(c)
        
def filter_tables(ts, articles):
    tables = []
    for t in ts:
        if articles & set(get_pagetitles(t.provenance)):
            tables.append(t)
    return tables

def table_pagetitles(t):
    return list(get_pagetitles(t.provenance))
        

def get_article_titles(arts):
    for a in arts:
        yield cat_db.lookup_str(a)[1:-1].replace('http://dbpedia.org/resource/', '').replace('_', ' ')

n_origtables = bag.map(table_pagetitles).flatten().count().compute()
n_pages = bag.map(table_pagetitles).flatten().distinct().count().compute()
print(f"Got a total of {n_origtables} original tables on {n_pages} pages")
print()

root = Path('fig/wikicat')
root.mkdir(exist_ok=True)
        
ntopics = len(topic_articles)
tops = {}
for ti, (topic, articles) in enumerate(sorted(topic_articles.items(), key=lambda ta: len(ta[1]))[:5]):
    article_titles = set(get_article_titles(articles))
    print(topic, f"({len(article_titles)} articles)", f"[{ti+1}/{ntopics}]")
    
    tables = bag.map_partitions(filter_tables, article_titles).persist()
    ntables = tables.count().compute()
    norigtables = tables.map(table_pagetitles).flatten().count().compute()
    ncols = tables.map(lambda t: t['numCols']).sum().compute()
    print(f'{ntables} supertables with {ncols} columns; {norigtables} original tables')
    
    if ntables:
        for kind in ['headers', 'coltypes', 'props', 'pivots']:
            f = eval(f"top_{kind}")
            top = f(tables, n=20)
            tops.setdefault(kind, {})[topic] = top
            print(kind, dict(top.head(3)))
        
    print()

Got a total of 50456 original tables on 25699 pages

Business (11595 articles) [1/5]
4 supertables with 24 columns; 2036 original tables
headers {'Preceded by': 1, '#': 1, 'Employer': 1}
coltypes {'person': 1, 'human': 1, 'natural person': 1}
props {'follows': 1, 'followed by': 1, 'part of': 1}
pivots {}

Sports (11816 articles) [2/5]
45 supertables with 261 columns; 5740 original tables
headers {'Year': 17, 'Name': 9, 'Team': 7}
coltypes {'person': 14, 'natural person': 14, 'human': 14}
props {'child': 2, 'genre': 2, 'participating team': 2}
pivots {'NOR': 109, 'BAR': 98, 'BUR': 93}

Music (13063 articles) [3/5]
35 supertables with 160 columns; 11088 original tables
headers {'Year': 9, 'Principal location': 5, 'Result': 4}
coltypes {'release': 13, 'single': 7, 'album': 7}
props {'child': 2, 'followed by': 2, 'follows': 2}
pivots {'Chart (2008)': 222, 'Chart (2007)': 55, 'Chart (2002)': 48}

Politics (13777 articles) [4/5]
10 supertables with 51 columns; 4595 original tables
headers {'

In [47]:
top_combined = {}
for toptype, top in tops.items():
    dim = toptype[:-1].title()
    dfs = []
    for t, s in tops[toptype].items():
        s.index.name = dim
        dfs.append(s.reset_index(name='n').assign(Top=t))

    df = pd.concat(dfs)
    df[dim] = df[dim].map(lambda x: x if len(x)<20 else x[:20] + '...')
    df = df.pivot(columns=['Top'])
    df['n'] = df['n'].astype('Int64')
    df = df.astype('str').replace('nan', '')
    df.columns = df.columns.swaplevel()
    df = df.sort_index(axis=1)
    display.display(df.head())
    top_combined[toptype] = df
    df.to_latex(root.joinpath(toptype+'.tex'))

Top,Business,Business,Music,Music,Politics,Politics,Science,Science,Sports,Sports
Unnamed: 0_level_1,Header,n,Header,n,Header,n,Header,n,Header,n
0,Preceded by,1,Year,9,Inaugurated,3,Name,4,Year,17
1,#,1,Principal location,5,Left Office,3,Date,3,Name,9
2,Employer,1,Result,4,Succeeded by,2,Location,3,Team,7
3,# of Employees,1,Artist,4,Name,2,Yield,3,Date,7
4,Rank,1,Chart,4,Notes,2,Preceded by,2,Nationality,6


Top,Business,Business,Music,Music,Politics,Politics,Science,Science,Sports,Sports
Unnamed: 0_level_1,Coltype,n,Coltype,n,Coltype,n,Coltype,n,Coltype,n
0,person,1,release,13,position,5,person,2,person,14
1,human,1,single,7,person,4,natural person,2,natural person,14
2,natural person,1,album,7,natural person,4,human,2,human,14
3,ranked list,1,visual artwork,6,human,4,Wikimedia navigation...,2,award,7
4,work,1,person,4,role,3,chronostratigraphic ...,2,sports festival,7


Top,Business,Business,Music,Music,Politics,Politics,Science,Science,Sports,Sports
Unnamed: 0_level_1,Prop,n,Prop,n,Prop,n,Prop,n,Prop,n
0,follows,1.0,child,2,position held,3.0,child,2,child,2
1,followed by,1.0,followed by,2,child,2.0,part of,2,genre,2
2,part of,1.0,follows,2,father,1.0,father,1,participating team,2
3,,,performer,1,part of,1.0,position held,1,father,1
4,,,father,1,,,follows,1,position held,1


Top,Music,Music,Science,Science,Sports,Sports
Unnamed: 0_level_1,Pivot,n,Pivot,n,Pivot,n
0,Chart (2008),222,2007,14,NOR,109
1,Chart (2007),55,2006,14,BAR,98
2,Chart (2002),48,2005,13,BUR,93
3,Chart (2003),46,Mass,13,LIV,93
4,Chart (2004),42,2008,12,BOL,91
