In [1]:
%%time
import pandas as pd
topics = [
    'Business', 'Economy', 'Education', 'Energy', 
    'Engineering', 'Events', 'Food and drink', 'Geography', 
    'Government', 'Health', 'History', 'Humanities', 'Industry', 
    'Language', 'Law', 'Mathematics', 'Military', 
    'Music', 'Nature', 'Politics', 
    'Religion', 'Science and technology', 'Sports'
]
# topics = ['Music', 'Politics', 'Sports', 'Business', 'Science']

import urllib.parse as ul
import trident
cat_db = trident.Db('/scratch/kruit/kb/dbpedia_20201001/categories.tridentdb')

topic_subcats = {}
topic_articles = {}
for topic in topics:
    topic = topic.replace(" ","_")
    c = cat_db.lookup_id(f'<http://dbpedia.org/resource/Category:{topic}>')
    if not c:
        continue
        
    br = cat_db.lookup_id('<http://www.w3.org/2004/02/skos/core#broader>')
    subj = cat_db.lookup_id('<http://purl.org/dc/terms/subject>')

    arts = set()
    cats = set([c])
    arts.update(cat_db.s(subj, c))
    for c2 in cat_db.s(br, c):
        arts.update(cat_db.s(subj, c2))
        cats.add(c2)
        for c3 in cat_db.s(br, c2):
            if c3 not in cats:
                arts.update(cat_db.s(subj, c3))
                cats.add(c3)
                for c4 in cat_db.s(br, c3):
                    if c4 not in cats:
                        arts.update(cat_db.s(subj, c4))
    
    topic_subcats[topic] = cats
    topic_articles[topic] = arts
    print(f"{topic:>30s}: {len(cats):5d} subcats, {len(arts):8d} articles")

topics = sorted(topic_articles, key=lambda k: -len(topic_articles[k]))[:10]
print(f"Largest topics:", topics)
topic_articles = {t:topic_articles[t] for t in topics}

                      Business:   454 subcats,    73164 articles
                       Economy:   352 subcats,    54204 articles
                     Education:   523 subcats,    50648 articles
                        Energy:   441 subcats,    18630 articles
                   Engineering:   195 subcats,    38870 articles
                        Events:   789 subcats,    51487 articles
                Food_and_drink:   647 subcats,    41434 articles
                     Geography:   246 subcats,    44074 articles
                    Government:  1064 subcats,   141264 articles
                        Health:   587 subcats,   112370 articles
                       History:   516 subcats,    85833 articles
                    Humanities:   773 subcats,   164416 articles
                      Industry:   504 subcats,    45565 articles
                      Language:   339 subcats,    48625 articles
                           Law:   421 subcats,    53521 articles
                   Mathem

In [2]:
%%capture --no-display
from dask.distributed import Client
client = Client(address = 'tcp://192.168.62.207:8686')
client

0,1
Client  Scheduler: tcp://192.168.62.207:8686  Dashboard: http://192.168.62.207:8687/status,Cluster  Workers: 7  Cores: 7  Memory: 471.41 GB


In [3]:
import dask.bag as db
from takco.util import robust_json_loads_lines
# fnames = 'hdfs:///user/kruit/output/tabel-small-6+7+8+9/9-triples/*'
fnames = '/export/scratch1/home/kruit/scratch/output/tabel-small-4/9-triples/*'
bag = db.read_text(fnames).map_partitions(robust_json_loads_lines).persist()
bag.count().compute()

4103

In [4]:
# Top header cells

from collections import Counter

def get_pagetitles(prov):
    if 'pgTitle' in prov:
        yield prov['pgTitle']
    elif 'concat' in prov:
        for c in prov['concat']:
            yield from get_pagetitles(c)
            
            
def frequencies(bag, func, pagetitle_filter=()):
    def multiply_matched(ts, func, pagetitle_filter):
        items = []
        for t in ts:
            n = 1
            if pagetitle_filter:
                n = len(set(get_pagetitles(t.provenance)) & set(pagetitle_filter))
            for x in func(t):
                for _ in range(n):
                    items.append(x)
        return items
    
    if pagetitle_filter:
        return Counter(multiply_matched(bag.compute(), func, pagetitle_filter))
    else:
        extract = bag.map_partitions(multiply_matched, func, pagetitle_filter)
        return Counter(dict(extract.frequencies().compute()))



def get_header_cells(t):
    for hr in t.get('tableHeaders'):
        for c in hr:
            text = c.get('text', '').strip()
            if text and text[0] != '_':
                yield text

def top_headers(bag, pagetitle_filter=(), n=10):
    return pd.Series(dict(frequencies(bag, get_header_cells, pagetitle_filter).most_common(n)))

top_headers(bag)

Year      620
Team      420
Date      415
Name      410
Title     348
W         333
L         328
Notes     283
Result    248
Player    230
dtype: int64

In [5]:
import trident
prop_db = trident.Db('/export/scratch1/home/kruit/20200713-prop-skos')
plabel = prop_db.lookup_id('<http://www.w3.org/2004/02/skos/core#prefLabel>')
def label(uri):
    i = prop_db.lookup_id(f"<{uri}>")
    if i:
        for li in prop_db.o(i, plabel):
            l = prop_db.lookup_str(li).strip()
            if l.endswith('@en'):
                return l[1:-4]

In [6]:
# Top classes
from collections import Counter
bad = ['Q17442446','Q12139612','Q15633587','Q14204246','Q4167410','Q48522', 'Q11266439', 'Q4167836', 'Q13406463', 
       'Q21025364', 'Q56248902', 'Q164509']

def get_coltypes(t):
    for _, cs in t.get('classes', {}).items():
        for c, score in cs.items():
            if not any(c.endswith(b) for b in bad):
                yield c

ent_name = lambda uri: uri.split('/')[-1] + " " + (label(uri) or '')
                
def top_coltypes(bag, pagetitle_filter=(), n=10):
    freqs = frequencies(bag, get_coltypes, pagetitle_filter).most_common(n)
    return pd.Series({label(cls): count for cls, count in freqs if label(cls)})

top_coltypes(bag)

sports festival    549
sports season      484
human              478
person             478
natural person     478
season             472
dtype: int64

In [7]:
wikiprops = '/export/scratch1/home/kruit/nary/data/kb/wikidata/wikidata-properties.txt'
puri_name = {
    l.split(' ', 1)[0]: tuple(l.split(' ', 1))
    for l in map(str.strip, open(wikiprops))
}

In [8]:
# Top classes
from collections import Counter

def get_props(t):
    for _, cps in t.get('properties', {}).items():
        for _, ps in cps.items():
            for p, score in ps.items():
                yield p.split('/')[-1]

def top_props(bag, pagetitle_filter=(), n=10):
    freqs = frequencies(bag, get_props, pagetitle_filter).most_common(n)
    return pd.Series({puri_name.get(prp, prp)[1]: count for prp, count in freqs})
                
top_props(bag)

cast member                                         37
member of sports team                               33
located in the administrative territorial entity    27
winner                                              25
participant of                                      21
participating team                                  20
performer                                           19
category's main topic                               12
director                                            11
topic's main category                               11
dtype: int64

In [9]:
def get_pivots(prov, pagetitle_filter = ()):
    if 'pivot' in prov:
        if (not pagetitle_filter) or (prov['pgTitle'] in pagetitle_filter):
            yield prov['pivot']
    elif 'concat' in prov:
        for c in prov['concat']:
            yield from get_pivots(c, pagetitle_filter=pagetitle_filter)
            
def get_table_pivots(t, pagetitle_filter = ()):
    for pivot in get_pivots(t.provenance, pagetitle_filter=pagetitle_filter):
        yield from pivot['headers'][pivot['level']][pivot['colfrom']:pivot['colto']+1]

def top_pivots(bag, pagetitle_filter=(), n=10):
    return pd.Series(dict(frequencies(bag, lambda t: get_table_pivots(t, pagetitle_filter)).most_common(n)))
        
top_pivots(bag)

1        1568
3        1425
2        1415
4        1383
Total    1002
5         991
7         968
6         959
Final     946
8         939
dtype: int64

In [10]:
%%time
import pandas as pd
from pathlib import Path
        
def filter_tables(ts, articles):
    tables = []
    for t in ts:
        if articles & set(get_pagetitles(t.provenance)):
            tables.append(t)
    return tables

def table_pagetitles(t):
    return list(get_pagetitles(t.provenance))
        

def get_article_titles(arts):
    for a in arts:
        yield cat_db.lookup_str(a)[1:-1].replace('http://dbpedia.org/resource/', '').replace('_', ' ')

n_origtables = bag.map(table_pagetitles).flatten().count().compute()
n_pages = bag.map(table_pagetitles).flatten().distinct().count().compute()
print(f"Got a total of {n_origtables} original tables on {n_pages} pages")
print()

root = Path('fig/wikicat')
root.mkdir(exist_ok=True)
        
ntopics = len(topic_articles)
tops = {}
for ti, (topic, articles) in enumerate(sorted(topic_articles.items(), key=lambda ta: len(ta[1]))):
    article_titles = set(get_article_titles(articles))
    print(topic, f"({len(article_titles)} articles)", f"[{ti+1}/{ntopics}]")
    
    tables = bag.map_partitions(filter_tables, article_titles).persist()
    
    
    tabletitles = set(tables.map(table_pagetitles).flatten().compute())
    print(f"{len(tabletitles & article_titles)} article matches")
    print(list(tabletitles & article_titles)[:10])
    
    ntables = tables.count().compute()
    norigtables = tables.map(table_pagetitles).flatten().count().compute()
    ncols = tables.map(lambda t: t['numCols']).sum().compute()
    print(f'{ntables} supertables with {ncols} columns; {norigtables} original tables')
    
    if ntables:
        for kind in ['headers', 'coltypes', 'props', 'pivots']:
            f = eval(f"top_{kind}")
            top = f(tables, pagetitle_filter=article_titles, n=20)
            tops.setdefault(kind, {})[topic] = top
            print(kind, dict(top.head(3)))
        
    print()

Got a total of 50456 original tables on 25699 pages

Economy (54204 articles) [1/10]
35 article matches
['Chiang Pin-kung', 'Dataram', 'Zemiology', 'List of countries by oil consumption', 'Legal Tender (song)', 'Stanley Aronowitz', 'List of countries by oil exports', 'David Flint', 'Takeo Fukuda', 'Forbes Global 2000']
29 supertables with 148 columns; 9094 original tables
headers {'Preceded by': 12, 'Succeeded by': 9, 'Location': 8}
coltypes {'person': 13, 'natural person': 13, 'human': 13}
props {'child': 19, 'father': 10, 'part of': 10}
pivots {'01/02': 2, '02/03': 2, '03/04': 2}

Military (59929 articles) [2/10]
208 article matches
['István Friedrich', 'Sam Manekshaw', 'Good Morning, Vietnam', 'Sundararajan Padmanabhan', 'Richard Vincent, Baron Vincent of Coleshill', 'John Quaife', 'Katakura Kagemitsu', 'Guthrum', 'Michael B. Donley', 'Johann Rudolf Pfyffer von Altishofen']
53 supertables with 280 columns; 8107 original tables
headers {'Preceded by': 162, 'Succeeded by': 131, 'Type'

In [11]:
top_combined = {}
for toptype, top in tops.items():
    dim = toptype[:-1].title()
    dfs = []
    for t, s in tops[toptype].items():
        s.index.name = dim
        dfs.append(s.reset_index(name='n').assign(Top=t))

    df = pd.concat(dfs)
    df[dim] = df[dim].map(lambda x: x if len(x)<16 else x[:16] + '...')
    df = df.pivot(columns=['Top'])
    df['n'] = df['n'].astype('Int64')
    df = df.astype('str').replace('nan', '')
    df.columns = df.columns.swaplevel()
    df = df.sort_index(axis=1)
    display.display(df.head())
    top_combined[toptype] = df
    df.to_latex(root.joinpath(toptype+'.tex'))

Top,Business,Business,Economy,Economy,Government,Government,Health,Health,History,History,Humanities,Humanities,Military,Military,Music,Music,Politics,Politics,Sports,Sports
Unnamed: 0_level_1,Header,n,Header,n,Header,n,Header,n,Header,n,Header,n,Header,n,Header,n,Header,n,Header,n
0,Preceded by,48,Preceded by,12,Preceded by,550,Preceded by,163,Preceded by,96,Title,122,Preceded by,162,Year,343,Preceded by,130,Year,136
1,Succeeded by,41,Succeeded by,9,Succeeded by,435,Succeeded by,140,Succeeded by,60,Preceded by,120,Succeeded by,131,Title,222,Succeeded by,104,Preceded by,132
2,Year,14,Location,8,Name,39,Year,55,Name,16,Rank,108,Type,15,Chart,208,Name,25,Gold,101
3,Title,8,Traction Type,7,Party,37,Abbreviation,27,Date,13,Studio,84,Notes,15,Peak position,191,Year,20,Silver,101
4,Name,5,Date (From),7,#,32,Meaning,27,Year,11,Succeeded by,79,Name,14,Peak chart posit...,154,Party,19,Bronze,101


Top,Business,Business,Economy,Economy,Government,Government,Health,Health,History,History,Humanities,Humanities,Military,Military,Music,Music,Politics,Politics,Sports,Sports
Unnamed: 0_level_1,Coltype,n,Coltype,n,Coltype,n,Coltype,n,Coltype,n,Coltype,n,Coltype,n,Coltype,n,Coltype,n,Coltype,n
0,person,59,person,13,person,567,human,222,person,109,human,143,natural person,169,release,467,human,150,human,316
1,natural person,59,natural person,13,natural person,567,natural person,222,natural person,109,person,143,human,169,single,283,person,150,natural person,316
2,human,59,human,13,human,567,person,222,human,109,natural person,143,person,169,discography,254,natural person,150,person,316
3,business,4,television progr...,2,United States di...,21,release,25,release,10,Wikimedia topic ...,85,armed organizati...,8,bibliography,253,release,29,sports festival,107
4,visual artwork,3,series of creati...,2,trial court,21,album,16,single,7,release,36,Wikipedia:Books,6,series of creati...,253,single,24,Wikimedia portal...,61


Top,Business,Business,Economy,Economy,Government,Government,Health,Health,History,History,Humanities,Humanities,Military,Military,Music,Music,Politics,Politics,Sports,Sports
Unnamed: 0_level_1,Prop,n,Prop,n,Prop,n,Prop,n,Prop,n,Prop,n,Prop,n,Prop,n,Prop,n,Prop,n
0,child,84,child,19,child,866,child,273,child,116,child,158,child,259,performer,113,child,204,child,162
1,father,43,father,10,position held,444,position held,140,position held,60,position held,83,position held,132,followed by,61,position held,110,father,82
2,part of,42,part of,10,father,442,father,137,part of,60,part of,81,father,130,follows,61,father,104,position held,80
3,position held,41,position held,9,part of,424,part of,137,father,59,father,80,part of,129,child,47,part of,100,part of,80
4,follows,5,follows,3,follows,109,given name,33,follows,38,follows,40,follows,32,part of,42,follows,26,follows,50


Top,Business,Business,Economy,Economy,Government,Government,Health,Health,History,History,Humanities,Humanities,Military,Military,Music,Music,Politics,Politics,Sports,Sports
Unnamed: 0_level_1,Pivot,n,Pivot,n,Pivot,n,Pivot,n,Pivot,n,Pivot,n,Pivot,n,Pivot,n,Pivot,n,Pivot,n
0,Domestic League,2,01/02,2,Senior status,11,Batting,4,Part 1,8,Part 1,8,In service ( ca ...,4,US,187,Chart (2008),5,League,26
1,Domestic Cup,2,02/03,2,Senior,10,Bowling,4,Part 2,8,Part 2,8,745,1,US Country,171,Chart (2003),4,Total,26
2,European Competi...,2,03/04,2,Secretary of Sta...,4,Billboard 200,3,Part 3,8,Part 3,8,810,1,UK,114,Total,3,League Cup,16
3,Other Tournament...,2,04/05,2,Attorney General...,4,UK,3,Part 4,8,Part 4,8,842,1,CAN Country,81,Chart (1971),2,1,12
4,Total,2,05/06,2,State Senate,4,Chart (2007),3,Speed over 1km,4,2011,5,959,1,CAN,71,Chart (2000),2,3,12
